Repository: Shenyi-Z/ToCa Branch: main Commit: e84096ffd85a Files: 713 Total size: 6.6 MB Directory structure: gitextract_nz69m5ai/ ├── COCO_caption_prompts_30k.txt ├── DiT-ToCa/ │ ├── cache_functions/ │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── cache_cutfresh.py │ │ ├── cache_init.py │ │ ├── cal_type.py │ │ ├── force_init.py │ │ ├── force_scheduler.py │ │ ├── fresh_ratio_scheduler.py │ │ ├── global_force_fresh.py │ │ ├── score_evaluate.py │ │ ├── scores.py │ │ ├── token_merge.py │ │ └── update_cache.py │ ├── diffusion/ │ │ ├── __init__.py │ │ ├── diffusion_utils.py │ │ ├── gaussian_diffusion.py │ │ ├── respace.py │ │ └── timestep_sampler.py │ ├── download.py │ ├── environment-dit.yml │ ├── models.py │ ├── sample.py │ ├── sample_ddp.py │ └── train.py ├── DrawBench200.txt ├── LICENSE ├── Open-Sora/ │ ├── Dockerfile │ ├── LICENSE │ ├── README.md │ ├── assets/ │ │ └── texts/ │ │ ├── VBench/ │ │ │ ├── all_category.txt │ │ │ ├── all_dimension.txt │ │ │ ├── all_i2v.txt │ │ │ ├── prompts_per_category/ │ │ │ │ ├── animal.txt │ │ │ │ ├── architecture.txt │ │ │ │ ├── food.txt │ │ │ │ ├── human.txt │ │ │ │ ├── lifestyle.txt │ │ │ │ ├── plant.txt │ │ │ │ ├── scenery.txt │ │ │ │ └── vehicles.txt │ │ │ └── prompts_per_dimension/ │ │ │ ├── appearance_style.txt │ │ │ ├── color.txt │ │ │ ├── human_action.txt │ │ │ ├── multiple_objects.txt │ │ │ ├── object_class.txt │ │ │ ├── overall_consistency.txt │ │ │ ├── scene.txt │ │ │ ├── spatial_relationship.txt │ │ │ ├── subject_consistency.txt │ │ │ ├── temporal_flickering.txt │ │ │ └── temporal_style.txt │ │ ├── imagenet_id.txt │ │ ├── imagenet_labels.txt │ │ ├── rand_types.txt │ │ ├── t2i_samples.txt │ │ ├── t2i_sigma.txt │ │ ├── t2v_car.txt │ │ ├── t2v_latte.txt │ │ ├── t2v_pllava.txt │ │ ├── t2v_ref.txt │ │ ├── t2v_samples.txt │ │ ├── t2v_short.txt │ │ ├── t2v_sora.txt │ │ ├── ucf101_id.txt │ │ └── ucf101_labels.txt │ ├── build/ │ │ └── lib/ │ │ ├── opensora/ │ │ │ ├── acceleration/ │ │ │ │ ├── __init__.py │ │ │ │ ├── checkpoint.py │ │ │ │ ├── communications.py │ │ │ │ ├── parallel_states.py │ │ │ │ ├── plugin.py │ │ │ │ └── shardformer/ │ │ │ │ ├── __init__.py │ │ │ │ ├── modeling/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── t5.py │ │ │ │ └── policy/ │ │ │ │ ├── __init__.py │ │ │ │ └── t5_encoder.py │ │ │ ├── datasets/ │ │ │ │ ├── __init__.py │ │ │ │ ├── aspect.py │ │ │ │ ├── bucket.py │ │ │ │ ├── dataloader.py │ │ │ │ ├── datasets.py │ │ │ │ ├── read_video.py │ │ │ │ ├── sampler.py │ │ │ │ ├── utils.py │ │ │ │ └── video_transforms.py │ │ │ └── models/ │ │ │ ├── cache_functions/ │ │ │ │ ├── __init__.py │ │ │ │ ├── attention.py │ │ │ │ ├── cache_cutfresh.py │ │ │ │ ├── cache_init.py │ │ │ │ ├── force_init.py │ │ │ │ ├── force_scheduler.py │ │ │ │ ├── fresh_ratio_scheduler.py │ │ │ │ ├── global_force_fresh.py │ │ │ │ ├── score_evaluate.py │ │ │ │ ├── scores.py │ │ │ │ ├── token_merge.py │ │ │ │ └── update_cache.py │ │ │ ├── dit/ │ │ │ │ ├── __init__.py │ │ │ │ └── dit.py │ │ │ ├── latte/ │ │ │ │ ├── __init__.py │ │ │ │ └── latte.py │ │ │ ├── layers/ │ │ │ │ ├── __init__.py │ │ │ │ └── blocks.py │ │ │ ├── pixart/ │ │ │ │ ├── __init__.py │ │ │ │ ├── pixart.py │ │ │ │ └── pixart_sigma.py │ │ │ ├── stdit/ │ │ │ │ ├── __init__.py │ │ │ │ ├── stdit.py │ │ │ │ ├── stdit2.py │ │ │ │ ├── stdit3 copy.py │ │ │ │ └── stdit3.py │ │ │ └── text_encoder/ │ │ │ ├── __init__.py │ │ │ ├── classes.py │ │ │ ├── clip.py │ │ │ └── t5.py │ │ ├── tools/ │ │ │ ├── caption/ │ │ │ │ ├── __init__.py │ │ │ │ ├── acceleration/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── llava/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── policies/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── llama.py │ │ │ │ │ └── mistral.py │ │ │ │ ├── camera_motion/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── camera_motion.py │ │ │ │ │ ├── detect.py │ │ │ │ │ ├── utils.py │ │ │ │ │ └── visualizer.py │ │ │ │ ├── camera_motion_detect.py │ │ │ │ ├── caption_gpt4.py │ │ │ │ ├── caption_llama3.py │ │ │ │ ├── caption_llava.py │ │ │ │ └── utils.py │ │ │ ├── datasets/ │ │ │ │ ├── __init__.py │ │ │ │ ├── analyze.py │ │ │ │ ├── convert.py │ │ │ │ ├── datautil.py │ │ │ │ ├── filter_panda10m.py │ │ │ │ ├── split.py │ │ │ │ ├── transform.py │ │ │ │ └── utils.py │ │ │ ├── frame_interpolation/ │ │ │ │ ├── __init__.py │ │ │ │ ├── interpolation.py │ │ │ │ ├── networks/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── amt_g.py │ │ │ │ │ └── blocks/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── feat_enc.py │ │ │ │ │ ├── ifrnet.py │ │ │ │ │ ├── multi_flow.py │ │ │ │ │ └── raft.py │ │ │ │ └── utils/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dist_utils.py │ │ │ │ ├── flow_utils.py │ │ │ │ └── utils.py │ │ │ ├── scene_cut/ │ │ │ │ ├── __init__.py │ │ │ │ ├── convert_id_to_path.py │ │ │ │ ├── cut.py │ │ │ │ └── scene_detect.py │ │ │ └── scoring/ │ │ │ ├── aesthetic/ │ │ │ │ ├── __init__.py │ │ │ │ └── inference.py │ │ │ └── matching/ │ │ │ ├── __init__.py │ │ │ └── inference.py │ │ ├── vbench/ │ │ │ ├── __init__.py │ │ │ ├── aesthetic_quality.py │ │ │ ├── appearance_style.py │ │ │ ├── background_consistency.py │ │ │ ├── cli/ │ │ │ │ ├── __init__.py │ │ │ │ ├── evaluate.py │ │ │ │ ├── static_filter.py │ │ │ │ └── vbench.py │ │ │ ├── color.py │ │ │ ├── dynamic_degree.py │ │ │ ├── human_action.py │ │ │ ├── imaging_quality.py │ │ │ ├── motion_smoothness.py │ │ │ ├── multiple_objects.py │ │ │ ├── object_class.py │ │ │ ├── overall_consistency.py │ │ │ ├── scene.py │ │ │ ├── spatial_relationship.py │ │ │ ├── subject_consistency.py │ │ │ ├── temporal_flickering.py │ │ │ ├── temporal_style.py │ │ │ ├── third_pary/ │ │ │ │ ├── 0.txt │ │ │ │ ├── RAFT/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── core/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── corr.py │ │ │ │ │ ├── datasets.py │ │ │ │ │ ├── extractor.py │ │ │ │ │ ├── raft.py │ │ │ │ │ ├── update.py │ │ │ │ │ └── utils_core/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── augmentor.py │ │ │ │ │ ├── flow_viz.py │ │ │ │ │ ├── frame_utils.py │ │ │ │ │ └── utils.py │ │ │ │ ├── ViCLIP/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── simple_tokenizer.py │ │ │ │ │ ├── viclip.py │ │ │ │ │ ├── viclip_text.py │ │ │ │ │ └── viclip_vision.py │ │ │ │ ├── __init__.py │ │ │ │ ├── amt/ │ │ │ │ │ ├── benchmarks/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── adobe240.py │ │ │ │ │ │ ├── gopro.py │ │ │ │ │ │ ├── snu_film.py │ │ │ │ │ │ ├── speed_parameters.py │ │ │ │ │ │ ├── ucf101.py │ │ │ │ │ │ ├── vimeo90k.py │ │ │ │ │ │ ├── vimeo90k_tta.py │ │ │ │ │ │ └── xiph.py │ │ │ │ │ ├── datasets/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── adobe_datasets.py │ │ │ │ │ │ ├── gopro_datasets.py │ │ │ │ │ │ └── vimeo_datasets.py │ │ │ │ │ ├── flow_generation/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── gen_flow.py │ │ │ │ │ │ └── liteflownet/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── run.py │ │ │ │ │ ├── losses/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── loss.py │ │ │ │ │ ├── metrics/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── psnr_ssim.py │ │ │ │ │ └── networks/ │ │ │ │ │ ├── AMT-G.py │ │ │ │ │ ├── AMT-L.py │ │ │ │ │ ├── AMT-S.py │ │ │ │ │ └── blocks/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── feat_enc.py │ │ │ │ │ ├── ifrnet.py │ │ │ │ │ ├── multi_flow.py │ │ │ │ │ └── raft.py │ │ │ │ ├── grit_model.py │ │ │ │ ├── grit_src/ │ │ │ │ │ └── centernet2/ │ │ │ │ │ └── centernet/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── config.py │ │ │ │ │ └── modeling/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── backbone/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── bifpn.py │ │ │ │ │ │ ├── bifpn_fcos.py │ │ │ │ │ │ ├── dla.py │ │ │ │ │ │ ├── dlafpn.py │ │ │ │ │ │ ├── fpn_p5.py │ │ │ │ │ │ └── res2net.py │ │ │ │ │ ├── debug.py │ │ │ │ │ ├── dense_heads/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── centernet.py │ │ │ │ │ │ ├── centernet_head.py │ │ │ │ │ │ └── utils.py │ │ │ │ │ ├── layers/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── deform_conv.py │ │ │ │ │ │ ├── heatmap_focal_loss.py │ │ │ │ │ │ ├── iou_loss.py │ │ │ │ │ │ └── ml_nms.py │ │ │ │ │ ├── meta_arch/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── centernet_detector.py │ │ │ │ │ └── roi_heads/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── custom_fast_rcnn.py │ │ │ │ │ ├── custom_roi_heads.py │ │ │ │ │ └── fed_loss.py │ │ │ │ ├── tag2Text/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── med.py │ │ │ │ │ ├── swin_transformer.py │ │ │ │ │ ├── tag2text.py │ │ │ │ │ ├── tag_class.py │ │ │ │ │ └── vit.py │ │ │ │ └── umt/ │ │ │ │ ├── __init__.py │ │ │ │ ├── datasets/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── build.py │ │ │ │ │ ├── kinetics.py │ │ │ │ │ ├── kinetics_sparse.py │ │ │ │ │ ├── mae.py │ │ │ │ │ ├── masking_generator.py │ │ │ │ │ ├── mixup.py │ │ │ │ │ ├── rand_augment.py │ │ │ │ │ ├── random_erasing.py │ │ │ │ │ ├── ssv2.py │ │ │ │ │ ├── transforms.py │ │ │ │ │ ├── video_transforms.py │ │ │ │ │ └── volume_transforms.py │ │ │ │ ├── functional.py │ │ │ │ └── models/ │ │ │ │ ├── __init__.py │ │ │ │ ├── clip.py │ │ │ │ ├── modeling_finetune.py │ │ │ │ ├── modeling_pretrain.py │ │ │ │ └── modeling_pretrain_umt.py │ │ │ └── utils.py │ │ └── vbench2_beta_i2v/ │ │ ├── __init__.py │ │ ├── camera_motion.py │ │ ├── crop_to_diff_ratio.py │ │ ├── i2v_background.py │ │ ├── i2v_subject.py │ │ └── utils.py │ ├── configs/ │ │ ├── dit/ │ │ │ ├── inference/ │ │ │ │ ├── 16x256x256.py │ │ │ │ ├── 1x256x256-class.py │ │ │ │ └── 1x256x256.py │ │ │ └── train/ │ │ │ ├── 16x256x256.py │ │ │ └── 1x256x256.py │ │ ├── latte/ │ │ │ ├── inference/ │ │ │ │ ├── 16x256x256-class.py │ │ │ │ └── 16x256x256.py │ │ │ └── train/ │ │ │ └── 16x256x256.py │ │ ├── opensora/ │ │ │ ├── inference/ │ │ │ │ ├── 16x256x256.py │ │ │ │ ├── 16x512x512-rflow.py │ │ │ │ ├── 16x512x512.py │ │ │ │ └── 64x512x512.py │ │ │ └── train/ │ │ │ ├── 16x256x256-mask.py │ │ │ ├── 16x256x256-spee-rflow.py │ │ │ ├── 16x256x256-spee.py │ │ │ ├── 16x256x256.py │ │ │ ├── 16x512x512.py │ │ │ ├── 360x512x512.py │ │ │ ├── 64x512x512-sp.py │ │ │ └── 64x512x512.py │ │ ├── opensora-v1-1/ │ │ │ ├── inference/ │ │ │ │ ├── sample-ref.py │ │ │ │ └── sample.py │ │ │ └── train/ │ │ │ ├── benchmark.py │ │ │ ├── image.py │ │ │ ├── image_rflow.py │ │ │ ├── stage1.py │ │ │ ├── stage2.py │ │ │ ├── stage3.py │ │ │ └── video.py │ │ └── opensora-v1-2/ │ │ └── inference/ │ │ └── sample.py │ ├── docs/ │ │ ├── acceleration.md │ │ ├── commands.md │ │ ├── config.md │ │ ├── data_processing.md │ │ ├── datasets.md │ │ ├── installation.md │ │ ├── report_01.md │ │ ├── report_02.md │ │ ├── report_03.md │ │ ├── structure.md │ │ ├── vae.md │ │ └── zh_CN/ │ │ ├── README.md │ │ ├── READMEv1.1.md │ │ ├── acceleration.md │ │ ├── commands.md │ │ ├── datasets.md │ │ ├── report_v1.md │ │ ├── report_v2.md │ │ ├── report_v3.md │ │ ├── structure.md │ │ └── vae.md │ ├── environment-opensora.yml │ ├── eval/ │ │ ├── README.md │ │ ├── human_eval/ │ │ │ ├── generate.sh │ │ │ └── launch.sh │ │ ├── loss/ │ │ │ ├── eval_loss.py │ │ │ ├── launch.sh │ │ │ └── tabulate_rl_loss.py │ │ ├── sample.sh │ │ ├── vae/ │ │ │ ├── cal_flolpips.py │ │ │ ├── cal_lpips.py │ │ │ ├── cal_psnr.py │ │ │ ├── cal_ssim.py │ │ │ ├── eval_common_metric.py │ │ │ ├── flolpips/ │ │ │ │ ├── correlation/ │ │ │ │ │ └── correlation.py │ │ │ │ ├── flolpips.py │ │ │ │ ├── pretrained_networks.py │ │ │ │ ├── pwcnet.py │ │ │ │ └── utils.py │ │ │ └── script/ │ │ │ └── eval.sh │ │ ├── vbench/ │ │ │ ├── VBench_full_info.json │ │ │ ├── calc_vbench.py │ │ │ ├── launch.sh │ │ │ ├── launch_calc.sh │ │ │ └── tabulate_vbench_scores.py │ │ └── vbench_i2v/ │ │ ├── calc_vbench_i2v.py │ │ ├── json_to_txt.py │ │ ├── launch.sh │ │ └── launch_calc.sh │ ├── gradio/ │ │ ├── README.md │ │ ├── app.py │ │ └── requirements.txt │ ├── notebooks/ │ │ ├── inference.ipynb │ │ └── launch.ipynb │ ├── opensora/ │ │ ├── __init__.py │ │ ├── acceleration/ │ │ │ ├── __init__.py │ │ │ ├── checkpoint.py │ │ │ ├── communications.py │ │ │ ├── parallel_states.py │ │ │ ├── plugin.py │ │ │ └── shardformer/ │ │ │ ├── __init__.py │ │ │ ├── modeling/ │ │ │ │ ├── __init__.py │ │ │ │ └── t5.py │ │ │ └── policy/ │ │ │ ├── __init__.py │ │ │ └── t5_encoder.py │ │ ├── datasets/ │ │ │ ├── __init__.py │ │ │ ├── aspect.py │ │ │ ├── bucket.py │ │ │ ├── dataloader.py │ │ │ ├── datasets.py │ │ │ ├── read_video.py │ │ │ ├── sampler.py │ │ │ ├── utils.py │ │ │ └── video_transforms.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── cache_functions/ │ │ │ │ ├── __init__.py │ │ │ │ ├── attention.py │ │ │ │ ├── cache_cutfresh.py │ │ │ │ ├── cache_init.py │ │ │ │ ├── force_init.py │ │ │ │ ├── force_scheduler.py │ │ │ │ ├── fresh_ratio_scheduler.py │ │ │ │ ├── global_force_fresh.py │ │ │ │ ├── score_evaluate.py │ │ │ │ ├── scores.py │ │ │ │ ├── token_merge.py │ │ │ │ └── update_cache.py │ │ │ ├── dit/ │ │ │ │ ├── __init__.py │ │ │ │ └── dit.py │ │ │ ├── latte/ │ │ │ │ ├── __init__.py │ │ │ │ └── latte.py │ │ │ ├── layers/ │ │ │ │ ├── __init__.py │ │ │ │ └── blocks.py │ │ │ ├── pixart/ │ │ │ │ └── pixart.py │ │ │ ├── stdit/ │ │ │ │ ├── __init__.py │ │ │ │ ├── stdit.py │ │ │ │ ├── stdit2.py │ │ │ │ └── stdit3.py │ │ │ ├── text_encoder/ │ │ │ │ ├── __init__.py │ │ │ │ ├── classes.py │ │ │ │ ├── clip.py │ │ │ │ └── t5.py │ │ │ └── vae/ │ │ │ ├── __init__.py │ │ │ ├── discriminator.py │ │ │ ├── losses.py │ │ │ ├── lpips.py │ │ │ ├── utils.py │ │ │ ├── vae.py │ │ │ ├── vae_temporal.py │ │ │ └── video_sdxl/ │ │ │ └── blocks.py │ │ ├── registry.py │ │ ├── schedulers/ │ │ │ ├── __init__.py │ │ │ ├── dpms/ │ │ │ │ ├── __init__.py │ │ │ │ └── dpm_solver.py │ │ │ ├── iddpm/ │ │ │ │ ├── __init__.py │ │ │ │ ├── diffusion_utils.py │ │ │ │ ├── gaussian_diffusion.py │ │ │ │ ├── respace.py │ │ │ │ ├── speed.py │ │ │ │ └── timestep_sampler.py │ │ │ └── rf/ │ │ │ ├── __init__.py │ │ │ └── rectified_flow.py │ │ └── utils/ │ │ ├── __init__.py │ │ ├── ckpt_utils.py │ │ ├── config_utils.py │ │ ├── inference_utils.py │ │ ├── lr_scheduler.py │ │ ├── misc.py │ │ └── train_utils.py │ ├── opensora.egg-info/ │ │ ├── PKG-INFO │ │ ├── SOURCES.txt │ │ ├── dependency_links.txt │ │ ├── requires.txt │ │ └── top_level.txt │ ├── pyproject.toml │ ├── requirements/ │ │ ├── requirements-cu121.txt │ │ ├── requirements-data.txt │ │ ├── requirements-eval.txt │ │ ├── requirements-pllava.txt │ │ ├── requirements-vae.txt │ │ └── requirements.txt │ ├── scripts/ │ │ ├── inference.py │ │ ├── inference_vae.py │ │ └── misc/ │ │ ├── extract_feat.py │ │ └── launch_extract_feat.sh │ ├── setup.py │ ├── tests/ │ │ ├── test_attn.py │ │ └── test_lr_scheduler.py │ └── tools/ │ ├── __init__.py │ ├── caption/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── acceleration/ │ │ │ ├── __init__.py │ │ │ └── llava/ │ │ │ ├── __init__.py │ │ │ └── policies/ │ │ │ ├── __init__.py │ │ │ ├── llama.py │ │ │ └── mistral.py │ │ ├── camera_motion/ │ │ │ ├── __init__.py │ │ │ ├── camera_motion.py │ │ │ ├── detect.py │ │ │ ├── requirements.txt │ │ │ ├── utils.py │ │ │ └── visualizer.py │ │ ├── camera_motion_detect.py │ │ ├── caption_gpt4.py │ │ ├── caption_llama3.py │ │ ├── caption_llava.py │ │ ├── pllava_dir/ │ │ │ └── caption_pllava.py │ │ └── utils.py │ ├── datasets/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── analyze.py │ │ ├── convert.py │ │ ├── datautil.py │ │ ├── filter_panda10m.py │ │ ├── split.py │ │ ├── transform.py │ │ └── utils.py │ ├── frame_interpolation/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── interpolation.py │ │ ├── networks/ │ │ │ ├── __init__.py │ │ │ ├── amt_g.py │ │ │ └── blocks/ │ │ │ ├── __init__.py │ │ │ ├── feat_enc.py │ │ │ ├── ifrnet.py │ │ │ ├── multi_flow.py │ │ │ └── raft.py │ │ └── utils/ │ │ ├── __init__.py │ │ ├── dist_utils.py │ │ ├── flow_utils.py │ │ └── utils.py │ ├── scene_cut/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── convert_id_to_path.py │ │ ├── cut.py │ │ └── scene_detect.py │ └── scoring/ │ ├── README.md │ ├── __init__.py │ ├── aesthetic/ │ │ ├── __init__.py │ │ └── inference.py │ ├── matching/ │ │ ├── __init__.py │ │ └── inference.py │ ├── ocr/ │ │ ├── __init__.py │ │ ├── dbnetpp.py │ │ └── inference.py │ └── optical_flow/ │ ├── __init__.py │ ├── inference.py │ └── unimatch/ │ ├── __init__.py │ ├── attention.py │ ├── backbone.py │ ├── geometry.py │ ├── matching.py │ ├── position.py │ ├── reg_refine.py │ ├── transformer.py │ ├── trident_conv.py │ ├── unimatch.py │ └── utils.py ├── PixArt-alpha-ToCa/ │ ├── Dockerfile │ ├── README(PixArt-alpha).md │ ├── app/ │ │ ├── app.py │ │ ├── app_512.py │ │ ├── app_controlnet.py │ │ ├── app_lcm.py │ │ ├── style.css │ │ └── style_controlnet.css │ ├── asset/ │ │ ├── docs/ │ │ │ ├── pixart-dreambooth.md │ │ │ ├── pixart.md │ │ │ ├── pixart_comfyui.md │ │ │ ├── pixart_controlnet.md │ │ │ ├── pixart_inpaint.md │ │ │ ├── pixart_lcm.md │ │ │ └── sasolver.md │ │ ├── examples.py │ │ └── samples.txt │ ├── configs/ │ │ ├── PixArt_xl2_internal.py │ │ ├── PixArt_xl2_sam.py │ │ ├── pixart_app_config/ │ │ │ ├── PixArt_xl2_img1024_controlHed.py │ │ │ ├── PixArt_xl2_img1024_dreambooth.py │ │ │ └── PixArt_xl2_img512_controlHed.py │ │ └── pixart_config/ │ │ ├── PixArt_xl2_img1024_internal.py │ │ ├── PixArt_xl2_img1024_internalms.py │ │ ├── PixArt_xl2_img1024_lcm.py │ │ ├── PixArt_xl2_img256_SAM.py │ │ ├── PixArt_xl2_img256_internal.py │ │ ├── PixArt_xl2_img512_internal.py │ │ └── PixArt_xl2_img512_internalms.py │ ├── diffusion/ │ │ ├── __init__.py │ │ ├── data/ │ │ │ ├── __init__.py │ │ │ ├── builder.py │ │ │ ├── datasets/ │ │ │ │ ├── Dreambooth.py │ │ │ │ ├── InternalData.py │ │ │ │ ├── InternalData_ms.py │ │ │ │ ├── SA.py │ │ │ │ ├── __init__.py │ │ │ │ ├── pixart_control.py │ │ │ │ └── utils.py │ │ │ └── transforms.py │ │ ├── dpm_solver.py │ │ ├── iddpm.py │ │ ├── lcm_scheduler.py │ │ ├── model/ │ │ │ ├── __init__.py │ │ │ ├── builder.py │ │ │ ├── cache_functions/ │ │ │ │ ├── __init__.py │ │ │ │ ├── attention.py │ │ │ │ ├── cache_cutfresh.py │ │ │ │ ├── cache_init.py │ │ │ │ ├── force_init.py │ │ │ │ ├── force_scheduler.py │ │ │ │ ├── fresh_ratio_scheduler.py │ │ │ │ ├── global_force_fresh.py │ │ │ │ ├── score_evaluate.py │ │ │ │ ├── scores.py │ │ │ │ ├── token_merge.py │ │ │ │ └── update_cache.py │ │ │ ├── diffusion_utils.py │ │ │ ├── dpm_solver.py │ │ │ ├── edm_sample.py │ │ │ ├── gaussian_diffusion.py │ │ │ ├── hed.py │ │ │ ├── llava/ │ │ │ │ ├── __init__.py │ │ │ │ ├── llava_mpt.py │ │ │ │ └── mpt/ │ │ │ │ ├── attention.py │ │ │ │ ├── blocks.py │ │ │ │ ├── configuration_mpt.py │ │ │ │ ├── modeling_mpt.py │ │ │ │ ├── norm.py │ │ │ │ └── param_init_fns.py │ │ │ ├── nets/ │ │ │ │ ├── PixArt.py │ │ │ │ ├── PixArtMS.py │ │ │ │ ├── PixArt_blocks.py │ │ │ │ ├── __init__.py │ │ │ │ └── pixart_controlnet.py │ │ │ ├── respace.py │ │ │ ├── sa_solver.py │ │ │ ├── t5.py │ │ │ ├── timestep_sampler.py │ │ │ └── utils.py │ │ ├── sa_sampler.py │ │ ├── sa_solver_diffusers.py │ │ └── utils/ │ │ ├── __init__.py │ │ ├── checkpoint.py │ │ ├── data_sampler.py │ │ ├── dist_utils.py │ │ ├── logger.py │ │ ├── lr_scheduler.py │ │ ├── misc.py │ │ └── optimizer.py │ ├── docker-compose.yml │ ├── docker-entrypoint.sh │ ├── docker-readme.md │ ├── environment-pixart.yml │ ├── environment.yml │ ├── notebooks/ │ │ ├── PixArt_xl2_img512_internal_for_pokemon_sample_training.py │ │ ├── convert-checkpoint-to-diffusers.ipynb │ │ ├── infer.ipynb │ │ └── train.ipynb │ ├── requirements.txt │ ├── scripts/ │ │ ├── infer_pixart_8_bits.py │ │ ├── inference.py │ │ ├── inference_ddp.py │ │ ├── inference_lcm.py │ │ ├── interface.py │ │ ├── interface_controlnet.py │ │ ├── pipeline_pixart_inpaint.py │ │ └── pipeline_pixart_reference.py │ ├── timing_analysis.py │ ├── timing_info.json │ ├── tools/ │ │ ├── VLM_caption_lightning.py │ │ ├── convert_pixart_alpha_to_diffusers.py │ │ ├── download.py │ │ └── extract_features.py │ ├── train.sh │ ├── train_latents.py │ └── train_scripts/ │ ├── train.py │ ├── train_controlnet.py │ ├── train_diffusers.py │ ├── train_dreambooth.py │ ├── train_pixart_lcm.py │ ├── train_pixart_lcm_lora.py │ └── train_pixart_lora_hf.py ├── PixArt-alpha-ToCa-tools/ │ └── clip_score.py ├── README.md └── flux-ToCa/ ├── .gitignore ├── LICENSE ├── README.md ├── demo_gr.py ├── demo_st.py ├── demo_st_fill.py ├── docs/ │ ├── fill.md │ ├── image-variation.md │ ├── structural-conditioning.md │ └── text-to-image.md ├── model_cards/ │ ├── FLUX.1-dev.md │ └── FLUX.1-schnell.md ├── model_licenses/ │ ├── LICENSE-FLUX1-dev │ └── LICENSE-FLUX1-schnell ├── pyproject.toml ├── setup.py └── src/ ├── flux/ │ ├── __init__.py │ ├── __main__.py │ ├── _version.py │ ├── api.py │ ├── cli.py │ ├── cli_control.py │ ├── cli_fill.py │ ├── cli_redux.py │ ├── ideas/ │ │ ├── __init__.py │ │ └── cache_denoise.py │ ├── math.py │ ├── model.py │ ├── modules/ │ │ ├── autoencoder.py │ │ ├── cache_functions/ │ │ │ ├── __init__.py │ │ │ ├── attention.py │ │ │ ├── cache_cutfresh.py │ │ │ ├── cache_init.py │ │ │ ├── cal_type.py │ │ │ ├── force_init.py │ │ │ ├── force_scheduler.py │ │ │ ├── fresh_ratio_scheduler.py │ │ │ ├── global_force_fresh.py │ │ │ ├── score_evaluate.py │ │ │ ├── scores.py │ │ │ ├── support_set_selection.py │ │ │ ├── token_merge.py │ │ │ └── update_cache.py │ │ ├── conditioner.py │ │ ├── image_embedders.py │ │ ├── layers.py │ │ └── lora.py │ ├── sampling.py │ └── util.py ├── geneval_flux.py └── sample.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: COCO_caption_prompts_30k.txt ================================================ A man about to return a serve with his tennis racket A horse drawn carriage in a historic city. Two gray fire hydrants sitting next to each other at a park. A goat with horns is standing in a grassy field. Several buses parked under a carport in a parking lot. A bowl of soup with bread and a cup of coffee. Three polar bears walk across a snowy field. A glass mosaic vase filled with colorful flowers. A bunch of apples in large trays on top of wooden crates. A girl does a skateboard trick in the air. A dog rests in the grass next to a fire hydrant. A giraffe standing in the shade of a near by tree. a plate of chilies with carrots and peas A motorcycle parked in front of a red brick wall. A tour bus stopped near a mountain while people gather nearby A young girl and her dad play with kites in a park. a man who appears to be herding sheep is closing two big fence doors A man with glasses holding a glass of wine. Group of skiers and different colored outfits on Ace Eastlynn. A fortune note on a tea bag next to a bagel. a man walks along the beach with a surfboard The view of a clean toilet surrounded by marble tile. a couple of people that are skateboarding down the road A couple of cars are riding down the street from a window view. A pet crate and a lot of tools and wires. a man holding a kite and a dog in a field. Four women with snowboards and gear are posing for a photograph near some snowy mounds. The boy and his dog are posing for the camera. The streets and the double decker bus are lit up in the night. A small vase has a good luck plant in it. a man riding on the back of a bike on dirt ground. A group of people standing on the street. The girl is riding her skateboard while using her cell phone. A person doing a trick on a skateboard in the road A lone zebra is walking in tall green grass. A fuzzy image of some people on skate boards. A man throws a frisbee to another man with two children. a number of people in a body of water with a small boat A complete train set, with tracks, buildings, and three piece train. A couple of buses parked in front of a building. A girl wearing a wet suit surfing in the ocean. A picture of something and it appears like sustenance. A man on a skateboard on a concrete lip. A person walking next to a horse at a horse show. A man with a beard looks pensive and wears a tie. A blue vase filled with colorful flowers sitting on the ground. A street name on a sign built into a curb. Two zebras are behind a fence on green grass. Young person on the street skateboarding wearing a helmet. A toilet with a red seat in a small bathroom with red tiles. Woman lying in an unmowed field with a frisbee. Cars and a bus driving down a busy road. A blurry image of some car lights on a dark night. A smiling young man stands beneath an Obama street sign. this is a train riding under a bridge A group of giraffes eating bark off trees. A horse is looking in the living room window of a farmhouse. A man holding a frisbee on a beach next to another man. a mixture of black and white sheep in a dried out field A chocolate caked covered in strawberries sitting next to a knife. Two people standing next to each other on a snow covered slope. A young boy is flying a kite in the park. Baseball pitcher in the process of pitching in a baseball game. Two men are in the water on a boat. A fighting plane turning sideways in a cloudy sky. Several red roosters together in a small area. A few small boats sail down a waterway. A dog and a cat are looking at the snowy front yard through a glass door A small child sitting on a shelf with teddy bears. One giraffe standing behind a dead tree branch. A bright green kite with a scary monster face flying high A man holding a tennis racket and staring at the camera with pride. This is a thing that is straightforward and plain. A large open field with small bushes and trees, and a giraffe standing in the middle of the field. Two women who are sitting at a table together. A small dog eating out of a bowl on the floor. a cow that is standing up eating a pan Two people smile as they ride on an elephant. some city workers work on a car crash A giraffe walking past a tree on a dirt landscape. A tow truck vehicle on a street in a city area. Professional baseball player winding up to pitch the ball. A dog is wearing a baseball hat over it's eyes. A bearded man standing in front of bookcases A horse drawn carriage traveling away fro ma very large cathedral. a man is standing inside of a food truck There is a stop sign with two road signs on top of it. A girl looking a a beautiful view of the Rockies. A very big high ceiling room with a yellow fan. A man in an office chair looking at a laptop next to a glass of wine. a brown desk a keyboard a computer and a monitor and speakers A woman talking on her cell phone while walking. An abandoned train with lots of graffiti painted on it. a person bending down cutting another persons hair A plane at an airport with a truck driving past there are many men sitting at a small table A girl looking inside a living cartoon refrigerator. Two sheep are in a dirt outdoor enclosure. People look on as a ball heads towards a batter. Several glazed donuts are lined up on a tray. A guy standing on a snowboard in the snow. There is now image here to provide a caption for. A man practicing baseball on a field. A bedroom that has a large computer desk in it. A close up of a pizza with spinach on it. a living room with a couch a tv and a table A man checks his cell phone as he walks to his car in the parking lot. Several people on skis in the snow outside of a lodge. A small pink beanie hat next to a cell phone. A mother handing her son a piece of cake on a paper plate. a man in a tuxedo sits at a table and uses a laptop A young man holding a Nintendo Wii game controller. A wrap of some sort on a plate with potatoes A motor boat next to a beach and others in the background A fruit bowl sitting on a table with bananas and apples. Three people running around in grass playing Frisbee. A person sits down to their meal of a sandwich on a croissant with a side of french fries. A man is holding up an old cellphone a couple of animals standing in a field a cup on a table next to a tv A large brick building with a tall tower containing a clock near the top A figurine of a little boy riding a snow board in yellow pants. A memorial set on a fence by the ocean with flowers and teddy bears. A group of white sheep eating from blue bowls A couple of kids laying in a bed with an umbrella. A small table set with pastries and tea A rainbow lorikeet parrot eats sun flower seeds. A white plate topped with a sandwich and a salad. A woman getting ready to hit a tennis ball. A tennis player jumps into the air and swings his racket. A mustached man is standing in front of a larger mustache. Street signs showing the intersection of Eight Mile and Shadyside A white desk has a computer, keyboard, globe and green phone. A man with bandaged hands lying in bed. A baseball player is trying to hit the ball. a woman gets ready to pet a big horse A herd of black cows grazing in a field. A very pretty girl looking at her cell phone. A man rides a bicycle across a wet intersection. a man with a bat walks as other look on Two zebras are running through some high grass. A baseball stadium full of fans while two teams play ball. there is two pictures of a female tennis player There are flowers that are in a vase filled with rocks. Fruit and vegetables are hanging in a metal basket. A boat that is sitting in the water. A living room with a couch, TV, and fireplace. some people walking up a snowy hill with skis A woman is eating spam off of a plate with a camera next to her. A person that is playing in a tennis game. A red truck in street next to wall and buildings. A light with multiple bulbs is on a tall post. Grey toned elephant head closeup with grass and hill background. A man in striped shirt sitting on a fire hydrant. A person on a skateboard and bike at a skate park. a close up of a red tie and a white and blue shirt A kid about to ride his skateboard down a pool. LIVING ROOM WITH COUCH, TABLE, END TABLE, LAMP, PHONE, AND MIRROR A happy boy is waiting outside with his suitcase. Two men stand together using their cell phones. A group of men cutting into a celebratory cake A man eating a donut wrapped in tissue paper. A bedroom with a bed with blue cover and blue curtains, and a pair of shoes on the wooden floor. A penguin is standing and pecking at a teddy bear left in the snow. Two brown bears sitting on top of a black and white checkered bed. Two lambs with black heads look out from a gate. This person is holding a cell phone while standing on the sidewalk. There are a group of people snow boarding on the hill A jet on a runway near other jets. A baseball player is getting ready to go to bat. Man on a black motorcycle wearing a helmet. Horse standing in dead grass area near fenced field. someone is skiing through the trees by themself. The zebras are grazing in the field together. The cut sandwich has meat, lettuce and tomato. A small dog sitting on a stuffed animal teddy bear A person holding a slice of pizza in their hands. A man wearing a hat eating a hotdog at a sporting event. A painting of a man sitting next to a woman near the ocean. People are standing in front of a small store a train on tracks at the front of the train depot Elephants moving along on a very open field of some sort. A cat sits curiously perched in an empty cup Large collection of scissors attached with price tags. An older man and two kids sitting on a bench. Two motorcycle riders are riding a motorcycle bike. People watching motor cross bike riders racing on a field Two small children stand together scrubbing an elephant. a close up of a shirtless man wearing a neck tie The view of a distant mountain taken from an airplane window. A train on tracks in a city with high rises. PEOPLE BOARDING A BUS PARKED ON A STREET. A woman holds a decorative umbrella and walks with a man. A couple of cows standing on top of a grass covered field. A crossed eyed man holding a remote in his mouth. A single young giraffe stands and looks forward. two giraffes standing next to several huge rocks A person is flying a kite on the beach. Two giraffes are standing next to a tall fence. there is a person holding up a nokia lumia phone A boy sitting at a table while he puts something in his mouth. A man sitting in front of a laptop computer in an office. A woman is on the side of a mountain in ski gear. A view of a bed from across the room, it has a TV tray on it. View of traffic signal against a dark sky that looks like rain. A boy being affectionate with a baby on a bed. A man with a top hat on and a carrot in his mouth. There is a stove and a sink in a narrow kitchen. A hand picking up a bunch of bananas from a display. A cow and a person on a horse in the dirt. Many doughnuts on a display in a store. umbrellas, trees and a hut line a sandy beach A man eating a hot dog on a tray Some stacked with much sublime sustenance ready to eat. Seven people smile as they pose with tennis rackets. A school bus is parked by a street sign. A train coming out of an enclosure under a snowy mountain. A woman holding a surfboard walking into the ocean toward a dog. Three people on snowboards riding down a snowy slope. A black dog on a leash holding a frisbee in his mouth. A woman water boarding in a lake near land. A red stop light on a street at night. A kitchen with wood cabinets and white countertops. A bathroom decorated in pink ceramic tile and wallpaper.. A platter on a table that has pizza on it. A snowboarding is doing tricks on a ramp. A desk with a computer, a keyboard, a mouse, a bobble head, speakers and a lava lamp on it. A man in a crowd balancing a skateboard on top of his head A bathroom with a paper dispenser, toilet roll and garbage bin. A seagull majestically flying through the air over the ocean. A bed topped with a colorful blanket and lots of papers. A person looking into a convex mirror on the front of a school bus. wild animals graze in fields in front of a lake and snow-covered mountains The side dish of the meal consists of a macaroni salad. A plated meal on a table with flowers. THERE ARE TWO ZEBRAS THAT ARE STANDING BY EACH OTHER a cross walk sign in a busy city as light up the walk symbol A small cooked pizza on a dining table. a close up of a bunch of bananas and a container of garlic a couple of people sit on a horse pulled cart A wooden park bench sitting in front of a window. Two white vases on a shelf next to a window. Wearing shorts, a man holds up a snowboard while standing in the snow. Two people playing tennis in a neighborhood park. An outdoor area that has a glass top table with a plate on it and a blue vase with flowers in it. An empty bathroom with toilet and pictures on the wall. Two yellow bowls of food containing broccoli and potatoes. Group of people paddling boats on water in front of a city. A child holds an object while someone else cuts it A man approaching a water ski jump holding on to a wire. a toilet with a shower near by with tiled walls A skateboarder performs a trick while being photographed. A white fire hydrant near the address 700 Jones Street. A bed with red sheets on it and messy blanket and a lap top. A yellow school bus traveling a dark road. A boy wearing a helmet and using a skateboard on the sidewalk. A woman holding a knife over an unconscious woman. A close-up of a man brushing his teeth. a man sits next to a child as he uses a computer a kitchen view of cabinets a stove microwave adn refridgerator A dog and a railroad official and one person in train yard. A motorcycle is parked inside of a building. Two elephants that are walking in the dirt. A baseball player throws a pitch while others watch from the dugout. A group of people walking around an area together. people on a small boat in a body of water a train station and a train removing much smoke There is a elephant in the grass, there are also trees in the background. A small dog sitting on top of a couch cushion. A security officer using a segway as a footrest The mostly eaten pizza slice is next to olive pieces. A baseball player starting his run for first base. A man standing on top of a tennis court holding a racquet. A post clock is positioned on the sidewalk with flags in the background. Cars drive down a multi-lane road and pass businesses. Inside of a living room with a sofa and several tables. Two small suitcase is sitting in front of a white sheet. a living room with couches and a table Elephant with a brown eye hyper focused in the camera. A man in a white shirt has his hands on another man's shirt collar. A picture of five african american's sitting on a bench and chair. A counter holds tomatoes, bananas, pineapples and other fruits. A girl and a woman watching a candle being lit up on top of a cake. Pair of giraffes walking on grassy area in enclosure. A red fire hydrant next to some stones. a yellow and red apple and some bananas A green and yellow rain on tracks with building in background. A large elephant is standing in a fenced in enclosure. A boy on a boogie board in the snow. Photos of sports memorabilia including shirts, caps, and baseball bats. A young child mutton busting at a rodeo event The bald man leaning against the tree holds his face in place with one gloved hand. A young man with a neck tie untied around his neck Sheep are running across a green field of grass. a simple, normal toilet with the lid closed A cat look through a window at a dog. Busy city street with red signs on the traffic lights. several cows one lying and one standing in a dirt field A plastic container sitting on top of a table. An old fashioned television and a newer electronic gadget sitting on top. A white pickup truck sitting in front of two wooden scaffolding. A child biting into a piece of pizza The people are over by the cows in the water A couple of cows standing on a lush green field. A view along the transept of an older style church. The bed in the room has been made with a large purple blanket. A table that has people sitting around it with food in front of them. a group of small dogs are staring out of the window Two people sitting on a park bench near trees. A table has a bowl, candle, and Christmas decorations on it. A Skyteam Delta airline passenger jet taking off from an airport. a black white yellow blue green and red kite and a person A STREET SIGN POINTING IN WITH A TREE AND BUILDING IN THE BACKGROUND Two different slices of pizza with tissue paper under it on a paper plate. People riding a small boat under a very large bridge. A group of people injured and covered in blood. Three fire hydrants that are green stands near a parking sign at night. A striped giraffe is grazing in the grass. three chickens some water a fence and trees Four men play tennis together on a sunny day. A counter filled with coffee, cookies, and bagels. a baseball player with a bat on a field A green truck with a canvas tarp over the bed. A man stands in a large kitchen holding a coffee mug. An old motorcycle standing in a grassy field. A table with a white plate of food that includes salad and sausage. A cat on top of the counter sitting next to vegetables. A small green leafy plant in the ground. Two people eating hot dogs on a busy sidewalk. A bird perched on the limb of a tree a person riding a snow board on a snowy surface A boy doing a trick on a skateboard off a rail. Two woman are standing behind a large teddy bear. A couple of people standing in the water under a kite. A close up of the front of an old locomotive. The man in the hat is carrying an umbrella. A red brick building next to a green door. A dog looking at a book called "The Marriage of True Minds" by Stephen Evans. This is an image of an Air Canada plane flying. A bird perched on a plant in the middle of a forest. A living room filled with brown furniture on top of hard wood flooring. A black computer keyboard in a dim room The traffic lights are clearly visible for us to see. A couple of birds sitting on top of a rocky beach cliff. A couple of people throwing a Frisbee in a field. A sandwich with a dipping sauce served on a plate Road sign for the corner of Jackson and Montgomery A large group of giraffes roaming around in an enclosure. Stainless steel industrial stove sitting in a white and black kitchen. A little girl is posing for a picture and holding an umbrella. A couple of people with surfboards on a beach. a woman is standing in front of a giraffe A man in a blue shirt looking at his cell phone. A man on a skate board jumps high in the air for a trick. A woman sitting on the grass behind a pile of stuffed animals. The barbecue sandwich is on a plate near a glass of wine. Wrapped utensils are a part of a sterile and healthy meal. A metallic refrigerator freezer sitting in a kitchen. The child is putting the tooth brush in his mouth. A white comforter with a toy, book and child shirt on the top of it. A picture of a room in a house. The wildflower is sitting in the glass of water, A girl showing parrots to a group of children Two zebras fighting outside in an opening near some trees. Two surfers walk down the beach holding their boards. A group of people in a courtyard next to a pavilion. A renovated kitchen with wooden cabinets and white refridgerator Some soldiers are standing in line for food a man on a surfboard surfing a wave Two giraffes are standing together in the wild. The boat and the truck are parked by the dock. Oranges and bananas sitting in a stack together. An intersection with traffic lights and a street sign. Three horses standing close but in an open field. A person is riding a motorcycle in the mud. A statue of a giraffe is in a Children 's Hospital. A man is standing in a room with something yellow. A mother elephant and her tiny calf walk through the trees. A breakfast plate with eggs and meats, served with a gourmet coffee. a black and white photo of a person holding a skate board A boy swinging a baseball bat at a ball. A meter maid car is by a fire hydrant. A person flying a kite over another person on a roof. A car parked behind a wooden bench. A young skateboarder rides down the street alone. A row of bikes parked along a sidewalk beside some cars The bathroom in the home was just cleaned. A circus act with five elephants and some women put on a show. There is a muffin with white frosting and walnut bits on it. A curious giraffe leaning over into a car at a zoo field. Two signs one with the speed limit and one telling what freeway is which way. a lady that is on a tennis court with a racket A man sitting on his couch using his laptop A statue of a cowboy on a horse in the middle of building. There are street signs that show a direction of travel A couple of small beds and mirror in a room. A couple of large jetliner sitting on top of an airport tarmac. The room is decorated in terra cotta tile. A white plate topped with a pizza next to a bowl of salad. Many suit cases are stacked on top one another A gray and white cat sitting in front of a mirror. A dog riding on the back of a motorcycle down a street. A coffe and plate of bread sit next to a pillar. a red and white tail of a large plane Several pilots walking as a group across a street. A man riding a skateboard on the side of a ramp. a woman walking by a display with teddy beas and bottles A street sign on a light pole near on a city street. A male standing behind eight pieces of luggage. a vintage black and white picture of a train Two colorful umbrellas open against a blue sky. A small bookshelf is filled with books and decorative items. Several people walking in the snow, some carrying skis. a bottle of whiskey and a bottle in a brown bag on top of a fridge Women waiting for luggage at an airport luggage carousel. Two sheep standing side by side at a petting zoo. A pizza pie sitting on a board on a table A couple of airplanes that are on a runway. a bathroom with white walls and brown tile A woman walks down the water with a surfboard. this is an image of a train with black smoke. A lady is on the entrance of a train holding her luggage. A man is catching a frisbee while playing a game. A skateboarder with his skateboard is sitting on the side of a ramp. On a bright day, a young elephant in partial shade near a tree. Young people stand near a bus with a large amount of luggage. A bear costume cutting some cake with a Park ranger. a large clock reading 5 54 on the side of a building An unfurnished room contains a sleeping bag on the carpet. A little girl that is standing next to a horse. Two white bowls with vegetables, meats and herbs and chopsticks nearby. A kitchen with a stove, refrigerator and cabinets. a small child is sitting on a bench outside Brown, white, and black rams eating on a hill. A skateboarder performing in front of a crowd riding a rail. This is an image of scooters and bicycles. A cell phone peeks out of a crocheted cell phone holder. A kitchen that has white cabinets and black counters. Man on cellphone behind curtain while art displayed in front. Two men in bucket hats taking frisbees out of a frisbee golf bucket. A snowboarder grabs his board while high up in the air. A woman's eyes are hidden by the cast of a shadow. A black and white cat sitting in a bathroom sink. A stop sign in the grass beside an old farm silo. An Asian family that is eating pizza together. A yellow train traveling through the green countryside. A man and woman are standing beside each other playing a video game. Two cats lounging on the back of a couch. This picture shows sand, water, and some type of silver and red pole equipment. A cut dog in a basket with orange ears. An adult and child are skiing in the snow. Two children playing with the knob on money meters. Flowers sitting in a glass vase on a desk. A laptop computer is seen sitting next to a television. A small bathroom has a vanity, mirror, toilet and bathtub. a black and white photo of people eating a group of zebra standing in the sand in a fenced area A man standing by a kitchen counter doing something A bird is sitting on a silver truck A mother and son sitting in a bed with two cats A large piece of meat surrounded by vegetables. White oatmeal sitting next to toast, coffee, and orange juice. A bathroom with a shower combination tub and sink. A white and brown cat sitting on the shelf in a cabinet. A man is driving a small train with children. A man and a woman cutting a sheet cake with a knife. a man cooking some hot dogs on a grill some kind of chicken, rice, and vegetable dish on a pizza tray being served to a man. One man stands on top of the train while another man stands on the platform. A piece of cake is served on a plate. a close up of a child on a skate board Fruit juice is spilled all over a counter next to a knife and two pastries. A junk pile that looks to be piled with old bathroom sinks. Two black and yellow circular clocks affixed to an office building. There is a single bed in an old room with a window. An a kitchen is being cleaned and decluttered. Colorado Rockies' pitcher about to release ball from mound. A bird perched on top of a tree branch under a light blue sky. There's a desk with a laptop, phone charging, and other various electronics. The two men are standing outside by the tail of the airplane. A couple of brown horses pulling people on a wagon. A small plane flying over an ocean with waves The large SUV drives along a busy street. A row of motorcycles parked on a city street. a kitchen decorated with a couple american flags A room with a chair, a piano, and a laptop. A cat sitting on the side of a car door window. Two pelicans on the sidewalk in the foreground with several more in the water in the background A black and white photo of a man and woman sitting on a bench. A picture of a thick crust pizza and a bottle of wine, setting on a table. A MAN WEARING A SUIT AND A TIE STARING Male surfer riding a large wave with sun low in the sky A snowboarder hitting a trick on a trail, jumping over a person. The man is feeding the elephant with milk A bathroom with white fixtures and blue accessories two brown horses in a field gazing around Several people that are playing video games together. A plate with beans, broccoli, small sausages, fork and a small container. A black and grey double decker bus next to a building. A smoking women in a scarf makes a phone call. A person is traveling down the road on a motorcycle. A group of men in colorful jackets skiing down a hill A man and two others skiing across a snow covered field. A man is standing over a black motorcycle. A KITHCEH WITH A MICROWAVE SINK AND REFG A jar of peanuts and a cell phone sit on a laptop computer on a cluttered table top. A woman holding her child so she can see her birthday cake. there are two giraffes that seem to be embracing each other A table with wine glasses and people on the counter A wooden bench sitting on top of a green grass covered ground. A long couch with many pillows, a table and some seat cushions around it. Six people are paddle boarding in the ocean. Two young men play a game of soccer on a field. A black and white image of tennis players. This is an aerial view of a tennis player hitting the ball. An item is capture here in the photo. A cow grazing in a field next to a fence. An individual is taken in this very picture. People are outside flying kites in the sun. a person riding a skate board on a street There is a flower display in the corner of a room A man pulling a sled behind him while using ski poles. A person playing a game of tennis and other people watching. A decorative congratulations cake for a graduating student. A group of young people standing next to each other on top of a field. a semi truck loaded to the top with sheep A white truck crosses an intersection behind a traffic light. Two street signs sitting on top of a metal pole. The man in the red shirt is going to hit the ball with his racket. A shop called Pendulum with a clock out front. a traffic light on the side walk of a city street Two signs above a blue pole under a blue sky. A skateboarder is getting ready to skate down a ramp. A man wearing a black vest and black glasses. A bicyclist stopped beside a fence feeding or petting sheep. An orange and white bus crossing under a blue footbridge. A flock of birds landing in a field of grass A man reaching into a bucket near an elephant while another elephant stands near a pond in the back. A cat standing on the keyboard of a laptop. An empty and open silver metallic refrigerator in a kitchen Couple of people about to share kiss in front of wooden building A person para-sailing in the water with mountains in the background. a yellow taxi riding down a street that has a building with clock a snall toilet and a sink in a bathroom A zebra standing on a grassy pasture in the daytime. A group of people playing a game with remote controllers. A fighter jet with two streams of smoke coming out the back. A bike parked on the side of a city street. A glass plate topped with sliced apples and caramel. A player swings at the ball during a baseball game. A dog and man sit on rocks by water. A bunch of food on a tortia in some foil A woman tennis player is in a cropped photo. A plate of chicken, rice, and some vegetables. A man is staring at the viewer while a man plays a guitar and a woman sticks her thumb up sitting on a busy sidewalk. A large jetliner sitting on top of a runway. A person presenting a birthday cake to another. A woman with a child on skis go down the snow. A fridge in the middle of some cabinets Kitchen knives and scissors are stored in a wooden holder. A man speaks to some children on a farm. Two zebras are walking along a path outdoors. A woman lying in a bed looking at a laptop. A lot of motorcycle people that are on the road. A dog sits on a rug with its eyes closed. A pick up truck parked near a strange house A deformed orange sitting on top of an orange tree. peeled banana sits on a table uneaten and ripe A clock hangs from the wall of a beat up room A woman sitting on a chair blow drying her hair. A white plane with two people standing in front of it A young girl with a cape holds onto a kite. The puppy is eating food from the tiny bowl. A hummingbird is floating next to the feeder. A man riding a surfboard on a wave in the ocean. A werid skirt like outfit on a person. A security officer is setting up traffic guiding signs A tropical beach with a banana tree in the forefront. Military officer in dress uniform with many medals. A boy doing a trick on a skateboard on a ramp. A young boy is eating a meal in his pajamas. This is a vintage photo with four men in it. A dog is laying on the bed like a person. Players react to the ball being hit at a baseball game. A toilet with its lid raised in a stall. Young boy and his plastic skateboard at home Set of toy animals sitting in front of a red wooden wagon. A silly brown dog wears sunglasses as it sits in a car Broccoli is on a cutting board and is being cut in to smaller pieces. an image of a tennis racket and tennis ball A snowboarder sitting down with his snowboard on his feet. A tennis player pauses during a game in a public tennis court. A group of people on skis and snowboards outside. a couple of trains parked on some tracks under a closed roof Red double decker buses on a city roundabout. there is a game that is ging on at thte gym and people aer looking The man is wearing a tee shirt and a tie-dyed tie. A group of motor cycles parked on the street A set of two pictures showing a group of young people standing under a gazebo and next to surfboards. A man is shown feeding an baby elephant. There is no image here to provide a caption for. Three zebras are shown in a black and white photo. A Chinese public train waiting at the station. A man walking along the shore with a surf board. Two giraffes looking at a photographer inside of a barn. A dog looks up at a flying disk. A group of men standing on a city street. A dog stares intently off to the left in front of a glaring TV. A small pizza sitting in a frying pan of food. A mirror that is on a tiled wall. A black and white photo of people waiting at a boat ramp Modern jet airplanes lined up on the runway ready for take off Two white cows sitting in a farm area. A stop sigm at an intersection with some graffitti on it. A seagull at the beach with food in its mouth A passenger bus that is driving down the street. An umbrella strapped to the cross bar of a bicycle View off the wingtip of a passenger airliner on a taxiway. A cute little girl smiles for the camera A city with traffic lights, cars and buses. A man skate boarding in a pool with another man looking on. a young woman cuts up some food on a trey Two men skateboarding with a light and a camera. A group of drinking glasses sit along a bar, with two people nearby. A person on a motorcycle on a track near another person. A food item is shown on a napkin. two giraffes are standing in the open field. A man riding the back of a brown horse. A bunch of statues that are in the grass. A man riding a snowboard down a snow covered slope. A man swinging a baseball bat at a ball. A cat sticking its head out of a cement wall looking up. a few cowboys stand watching some animals outside The woman is riding the horse on the course. a woman sitting on a wooden park bench smiling at the camera A cat sitting on a wooden chair in a room. The animals are grazing on the wheat grain Two men playing frisbee in a park Two people standing in a market by a fruit stand. Several cars are seen going down a city street. A grown elephant and a young elephant roam freely together in an open field. Woman in sunglasses hugging a red fire hydrant. a teen standing on a skateboard while riding part of the wall a close up of a cat laying in a luggage bag A big family pose for a picture with a surfboard two motorcycles line up as they lean against some seats Two puffins sitting in some grass on a mountain. Many young men pose for a picture. a person showing cellphones on sale in a shop A YOUNG GIRL ON A SKATEBOARD IN A PARKING LOT A man is posing excitedly on a surfboard. A very nice boat on the water with a dog on it. A young man holding a Wii mote plays a video game A stack of suitcases stacked in a front lawn. A gang of bikers riding down a street. Little girls play soccer on a field on a sunny day. A living room and dining area with hard wood floors. A bunch of ripe oranges are stacked neatly on top of each other. Someone is making a sandwich consisting of carrots and alfalfa sprouts. A brown cardboard box filled with bananas, apples, oranges and kiwis. The young kids are playing a game of soccer. The kitchen bar is near a dining room table. A food container with five sections filled with various items. A woman sits on a brick wall, holding her umbrella, looking out at the city. Woman talking on cellphone in a dining room. A small industrial machine car on train tracks. A desk witgmh a telephone, laptop, cell phone and a book on it. a man in glasses is playing with a white controller A BIG BOX OF TOMATO AND BASIL PIZZA. A large group of people on a field playing soccer. A yellow and black train is on a train track. A black and white cat curled up on a brown checked sofa. A man is talking a picture of a bus. Two horses standing near each other in a field A lone swan swims in a river near a bridge. Woman playing tennis with bleachers in the background. A vintage photograph of a war plane flying Two food items are displayed on separate plates. A brown and gold fire hydrant in front of a brick building. THIS IS A PICTURE OF A FEW ZEBRAS GRAZING IN A LARGE ENCLOSER an image of umbrellas lined up with tables A group of sheep walk along a dirt path. A desk with a pc, monitor, laptop, mouse, and stuffed animal A gray and white cat laying on it's back with it's head looking up in a open drawer. A man walking a bike near a train station. Photo of a living room with a Christmas tree in the corner. A toiler and some buckets in a small room. Large group of ships tied together at a peer. An empty road with a red stoplight that spans voer the road. A black dog playing in the ocean while barking. A dark gray bird flying towards a palm tree A motor home parked along side an outdoor flea market. A closeup of a wine glass and a wine bottle A boy jumping up over a bench on a skateboard. A huge, captive fish gapes his mouth open at a woman taking a photograph in an aquarium. a person reading a book and cooking food on a stove Onlookers watch as a skateboarder performs a jump. Some wooden benches are in the middle of the forest. A man standing on top of a sidewalk holding a skateboard. A laptop sitting on a living space table with a spacious desert view. four colorful vases of different types are sitting on a shelf. A close up of a stop light positioned against a high rise building. A room features two identical beds with stools at the end. A computer screen showing photos on it while a smokestack is visible out the window Night shot of skateboarders in wide open area with lights above. A couple of giraffe standing next to a zebra near a rock wall. A table holding a white plate with bananas and a brown glass. A bus that is travelling on a road in a town that has many houses and buildings. A pair of giraffes grazing on hay by a fence. A couple of plates of sausages, broccoli and purple food. A family of giraffe on a wild field next to zebras. A large cow standing in a grassy field near other cows. A tennis player is jumping and reaching to hit the ball. A couple of people on a wall playing with a Frisbee. a laptop on the ground near a turn table An old clock with a flower design in a small room. a kitchen with a refrigerator near a sink Men loading luggage from a train onto a cart. A fan closely watches the professional baseball batter Little boy and girl sitting on the porch eating their meal. A man standing behind a display case filled with jewelry. A black and white photo of a man fixing anthers tie. a person preparing an authentic pizza on a wooden spoon A street sign next to a traffic sign next to tall buildings.. Herd of wild cattle walking along the beach A city street with people out and two large buses a small dog with some glasses over its eyes a dormitory consisting of many beds lined up along the wall A kitchen has a stove,microwave, and wooden cabinets. An electric commuter train on the tracks under a cloudy sky The person is wearing black clothes, shoes, and hose. There is a child that is walking in the gradd Hundreds of bicycle enthusiasts embark on a race on a city street. A man in the air skateboarding at the park A close up of a Harley Davidson parked on the road. A man holding video game controllers and playing. Gray and white bird with red crest using bird feeder. A white car passing a person in a black jacket. a large train is going down the tracks outside Two ponies together standing on a mountainous terrain. A small bathroom with a shower, sink, and wooden medicine cabinet. Small train coming out of a tunnel on an overpass. A group of people who are standing in the dirt. bread with banana milk and nutella on a table A clean and tidy kitchen with a stove, dishwasher, microwave, widow and a door. A cat sleeping on top of an open laptop computer. Two motorcycle riders talking on the side of the road. A bunch of people is watching something and a man in a brown and blue stripped shirt has his fingers in his ears. A ukelele is passed over a table with cake and lots of food. A giraffe is standing near its fenced area observing. A pasture with sheep in front of a large home A large group of birds sitting on metal pipes in the water. A group of people sit down at a table to share a meal. A man pinning a number to a child's shirt. A dog sleeping on bed against the wall. a room with wood flooring filled with furniture. Two zebras stand close to each other in a field. a person on a bicycle a bus a truck and a child a vandalized stop sign in the dark with a sky background a red and yellow trains engine pulling its cars and some tracks A woman sitting on a bed with a laptop. Two people with surfboards are standing in a sandy parking lot. A PERSON JUMPING FROM A SLOPE ON A SNOWBOARD Long bamboo poles with umbrella tops in front of the sky A stove is away from the wall in a kitchen area. A trolley driving down a street lined with tall buildings. A giraffe looking concerned on a grass field. Two metal lamps are placed beside a window. A close-up photo of a pool table with a man playing. A group of different parking meters displayed together. A boy is doing a trick on a skateboard. A black and white photo of a city street with old cars and people on it. a number of people standing around a large group of luggage bags piece of cake with a plate and fork Large giraffe roams in the lush green vegetation. a blue and pink kit with streamers flying in a clear blue sky A man pushes a brightly smiling little girl on a swing. Two stuffed bears that are next to each other. A car that is parked in some snow. A zebra standing in some brush without leaves. A bunch of people walking across the air field to get to their plane. A man hitting a tennis ball with a racquet. a close up of stuffed animal with metal pieces on his chest A person is on skis in a very snowy place. airport coming in to dock at the airport a person on a city street operating a cell phone Two man standing near each other in a park. A man is eating a hot dog and talking to a young girl. A person bending over to adjust a child's skis. A woman in a boat eating a sandwich. A person standing on a mountain top with some skis. A group of cattle walking across a lush green field. two giraffes standing under a tree to get some shade An owner plays tug-of-war with their Golden Retriever A dog looking up and running to catch a frisbee. an open toilet on the side walk of a street A man with a very bright orange hat sitting in a car. A traffic light next to a busy street in front of a brick highrise building. a person standing near a bush near an elephant A woman that is standing on a sidewalk. Three park benches are in a garden type setting. a bathroom with a toilet and a bath tub A flower vase in the center of the kitchen table. A bull walks up to a pile of wood and a teddy bear. a couple of coaches in a cluttered living room THREE BOYS RIDING THEIR BICYCLES ON A STREET. there is a male baseball player about to throw the ball A couple of people standing around holding snowboards. A photograph of a highly decorated cake on a table. A passenger bus that is driving down the street. A large building with a railroad crossing near it A bike is covered and parked on a street. A small bird in a tree with red fruit on the tree A cat looking out a window at a bird. High school girls soccer game action shot of green versus red team. A person with a lighter lighting several sticks. Reflection of a school bus in its own side view mirror. two people in costume pose for a photo brown bathroom with white toilet and white sink A pretty young lady eating a hot dog on a bun. An elderly man blowing out birthday candles on his cake. A plate filled with breakfast foods sits on top of a wooden table. This clean bathroom has a tile floor and a brown toilet lid A woman with a painted face is on a phone. A green bird bath decorated with various jewels A fancy clock graces the corner of this old building. A cat sitting in front of a television watching a hockey game. A chair and a blue umbrella are attached to a wheel. People sitting around an oval table in a restaurant posing for a photo. A street scene with a truck and trailer in the foreground. A locomotive on train tracks in a wooded countryside. A train riding pass a platform and buildings. a stuffed sandwich with meat, cheese and pickles A man riding on top of a board on a wave. A man and woman making a cut into a wedding cake Some cows stand beneath the shade of some trees. Pink lunch box with compartments for all types of food two white birds flying over the sea water The lobby has a few people in it but for the most part it isn't very busy. A double oven with one side completely full and the other empty Man are standing near a couch holding Wii controllers. A pizza on a board with a pizza cutter A BABY EATING A MEAL WITH HIS TOY DRUMMER BEAR A tan dog laying next to a park bench. A skateboarder performing a jump off the edge of a stone wall. a cat sits on the floor looking at the camera THERE IS A CITY BUS ON THE STREET a couple of people that are sitting on a bench A person sitting in bed with a dog on his lap. A seaplane is docked near a residential area. An umbrella and rain boots in a corner A laptop and a desktop computer sitting on a table. A zebra and a giraffe walking in opposite directions. Two children staring out a window while on public transportation. Giraffe and zebra grazing in a field next to plants. A stop sign is shown on the side of a corner. A modernist kitchen, with a white and aluminum color theme. a large room that has a big kitchen table in it A baseball player hitting a baseball with a bat. A baseball player running to catch a baseball during a game. A large white bus on a city street. A woman putting post it notes on a wall in a room. The baseball player is practicing his swing for his favorite game. Assorted pastries and tongs have been arranged above stacks of plates. A cellphone and a remote control sitting on top of a book. A woman is skiing down a high mountain. A man standing on a tennis court holding a racquet. A herd of elephants with birds at sunset. A white plate topped with broccoli and meat covered in sauce. A woman is smiling and holding a monkey. A black and white photo shows workers working on a road. A small girl holding skies in the snow A person is running with a kite in the air. People watching two elephants from behind a cement platform. A man flips a skateboard while doing a trick. The toothbrushes have a holder on the bathroom sink. a train station with a train sitting parked in it A guy on a skateboard at the top of a concrete bowl. Many stuffed animals hanging and sticking to a tree. Several "One Way" Signs are placed near an "All Way" Stop sign. A man riding a surfboard on a wave in the ocean. Two young boys eating carrots while sitting on a bed. a long train is going down the rail road outside Five benches in the park in an area surrounded by trees some people are pushing a truck in a lot A group of small children having a birthday party. A dish contains carrots, onions and other vegetables. The two baseball players are walking on the sidewalk. A room with decorations on a shelf and a painting on the wall. a close up of a laptop on a desk A laptop computer is sitting on a table top. a couple of people on skate boards do a trick A closeup photo of a bulldog wearing an Army style hat. A sprinkled doughnut with pink icing sitting on a plate. A kitchen area with refrigerator in the background and a sink and stovetop oven on the side surrounded with wooden cabinets. two zebras are in their pen at the zoo A baseball player holding a bat on top of a field. A couple of skateboards, two sitting on the sidewalk, the other on the board. A red stop sign sitting above a traffic light. A desk with two computers, phone, and other accoutrements. A teenager in wild clothing playing a video game Blue, pink, purple, and yellow flowers are in a red vase. A hot dog with a large amount of cheese. A city street filled with traffic at night The box of a dozen donuts has two different flavors. A living room setting with furniture and lamp The three men are dressed in costumes. A red double-decker bus driving down the road. two giraffes and one is eating some food An iPod and a laptop computer on a desk an orange and white cat and its orange play toy The closeup of a clock on the face of a tower. Large poster on wall behind white commode in dark tiled bathroom. A dish with shrimp and cucumbers and lettuce. A man with a skateboard that is up in the air. Children are looking at a zebra in an enclosure. People are sitting on a motorcycle with a woman standing behind them Oriental woman preparing to put a toothbrush into her mouth. A table with a laptop, bag of coffee and cellphone on it. A girl excited about a cake at her table. Dog laying down on a grey and yellow striped couch. The top of a desk with a keyboard, computer and phone. A coal fired train with passengers behind a split rail fence. A collection of fine furniture is displayed in a room. A transit train badly in need of a paint job A plate with a roasted carrot and broccoli on it. A woman holding a tennis racquet prepares to play tennis. A person gets ready to swing their racket. a girafee looking around by some people A girl stands on a bed and appears to be crying. A very large building with a tower near some water. a close up of person sitting with a laptop a cream colored dog lying on a brown carpet. White toilet with a shower with a tree on it beside it. A catcher has his mitt out as a baseball batter swings his bat and hits the ball. A man who is sitting at a table with a plate in front of him. a lady covering herself from rain with an umbrella A man wearing a gas mask and a suit and yellow striped tie. A glass bowl filled with noodle salad on to of a table. Two elephants are standing on the grass near some trees. Three boys sitting in chairs with game controllers. A city intersection with a sign redirecting traffic. Nice looking front room with brown furniture to decorate it with. A train with a red caboose sitting on tracks. Military looking truck parked in an old warehouse The living room actually features several different colors. Small pizza sits on a plate on a restaurant table. A man holding a bat next to a catcher and umpire. The teddy bear was left on the empty bench. very many pizzas in a plate in a kitchen A black microwave on a cabinet in a hotel room. Man flying a tailed kite high into the sky A man that has a gold tie on. The outdoor furniture with a table umbrella is made out of wood. A family sitting at a booth in a restaurant looking up. Two women standing on tree stumps with a boy and a teddy bear A group of men let their horses drink from a fountain. The girl is eating her pizza with a fork. a grey dog seated on a chair of a vehicle A brown cat lying at the back of a car A group of kids standing beside an opened fire hydrant. A herd of giraffe running across a field. A person rides a horse in front of a large group of people. A woman sitting with her legs crossed on a bench in a green field. A man riding a skateboard on top of a cement park. BRIGHT RED FIRE HYDRANT WITH A SIGN NEXT TO IT The cat is laying partially in the light with its eyes closed. A couple of men playing soccer against each other on a field. Two people work on a shed while standing on a tractor. there are two grizzly bears walking down the gravel road Two men running and playing baseball with plate and grass A couple women with remotes in a room. A red trolley passing by a group of people under umbrellas. A young girl standing on top of a tennis court holding a racquet. A large open concept living room leads into the dining room. A person with a red umbrella walking towards a bike chained to a lamppost. A bunch of broccoli spread out on a table A closeup view of food on a plate a dried out tree with fruit hanging on it A small and a large teddy bear sitting in plants View from the stands of sparsely attended tennis match A man that is sitting down next to a cops motorcycle. a brown table with a toaster a plate and a black microwave A snowy road with snow covered trees on which a skier is traveling. two guys are outside moving a refrigerator An Asian woman in front of a body of water with two umbrellas A man standing on top of a blue tennis court. Signs at a city intersection indicate no turning is allowed. Two elephants walking in water next to grassy area. A man on a skateboard doing a trick. Woman in a white uniform holding a pencil to wall. A small boat with several flags on it moving across the water. a big building with a clock built inside of it A market has many fruits and vegetables on display for sale there is a large truck and a yellow truck behind it a store front has many stuffed animals on display a tennis getting ready to hit the tennis ball A group of sheep walking down a path with a few stopping to eat grass along the side. A small cat laying on a couch in a room. A fire hydrant behind a gate on the sidewalk. Giraffes eating leaves from bushes near logs on sand. A man sitting in a chair looking at someone's food. A zebra with its mouth open and lip in the air. A man holding skis and poles and walking up a snow covered mountain slope. A man is cross country skiing on a bright day. a woman in white is cutting into a cake A person surfing a white water rapid. A glass vase with yellow flowers in it A rear view mirror on the side of a car reflecting a mountain range. Two sheep are in a barn standing next to each other. A double Decker bus is traveling down a street. An old steam locomotive waits at a country station. A snow covered parking lot meter in front of a building London double decker bus in motion on street The horse is grazing in the fenced coral. a zebra is laying down in some dirt A man in his skiing gear is on his board looking on. a guy on a surfboard with a kite attached to it. A sheep with its new born lamb in a field. A toy ship made out of Legos is attached to the side of the refrigerator. A person up in the air on a skateboard. Elderly man sitting on a bench facing the beach. A bathroom has pictures hanging on the wall. A small hot pink bathroom with a few touches of royal blue on the toilet is shown. A father and son in a kitchen preparing a meal. The parking meters are posted beside a cement wall. these men are playing a sport in a field A large city bus making a turn at a crosswalk with a clock tower behind it. Two woman play Wii video game with wireless controllers. A square of cheesecake on a marble cutting board with a two-pronged fork. A plant in a blue cup on a windowsill. A bright red bench sitting in front of a decorated store front. A man has some food hanging out of his mouth. A group of people standing around each other near a street. A child at a table sitting in front of a birthday cake. two giraffes standing next to some trees Many people are standing next to a very large plane with its bottom doors open A man in black shirt and white shorts playing tennis. Two different transit trains can be seen in this photograph. A group of people around a table with a blue tablecloth A bedroom with two beds and a table with a lamp. A bear jumping into a pool of water. a man with a napkin at his neck eating a dangling food An elephant walking by a group of ATVs. Two people that are sitting on a table. Two photos of a young man in a suit and tie A young boy with a helmet rolling down the street on his skateboard Commercial airplane flying in the air on a cloudy day. The neat bedroom has a large window in it. Two cows on a grass covered hillside on a sunny day. bathroom with white toilet and white sink berside each oher A bowl filled with yellow bananas and green apples. An aircraft soars over a beach near a city. a person sitting on a bench while the rest look somehwere else A lot of people that are on a sidewalk. The sun is blocked by a statue holding a round object. A man with an umbrella hat stands next to another man. Four horses walking across grass on a lake with mountains in the background. Outside view of the MGM Grand in Las Vegas with people sitting and walking. a brown and white cat is looking in a mirror with glowing yellow eyes A fork sticks out of a parking meter. A man in a bathroom on an airplane. Bunches of bananas in yellow and green hang from a ceiling. A white and red double decker bus on street next to car. ACCIDENT SCENE WITH FIRE TRUCK, AMBULANCE, VEHICLES, AND PEOPLE View of jet airliner taking off over tree top. A train sits on the tracks while people stand near by. A professional baseball player about to hit the ball A full view of an outdoor space with many things to see. a red motorcycle with a windshield parked on the sidewalk A dog with a tiara on and his head rested on an armrest. Two guys standing on the right hand side of a motorcycle. A bath tub in the center of a large bathroom. A stop sign and two fire hydrants set up in the woods. A condiment filled hotdog is in a red basket next to an iced beverage. Two slices of pizza on a table with one beverage. a store sits in front of a fire hydrant A set of three cow statues siting above a crowded walkway. A large plate of doughnuts on a table. a bunch of sheep in a field eating A child on a blanket with an apple. A woman leads a race horse down a cobblestone path. A man drinking a beverage with his sandwich. A person is squatting by a banana tree. a small group of fish about to be cleaned Two men standing in a parking lot dressed in business atire An arrangement of fruits and vegetables are laid out on a counter. A kitchen with a large wooden table and clutter on it's counter. A slightly dirty room that has green items on the floor A cat is laying on the lap of a man playing video games. People sitting around a long table using laptops A bear is next to a body of water outside. Girl on phone looking up a statue of Ronald McDonald. White living room furniture looks very modern and clean. A hose sitting next to a fire hydrant on the street. A large clock stands on a post on a city street. Two giraffes standing in a rocky area by a river. A group of women are gathered by a long table of food. A stop sign that has been covered with graffiti. A dog with its mouth open about to ear pizza crust . A shelf with donuts being sold six for five dollars. A muscular man surfing on a vast blue ocean Herd of elephants crossing a water hole next to another herd of elephants. A collaboration of people in different pictures doing things Airplanes on the tarmac in the rain at an island airport A person jumping a skateboard at a skateboard competition. there is a man pointing up standing on a building A boy and dog sitting on a recliner the boy looking at a laptop. Men on horses herding a group of cows down a road. A skier looks up to the camera above her. A row of urinals with air freshener boxes on a wall. a person standing in a door way and a horse in the foreground A baseball batter readies himself waiting for the pitch. kids out on the field playing soccer together A man flying through the air while riding a surfboard on a wave. Man in red shirt standing in front of a man holding a frisbee. Two mountain peaks rise above a large meadow. A bike in view in a living room with a Christmas Tree in the background. a group of horses standing next to a tree in an open field A player sliding onto bass while an opposing player tries to grab the ball at a baseball game. a big bear that is staring at a camera To people sit on benches in the rain, holding umbrellas. a person in a kitchen with a pizza A mans toilet attached to a black pole. Someone holding a stuffed teddy bear in their arm. A crowded city street is full of big umbrellas. A naked baby is on a bench in a backyard. people standing outside of a building with a fire truck A meat filled pizza sitting on a pan on a table. A rainbow colored kite caught in the branches of a tree. A young boy riding a skateboard up a ramp. She is eating a sandwich and having a drink. A kitchen with marble counter tops and black appliances a stop sign graffiti written on the front a train yard with several stopped trains waiting to go Street signs, including a stop sign, where someone wrote "Don't stop believin!" Two pillows sitting on the ground next to furniture. A person is holding something donut shaped in their hands. A woman with glasses and a scarf skateboards along Hollywood's Walk of Fame. A glider gliding in the sky over the ocean. a group of people sit around a table The pastry has a substance in the middle of it A group of baseball player congratulating each other. A man strikes a tennis ball during game. a number of people on horses playing polo some baseball players on some grass and some trees Two people climbing a mountain on their skiis A view of a oven with the food flipped over in it. A woman sitting at a table while using a laptop. A counter topped with lots of pizza and sandwiches. Several zebras walking in the shade near some trees. A kitchen has light wood and shiny floors. Plate of food including chicken, pasta and vegetables. Girl competing in a horse competition at the county fair. A white train sitting in a train station next to a Bologna sign. A zebra standing on top of a grass field. a motor bike sits parked on a cracked street A number of food items and two beverage atop a wooden table. Vases of flowers sit among plates of pastries. An airplane is on a snowy runway at an airport. A dark colored dog sitting on blanket and looking up. A bunch of different fruits sitting in baskets and on a table. Vegetables and fruits are on a brown cutting board. A low angle view of a church clock. A man and woman sitting by a pile of bananas a person sitting in a boat with a dalmation A group of people near a table full of bananas. A large flock of sheep are in a grassy meadow. A person sitting on a machine with wheels in the middle above a pedal. This lamp is standing near a wall that is painted red. A toddler laying in a bed with pink sheets. A person on skateboard in a parking ramp looking area. An old, dilapidated toilet with a broken seat A smiling clothed man sitting on a toilet. A huge white and blue airplane sits on the runway. Two men shaking hands and one being presented with a key. Stuffed animals on a shelf with some books. All of the donuts each have a different flavor of icing. A close of a bobble head doll with a computer in the background. A bathroom with toilet glass sink, mirror and extra toilet paper. A very cute orange cat laying with some shoes. A clock tower sitting behind an illuminated star display on a tree. A paper plate topped with a slice of cake next to a spoon. a shop with a bunch of signs sitting out front A room with wooden floors and wooden walls A school bus that is made by Chevrolet has a few bumper stickers. A suitcase full of random assorted food items A living room with couches, a table, and a fireplace. A plane takes off from a runway while a large building stands in the background. A trolley with people on tracks in a rural area. The batter prepares to hit the ball, while the fans watch from the side. Young snowboarder spending time on slope in ski area. a sink with a microwave oven on top of it A delta airlines jet sitting next to a truck on a runway. Two people standing in the reflection of a mirror. Two children on a soccer field kicking a soccer ball during a game. Group of motorcycle riders looking over traffic on the street A man holds an enormous sandwich in front of his face. Group of motorcyclist riding motorcycles down a highway. A man skiing down a snow covered ski slope with two ski poles. Small boy in green shirt touching a yellow fire hydrant. A small herd of cows near a water bank. Decorated coffee cup with spoon next to miniature bicycle. A man standing next to a woman in a kitchen preparing food. two street signs on a pole on a sidewalk next to a street. A man walks out of a colorful train onto a platform. A pudgy man holds a huge hot dog and chips. Donuts and a cell phone laying on a table. there is a man with tattoos talking on the phone A man with a racket goes to hit a tennis ball. A high speed train pulls into a platform while people watch. A bed in a room with two windows. two plates side by side, one with a roll and jam. A young zebra stands away from the zebra in the light. There is a white stove with pans on top of it and next to it, a refrigerator. A lady sitting at a kitchen table alone. A tennis player serving a tennis ball on the court. A typical living room with couch, glass coffee table, television and water dispenser. A cop leading a gang of bikers down a street. A couple of giraffe standing next to each other in a forest. Laptop computer sitting on a table with a sticky note on it. An animal grazing under a wide, gnarled tree. A woman laying on a surf board is riding a wave. a group of people next to a train with a sky background A cat staring at a another cat hidden in a travel bag Several people gathered around a table that has a cake on it. A coffee cup sitting on a pad of paper next to a keyboard. A white plate is filled with a variety of doughnuts. An aisle in a store that is selling holiday items. The player is hitting the ball with strength. The man is holding a glass pan full of liquid mix. There is a row of cows with baby cows next to them. The black cat is turning away from the large computer screen. The window to the store has graphics on it. A plate of food and a cup of coffee. a girl with a microphone talking about a cow Carrots and cucumber on wooden cutting board near knives. 2 buses and numerous cars move down the street. a male in a brown shirt sitting on a bench with a laptop A painting of several flowers in a vase sitting on a shiny surface. A large yellow dump truck driving on top of a sandy beach. A person is riding a horse inside an obstacle course. A man is standing near a computer giving a presentation. A black cat with green eyes rests on colorful blankets. PERSON GOING FOR THE RETURN ON A TENNIS COURT a view of a keyboard, remotes sitting on a desk A man riding a skateboard down the side of a ramp. the sandwich is on the plate and has been cut in two A man is standing at the base of a ski hill. A refrigerator and a stove in a kitchen. The bottom of a large airplane flying overhead. a man wearing a yellow snow jacket and black snow pants snow boarding. A piece of chocolate cake is in a plastic container. A small airplane flying over a body of water. A child holding the hand of an adult while moving on skis. People are walking on cobblestone with umbrellas and shadows. A child is under the covers reading a book. a plate with some food in it on a table. A man is riding on skis down a snowy mountain. A batter, catcher and umpire in a baseball game. A line of black and white cows are lined up and grazing. a woman is working at a pastry shop A cat is sitting on top of an entertainment system A woman sitting on a bench with a mean look on her face A large group of people on a grass field. A person sitting in front of a laptop computer. a desk with a bag and a bunch of other things sitting on the floor A dog runs alongside a skateboard with one paw on. A skateboarder balances on his skateboard, then balances on the board at the edge of a low wall. A man riding skis down a snow covered ski slope. a close up of a plate of food with broccoli A man wearing a wet suit riding a wave on a surfboard. a police on a big white horse in front of a retail store A small cat standing by a mirror on the ground. A bedroom with a white bed on a frame next to a window. A white table topped with lots of plates and food. A suitcase surrounded by some items on a floor A man holding a Wii game controller while standing in a living room. A fried dish is pictured on a plate. A tall clock tower sitting on at the end of a street. A herd of wild elephants walking along a dry grass filled hillside. A group of people sitting at a table with plates and soda. four woman standing next to each other with bike helmets on and holding bananas A woman walks in the road shimmering with rain past the city lights. A man making a cut into a celebratory cake There are cars parked along the side of the snowy street. A red crafted bird is pasted to a parking sign. close up of a cow standing on the other side of a barbed wire fence A small boat rests on wooden planks by the water. Man on skateboard on top of wall in factory. A plate of food featuring burger patties, potatoes and carrots. A girl lying in bed and playing a handheld game. Two clocks on post next to building in street. Looking down at cookies baking in a home oven Colorful flags hanging lined up in a row. A black and white image of an older air plane. A stuffed animal dog birching out in front of people at the beach A box is full of old items as a tribute to Forrest Gump. A vase that has flowers in it on the table. A dog on a bed looking at something. A man on a surfboard that just caught a wave A bird stands next to many black benches. A woman with makeup bruises is in a suitcase. a man lifting the lid of a square shaped toilet A man is pulled by an unseen boat while water skiing. Small cup of baked brownies being scooped out into small snack sized dishes. a group of people sitting at a table to eat at the beach a couple of people that are in a kitchen The train can be seen through a chain link fence. Birds stand on a side walk under the large trees. A view of a great room consisting of a living room and dinning room. a street light with street signs in front of trees A photo of some cows standing in a field. many luggage bags near each other on the ground A line of people in suits holding roses. A BATHROOM WITH A TOILET AND A SINK That collage of nude women probably means this bathroom belongs to a guy. A man standing in a room holding a drink and a game controller. A white toilet and sink in a room. A child showing a banana to the camera. A close up of a cats profile is shown. A man is standing next to another man who is laying on the floor A man is looking at a bus stop sign A baseball player holding a bat standing near home plate. A man holding a laptop sitting beside a woman with a small child. a toilet with a remote control mounted on the side A couple of women riding on the back of a horse drawn carriage. A man at a campground eating a sandwich. A man sitting at a table with food and beverages in front of him. A bride, groom, and minister at a wedding ceremony. A TV sitting on top of a stand in a living room. Four people riding on horses along the beach shore line A train engine carrying carts across a bridge over water. A black train sits on the tracks as people stop to admire it. A kitchen with two stoves, an island, and appliances. a woman walking down a crosswalk next to woman riding a skateboard A double length metro bus drives down a city street. A tray covered in tin foil on top of a counter. A clock sitting on top of a sidewalk. A park bench by a body of water. Two cats laying next to a cup of coffee. A hairy man is holding a frisbee on the beach. Two people hold their colorful pastries next to each other. Many skiers are going up a snow hill A picture of a baseball game being played in a stadium. A collage of pastries, and a boxed of donuts. A triple layer cake sitting on top of a table. A red and yellow traffic sign sitting on the side of a road. A bowl with red and green apples and an orange. A living room with furniture, television thrown rug and a window. a small child standing in a living room eating something a dog moving towards the horses at the mountains man with skull decorated surfboard eyeing the ocean A young man standing on top of a snow board in the snow. A cyclist passes a bus while it picks up passengers. a big living room with stained glass windows leading to a piano Men's doubles tennis players shaking hands on the court A white cat is sitting on a white sofa. A woman wearing a white shit and apron standing by a man in front of a traffic light. A white plane sitting on top of a runway near a building. An old black and white photo of a man holding skis. Cows in pasture within a fence on a field. a broken toilet bowl base overturned in a shrubbery next to dirt and rocks. Bathroom with a toilet, glass sink and a mirror. A couple of animals grazing on a dry grass field. Electronic and personal items from a back pack laid out neatly Female flying a kite in an open field. a woman posing on a bench in front of stony ruins four giraffe stand at a tree all with their noses stuck into some kind of nest A group of elephants walking through the street with a pepsi stand in the background. A man holding white surf board on the beach A walk in shower next to a tub in a bathroom. A kids baseball game with a runner sliding into home A black, blond and white cat crouches on the side of a table with a cake on it . A man on a bike balancing quite a bit on his head. A group of female soccer players at the pitch playing Horse jumping over an obstacle on a course with a rider. Man cooking marshmallows over an electric stove with fancy tongs. A plane on a runway drives off to the air A group of people playing Wii in a family room. Flowers in a vase full of water next to a window. Many different piece of luggage that are open on the floor. two people with two dogs on a surf board and one dog swimming a plate filled with assorted veggies and cheese A train engine is sitting at a train station. A couple of people on skis in the snow. A bathroom with a toilet, towel rack and a tub in it. A man in wet suit surfing wave on a surfboard. A cat is curled up on a bed beside a remote control. A couch with a cat and toy teddy bears on it. a number of horses standing near one another A child in snow gear and skis on a ski slope. An old truck with no passenger door with tires and body painted in different colors. A close up of a plant center surrounded by leaves A couple of kids are skate boarding down a street. A yellow train is stopped against a barrier on the tracks. The soccer player is kicking the ball while a crowd watches. A silver pan filled with food on top of a stove top. A man riding a dirt bike on top of a sandy beach. A cat is stretched out on a couch under a window. Close up images of bikes parked next to the highway. a table with some glasses of beer and some pizzas on it Two horses giving each other a loving nose kiss. A monkey with a banana sitting in the dirt. A table that has a plate of food and a glass of wine on it. A bowl filled with ice cream, sprinkles, cherries and other toppings. half a dozen giraffe in a wooded area Two skiiers jump down a snowy slope towards a ski lodge. A full, black and white coffee cup held in front of a computer keyboard. This kitchen has white cabinets and counters and silver appliances a man that is walking down a sidewalk A beautiful young woman riding a pink skateboard. A commuter train makes a left-hand track change to change direction. A beautiful Asian girl with a white rose in her black hair. She is holding an open blue umbrella over her head. A man riding a motorcycle on the street. a group of people holding wooden utensils a smiling at he camera A bear made out of gummy bears in a candy store. This aerial shot shows several people using a cross walk while holding umbrellas. A group of people sitting on a couch in front of a cluttered table. A white bed topped with pillows sitting next to a wooden night stand. A boy is playing tennis with other people in the background Many people on the beach with large colorful kites flying in the air. A women is holding an ID and holding a pair of scissors to it. The elephant is walking outside by himself along the wall. A woman with nice legs laying next to a purple umbrella. A group of teddy bears all dressed in Pilgrim and Halloween outfits. A man in light clothing stands near a boy with sunglasses and jeans and they are both by a white glider. Display of ornamental vases and figurines with oranges stacked on stands. Athlete in motion during attended competition on gray and blue court. A large jetliner flying over a body of water. A kitchen with stove, refrigerator, and cabinets in it. Two people riding bicycles alongside the river on a sunny day An older gentleman flies a kite on the beach. A variety of items are spread out on the bed. a bathroom with a sink right next to the shower Two girls in pink robes standing in front of a television. Trays of party food lined up on a table. A copious amount of food are served up in the kitchen wares. A white horse is out eating in a field Three donuts piled together on a small plate. An airplane on the runway either just landed or ready to take off. The cat was laying in the sun on top of the zippered bag. A train with two cars is on a railroad track that splits into several directions. A desk nook area has a desk, a chair and a book shelf. a man is arranging a set of appetizers on a tray A train is parked at a depot on the tracks. A man hitting a tennis ball with a racquet on a court. a person riding a skate board at a skate park A silver truck driving past a giant arch from a mcdonalds. A man in green shirt riding on an elephant. A man is wearing a robe and a tie. A man is standing in front of a grill with an umbrella. A young boy aims his video game controller as a man watches. A group of cows standing around in an open field. Orange and white cat laying down and chewing on some cups. Woman on tennis court grasping racket with both hands. A piece of pizza being held in a persons hands. a red fire hydrant at the corner of a street A green and red semi trailer truck front without a load. A couple of men standing on a lush green park playing a game of frisbee. the duck is looking over the side walk A group of men standing next to each other. A zebra in an open ground near a bench. A toddler brushing his teeth and gums at the sink. A group of zebras that are standing in the grass. Man mid swing playing Tennis on tennis court Boxed hotdog, fries and a drink are set out for daytime reading. Wooden mantle holding two vases of flowers and a picture. Tilted pic of a mountain road with a street sign. A couple of people kneeling over a pile of snow. A bird with outstretched blue wings is sitting on some bird feeder. a kitchen with a table a stove and an oven A red stop sign sitting on the side of a road. A bed sitting in bedroom under a picture. a living room with big couches and a ceiling fan A person with glasses on the phone in a restaurant. An elephant is standing in a grassy field in front of trees. Four boxes of donuts of various descriptions on a table The baseball pitcher has wound up his arm to pitch that ball. A little girl is holding a Minnie Mouse umbrella above her head. A man is looking at hanging fruit arrangements. There is plenty of clutter by the computer on the desk. A cat in a bathroom stands on the rim of the toilet. a red fire hydrant with two nozzles on it A giraffe is standing in a field with a group of zebras. Someone who is holding a hot dog in front of a box of teddy bears. a man in a uniform is cutting a cake A man on cellphone and woman walking by building. a teen girl sitting at a table with some pizza in front of her The ingredients represented in the meal might include pineapple. A small group of men playing with a frisbee. Several people are swimming in a blue lake A train emits thick steam as it moves on the rails through a flowing plains. An orange tabby cat stands in a doorway with a bookshelf in the background. A group of skateboarders standing around while another skates. A family of elephants stand close to each other. some yellow signs attached to a building wall A variety of boats are shown in the water. A woman speaks into her microphone while looking at the cow. A green umbrella over some chairs and tables The pitcher is winding up to make the pitch. A very cute girl holding up some scissors. A male and two females jumping to catch a Frisbee. There is a train moving along a railroad track. A couple of people that are sitting on a bench. The man is watching hockey on his computer. Man on a boat carrying large quantities of cabbages. a parking meter that has been drawn on A group of people standing around a white cake. A white and black potted plant with a mirror behind it. some kids are standing outside with an umbrella A train car traveling on a bridge over water. A boy with two marks on his back stands on a skateboard. a surfer runs into the waves on a beaching with his surfboard a tattooed man with a skateboard thinking about doing a trick Two cats who are laying down on a bed. A plate of food, that appears to a small omelet and other pieces of meat. A dog that is sitting in a window. A cake shaped like an elephant squishing a horse. A WOMAN SITTING ON A BENCH EATING PIZZA WITH A LITTLE BOY a pepperoni pizza sitting on an oven done cooking A pitcher standing on a mound on top of a baseball field. A man riding a snowboard down a snow covered slope. A man kneeling down next to a little girl. A dog with a frisbee in its mouth is jumping over a man lying on the ground. A giraffe looking over the corral fence in his zoo habitat. A horse and foal are standing in the meadow. This is a picture of a black furry cat on a laptop. The man is riding down the ramp on his skateboard. A couple of people walking in a parking lot by several motorcycles. A man holding an umbrella light on a beach. A man in a red and white baseball uniform holds out a bat toward a baseball on a baseball field. A man standing in a kitchen in front of a stove top white oven. A small bathroom with a toilet and flushing system. A living room with a fireplace and contemporary furnishings. A pizza that is sitting on a table. A cat in a chair peeking above the table's edge at a drink. Leaves sitting on a street next to a parking meter. Young man gliding along rail on his skateboard. A mother and child carry kites through a park. A blender with something in it to blend A batter is getting ready to take a swing. A man flying through the air while swinging from a pole. A small green boat at a dock in the water. A desert topped with whipped cream is sitting on a plate. Two women who are riding in a horse drawn carriage. A vase sitting on top of a roof top. Pizza sitting on top of a table next to a couple of wine glasses. a kitchen with a refrigerator a sink and a stove Blue and purple vase sitting not he side of a white wall. A boy eating a doughnut in a diner. A line of traffic beside a metro bullet style train indoors. A woman is on snow skis on top of a mountain. A large black bear walking through a forest. a bench dedicated to someone with a weird edge She is eating a slice and watching the small countertop TV. A river with rocks in the middle and a train trestle in the background. A boy in a yellow shirt is riding the edge of a half-pipe on his skateboard. A plane flying over the beach with a mountain in the background. the man has returned a server of a tennis ball A large Japan Airlines jet landing on a runway. The back legs of a cat dangling over a keyboard. there is a large wooden platform bed in this room Residential bathroom with wooden cabinet and mirror next to shower. A dog plays with a frizbee in a pile of snow A man is standing up, taking a shot of the water, while a pigeon looks on. A man wearing a bandana, holding a skateboard. A woman is riding the waves on a surfboard. A giraffe is in a field of grass eating leaves off a tree. A black and a white horse are grazing in a green pasture. Kites being flown from the water in the ocean A man is riding a bike while using a cell phone. a group of people are at a market A room with a table, chairs and a doll in it. The woman is eating breakfast in the kitchen. This shows an innovative Apple device and a keyboard. a small bus sits parked as a kid runs across the street a building with some windows next to a street A man surfing a nice wave on a bright day with a ship in the background. A wet bear stands in the river looking for fish to eat A white table topped with a flower surrounded by chairs. a woman skiing down a ski slope in the slope A smiling man stirring something in a kitchen. a male with a beard a book and a child in bed A bird sitting next to a dried cob of corn. Man walking up the side of a mountain with his skis on. an oven outfitted with several Christmas lights A girl on a bicycle is stopped before crossing traffic. Man sitting on chair in kitchen with baked pizza on table. A little girl buying a small teddy bear. four men in an office working on their computers A skateboarder is riding the green ramp. A skateboarder performs a trick in a skate park. Several people are talking next to a yellow plane in a hangar. A person points a remote control at the television. A brown and white dog laying on top of a green field. A red and black motorcycle with people in the background. A boy that is on top of a skateboard. Three giraffes tower above trees and brush as they feed. There are small trees with oranges growing on them. Older black and white photo of a woman playing baseball and swinging a bat. A white frisbee laying on top of a dirt field. Group of giraffes standing by a pile of wood in an exhibit. Two brown bears walking on an unpaved forest road. A bear walking on a fallen tree in the woods. Two horses that are pulling a piece of farm equipment. People on a tarmac board a Qantas airplane. Up close to a giraffe in its natural habitat. A red truck is parked on the lawn of this house A man's feet resting on a skateboard A child standing between two luggage carts behind a car. People sitting around a table as someone puts stuff in a blender. A city intersection displays a clock on a long tall stand. Four giraffes encircle the palm tree within the fence. a stop sign with some graffit on it A lit birthday cake has some penguin candles. A building displaying a clock showing the time to be 6 oclock. Some very pretty whit bowls with some food in them. A man is eating a peanut butter and jelly sandwich. A black and white cat sitting on top of cabinets. An elephant stands in weeds with trees in the background. A man and woman sitting on a motorcycle. A shirtless man reading a book and eating. Park bench near tree during fall in open area. Two people standing on the beach with a kite. A man making a surprised face is getting a hair cut. A young boy learns the meaning of the word strike. There is a batch of doughnuts being made a man sitting at a table with a plate full of food a white horse is standing near a train The bus is parked at the bus stop. a man taking a picture of a truck parked next to a building A group of young people standing next to each other on a beach. A train traveling down tracks next to a power grid. Three giraffes in a field with an Egyptian theme in the background. Someone on a snowboard holding the bottom of it in mid air. A gourmet style pizza with a variety of vegetables. Cows graze in a field in front of a lake. A room with a sunny window contains a bed and a desk. A toddler holding an electric toothbrush to his mouth. a close up of a clock on a pole with a wind tool A white refrigerator and cabinets in a grey kitchen. A partially eaten plate of eggs, bacon and toast. Garbage and police trucks on a city street A small gray goat standing on large rocks. An orange and yellow flower sitting in a see through humming bird. A yellow and orange fire hydrant in front of a building. A group of snowboarders snowboarding down a mountain. Man stands up on his bike and looks up next to a parked car. A herd of animals standing on top of rocks. A bus is traveling down a street near a building. A few birds are on the roof of a house. A woman sitting on a rail next to skis A kitchen with a microwave, cabinets, stove and dishes on the counter. A child skier standing at the bottom of a slope. A teddy bear sitting on a tricycle on a sidewalk next to a flower bed. A man standing next to a woman in ski equipment. An acrobatic dog catching a frisbee mid air. A blender is full of food being prepared to puree. Girls walking in a park talking and taking pictures. A group of people sitting around a living room together. a couple of people that have tennis rackets in hand A group of people sitting outside at a restaurant table. A woman standing over a stove cooking food. A little league pitcher standing in a field holding a catchers mitt. A colorful doll-house bedroom with one girl doll occupant. People walk across a footbridge that stretches over a river. A person holding a dog who is looking at it's self in a mirror. Overripe bananas on plates with breakfast food packages. Two double decker buses passing each other on the street There are several people riding mopeds and motorcycles traveling down the street. A man holding a flying disc in a park. A large man is holding a black suitcase. An elephant throws dirt on his back with his trunk Two giraffe, two zebra, a monkey, and two flamingo are searching for food. a young boy wearing ski equipment in the snow. A surfer is on the water and is waiting for a wave. A herd of elephants walking through a lush green field. A woman with a tennis racket tosses a tennis ball. Police car parked behind a car illegally parked at fire hydrant. A herd of elephants splashing and playing in a waterfall. A group of people sitting at a table with beverages in front of a window with ocean view. A woman with a pen is writing while a man in a tie is watching. A bathroom scene with the sink and shower. A cheesecake on a plate with a croissant behind it. a woman reaching up her arm as she looks at tennis ball Round mirrors above clean sinks in a public bathroom. A man playing tennis, ready with racket in hand. A photo taken in a mirror showing the side of a truck. a man in a surfer suit walks down a street A gray airplane with metal petals on the wings takes off from an airport. A European tour bus with luggage on top on a brick city street. A close up of a glass bowl full of small oranges. A very beautiful woman wearing a black hat, black shirt and tie. An ipod plugged into a dock inside of a kitchen. A man sitting on a bench waiting to get a ride from a bus. an elephant behind a fence at the zoo A cat sleeping on a bed with its head on a teddy bear. A person viewing a picture on their cellphone. A meal with two plates full of broccoli and other items. A refrigerator adorned with several magnets and clippings. A few people are getting to know one another in affection. A street light turned green on a dark street. Small bird sitting on a skateboard posed in front of dark blue background cloth. A man leading a horse around the town. A sign that has a camel on it. Color fruit is on a stand including pears and apples. A giraffe standing alone next to some trees. A man is holding a military medal in a bar. A silver and green train stopped at a train station near kids. A couple of knitting books sit on a couch. Single zebra standing in a field of semi dried grass. some blue and orange surfboards on the sand water and rocks Small child in baseball uniform standing next to players. A boy swinging a bat at a ball on a field. The man is on a horse pointing his finger. A small elephant lawn decoration near a plant. A child dressed in random clothing standing barefoot in the kitchen. A woman in short shorts standing next to a young man. some people some buildings and some are flying kites The back end of three zebras walking in a group. A man in a red shirt motions toward his cell phone. a corner of a building with the name of the street on it. A man is checking his cell phone while snowboarding. A sandwich in a basket accompanied by a beer and a lollipop. The ships are all docked on the beach by the water. Skiers on a snowy slope are high above a small town. A dirty nasty urinal in a very dark rest room. A family gathered at the table eating breafast a black and silver trains engine and a car and grass A steer walking down a busy market street. A pizza with several toppings sliced and ready to eat. A child dressed in a skeleton Halloween costume. Teenage boy about to catch the flying Frisbee. Soldiers with guns in the back of trucks in a parade. A boat that has been beached on the shore. A couple of hot dogs sitting next to a basket of fries. Four meals have been placed on a table with beverages. A photo of a couple singing karaoke. Person in black surfing a wave near the beach. there are two woman that are walking in the street under a umbrella A black and white photograph of a skater performing a trick. A herd of sheep grazing on a lush green field. A group of people riding bikes down a street. two high school soccer teams play against each other. A silver fire hydrant stands in the grass next to shrubbery. Two people with umbrellas stand at the fence looking over the water. A woman stands in line at an airport. A big vase with flowers near a cup and window. A person in a park playing with a frisbee. A television sits above a fireplace in a living room. A single tall flower in a green glass vase sitting on a windowsill. a close up of a bench surrounded by plant life A black stereo speaker near a computer monitor and mouse. A woman standing with a donut and a candy apple in her hands. A red train sits on the rail road tracks. a young woman sitting at a table resting her elbow on the table An elephant in dirt area next to a booth. Two giraffes eating leaves off the trees in the woods A variety of fruits and vegetables sit on a table. Young male baseball player in full uniform and glove alone posing. a really sad picture of some men with guns sitting next to a dead zebra. A small plane with the cockpit open and landing gear down A woman is standing next to a display of giraffes. A living room filled with lots of furniture and a TV. The woman in red sunglasses is walking in snow with ski poles. a close up of a zebra in a field of wheat A group of people riding horses through a small village. A sandwich on toast with potato chips on the side. A brown bear is grazing in the grass. A multi-colored umbrella that is blocking out the sun There is a laptop on a crowded desk. A tennis player is on one foot hitting a tennis ball. A zebra grazing on dry grass in a field. Variety of meat and produce displayed for meal preparation. A red stop sign on the side of a building. A black and red train engine with train cars behind it. The lady on the bicycle is waiting for the light to change. a person with one foot in a snowboard a bike shop with various bikes in it A woman is holding an umbrella over her head A couple of guys playing video games inside This is a meal made for two people. a small little plate that has some fruit on it A large pair of scissors on display next to plaques. Delicious looking meal of vegetables, cheese and meat on bread. A public bus near a curb on a wet day. A red traffic light hanging on a street pole. a shirtless male surfer is carrying a white board A man in yellow vest on motorcycle next to a building. A large tree sitting on top of green grass. a person is sitting on a couch while on a laptop A white bathroom with corner shower and tiled floor. A sausage sits on a takeout plate with spicy carrots. A bunch of books that are on a bed. A large red truck visible through the rear view mirror of a car. A young girls soccer team posing for a picture. Two lines of bicycles parked on a brick surface. A snowboarder mid-air above a ramp outside in the snow. A polar bear goes bobbing for fish at the zoo A cat lies in a crib next to a small child. Man pushing a cart loaded with luggage in an airport check in line. A red and yellow high speed passenger train rolling along the track. A large, ancient looking clock tower rises above a neighboring structure. A laptop on a table with a white cloth at an art auction in a hotel ballroom. A white bird with wings spread under a cloudy sky many people sitting on the ground with a big container in front of them Stainless refrigerator and microwave on the counter of a kitchen. A guy with headphones does a trick with a skate board. A picture to people and horses in the water. A baseball player holding a bat over home plate A Christmas tree sitting inside of a living room. A woman sitting on a bed talking on the phone. A subway train with the doors wide open next to a bench and pole. The clock face on the exterior of a building. Some animals that are hanging out in the dirt. A lady holding a camera up near a big black dog. a small giraffe that is next to some rocks A dog playing with a toy in the snow A half full glass of red wine on a table. A group photo of men and boys from the Goodmayes Boys School dated April 1929. A white and brown dog laying on carpet under a desk. I can see one tennis player but I cannot see the other. A couple holding wine glasses and holding up a tag reading USQ. Two soccer teams playing a soccer match in a stadium. a group of people playing frisbee in a field a cat laying down stretched out near a laptop An orange kitten is hiding under a blue blanket. Eight dishes on a platter, each with a different food item A room with two side by side beds, one of the nightstand lamps are on. An oven with fire in it and ashes around it. a small child standing above a skateboard on a tiled patio a black cat sleeping on some bags of carrots A cat is looking at a cluttered computer desk. Adult elephants crossing roadway with young in native land. A young guy is surfing in the ocean. Cars and a truck lined up across a train car A man poses in front of some green wood panels. A donut frying in oil along a conveyor belt. Some hands are coming from the closet and reaching for a sleeping woman. A pizza slice is being removed from a pie. The umbrella is ready to be installed at the restaurant. Man with ponytail digging out condiment for sandwich in hand A plate full of food sitting on the table next to a fork, orange, cup and salt and pepper shakers. The student is trying to relax on the floor. there is a man sleeping on a mattress outside A packet of ramen, remote control, cigarettes and a lighter on table. A deser plate with cake ice cream and fruit on it. A table topped with a toothbrush and other items next to a wall. Cars and buses seen through the reflection of a window Green cake with a pair of pink pigs next to it. A young woman is eating a piece of pizza. A clock tower at seven forty three in the afternoon. Young boys walking on wet pavement with umbrellas. A clear tube containing a flower sits on the floor. Two men sitting on a couch one who is holding a remote. Three colored beached chairs, yellow, red and blue by the ocean Antique warplane surrounded by safety cones near person. A couple of boats floating along a river. a man is holding a baby and playing with a laptop a street sign outside near a flag pole a rusty flatbed truck sitting by a building A black fire hydrant that has two exits. An appliance is standing next to cabinets in a kitchen. The red clock is displayed for the people can see a close up of a slice of cake on a plate A hot dog covered in cheese on top of a plate. A cat laying on top of a pair of shoes. A laptop on top of a box on a table A picture of some food and some coffee. A room with chairs and a couch next to a fireplace A very dimly lit dining area with some pretty flowers. A tennis player wearing all white reaches hi racket up to a ball. Kites laying on the beach on a sunny day a small candle lit beside a placemat and some glasses A woman standing alone holding an open umbrella over her head. There is a place of food on a white table. A cat and some people on a grass field. a train stopped at a train station with people near by two cows in a body of water near a field A cat sitting on a dresser with a person in the mirror behind it A pallet holds a display of fresh vegetables. Two geese and their babies stand together outside. A military plane is flying upward in the sky. People standing next to sheep and feeding them. Rows of green bananas on a tree with big green leaves. A young woman riding a horse holding a flag Two boys sitting on a bed playing a video game. A meal that looks like falafel and hummous. Three sheep are grazing freely in the open field A vehicle near a stop sign with a poster. A living room minimal furniture and a large window. A view of a bus stop from across the street. a person dressed in ski gear in the snow coming down a mountain side Dual digital parking meters are in place and waiting for a visitor. A female soccer player sits in the bleachers holding her ball. a black and white picture of a white man singing a song A man and woman smile while standing beside each other. There are several holiday teddy bears in a shop window. Wall with tools hanging on hooks and two litter boxes under alcove A girl sitting at a table full of bananas. A man brushes his teeth with a toothbrush. A tablet PC decorated with a picture of a girl and three baby pandas. PIZZA, SPOON, BOWL, COFFEE POT ON TOP OF STOVE A dog in a grass field with a Frisbee. A person on the beach flying a kite. An elephant is being taken down a road in the back of a truck A black cat staring out the window behind a computer A man with a black tie smiling and holding a white box. A puzzle picture of a baseball player batting a ball A cardboard cutout of two boys kicking a soccer ball A pizza sitting on top of a white plate on a table. An old fashioned refrigerator in a kitchen next to an old fashioned stove. A mom and a baby who is holding a teddy bear A white horse looking through the window of a tall brick building. A big orange truck driving down a street. A wooden bench written 'CITY OF LONDON' at the park A man smiling while slicing into a cake. A bed with white sheets and a night stand. A stop sign at the intersection of fifth avenue and fifth street. The brown dog is riding a wave on a blue surfboard. A boy and a group of sheep walking away in dirt field with trucks in background. A picture of a very nice clean living room. A person with glasses holds a Frisbee standing in the grass. Locomotive pulling cars on tracks in outdoor area. a cute happy bright yellow and red bird sitting on a tree branch A team of horses hitched and ready to pull a wagon. A man riding a skateboard while a child sits on the front of it a newly shaved sheep walks away from it shaven fur there are any kites that are being flown in the sky Two elephants are in front of a muddy waterway trampling in the wet dirt. Zebras are grazing on grass by a car. a row of three ambulances with white and yello paint this kitchen is all white and all white appliances A person sitting in a bed with a laptop before them Someone is wind sailing out at the beach A man is sitting on a park bench speaking on his cellphone. A laptop in front of a computer on a desk and a blue chair with a colorful blanket on top of it. Two men standing and holding video game controllers. A 18 wheeler truck on a highway carrying a large over-sized covered load. A WOODEN HAND MADE KEYBOARD WITH A MOUSE A view of an airplane traveling across the bright sky. A dog laying in the grass next to the sidewalk. A baseball player is swinging high in front of the readied umpire and catcher. a couple of people on the beach playing with their kites A homemade pizza with toppings served on a plate Two giraffes in a grassy field with trees in the background. A flock of sheep are crossing the street next to the cars. A couple is sleeping in a bed with red sheets. A young girl smiles for a picture at the beach. A parking sign and a fire hydrant. A red stop sign sitting under a green street sign. a close up of a bird on a beach near water a girl dressed in red shirt and black pants playing tennis A girl with glasses curled up under a colorful, crocheted blanket Some women are talking next to some sheep. A laptop computer and mouse on top of a desk. The refrigerator and the kitchen is being cleaned. Different kinds of food rest on a plate. A hot dog lays on a white paper next to a can of juice. The living room has two couches and an easy chair. A woman on the beach has a pink hat and umbrella. Horses and carriages are lined up along a walkway awaiting customers. A piece of wood with bananas and forks on it Man in white shirt and scarf throwing a frisbee. a large field full of sheep out in the outdoors There is a man about to fall off his skateboard A group of people standing together for a gathering. A long train yard full of different equipment A man with a bunch of plates in front of him by a red house with an open door. A tile bathroom with a large mirror on the back wall. a person sliding in to home plate when the guy didn't catch the ball Several potty pieces with a white background and blue design painted on and one is adorned with feathers. A big city bus parked right beside a building. Young black cat lying on desk with head on keyboard. a close up of a baseball player with a glove a man that has a wii remote in his hand An applegate hot dog is placed in a bun. A row of matching planters are arranged on an outside colorful wall. Group of people walking through a city at night. an image of a tray of food on a table a case in a bakery full of doughnuts of different flavors A vegetable succotash has cashews, broccoli and sauce. a image of a dessert on a plate with toppings A man with lots of tattoos sitting in front of a bowl of food. Adult wearing white shirt and tie holding baby in outdoor scene. a pair of gray scissors hanging on a nail and another black item A horse is standing in the green mountainside grass. A person is flying a kite on a beach. a toilet in a wooden themed bathr oom is open A clock on a tall brick and white tower. A series of images of skateboarders skating and jumping. a couch in the living room near some stairs Trio of large birds sitting next to each other on wooden perch. A painting of a woman sitting in a chair with a laptop computer. A couple of small birds on a wooden pole. This is a collection of different kinds of hot dogs and french fries. Person wearing all white leaned up against a wall with a yellow sign. A woman at a table in a restaurant A black and white image of a young woman sitting on a grassy knoll using her lap top. A person that is playing a tennis game. A baseball game is being played before a crowd. A mini refrigerator stocked with bottles and cans of alcohol and soft drinks. A young boy holds a kite in a grassy park. The young man races toward the yellow frisbee. An orange cat licking a blue pair of shoes. A close-up of a desert with cookies, ice cream and a cherry. Two semi cabs are parked neatly beside one another in a park area. a dog begging for food off a table A living room with low lights, a couch and a tv. A television stand has a television and vases on it. A young boy holding a tennis racquet near a house. A clock, bird with missile, american flag at an area that looks like a flea market. Three women at a party posing for a photo. A display of a man with striped tie and a bird on his shoulder utilizing two Instamatic photos Three inset pictures including bottled water, small pizza, and cup of coffee. A man riding a motorcycle down a street next to a train car. Older man rides on a carriage pulled by two horses Several men are unloading trunks from a Model T. A tray with a hot dog, fries, ketchup and mustard on it. A piece of cake is seen on a clean, white plate. Small boy holding a kite over his head waiting. A firetruck without emergency lights on cruising through an intersection. A TV sitting on top of a brown couch next to a pool. A black and white photo of a person surfing. The picture is from underneath the water. A train with a red engine in the countryside. A man riding a skateboard down a street in front of a red car. A trio of men throwing a Frisbee in a field. A motorcycle stood up in a forest with melting snow. a green fire hydrant siting by a yellow pole A pile of garbage sitting on the curb in front of a wall. a few pieces of pizza on a pan Skiers pause for a photo before hitting the slopes. A woman plays tennis in a tennis court A zebra standing by a log and container eating grass. two stuffed teddy bears sitting in a chair A cat stands alert on a park bench. Two pedestrian walk signals are lit up at night. A cow licking its side in an enclosure Two female tennis players walking in opposite directions on the tennis court. A man stands in front of a Jamaican food truck in a city. An airplane flying a in clear sky above a light. a man is skating around a cement skate park A orange and yellow freight train traveling down the tracks. A stew pot holding carrots, celery, and squash. A blender has some sort of liquid inside. A lady dressed with a pink hat and unique clothing snow boarding. A green airplane flying over a lush green field. Two children at a skateboard park under a blue sky. A lamp that is on in the corner of a living room. A man holding a tennis racquet on top of a tennis court. A man dressed like a zombie with other zombies around him. An older man and a younger boy play a video game. A clock is shown on top of a building. A small baby is biting into a banana. A very tall clock tower towering above a city at night. A man in glasses wearing a suit and vest. a group of women gathered together side by side in front of a table with pastries on it A man standing on a tennis court holding a racquet and a ball. Three people riding ponies and horses in a residential area. a woman and horse walking behind a giant pickup truck An image of half a bathroom and half stairs. A calico cat is laying on a laptop computer. a man and woman are outside taking a picture together An elephant and it's trainers interact with each other. An arial vierw of a building with a clock tower. A woman and two men are having a conversation. A far off picture of birds flying above a field. A man and two women with Wii video game controllers. A couch that is in a living room with pillows on it. There are three adult giraffees that are walking in the park. many brown and black sheep bushes grass rocks and trees Several women sit at a table tasting wine A cat lays down around some stuffed animals. a person on a motor bike drives down a street Young man wearing a suit and tie standing inside a building. there is a tall sign that is on the side of a building A busy city intersection with people and cars A close up of a clock reading 1028 and 54 seconds. A train traveling past two cars on a road in a rural area. a person on a surf board rides in the water Two elephants bathing in a man made environment. Various tools are sitting on the table together A small bike laying beside a fire hydrant. White sheep are grazing in a green pasture. Male surfer on a red and blue surf board. A mother elephant and her baby are standing alongside a dry water pool. The view of green mountains and a valley from a cockpit. A man and a young boy riding on a donkey while people move behind them. A ORANGE WITH A WINE BOTTLE ON THE COUNTER so many elephants moving near some waters in the forest An image of a baseball player getting ready to take a swing at the ball. A long river runs alongside the train tracks. A street sign points in the direction of the road. Several signs can be read at a pillar in the fence. A large bear is sitting near a rock in an enclosure. three people walking a dog in the snow a guy standing by a fench with his skateboard A mom and two smaller sheep in a large green field. A women reading a red book in her bed. A man riding a motorcycle with another person during a sunny day. A table in a restaurant covered in plates and mugs. The man is playing tennis at a very high level. A fire hydrant sits in a small grassy island near the sidewalk. An orange kitten laying in a chair with a stuffed bear. A train is making its way around a snow dusted corner track. Blueberry stuffed beanie teddy bear sitting on a table. A black and white zebra stands next to a tree. Assortment of laptop computers displayed on table with backpacks full of electronic cords. Two pictures of a stoplight, one is green and one is red. A large orange bus stopped next to another bus. a couple of different types of signs on the outside A group of beautiful woman walking down a street in bathing suits. a large air plane flying in the sky Several toy SUV's alongside a toy bus on a highway. Two parents are helping a baby put on a hat. Three horses are pulling a wagon full of hay. A kitchen knife on a cutting board with vegetables and spices beside it. A boy and a girl posing for a picture. A variety of Domino's pizzas and a business man selecting a piece. Two dogs playing tug of war over a frisbee A gentleman laying on the couch while talking on the phone. This bathroom is all white and has a framed mirror on the wall Attractive landscape with picture frames and large white vase. There is a man cutting something up over what looks like a homemade pizza. a bunch of bananas hanging on a wall A female sitting at a table cutting a cake. A woman sitting at a table with a plate of food. Large black towel sitting any Penwith hey with people looking at it. A van is driving through an alley way. A stop sign is standing on the side of the road in front of houses. A white fire hydrant is in front of an old couch sitting on a sidewalk in front of a house. A few mack trucks in a parking lot. A person wearing all black does a one handed hand stand as he holds a skateboard on his feet. a close up of a cat laying on a desk A flooded street with the water up to the traffic lights. A baseball player waits at the plate for the pitch. A large green bus transporting passengers through a city A kitchen with lots of counter space and a black oven stove top. A table with a stack of orange cups by orange scissors. A dog is standing on a tile floor. A large red bus parked in a stationary position. A boy skate boarding down some steps . An angled photograph of people flying kites at the beach on a sunny day. Two dogs are laying next to a bike. A long silver train traveling through a wooded area. A light red fire hydrant on the corner of a street. A person crouching next to a pair of motorcycles Busy stadium with many people outside near vendor trucks. Two urinals in a restroom with multicolored tile. A person watching a sheepdog chase a white disc across a green field with mist covered mountains in the background. Several people holding umbrellas are lined up near a fence. A stoplight and street signs beside old buildings A tennis player getting ready to swing her racket. a close up of a drink on a table near a laptop Several pieces of ancient pottery and stoneware on display in an exhibit. A woman standing in a room holding a Wii game controller. a group of people with surf board standing on some snow the ice cream vendor is talking on his cell phone A pile of TVs sitting next to a brick building. an image of street signs on a residential A guy on a skateboard in front of a water fountain. A giant cake decorated with round discs on a table a woman on skis is standing in snow with her dog A woman casually reaches up to hit a tennis ball. A fat kid enthusiastically enjoying a pizza from a big pan. An old rusty fire hydrant sitting in the grass near a picnic table. A new kitchen that has just been built. A sleek, modern toilet has a backlight and granite counter for storage. A teddy bear posed sitting holding a book A train running on train tracks through the wilderness. A white toilet sits in a bathroom, with the lid open. a bathroom with dark tiling in iit and a pink bathtub A machine is on a folding table in a small kitchen. A person riding a skateboard down a street. A man kiteboarding over a large body of water. A man working on a laptop computer at a desk. Two people pulling a luggage cart down a sidewalk. A white plate topped with meat and vegetables. Skiers come down a snowy hill in a row two glasses of juiced carrots and apples on a white cutting board A man is seen in a mirror in a bathroom. A person is holdiing a kite in a field. A sepia colored room shows vintage furniture with a tendency to the frilly, including a bed with a curtained balcony and a chair, both in matching floral pattern, and a dress form. a little tourist train pulling three cars of passengers A mirror is shown with a man driving in it. A moped parked in front of a yellow wall and traffic sign. A flock of sheep sitting in the middle of a field. A skier kicking up a spray of snow. Yellow and blue fire hydrant in front of a movie theater. A teenager does not make any expression as he rides a skate board. A man holding two small green birds in his right hand. There is a baby elephant with its parent a truck parked on a beach near water The neatly made bed is beside an open window. a big grizzly bear looks toward the camersa A renovated propeller airplane flying in a blue sky A giraffe that is eating a piece of food near another giraffe. A group photo has smiling people and one dog. Brick houses with brown stairs stand near a wide sidewalk by a line of trees. A large building with windows and cars parked below An asian woman holds a sub sandwich near her mouth. A pizza cut up into many pieces on a white plate. A red truck with a trailer attached, is parked near a red house. A plate of food with eggs, meat, salad and a fruit cup on it. A small table with cups and saucers and a clock on it. A boy flies a kite on a beach near colorful tents. A very cute cat laying on a desk. Several Billabong surfboards make up a nice display. A cat looking at the television with flowers on the screen. A group of people in a park playing frisbee. An airplane hanging from the ceiling of a building. A table topped with lots of different types of fruit. A small child's bed sitting next to a window. Tennis player with white outfit holding a racket. A person wind sailing next to a person para sailing. A man is prepared to get on a wake board A horse pulled carriage on a open street. a couple of bowls of food sitting on a table A refrigerator is shut with black duct tape. a man standing on a surfboard in the water A large clock rests on the side of a brick building. A picture of a vase with colorful flowers in it. A giraffe walks the grasslands by himself at sunset. A dog heading into the water near a horse. A woman rared back at a tennis ball with a racquet. a person at a table with a plate of food A small child on a bed looking at a lap top computer. A group of people are looking at something or someonr A bride waits for something while holding her bouquet. A white cloth with scissors, a needle, thread, and measuring tape resting on top. A tennis player gets ready to hit the ball as a crown watches from the bleachers. A white truck has a vision sign on it. A clock on a steeple of a tall building. a shop with some wine bottles sitting on a counter The kitten is nesting inside the empty bowl. A gondola like boat crossing over a bridge A large clock tower with a roman numeral clock on it's side. A woman on a court with a tennis racket. There are three dishes and a vase with two roses on the table. A boat sailing on a massive lake surrounded by mountains. A woman in a long dress talks on her cell phone. A white plate topped with a hamburger next to fries. A man riding a surfboard in the ocean on a wave. A LADY FEEDING HER CAT WITH A SPOON. a man standing at the beach with a surfboard and a paddle a white building and some people flying a kite A guy standing in the grass is ready to throw something. grandma watching two kids playing a video game A bowl of soup and a sandwich plate on a table. A photo taken from a boat with a long bridge in the background. An average hotel room with twin occupancy capabilities. A woman is eating a pita on the street. A baseball player holding a baseball bat in a game. A serving dish has meat and greens in it. two boys with painted faces laying in a bed Two men sitting at a table with plates in front of them. Wooly horse and sheep dog face each other down A white pitcher filled with orange and purple flowers. A lady bent over with her tennis racket while another girl looks down court. A grey and red train next to a train station. A person standing in a bathroom next to a white toilet. A counter with various baking ingredients that include bananas, butter and oats. A small plane is dwarfed by the larger ones in the background. A couple of dogs standing outside of a wrecked car. Motocross rider displaying aerial tricks on nice day. A male child swinging his bat at a ball, another child behind him as the catcher. a man holds parts of a broken television A lone bench sits atop a hill looking over the river. Donuts are going through the mechanical glaze machine. A white boat sitting next to a dock near a white building. People sitting on the side of a street next to suitcases. A group of baseball player standing on top of a baseball field. A man wearing a brown suit and brown tie. A train on some tracks with power lines above it. A boy in a blue and white shirt playing tennis on a brown tennis court. A toy town with a train on the tracks passing a signal. A couple of sheep standing on top of a grass hillside. A man on a stage with ski poles in his hands. Chef stirring large pot on top of stove. A young boy is playing tennis at the tennis courts. A boy riding a skateboard and doing a trick. Two girls compete in a game involving a frisbee. The dog is sitting in a chair beside a bright window. A close up of someone's feet on a skateboard. A white dog is on top of a bed looking into a box. Brightly colored oranges, pear and apple in a colander. Suitcase containing many compact clothes for just one person a person on a tennis court holding a rackett Three men on a field playing a sports game. A bunch of people walking on wet sidewalk by buildings. A woman holding a tray of food in a kitchen. The sink counter of the small bathroom is made of wood. The girl is standing with her laptop in her hand Three people on snowboards on the slope of a mountain. A man standing next to his guitar case talking on his cell phone. a small boy in a black shirt a brown and black dog and a bed There is an airplane flying by a mountain. A toilet stall with green marble walls and a painting. a toilet a tub some pipes and a window A black and white photo of a train pulling into the station A lady is running with a tennis racket on a tennis court. A meal of hot dogs and stuffed vegetables a person cutting a cake on a table A woman is sitting down with the light turned down low to take a picture of herself with her cell phone. A couple of white horses walking along side a rocky hillside. The motorcyclist is traveling down the busy street. Park bench on snowy elevated viewing area above city. A man in a green tennis outfit hits a tennis ball with his raquet. a man that is standing up on a stage A room with a picture on a wall and a vase near the window with flowers in it. A train moving along a track, approaching a light signal. a small toy truck with a cat peering through a window Bed with yellow blanket against a wall with hard wood floor. A group of people are painting a bench in the park. The pitcher just threw the ball to the batter at the baseball game. A classic clock sits on a wooden table. A giraffe on the dirt looks tall among the trees. a living room with couches covered by sheets A man in a suit and tie is playing a key board. a number of oranges in a tree on branches near leaves A hat is sitting on the top of a bed. A man eating a hot dog on top of a bun. Several birds are standing in a large nest. Two woolly sheep in front of a wooden fence and barn A couple of trick planes flying by each other. Guy in shorts and a cap ride along top of wall with his skateboard A person skiing downhill in the white snow. Two men looking at a plane on a runway. two woman sitting on the ground one is on a cell phone Two Asian men standing in a office with business suits on. A bunch of used appliances sitting on the street a clock attached to a green pole on a building a sheep standing in the grass next to a fene A row of passenger buses traveling down a lone road. A black bear lying down near many trees. a couple of houses that are next to each other Several broccoli plants planted next to a wall. Train cars sit on the tracks next to a platform. A group of lambs are running in the opposite direction of a dog who lays barking. Fluffy white cat laying on a lightly colored bed. A living room filled with furniture and a flat screen TV. An employ looking kitchen has a black refrigerator. The bathroom is mostly a red color. It looks very old. A plate of food with a salad and very large chicken sandwich. an image of a cat next to the feet of a person A green train sitting along side a train station platform. A man lays in a hospital bed while holding a teddy bear. A man stands behind the counter of a restaurant. A skate boarder jumps off a curb into the street. A lit up display of teddy bears of different colors and sizes. A cap that is sitting on a blanket next to a remote control. a couple of cats are laying on a bench a group of people standing around a metal briefcase Several types of wild animals grazing in an open field. A man sitting at a table with a pizza in front of him. A large long train on a steel track. Baseball player preparing to strike ball from the pitcher during game. The woman is in a ski racing down the path. A vase filled with flowers sitting on top of a counter. An old parking meter sets with time expired in front of a parked vehicle. A large white polar bear sitting on top of a rocky ground. A kitchen that has white cabinets and a white oven. a close up of a child near an opened refrigerator a bedroom with a large window cover with shiny curtains Some traffic lights suspended over a road by some parked cars and houses. A row of parked motorcycles on the side of a street. A goat with red painted horns on its head A refrigerator packed with lots of food and drinks. A surfer falling in a wave with other surfers nearby. A huge cargo ship sits empty in a bay reflecting blue skies. A man that is sitting at a table. A slice of pizza with cheese and golden crust. The pizza is topped with broccoli and onions. a bunch of plates of food no a table A tower that has a clock on it. A man surfboards on a wave in muddy water. A woman is skiing near a bunch of trees. A caution light and traffic cones set up to block a street. a black cat is sitting on a green bench A narrow city city features colorful buildings and a large green bus with cars behind it. A man standing next to a beautiful woman. a kitten laying on a bed next to some phones A big piece of bread is placed on a white plate. A small white dog begging at a door to come inside. A doughnut on a plate and a banana. A plate that has a cooked pizza on it. a man carving the turkey for thanksgiving dinner Three sheep graze in front of a barn. There is an old street sign leading against a building. a bear in teh middle of a grassy field a green and white street sign and a traffic light Traffic light signaling green at the train tracks A cell phone held open in a bathroom. A teddy bear sitting in a very unusual spot high up A man touches a hammer to the center of a clock. A group of three people sitting on top of a green couch. A couple of military men cutting up a giant sheet cake. A person eating food from a large white dish on a desk. A man standing in front of a microphone wearing a suit and tie. A cow laying on top of a grass covered field. The side of an airplane that is parked, and an Air China sign on the side of the plane. Shadows dominate the landscape in this dark, dreary scene. A snowboarder is boarding next to a chairlift. A woman in a tiara cutting a birthday cake at a party a desk with a keyboard mouse monitor and a tv A giraffe is crouching in the grass next to a tree. A cat sits on the table next to a bowl. two adults holding a baby while wearing ski wear and standing on a snow bank. Giraffe looking through a set of bars in a cage. a white building with a white clock and some trees there is a red stop sign on this street pole The tall zebra is following slightly behind the shorter one. A skier is headed down the steep slope. A surfer riding a small ocean wave on his surfboard. A young woman is holding a cell phone open next to her face. a brown horse with a white stripe on its head A cat is sitting by a single shoe. A single giraffe that is walking in a field. A group of people posing for a picture. A person is standing on the beach holding a kite. A group of people in grassy field with kites in the sky. Two women playing tennis on a tennis court. There is a man interacting with a black dog. A parking meter on the curb of a city street A ceramic object with blue flowers on it. The dog went all the way into the water to fetch the hat. Cows and a sheep eating food from a red box. there is a piece of cake and a fruit on a green late There is a stop sign covered in snow. a couple of people holding a martini in their hand An airplane on a runway with another plane flying overhead and a truck nearby A very tall white clock tower towering over a lake. A corner of a rest room with a shower with glass walls. Three sheep laying in hay in a gated area. A group of people sit at a table with food. A desktop computer monitor sitting on top of a desk next to a mouse. A lot of carrots on a wood board for sale. A man standing on top of his head while riding a skateboard. A man dressed in a military style uniform shaking another mans hand. A empty living room that has a table in the center. A kitchen that has pots on the stove. A large two story boat floating in a lake surrounded by mountains. A person on a motorcycle is doing a wheelie two white black and brown dogs are lying on a red couch a vast, grassy field with animals in the distance a gray cat is sitting on a wooden bench An iPod with ear buds and a mouse near a book and keyboard. A black leather case containing several pairs of scissors. Several boats that are moored at a dock a man standing on the corner and people walking down the sidewalk A woman holds a colorful kite in a city park a close up of a person playing nintendo wii A commander cuts a cake at a military function. A boy looking at the camera while sitting at a wooden table. A large ship making it's way through the water. A woman in a bustier holding a stuffed animal. an image of two men standing in front of a Christmas tree A vase with flowers sitting next to a glass tomato. The lamps are on next to the pull out couch. Black and white photograph of two people on a moped. A kitchen with white cabinets and a stove on the counter top. President Barrack Obama standing in front of a crowd while giving a speech. A boy smiling as a large spider walks on his arm. A giraffe standing outside of a building next to a tree. The back of a moving truck that has a man standing on a lift with a royalty style chair next to him. A duck is swimming in the pond to the next destination. A bathroom with some of the wall removed during a renovation. a couple of little kids in baseball clothes stand next to each other A group of people flying kites over a beach. a person siting on a bench with a dog near by a black cat sitting on top of a black suitcase on a bed Two computer monitors, two keyboards and two CPU's on a desk. A cat looking inquisitively over the top of a car seat. planes and cars sitting on an airplane tarmac Trucks and cars going down a commercial retail street in a city A young woman is preparing to hit a tennis ball. A man that is at the beach jumping in the air. There are many books and magazines in the small room. A framed wedding picture on a crowded wooden table. A child holds a string over the water. The train has spots of rust that are obscuring the graffiti. A series of photographs depicting bathroom before and after minor changes. Dark cabinets around a white two doors refrigerator. A man holds a card and wine glass with a woman who also holds a wine glass. A casserole containing broccoli and topped with cheese. Woman standing behind open refrigerator door in modern kitchen. A turquoise and orange station wagon with two surf boards on its top. A tennis player in black shorts and white shirt looks up and holds back a red racket. a train is passing over water on a bridge A woman wearing a maroon sweater standing in front of crates. A flock of black-faced sheep near a watering trough on a rural hillside. Two giraffes standing outside while people watch them Table centerpiece of a tall wine glass shaped vase with flowers A man is holding a large piece of pizza. The young child is close enough to pet the cow. One horse has taken the lead in the race. A skier struggles in deep snow with their lost ski. A collection of different smart phones on a table. Note with listed items on white refrigerator in kitchen area. A man with a backpack and coat walks by a bus. An magazine photo of a restroom toilet and sink. A man choosing a piece of pizza from two boxes A scenic view overlooking the water at night or early morning Seagull flying through marina with many boats around . A blue bird standing on the ground among large green leaves A male skateboarder does tricks on a half-pipe course. People ride on the back of an elephant while being guided along with other elephants. A peeled banana on the front of a car. Three people ski in a row in the snow. A woman riding a wave in a wet suit on a surfboard. A beautiful young woman brushing her teeth in a bathroom. A scene containing a couch with flowers and a mirror. An automobile with a timer attached on a city street. An elephant placing its trunk on some plants and some people watching. Several elephants dressed for the circus are in line next to people. Two sheep sitting behind a fenced in area A man flying through the air while riding a skateboard. A close shot of a pizza plate with a rubber on it. A bear observing something on the ground of a field. A dog in a river chasing a red ball that is thrown into the water. a person handing a child a plate of food A bride and groom are cutting their wedding cake. Two dump trucks driving down a two lane road with a white pick up approaching from the opposite direction. Lady using oxygen in bed with a little dog. An elegant kitchen has an attached stone fireplace Two zebras in a field are eating grass. there are many computer monitors and things on this desk A man leaps in the air while on his ski board. A woman dressed in a button up white shirt, suit and necktie. The lid is up on the toilet bowl. A dog is resting on the window sill of the building. People milling about outside in a busy city A young boy wearing goggles and a billed hat holding a stick. A group of people walking under a leafy green tree. A motorcyclist with a female rider in the back and a dog in a sidecar. a woman poses in front of a giant pizza A woman holding her hand over a giant pizza Sun setting on a dark street and buildings. Giraffe holding it's head mid way with a wooden gate behind it. There is an elephant that is lying in the grass A giraffe presses his head against another giraffe. A plate with a pastry on it, topped with whipped cream. The airplane is waiting at the airport for passengers. A group of people on bicycles riding down a road. A man and a woman stand in a field with cows and horses. A bathroom in the process of demolition. Black and white photo of a small air craft. A person standing next to some old junk appliances. a tennis player in a black shirt is wiping his face A couple of plastic containers filled with lots of food. A man is holding a banana in his hand. a cow in a field looking into a camera A view of a cell phone and a watch on a table. A bowl with sliced avocado, eggs and tomatoes. A empty water bottle sitting on the corner of a wooden bench. A bathroom that has a broken wall in the shower. A full course meal with meat and mixed vegetables. A young man in a kitchen shapes dough into balls. A couple of elephants standing next to each other on a dirt field. A couple of dogs walking through a large body of water. Two flat computer keyboards laying on a table elephants in the wild surrounding a large tree A large park with people flying kites in the sky. A man with glasses on and a suit and tie. A traffic light on a street corner with shops behind it. A boy soaring into the air, doing tricks on his skateboard. Smiling people are holding a large white snowboard. A black cat and a "K" sitting on a green bench. An attractive young woman holding an umbrella under a tree. Grass roofed umbrellas on a bay with cliffs A person holding a banana in front of a basket containing fruit. two dogs playing in the snow as a one person wearing black uses a snow board to go down a hill. Two cows are standing on the end of a boat A picture of several street signs on a post. A man walking with his surfboard on the beach. a person on a beach with a kite flying in the sky The cat stands on the edge of a bed looking at television. The large scissors are sitting alone on the counter. A pair of scissors and some stick like things in a bag on a wooden table. two horses in a field of grass near bushes Infant in a high chair eating a chocolate frosted chocolate cupcake. A skateboarder jumping with two others behind him. A painting that shows a vase with flowers and a table. A person riding a horse, jumping it over an obstacle. A family of zebras standing together at a zoo. There is a train attached between two buildings as a walkway. A black headed sheep sitting in a field looking onward. Two people embrace while walking down the street under a pink umbrella a room that has all kinds of christmas deco in it A crowd of people standing around an old fashioned train engine. A couple of very small cute kids in the rest room. A BLACK AND WHITE PICTURE OF TWO WOMEN BASEBALL PLAYERS A baseball pitcher delivering a pitch to a batter. A Harry Potter novel is set next to a plate of eggs and toast. a person milking a cow next to a wall a woman watching a dog jump up for a frisbee A zebra grazing on top of a grass covered field. A large metal pan filled with peeled food items. A woman at a baseball game talking on her phone. White truck with painted words parked at night. a white box with 12 sugar glazed donuts A large black bear about to take a swim in a pool A train has been painted with Christmas decorations and lights. The large crowd watches a skateboarder descend a rail on a stair case. A group of people having a picnic on the beach. A little dog that has a frisbee in their mouth. A small cup cake and a knife on a plate. A frosted doughnut with sprinkles on a table. A man in a yellow jacket is snowboarding The flowers are in a vase on the table. A close up of a vary unique looking vase in front of the tree. A little girl standing and holding a remote in her hand. A baseball player holding his arm up with a ball in his hand. A pitcher in the middle of delivering a pitch. some kind of room with some weird things in it a bear partially submerged in a body of water Young girl making funny face in residential home. A table with a plate of food, utensils and some other items A plate of cooked broccoli on a long white platter, next to a dipping sauce. People inspecting a large, shiny semi trailer truck at a park a couple of men compete for a frisbe Artistic black and white photo of man on a motorcycle. a cake made to look like two trains A group of oranges stacked in a wooden bucket. Chilled beverage in glass bottle next to orange halves. A woman in playing with a green frisbee at the beach. A stuffed bear sitting in a chair with napkins and cup. A display of wild animals inside a building. a big train passes under a big bridge A man sitting on a bench next to a man. Two kitchen stools sitting in front of an island in a kitchen A bowl of fruit and a plate on a table. A phone and a computer on a kitchen counter. Someone having dinner in a dimly lit restaurant with wine. A pole, light and traffic signal have all been painted green. Cat sitting inside kitchen cabinet, near the dishes. A red and white bus driving on the street A living room scene with chairs, lamp and a clock. A girl carrying a kite walks along a beach. Two children in a fire truck amusement park ride. A group of plates with grilled meat, bread, and appetizers. A man in a blue blindfold reaches a doughnut tied to a string with his mouth. A seagull wading in the surf at the waters edge. People enjoy a day at a mountain lake. Police tow truck parked on a city street in front of stores Living room with TV playing and view of a hand in the picture. People stand beneath umbrellas on a flooded road. a dog jumping in the air with a frisbee in its mouth A tree with a white low trees sign hanging off of it's side. A man in a dirt field next to a group of sheep. A baby girl wearing a red shirt holding a tooth brush in her mouth. An upstairs bathroom is pictured in this image. Woman laying down on a mattress at a store. A skateboarder performing a trick on the edge of a ramp. Man in a kilt and woman and white dress cutting into a cake. A faded stop sign near a street side. A snow covered wood bench in a park Major League Baseball player taking a very fast pitch from the pitcher A young toddler playing in a suitcase on a bed. a group of people playing frisby in an open field Woman kissing little girl's cheek under umbrella indoors. A man is surprised by a very large doughnut. People bringing in a loaded boat of vegetables to the market. Two skiiers ski down a mountain in front of a village while it is snowing. a close up of a pizza on a wooden spoon A Bus Stop sign peeking out from a vined wall Two green animal food bowls sitting on a tile floor in a room being refinished Two people standing near the water holding surfboards. The man is standing in the snow with his snow board. a close up of a broccoli plant with leaves A cow is standing near a fence in a field. A sign that is standing in a parking lot. A boy in white shirt playing with a Nintendo Wii controller. a red umbrella is inside out in a city A woman sitting on a bench while reading a magazine. A dog riding on the back of a horse. A catcher and an umpire near home plate. there is a blue and black bus stopped at a bus stop A young girl riding a skateboard behind a man on a bike. some baseball players are playing baseball on a field A black-and-white photo of a person sleeping in a bed. Two people dressed a refrigerators walk down a crowded street. a sink a picture a mirror and white tiles Young boy inspects a picture on a table with construction paper materials. A man is standing by a movie poster talking on his phone.. A sesame sandwich sits on a white plate with a cup of coffee. A kitchen with a white stove top oven and a refrigerator. A man sitting in a chair with a canned beverage in hand. A motorcycle rider bends down on a track. Several sandwiches sliced and neatly arranged on a white plate. A street light pole with many street signs and warning signs. The girl in a tan skirt is sitting on a bed. There are electronics and other music equipment around a desk. a person in a field with a kite flying in the air Woman talking on cellphone in front of personal computer. A red stop sign mounted to a wooden pole A red and yellow fire hydrant in an open field. A couple of giraffe standing on a grass covered hillside. A guy on a tennis court holding a raquet. an image of a woman at a ski slope Rear-view of a horse as it grazes near concrete. a cat is sitting on a white keyboard The plate is loaded down with a lot of food. A pair of leather chairs beside a table and matching couch. a hot dog sitting on top of a mound of french fries A young boy appears hesitant to eat some broccoli. A woman with glasses making a call with a cell phone Two bears laying against wood with a sign. A plate that has broccoli and other food on it. A blue and silver cell phone with an accessory. They've gotten off the bus to stretch their legs for a few minutes. Horses stand saddled in their paddock near the beach. A man swinging a baseball bat on a field. Chicken with sauce and broccoli is served in a serving dish. A couple of boys playing frisbee against each other. Large white birds with black beaks sit atop benches. Two women who are standing under an umbrella. A plate of food, bread and salad, sits on a chair. a big boat is going down a small river A caramel apple is sitting next to a jar of french fries. yes we have no bananas we have no bananas today a man gives children food to feed an elephant A table covered in different kinds of baked goods A bunch of dead, stuffed wild animals on display. Two men loading up the back of a truck There is a mug with a fork in it and an unidentifiable liquid. two red double decked buses side by side two people riding motorcycles on a street at night A woman is looking at ribbon for children participating in an art activity. A box of pepperoni pizza already has two pieces missing. A room filled with people sitting at tables eating food. A cutting board with pizza and a glass of wine A woman with a purple umbrella stands on a brick street. A man dressed in all white is posing on a motorcycle. A mixture of food and drinks sitting on a table outside. A man sitting on a horse in the sun A man is sitting on a picnic table next to s ski slope. A woman on a bed kissing a mans face. A young boy puts together a kite on the floor. A store has displays of pans and other things. Several people in a large building that is filled with luggage tagged with yellow tags. a perishing square tent set up with a bicycle A carnival occurred on a beautifully sunny day. A herd at of zebras are grazing in the field A cow poking its head between skinny tree trunks. a skier at high speed coming down the mountain A bunch of vegetables and fruit arranged on a table. A large slice of angel food cake sitting on top of a plate. a baseball player swinging a baseball bat at a game Two guys in suits are having a conversation at the couch. A large truck driving down a city road. a whole bunch with luggage standing outside Multiple people standing in the water on a beach. Two airplanes lined up on pavement near a building. A man lounging in a computer room with a laptop on his lap. A man carrying a kite while walking on the beach. A young elephant walking in tall grass behind a larger elephant A snowboarder leans into the snow with their board. A lady with green hair and red boots sitting in the grass near a horse. a man looking at the chocolate on his fingers A group of children running after a soccer ball. Woman approaching the door of a train at a station. A group of people sitting on a bench in front of a restaurant. A lot of animals that are in the grass. some people are walking down the street with each other A nutty cake is sitting in the grass. A car sitting in the middle of the grass in the rain. A lot of boats parked in a large body of water. A man in a police uniform sitting on a horse by a traffic light. A flipped image of 2 toned room with a small chandelier. Light green and white painted fire hydrant with people walking in background. A man and a woman sitting on adjacent couches focus on their laptops. A golden bath area with a chandelier and blue and white bathtub. The clock is below the dome of the tower. A close up of a full cooked pizza pie. two elephants are drinking some water on a sunny day A small glass of liquid sits on a table. A sandwich and salad on a plate sitting on a black table. A woman is getting ready to hit a tennis ball. Two motorcycles are parked next to each other. A professional baseball player about to pitch the ball A modern kitchen with a large window by the sink. there are two men riding a motorcycle and holding a umbrella Two racks on top of a white counter topped with cup cakes. A person with a skateboard on a ramp. The little girl is standing between the low shrub and the fire plug A man and a woman beside bicycles with orange train cars behind them. A snowboarder riding in the air above the snow. A man and woman in the middle of a conversation. A woman skier in costume at the beginning of a race. A store window with stuffed teddy bears in it A man bent down fixing a toilet . Group of motorcycle riders being led by a police car. Red passenger train passing over top of a bridge. A small coyote is seen in the back of some tall grass. A large red chair in front of a building. A man is on snow skis on top of a mountain. A baseball player is ready to swing at a pitch. A woman serves out a sauce for her dinner party. A young man is preparing to throw a Frisbee. A brown and black dog holding onto a couple of crushed water bottles. a nice black back splash a plant some body oils and a black Kleenex box A beautiful young woman laying on top of a bed next to a dog. Large pizza sitting on a table next to beer glasses. A fire hydrant with a blurry view in the back of it. An infant is sitting in front of a computer. A cook placing two pies in the oven. A group of people standing around each other in front of a building. A skateboarder performs a trick on a small ledge. A boy on his skateboard at the top of a skateboard ramp. a women that is playing tennis on a court Very small remote control that fits in the palm of your hand. A huge elephant is walking down the road. A woman standing next to a man while wearing a short dress. A MAN WITH KIDS ARE ON THE FLOOR A kitchen with an island that has place settings A moose is getting some shade outside an old building. Jockey on black horse being walked around infield. A person cutting into a plate of food on a table. A long train is going down one of many tracks. Several stacks of different types of books on a bed. A giraffe laying on lush green grass next to trees. Two glazed and one chocolate doughnut placed on a napkin. a police officer rides a motorcycle on a walkway A child stands next to a window near a bear. BABY IN BLUE JEAN OVERALLS HOLDING A CELL PHONE A view of a Sony remote, next to a laptop. A person on a skateboard is riding up a ramp two people riding skis on a snowy slope A baseball player up to bat swinging at a baseball. A little girl laying in bed holding a book next to a black cat. A woman cuts a cake while two dogs watch closely. A fritter and a donut on a white bag next to a donut box. Some chefs working together in a big kitchen. A picture of a person brushing her teeth. The pizza is topped with very unusual ingredients. Bicycle parked at meter outside large building with column. Three of six people standing and sitting at a restaurant table are on cell phones. A brown teddy bear sitting next to a wall with a painting. A small room cluttered with piles of books, a portable TV, and stereo equipment. A yellow rectangle sign stating that pedestrian priority crossing is ahead. A restaurant clock displays the time of ten twenty. THERE IS A METER POST ON THE STREET A group of men that are in the back of a truck. A man in a orange and yellow outfit juggling tennis rackets. A sign that is on the side of a building. A horse wearing a pink hat pulling a carriage. Many people are scattered together near an Orange stand. Two men playing a video game inside of a room A woman riding a surf board through the waves. A pretty young woman walking a bike with a small dog in a basket. A man standing in front of a brown horse. A very big group of happy looking people posing together. Different kites flying around in a field with a bunch of people. A wall with a black and gold clock and walkway above. A plate full of a lot of good food ready to eat. A slice of cake with icing on three sides, a knife and fork beside it, on a wooden table surface with a knot area visible in the wood. man jumping up super high in a grey jacket A person on a sidewalk holding a kite for the camera. A very big pretty green vase with some flowers. A polar bear standing near a tree on grass A kitchen island has a farmhouse sink on it. A sheep looks at the camera, by the side of the road. Two pieces of bread coated with a dark brown spread. The train car is used as an office by railroad personnel. City scene of cars at sunset going past stoplights. A red and white street sign that reads "no parking any time." The towel bar is above the toilet in the bathroom. Baseball batter hitting ball standing near catcher with mitt. Two skiis and poles stand upright in the snow. This kitchen table has fruits and vegetables on it A cluttered room with a televisions that is surrounded by shelves that have various games and supplies all over them. A view of a narrow kitchen with the only light coming from a glass door. A woman standing on the sidewalk, looking at her phone. A tall clock sitting next to a barren tree. A vase that has flowers inside of it on a glass table. A man in a blue shirt, blue hat and gray shorts playing tennis. A person is near a row of luggage carts as one man pushes a cart. A boy blowing out candles on a birthday cake. A small locomotive on small train tracks with people inside. A girl is standing against a wall in a room. A breakfast plate of scrambled eggs and fruit. Baseball player swinging a bat during a game a large plane is parked at the runway A man in a wet suit on a surfboard in the water. Clock tower sitting in a pier with clear blue water. Large group of motorcycles on brick street with trees Woman standing on a surfboard in calm water. Man riding a horse in a foreign country. Woman on a bus looking out the window at another orange bus. A rusty parking meter that is empty A woman with blonde hair sitting at a bench in front of a building. a man is riding a snowboard in the snow A giraffe hiding behind a grove of very tall trees. An elephant with no tusks walking in the woods kicking up dirt. A person in a baseball uniform about to catch a flying baseball. A pizza that is sitting on a table. A man eating a hot dog and holding up a dollar bill. A plane up in the sky viewed from below labeled "Cityjet.com" A cat on the toilet peeks its head into the bowl. A group of people gathered around a table outdoors having a meeting. There are several people in this funny boat. A man slicing pieces of bread with a knife. A stack of pancakes covered in blueberries and whip cream. A man is standing close to a tv playing video game bowling. Two elderly men preparing a motorbike for a journey. The motorcyclists turn the corner of the road next to the home. A cat is playing with the bottom part of the umbrella A traffic light and cars on a street. A bride and groom teddy bear each in a coffee cup on a saucer. A laptop computer sits on a girl's lap. A street sign that reads, "right turn only." A baseball player is preparing to swing while several people watch. A man sitting behind a group of different wines ready to taste them. A black cat rubbing up against a woman laying on a surfboard. a person walking on a train station platform A long train traveling past a forest near a road. a herd of elephants walking down a path in some tall grass A silver railroad train traveling down the tracks A man with glasses showing off two cell phones. A dog looking cautiously at its reflection in a mirror like object. A man using his laptop sitting on the balcony with a water view Plate of food with a variety of vegetables. A blue bullet train stopped at a train station. A red fire hydrant between two potted plants. What can only be described as an interesting and presumably authentic dish. The man has a tennis racket in his hand. A large jet liner sitting on top of a runway. A batter, catcher and baseman during a baseball game. The train drives between the forest trees. A green city bus traveling by a parked truck. A woman is using the ingredients to make sushi. Two men in a living room holding the Nintendo Wii remote. A clock with roman numerals hanging on the wall next to flower patterned drapes. A tray with carrots, snap beans, mash potatoes and an egg. A dressmakers dummy with hat, coat and tie. A plate of food has carrots and broccoli. A mini stagecoach being pulled by one horse and driven by one driver. A young girl holding a tennis racquet on a tennis court. A dog is in mid air with a frisbee in its mouth. A skier in the air coming off a jump with a mountain in the background. A cutting board topped with fruits and vegetables. A pizza in it box siting on a table with a side dish. A little bird standing a the twig of a tree. A polar bear laying down on rocks by some water. A woman sitting by a man at a restaurant eating food A group of young men standing on a sandy beach. People sitting and walking in the patio and grass area of a building with tented sitting tables and lawn chairs. A man serving a tennis ball on top of a tennis court. A white toilet sitting on a sidewalk outside. Multiple skateboarders in the same outfit ride in a demonstration. A fit young woman enjoying a game of tennis. A decorative cake with several layers and an animal on top. A brown dog laying in an open suitcase on floor. A man bending over on a tennis court. A slice of cake baside a fancy beverage on a wooden tray. a car with luggage bags on the roof and in a trailer A couple of men standing next to each other holding snow boards. The bathroom is equipped with many electronic devices. a person standing with a tooth brush and tooth paste Two giraffes are standing together in a field. a small cat gets petted in front of a laptop A cardinal sitting on a small branch of a cherry tree. There is a man wearing a dress shirt and tie. A man about to sit at a restaurant table with a woman. Many people around a chocolate birthday cake with candles. A man riding down the side of a skateboard ramp. A man sits on a broken toilet as people walk by. A person flying a kite on a sunny day. A couple of elephants standing next to each other. A rainbow that is above a street corner. Swans gather in the middle of a parking lot. The skier is repairing his ski on the slope. Four dishes of food are organized on a counter. a kitchen with a double sink, stove and counter top A small child is enjoying a donut at the table. These two people are using the phones at a parade A paper plate holding a piece of cake. A beautiful young lady standing next to a man on a tennis court. A clean white bathroom with a simple mirror above the vanity. A man standing next to friends eating food. A bathroom toilet with a phone hanging on the wall. a guy that is on a surfboard flying in the sky Signs and wooden poles stand in front of houses and lawns. A mouse pad sitting on top of a desk under a mouse. A couple of trains move side by side down the tracks. A man in a yellow shirt stands in a dirt circle. Two horses standing on the grass near a body of water. Two giraffes are in the foreground and there is a zebra in the background. A young man riding a skate board up a ramp. A small horse with his eyes closed standing on snow covered ground. A bottle of liquor called Granite next to a half-filled glass. A flock of sheep standing around in the middle of a pen. A mans face coming out of a chili dog with a fez. Several boats in a river with people in each boat. People sitting in a subway station that is in black and white. Two kids are holding sprinkled doughnuts at the table. The man is sitting down resting before his tennis match. A bowl of beans sitting on next to a sandwich. Six people in a boat rowing on a body of water. a plane flying by below a slighly cloudy sky Old style bed has a cross on the headboard A dog jumping up in the air to catch a frisbee in it's mouth. A group of young ladies kicking around a soccer ball. Two back-lit computer monitors and a keyboard and mouse. A beautiful woman sitting at a table next to two pizzas. There is a person jumping high on a snowboard. A big orange cat sitting on a wooden bench. A child sitting at a table with a hot dog. hungry dog inching it's way toward the donut. A brown ottoman sits near a black counter in a vacant room. A pretty young lady carrying two large donuts in a restaurant. A woman is getting ready to dive in to some donuts while two guys watch. A couple cut their wedding cake while the bride makes a face for the camera. A large animal laying on top of a lush green field. Rows of books on bookshelves in a library setting A room with two woman, a dog laying on the floor and table and chairs in it. A healthy meal with various fruits and vegetables. A man that is on the side of the wall with a skateboard. An adult and young horse interacting in a field of grass. two trains on a track near a platform A dog is in the air catching a frisbee with a crowd watching. a group of people stand by watching a group of elephants Man with racquet about to hit ball tennis ball. A man riding on the back of a white horse. kids are enjoying a nice game of soccer A man getting ready to take a picture in a field. Several people in a canoe with oars on the river. A tower that has a clock on the side of it. A large clock is next to a pillar. Cat staring at something while sitting on porch. a woman plays a video game in a living room a man snow boarding on a ledge with a snowy field behind him A woman walking in the rain with an umbrella Various toys and items on carpet that includes wallets and a camera. Person holding a string with a white kite on the other end. Young skier posing for photo in alpine ski area. A baseball player holds the ball in his glove using his other hand. A red motor bike is being repaired in the driveway. A little boy on a skateboard on the road. The long plate has cookies, fruit, and chocolate on it. A group of people in a living room playing video games A man in white jacket rowing a yellow surfboard on water. a person that is riding around on a horse A woman walking in a muddy field carrying an umbrella. There is a birthday cake covered in this guys face. Several people who are waling on a dirt road. a nicely decorated living room with a big mirror above the fireplace A person holding a little girl next to a sheep. A town full of street signs connected to a building . Fish, small potatoes and broccoli are arranged on the plate. a large black sheep who has been shaved The large detailed cathedral has a clock on it. A ship in the ocean with a seagull and another bird standing on things on the boat. Three helicopters are flying through the clouded sky. cows with ear tags standing in a field a laptop projecting an image on to a flat screen television a person cooking a pizza in an outdoor grill Parasails in the wind in front of a bridge on a gloomy day. Two men on bicycles riding on the street A group of skateboarders watch a skater perform a trick. A rear view mirror view shows a truck coming up behind. This is a traffic light signaling green in a downtown area. A man jumping over a blue park bench. A small infant holds a soft toy bat. A frisbee that is laying down in the sand. A man with wide eyes eating a muffin. a giraffe grazing in a high line of shrubbery. A batter, catcher, and umpire anticipating the pitch. A big boat on the water near the shore. A woman standing on a tennis court holding a racket. there are two urinals in a public bathroom A silhouette of a horse is seen against the back drop of the sea. One lonely person on the platform waiting for train to open for boarding A tennis player prepares to hit the tennis ball A black bear walking through a zoo exhibit. A cat sprawled out over the top of a laptop computer keyboard. People walking along snow and trees with skis on. A woman walking a gray horse around a field. A dark vase is holding pink flowers in front of a window. A parking meter reads "90" minutes on the window. a man being feed a cake seated on a yellow chair Two briefcases are stacked up on a desk chair. Three horse and buggies are parked out in front of a building that has three steeples. a woman writing something down on paper while the laptop sits on the table A green parking meter on a city street. A group of people on bicycles next to a passing train. A woman is getting in her car on a busy street. An empty classroom with four unoccupied desks and writing on a chalkboard. People are standing by a small passenger train. A elephant being ridden by a little boy. an image of a man riding on a skateboard two computer monitors are sitting on the computer desk A train on one of multiple parallel tracks passes under a bridge. A pink and a blue toothbrush are on a white background. A person on some skis in the snow. A person on a skateboard going up a small ramp. Guy jumps high in the air on his skateboard off the hill ramp Man shopping at a grocery store at the produce section. Three children posing with their tennis rackets at a tennis court. a keyboard a persons hand a mouse and a monitor One building has a clock and the other one doesn't. a close up of a train on a train track There is an image of a bear jumping on another bear. Several trucks and cars are driving on a muddy road. its raining so all the people are carrying umbrellas a lady and a man getting ready to fly a kite Motorized scooter parked in front of a gated roadway. A computer screen and keyboard on a desk. A stuffed teddy bear sitting next to hay holds a stuffed dog. Boats are docked by houses on the shore side. A young boy riding a skateboard at a skate park. A man holding a phone that has a picture on it of a man holding a phone. two people in a body of water near a pier A couple of young boys riding on the back of wooden bikes. A man is standing in the snow with skis and ski poles. Wavy wooden seat bench with a sidewalk, grass, and stones A dog lying down on a couch next to a nightstand with a wedding picture on top. it is extremely foggy and theres a truck on the road A recliner chair sitting next to a table with a lamp. a person in a kitchen preparing food on a plate Two men in black wet suits ride surfboards on small waves. A tennis player running to hit the ball. A man on stilts is holding a pink, polka dot unbrella over a woman in colorful clothes in a park setting and there is a crowd milling about. A white bathroom has an aqua colored container. A very small child on a surf board near a big fake wave. A large airplane flying through a cloudy sky. A woman starts to remove something from the oven. A man watching a single engine plane make an approach to land. A row of retro kitchen designs in various colors. A kid with a baseball bat on a field. a person standing outside of a building with an umbrella A stove pulled away from the wall in a kitchen. Two pink roses sitting inside of a blue vase on a table. A military jet is parked on the runway of the airport A group of women hanging around a long table a close up of a large and a small zebra A surfboard is recycled into a unique planter. Several colorful trains are parked at a station. A man holding a surf board standing on rocks. A black cat resting on a bed wearing a tiny winter hat. A black and white dog is catching a Frisbee in the air. A man eating food from a napkin in his hand. A giraffe with an object in its mouth. A baseball player up at bat in a game in a stadium. People walking in the rain on a city street some with umbrellas. A group of sheep sitting on the ground around a bench. A crowd of people standing on snow covered ground. A group of people crossing a street while holding umbrellas. A glass cup holding three toothbrushes next to a wall. Two pictures of the same woman playing tennis. The room has a large china cabinet, and two couches. Two guys reach high to catch the Frisbee. People walking their dogs on a park trail. A large wooden clock by a window in a room. A brown dog standing next to a toilet in a bathroom. Pink and white flowers in a blue vase. A group of three horses standing on a lush green field. A group of people eating at a table raise their glasses Two people walking and talking on a huge air strip. A man riding a skateboard on top of a ramp. this is a mulicolored stripe sun umbrella near a palm tree A group of skiers trekking through the snow A clock that is on the side of a tower. A boy is holding a video game controller in his hands in a living room. A stool is inside of a walk in shower. Some very pretty giraffes standing by a big fence. A sink is shown in front of a frame covered wall. Two jets landed at an airport facility with many service trucks. an old school bus sits in a field in a retro photo A woman standing over a cake with a knife. Two young boys play a game of frisbee. A young woman plays with a frisbee indoors. A baseball player about to receive a pitch in a stadium full of people. A man near a curb with a bag and a box. A picked off cake somewhat resembles the original design. A crucifix is on the wall next to a clock. A close up view of a small broccoli plant. A police officer is riding his motorcycle on duty A cat sniffing a small teddy bear laying on the floor. Two birds are standing on park benches outdoors. Someone chopping up foods and placing them in bowls and plates. A tow truck hauls a jeep along a busy street. Woman walking beside a man riding a horse in a yellow shirt. Adult in suit and tie with markings across back of hands. A small older bus parked alongside a roadway and another behind it. The horse is tied to a tree in this snowy yard. A living room consisting of windows, rugs, chairs, and a coffee table. A man holding a kite next to another man. A bird standing on the sand near a body of water. The bathroom in the house is clean and ready to use. a toilet with metal walls and a sign A woman takes a picture of the newspaper with her phone. A large red bus on a city street. a lemon sitting on top of some fish with veggies and rice on the side A woman stands behind her luggage next to a building. A group of men holding up a white and yellow frosted cake. A few birds wading in some shallow water. Herd of cows, walking in a drinking from, a river. a man in sunglasses and a pilots uniform looking down A skateboarder grinding down a red and black ramp. Two men sitting at a restaurant table holding up a tray of pastries. A food entree is shown on a platter. This kitchen has wooden ceilings and two tables Few horses out in the distance eating grass A picture of a black bag with a motorcycle in the background, on a dirt path. a train with green and purple on it on a track a small vase sitting on the table with flowers inside A stop sign outside of a building with the word Liberty on it. Two older gentlemen sitting on a public bench in a park. a close up of a pizza on a pan A woman with a dog throws a frisbee to a hill. The inside of a bicycle store with numerous related items on display two sinks a toilet mirrors and a counter A baby sitting on a couch next to a brown teddy bear wearing a t shirt. A dog standing next to a sheep behind a fence. A man is in a field flying a kite. A airplane with striped wings is in the air. A man in snow gear in skis at the side of a snow slope. Small desk with electronic equipment in office type room. A plane is flying over a gas station preparing for landing. A man spins around with his forehead on a baseball bat. A group of men tours a building that has had fire damage. A white sink and toilet in a small bathroom. A man sitting on a sofa using a laptop with his dog curled up next to his shoulder. A man and a woman dress up in costumes. A horse is standing behind behind a fence A red, white and blue train on the tracks beneath a mostly cloudy sky. A person doing tricks on a skate board at a skate park A motorcycle sits parked in the corner parking space. A professional baseball player in a white uniform on a baseball field with a bat in his hand. We are looking at a photo of buses in a demolition derby. A man rides a snowboard down a snowy hill. A baseball player takes a high grip on the bat as a catcher scrambles for the ball. Two guys are excited to be catching a Frisbee in this tournament. A glass coffee table sits in a living room. The man wearing a hat holds a kite near many other kite fliers. A green plate topped with pasta, broccoli and a salad. Two shorn sheep graze on tall green grass in a sunny pasture. A dog on top of a building yawning and an airplane above him. Various contents laid out on a wooden table a number of people near many bunches of bananas A dog is chasing a frisbee in a park. A man standing on a tennis court holding a racquet. A toddler with a frisbee in his hand. four children in a living room with one of the children holding a game controller before a television. The men are playing a game of baseball in the yard. A clean living room containing a couch, three tables and other decorations. A female tennis player is holding the racquet, ready to swing. Two plastic containers sitting on a table filled with food. A man standing beside a robot with a camera around his kneck a kitchen with a sink on a counter top A woman holding an umbrella while a man walks behind her. Men are eating hotdogs during an eating competition. A bed sits in a white room with a window view of a nearby house. A bus that is parked inside of a building. A gigantic bird statue outside of a building a street people cars trees and buildings and police there is a man with a uniform at the supermarket Two bananas that grew as one speckled banana A older woman is fixing a younger man's tie. Kitty fast asleep on its back on the bed. A BED NEATLY MADE WITH A TABLE NEXT TO IT a giraffe sticks out his blue tongue at zoo visitors Two giraffe sitting on a dirty lot next to a forest. A cat snooping in a bag on a bathroom counter. Looking up at the belly of a jet airpline A young many getting ready to serve a tennis ball on a clay court. An old man is flying his kite in the middle of no where. Dog curled up in the bed under covers A living room area with couches facing a television and windows on the side wall and on the wall behind the television. Several people are skiing along on a snowy field. Some bottles and glasses of wine surrounding entrees. A woman takes a bite of her sandwich. Food and a cutting board sitting on a table. Here is a stop sign with graffiti written on it. a young man is doing a skateboard jump. A cake on a table with other desserts and pastries. A baseball player swings at the pitch being thrown. A zebra standing in a dry field of grass. This is a nighttime image of a church A baby in a crib with a nighlight The brown and white cat is sitting by the computer. Three pies on separate plates on a table. Outdoor subway train pulling into an empty station. Sugar covered doughnuts are heaped in a pile. A man standing on the grass preparing to throw a frisbee. A man seated with a mouse, a keyboard and cell phone in front A calf is laying in a pen as people gather outside to look. Traffic is stopped on the road because of a red light. A truck parked on the beach next to the lifeguard station. A man is on his cell phone outside under an umbrella. A colander sitting on a countertop in a kitchen next to a microwave. A man in black shirt doing a trick on a skateboard. A flight of red brick stairs which lead to an antique bench and a view of historical brick buildings. a number of small boats in a body of water A table topped with a giant penny and a tray full of vegetables. A dirty kitchen stove with a timer located on at it's center. An electric pole with a single light and two street signs next to a 5 eleven sign. A yellow train parked at the end of the tracks A guard with a dog walking around a bus in a parking lot. A cutting board with a beet cut in half A tall giraffe rests amidst green grass and trees. A man is skiing in a snowy forest. A tennis player dives for an incoming ball on the court. Smiling man enthusiastically hugging a plush teddy bear. A full view of some tall buildings in the downtown. A bird standing on debris in the water A young boy is skateboarding in the middle of a parking lot. A man playing tennis with a racket in one hand. A skateboarder riding on a park bench, on a cloudy day. A black cat holding a Nintendo Wii controller. Man smiling while displaying food item in kitchen area. Two people stand by a motorcycle and a van. A black and white cat walks near someone's legs. A group of people gather near a motorcycle. A man talking to another man sitting at a table in front of a laptop. A kitchen with refrigerator, sink, and a curtain over a doorway. Two teams are playing a frisbee sports game. Carrots, cauliflower and broccoli sitting in a clear container. A man eating food off of a plate A tennis player hits the ball up from the racket. A large jetliner sitting on top of an airport tarmac. People sitting on a bench above the water. A kitchen that has wooden cabinets and a kettle on the stove. An orange and white kitten laying on a chair. a cook standing in a restaurant kitchen while making a meal A woman smiles while posing wearing skis and holding ski poles. A woman in grey sweater lighting a candle at table with hotdogs. A skateboarder performing a trick on a ramp. Man and a rowboat next to a misty mountain lake Several cut up carrots boiling in a pot of water. A bus all lite up inside and out with diffrent colors . A television and a cabinet on both side. Parents and children dressed up as Santa Claus sit on a park bench. A simple yellow vase holds two red and white tulips. some people and a long thin boat water and houses Baseball player coming in for the base, while catcher readies himself to get the ball first A person on a skateboard and a person walking a bike. A glass sitting on top of a wooden table. next to a keyboard. Commuter buses parked in a lot outside an apartment complex. a table full of vegetables and fruits stacked on top of each other A plate with a sandwich with people in the background. a police officer on a red motorcycle in the street The child is pushing a cart full of luggage bags. A young toddler is standing next to the high toilet. A female nurse is standing next to a man in a bed. a blue and white road sign written in japanese A small elephant standing next to a wooden tree. A lone person snowboards down an empty slope. An open laptop computer sitting on top of a bed. a street light on a street next to a tree lined median. a close up of a bench near a ledge with a statue An antique car being towed on a flatbed trailer Two guys are having fun while playing the Wii. The young person in a cap is grabbing his skateboard during a jump. A dog attempting to use its mouth to pick up a pair of scissors from the floor. this is a pizza and a fork on a table A group of women with umbrellas and a couple without umbrellas outside in the rain. a woman tosses a frisbee in a public park A train on an overpass over a parking lot. three kids at the field playing frisbee together A tennis player is in motion with his racquet raised. A man who is holding a skateboard in his hands. a close up of a sink with many dishes in a tray A man on a phone with a sandwich in his hand A red fire hydrant sitting in the middle of a forest covered in snow. Two little girls sitting in the grass with toys. A kitchen scene with focus on the microwave and oven. A dog sitting on its dog bed in the middle of a living room. An adult zebra and a young zebra stand together in a zoo enclosure. A man is on a paddleboard in rough waters a man holding a teddy bear standing next to a basketball. A man flies a kite at the beach. A sandy beach topped with lots of people and tens. One cat lying on the floor, and another with its front paws up on a stool A man holding a plastic gun in his hand. A train is passing along a hillside during the day. A baby in pajamas outside, sitting on a skate board and waving to someone. A street sign hanging from the side of a pole. This is a photo of a hotel bathroom, all nice and neat. A pen of five sheep surrounded by other pens. A person in a large yellow and purple train. a large air plane on a run way A herd of sheep shares the road with cars and motorcycles. A paper mache "bandit" piece of artwork stuck to a pole under a "neighborhood watch area" sign. People standing in line by several food trucks parked on the street A man holding a huge slice of cheese pizza with a crying kid on his lap. Man on purple tennis court swinging at a ball. A knight riding a horse and greeting a crowd as he clutches his shield. A broom is hanging on the wall of a house. Picnic tables under a pavilion in a park. An assortment of thrift store objects, including two vases and some miniature carousel horses. A man riding through the air on top of skateboard. A young child is palying with some books. A giraffe standing up with its legs crossed. LOTS OF PEOPLE ON THE STREET IN A DIFFERENT DECADE A tennis player reacts during a match in a tennis court. A man is standing by a trolley station as one approaches. A brown couch sitting in front of a flat screen TV. a plaza filled with a lot of birds and some cows The produce section of a grocery store a couple of boats sit parked by a dock A white and red moped is parked on the sidewalk. A young woman with a smile on her face rides her bicycle down the street past some parked cars. Four Blue Angel jets are flying in formation. Traffic sign and barricades on roadway near large city building. A zebra stands alone in some tall grass. A blue and yellow train in the train station. Several people sit on top of an elephant as another person watches. A white toilet in a small tile walled bathroom two people setting up a table of food a girl is looking at her cellphone and a blond boy a big tower that has a clock on top Someone who is enjoying some rest on the edge of the pier. A group of people flying kites on a field. A man that is standing on a court with a racquet. A sheet cake with a tractor frosted on it. A very clean, modern and minimalistic style bedroom. A woman using a Wii controller and playing a game. Two giraffes in a zoo enclosure with a zebra. A man wearing a glove pitching a baseball. A woman with her face painted white and decorated with a dragon. A traffic signal underneath some tall buildings. A rear view mirror on the side of a car door. A bedroom scene with focus on the window with the bed in the reflection. Four men playing doubles tennis on a court. A Chanel sign sitting behind a display window at a store. A man riding a motorcycle with another man hang off it's side. A dirt road through the woods with a rolling suitcase. A black motorcycle is outside of a house The children watch the man and woman cut the cake. many people in a boat in the water and trees Glassed in bathroom with the sinks on the outside. A truck waits in traffic next to a wooded area in the city. A man standing next to a cat in a kitchen in front of a laptop computer. A bathroom door is open showing a shower with the shower curtain mostly open. a brown table and chairs and a vase with flowers Several students sit at a conference table with their laptops. A photo of a jetty and a body of water. A surfer wipes out as the waves break. Toddler getting ready to hit a t-ball with his bat. Young lady falling to the ground catching a frisbee. THERE ARE TWO ANIMALS THAT ARE PLAYING TOGETHER Large brown bear sitting next to rocks in open area. two boys with a skateboard and a bicylce A young boy holding a Nintendo Wii game controller next to a lego controller. An umpire in the field, talking to the batter. A group of people riding an elephant on a dirt road. A group of trucks and cars are coming out of a tunnel. A man walks past a weathered structure and parking meters. A small hotel room with a king bed. A male standing up with a Wii remote control strapped on his hand. A guy is performing a trick on his skateboard. A person standing on the river bank by bushes A picture of an open air zone that looks incredible. A man pulling a piece of something from a machine A large stuffed bear is sitting on the ground with a cup next to it. A man in a beige suit with graying hair. a couple of vehicles on a busy city street A man stands with his produce in baskets. Adult preparing to catch flying disc in open area near trees and water. People stand in line for an ice cream truck. A man in a the middle of a bunch of cows. A female tennis player hitting the ball with racket People on a busy sidewalk, some on bicycles. A fat ass sitting on a toilet with lady magazines. A small white tow truck parked to another small white tow truck. A man without a shirt squatting on top of a skateboard. A man on a motorcycle driving beside a van. There are signs on a cobble stone sidewalk Black and white photograph of a man on subway with bicycle. A stop sign in an older residential neighborhood is marked with graffiti. a couple of large pizzas that are sitting on the table The man is spraying down the toilet to clean it. a bunch of people watch a person do a trick on a skate board a man on a beach tries to fly a geometrically made kite an image of a clock on a tower high in the air A large black dog standing near an open suitcase. A man standing behind a white frisbee on a green field. A black-and-white photo with a colored red double-decker bus. Line of people behind plastic fence with umbrellas The street signs are clearly visible for us to see. A pair of red scissors sitting on top of a piece of paper. A baby feeding cake to a man with a fork. SEVERAL PEOPLE STANDING AROUND APPEARING TO BE LOOKING AT SOMETHING skateboarder in red helmet jumping on skateboard ramp A person working at an airport, outside of an airplane. A birthday cake for a child is sitting on the table with candles. A dirty, overturned motorbike lays in the mud. An airplane in the middle of a field with some jeeps parked near it The front of a store that has a large teddy bear in the window. Old suitcases piled to the ceiling on a luggage cart make art in an airport A giraffe standing on a lush green field. The white and black dog is in front of an open refrigerator A small boy next to a table upon which sits a birthday cake shaped like a racecar. A train runs down a track past run down buildings. A man that is standing on a platform with a frisbee. A black plate with a hot dog and fries Two pieces of pastry sitting on a mat next to a spoon and fork. A bunch of green bananas is on a tree. A woman holding a video game controller and grinning. A pan of some kind of food cooking in an oven A man in a suit and colorful tie in a park. A close up of a girls boots as she sits on the counter. a black and white photo of a cow near a tree Assorted vegetable along with cheese and nuts for food preperation. Two tennis players are doubled in their pursuit of the two tennis balls. A man on his skis on a snowy slope. A comparison photo showing bathroom in regular view and in stretched version. Small yellow container attached to a digital camera. Two slices of square pizza on a plate with a fork. a man in a tie holding a cup A street scene of an intersection with a street light. Three wild cows in a field on a nice day. A small sink inside of a very clean and white bathroom. a small machine is sitting by a cliff A wall with words painted on the glass and people behind it. A laptop sitting in someones lap and a dog lying on the floor. A little girl posing for a photo next to an elephant. A man is at an outdoor table under an umbrella. A bathroom window reveals a snowy day outside. Animals grazing in grass in front of an industrial landscape. I THINK THIS IS A RECEPTION HALL AND ITS FULLY DECORATED A red frisbee has been thrown by a man. A blender filled with flower and eggs on top of a counter. Colorful collection of fruits and vegetables with some type of "baby" decorations A white boat on water with brick wall next to it. A baseball player up to bat and missing a hit. A park bench sitting in a snow covered field A woman cutting thru a pastry on a white cutting surface. A vase of flowers one being a large sunflower in front of a brick wall. A woman biting into a sandwich with a happy look on her face. A man sitting on a couch playing with a game system. Two people walking down the sidewalk with a "wrong way" street sign directly above them A couple of horses standing on top of a grass covered field. An elephant is drinking with its trunk at the watering hole. a close up of slices of bread on a table near a spoon A woman with eyeglasses in a kitchen with bowls, spoon and glasses A person is eating at the kitchen table. Two women are facing each other and one is blow drying her hair. a drawing of a big fancy court house two elephants walking in an stone enclosure. A cake with a train on a track and a ground made of cookies. A baby is laying on a bed with the cat next to and a man is looking over in the mirror. A supply truck in a snowy area driving towards a tunnel. A bunch of carrots are on a plate next to broccoli. A fire hydrant on a sidewalk next to grass A stirfry containing broccoli, carrots and other vegetables. A wooden bench surrounded by potted plants in front of a house. a close up of a teddy bear on a balcony The storefront of a bakery that has been painted green. Three stuffed animals hanging on the structure of a train track. Children get off a school bus as a crossing guard stands among them. Two horses underneath a canopy of green trees Group of mixed vegetables sitting on a counter top in a kitchen. a close up of a plate with a sandwich A carnival atmosphere is effected by these colorful stuffed creatures. delivery truck dropping off delivery at train depot A young child laying down in bed with one arm raised up that is wrapped up. A woman in black jacket with umbrella on a sidewalk. Different types of fruit are shown on the counter. Two individuals sit on motorcycles on a busy street in the rain. two people riding motorcycles, one sliver one white two girls and a boy standing in a kitchen a couple of birds stand on top of a rock A man in a white shirt with a red tie is standing in front of a door way. a couple of small beds are in a room together A white bedroom decorated with low level furniture People looking at zebra and cows behind fences at a zoo. A man standing on one leg on a baseball mound. This tennis enthusiast, not using correct form, is practicing on the court in the city. A large eating and living area inside a house A lady walking with an umbrella during the day on a rainy day. A person on a skateboard on the ground of a park. A person sitting on a bicycle at night outside of a shop. Two different types of dogs sitting together on a bed People are riding in a boat on a lake. Two horses rubbing necks together in a field A zebra is chasing another zebra inside an enclosure. Two zebra standing next to each other in a field. A white toilet sitting in a bathroom next to a wall. A woman standing on skis and holding poles in the snow. A GROUP OF SHEEP WITH MOST OF THEM IN INDIVIDUAL CAGES An empty alley with a street sign at the end of it. A sandwich that has several toppings on it. The man is kneeling low to hit the tennis ball. A lone giraffe bending over to graze within an enclosure A man driving a scooter with two women sitting on the back of it. A very up close picture of a sign. Someone is holding a toy bear in the mans face. A tour boat on a man made lake under a blue sky A man hitting a tennis ball with a racquet. A young boy is doing a trick on a skateboard. A man working in a market surrounded by produce and meats. A dog sits in a suitcase with a doll. this is a group of people eating together A player is swinging at the ball in a baseball game. A blue doorway with a clock mounted above it. a teddy bear sitting in a hanging basket A man talking on the phone while he walks down the street. The bear is wondering about in the woods. Two wooden desks holding keyboards and computer monitors. A man cross country snow skiing near a wooden post. There is a young child sitting on the floor in front of the refrigerator. This bedroom features a bed and chest of drawers. Two men stand next to a horse near a bus. A long bookshelf behind the head of a bed. A large fire truck with a water tank on the side of the road. A group of people carrying bags of luggage through a lobby. a baseball player swinging a baseball bat at a ball There are people riding bikes on the street. A LARGE PAINTING ON SIDE OF BUILDING WALL OF TOASTERS a small bird sitting perched on a chain link fence Tennis players at match standing over net shaking hands. A city street lined with very tall buildings. A woman in white shirt holding a kite on beach. A table with a sandwich, sandwich makings and glasses of red wine. The unique meal includes both carrots and peppers. a kitchen with maple cabinets and black appliances A wine bottle is being used as a flower vase. The alleyway is lined with many parked motorcycles. a woman is walking down the street in a sweater Woman eating a really large sandwich at a dinner table. BEAUTIFUL VIEW OF A BLUE SKY TOWERING OVER WHITE CAPPED MOUNTAINS a white plate with three donuts and two drinks a boat docked at a wooden dock on a lake A young man uses a fork to eat food. Motorcyclists perform a pyramid stunt in a darkened auditorium. A person sitting down in front of a laptop. there is a young girl that is feeding a giraffe Cattle grazing and eating grass while looking at the camera. A child, wearing a cat costume and umbrella, stands before a brick building. The little league ball player is posing for his picture. Three men sitting on top of a green bench. A bouquet of flowers is stuffed inside an arrangement of wine glasses on a table. Baseball pitcher in the middle of a windup. A vase sitting on top of a plastic pedestal. A mannequin wearing a jock strap, unbuttoned shirt and tie A couple of men sitting on the side of the street. An old flip cell phone inside a cozy. two kids battle over a soccer ball while on a field A red motorcycle on display at a show. A stuffed animal sits atop a barbed wire fence. A man sitting in a car on a cellphone. A tennis player on the court stepping backwards in preparation to swing A bedroom with a fluffy comforter and lights above the headboard this is a man skiing down a hill An older male with white hair holding a flip phone. a female tennis player diving to hit the ball A red and white plane sitting on a runway. A group of people standing around each other. Mom has to help him eat his hot dog and bun. A kitchen counter with some bananas and eggs. A group of people who are sitting on horses. A small vase of pink and yellow flowers next to a candle holder. A variety of vegetables laid out on a kitchen counter. It is out in the open with various things in viewpoint. An advertisement for Samsung Galaxy Golden cell phone A man wearing headphones looking at the camera. Children in the snow with skies and snowboards. an exhibit featuring various animals under a wooden roof A big bear standing out in the shade and sun light A city is lit up at twilight near a river and a clock tower is lit up in the distance as a large boat is seen on the river. a number of giraffes in a field near one another Two soccer player on opposing teams playing soccer. An Asiana Airlines plane taxiing at an airport. A man riding a red motorcycle on a street next to a crowd. A blue street sign in an Asian language and English. Pink lunchbox filled with fruit and vegetables and snacks. A white refrigerator on the side of a road next to cars. Various vegetables in a roasting pan in an oven A giraffe walking through a lush green field. A bowl filled with apples, limes and lemons. A man riding a gray elephant holding a ball in it's trunk. A bus stops on a street corner as pedestrians walk down the street. a bridge lit up with some blue lights He is heading for the beach with his surfboard. A couple of kids looking out of a window on a subway car. Two dogs are watching a television set intently. This is a picture of a persons garage sale. An elephant under trees in the night time A chair and debt with laptop, monitor and a cat. A black, brown, and white cat is near a laptop. A teddy bear wearing green hat and jacket. Several baseball bats leaning against a fence with a short hanging from the fence. A person in red jacket skiing down a hill between trees. A woman dressed in costume is sitting on the motorcycle A woman on a tennis court is hitting a ball with a racquet. A man in a traditional African outfit gestures while a black cow is in the background. A lone skier is seen on the slopes on a cloudy day. Children flank an old pickup truck in a parade. There is a bird statue and clocks outside of an apartment building. A baseball player holding a baseball bat over his shoulder. A plate of food that includes meat and broccoli. a person wearing an apron in front of kitchen appliance The hot dog is loaded with many toppings. A large jetliner flying through a gray sky under clouds. A bunch of ripe bananas sitting on top of a table. some baseball players are playing baseball and some trees A man talking on a cell phone while sitting down. 2 zebras outside eating grass in a wide open space A man in India herds a number of cows on the street. A laptop and desktop computer on a desk with a light on next to them. A group of baseball players standing on top of a field. A train that is driving through some houses. Cat and dog in the windowsill of a building. A desktop computer is displayed at a wooden table. A large group poses for a photo in their ski gear. A man sits in the snow while breaking from snowboarding A man standing and talking on a phone in a courtyard. Automobiles stopped at an intersection because of a passing train. A giraffe standing on a stretch of sand at a zoo. a woman wearing a crown and a young boy smile at a table with a cake An older man and a boy are on the beach with their surf boards. A bench next to a lamp post on a cobble stone street. Two parking meters that are nearly covered with snow Cars present at an intersection with traffic lights. A sign indicates when parking is off limits on West 25 12 street. A Pizza with red peppers, zucchini and cheese. A bird with a red face is standing on a rock. A dog laying on the back of a couch. a kitchen is decorated with american flags a tropical bread on a branch surrounded by trees A zebra standing in a dirt field next to green plants. A colorful plate with a pizza sitting on top of it. the people on the beach are flying kits over head A man in white baseball uniform throwing a pitch. A boy with a helmet on eating food across from a bicycle. A large giraffe standing in a dry brush field. Woman in grey and blue throwing a frisbee. Dozens of people on a grassy field flying kites. A man that is standing in the snow. A cat is standing on a toilet with its front paws inside. a small baby is biting into some food Several pieces of pottery in the process of being painted. Various types of flowers sitting inside of a vase. A little girl holding a colorful umbrella next to a penguin. A red piece of luggage sitting on top of a bed. A black and white shot shows evergreens, bare shade trees, and bushes that slightly obstruct the view of a building with a low roof in comparison to its clock tower, which stands more than twice as tall as the evergreens, against a grey sky. A very long street with traffic under some cloudy skies. The men are racing on skis on the snow covered race course. some white and brown signs a tree and a building The corner of college street and 5th street Miniature Poors on the side of the road in a rural mountain. a "use crosswalk" sign on a post in front of a rain-covered street a person holding up a cell phone A man sitting on a couch has two cats on his lap. Military jet on tarmac near wooded area on cloudy day. A desk with a monitor, keyboard, and laptop on it. A plate with a piece of cake and a spoon on it. A table with family photos, sentimental mementos, and a potted plant A couple of buses that are on the lot. Group of four ladies sitting at table overlooking parking lot Flowers in a window box sit in front a closed window. People behind a barricade watch a man ride a motorcycle. The bathroom has been cleaned and is ready to use. a woman on a tennis court holding her tennis racket up to hit the ball A couple of men sitting at a table with pizza. An orange cat grooming themself underneath a piece of furniture. a yellow hall with a brown floor and a mirror a cat sitting on an organ looking out the window One adult giraffe and two kid giraffes standing in the woods. A man in cowboy hat on horse next to cattle. A trolley at a train station at night. A soccer player in front of the goal holding a soccer ball. A couple of horses standing in a grass field. A young person with an umbrella is crossing a busy intersection. Several closeup shots of giraffes near a fence. A airplane that is sitting on a runway. two female tennis players are playing tennis on a court A pair of workers unloading the back of a pickup truck. A glass table contains a bowl of spheres and two fancy vases. A man stands with several ripe and unripe bananas. A cellphone next to a laptop computer. There is a man on skis in the snow. A humongous jumbo jet is on the airport runway. Several people are flying kites in a field. A black and white dog catches a Frisbee in the grass A glass vase filled with different colored flowers. A street on a city at night that says "Obama". A large silver truck with a tractor parked on it's flat bed. A large knife sticking out of an apple in front of a blood soaked wall. A man who is swinging a tennis racket. The orange and white cat is wearing a bow tie. A huge group of people stand outside several buildings, holding umbrellas of various colors Small bird feeding near chair in grassy area. A cat laying on the ledge of a window. A white and brown sandpiper with a long, black beak lifts up one leg. A city street with people, cars and police. two long haired cats laying on a bed beside each other The man is cutting bell peppers near a large pot on the stove. A man sitting at a table with a large plate of breakfast food on it. a person standing at a tennis court holding a tennis racket A traffic light near a building on red. Suitcases revolving around on an airport baggage belt. A baseball player throwing a pitch into the field A tow truck and fire truck are at the scene of the accident An ocean view with people water skiing using parachutes. A cat looks happy while sitting in a bowl. A child is flying a kite while sitting in a yard. The fire truck red and the green pastures make it look just like Christmas. Two people sit near many luggage bags using laptops. A meal of noodles and broccoli being held by chopsticks. A woman carries a basket of bananas on her head while some men stand around. boy skateboarding next to a graffitti covered wall A lad and a lady patting their favorite horse. People are sitting on surfboards in the water. a couple of people that are walking in some grass People on a boat on a lake and two people jumping into the water. People standing on a dock near a elephant on a phonton boat. I am unable to see the image above. a man that is sitting at a table with a laptop a little boy that is eating a pizza A baby zebra is standing in a pen A large truck sits on the dock as a boat pulls up. The bathroom has a shower area, toilet and sink. A display case in a store filled with lots of efferent foods. an extreme close up of many different types of bottles A boy standing on the grass as bicyclists ride by. Four men riding horses playing a round of cricket. A man feeding a baby her bottle with a smile. A bunch of candles that are on a cake. a woman sitting on a wooden bench in the middle of nowhere A very tall white clock tower sitting under a blue sky. A bathroom with white toilet and walls and blue accent bars. A woman is holding an umbrella while walking down a flooded street. Two apples and a bowl and jar of applesauce on a cloth. a line of very tall buildings next to a clock tower A group of people who are standing outside. The apple and banana are on the table. Four trucks are parked in front of a paint store. A small sandwich made on fresh bread with lettuce and mayonaise A cat that is in a white sink. A group of people sit on a couch in front of a kitchen. A city street with busy traffic including a yellow bus, many cars and a person ridding a bicycle. a number of baseball players with bats A soldier who is standing near a goat to feed it. A woman taking a hard swing at the tennis ball. A person watches tv in a room with a couch and a laptop A bed with two pillows and a backpack leaning against it A yellow and orange double decker bus is shown. A man surfing in the ocean as the sun sets. A living room with white furniture and a small wooden table. A group of athletes engage in an organized game of ultimate frisbee. A cat on the floor next to a room with a sink A living room tastefully decorated with flowers on the coffee table 1 12 loaded hot dogs and veggie side A person standing next to a chair with two tennis rackets. A man is swinging a bat at a baseball game A toy fire truck sitting on top of a wooden table. A man is riding a skateboard in an underground parking garage. There is a wood bench in the garden. A bright computer screen inside of a room. Windsurfer kites are seen from above the beach. A kitchen with white counters, sink, and stove. The elephant is attempting to complete the difficult trick. A small white plate of food on a table. Two giraffes and one other animal grazing in a field. a man holding a tennis racket on a tennis court. a close up of a young person holding a kite A sleepy tortoise cat laying in front of the monitor. A man at a kitchen counter preparing food. An old diesel truck driving down the path next to freeway A beach volleyball game with a kite flying in the background. Several baby elephants standing on a plain on the side of a river. A large group of people sitting in the sun a number of food trucks parked near one another Semi trucks on a parking lot with orange cones. A photo taken in a car looking at a dog in the back seat. Young man surfing a fairly good size wave BLACK AND WHITE PHOTO OF A MAN AND A WOMAN Trunk and small chest in cream colored room. A baby girl with beautiful blue eyes standing next to a brown teddy bear. Men playing soccer on a field at dusk. a building with a clock at the top. A couple of clocks mounted to the side of a wall. A young person riding a skateboard up the side of a ramp. A man standing on a tennis court holding a tennis racquet A white plate of food on a table. two donuts left in a box of donuts on a counter A woman wearing skis with her black dog in he snow. A young boy smiles while holding a hot dog. a pizza in an oven not yet cooked Barry Bonds holding onto a baseball with the number 754 written next to him A kitchen scene with focus on the pantry and a clock. Several planes fly through the sky, close together A smiling woman holding her cel phone up and open beside her face. A small dog getting a bath by it's owner. A man is riding his horse on the field near the blue trash cans. A woman holding a cat in her arm. A fake large cow that is standing in the snow. A tennis player is bending over and reaching to hit the ball. A long mirror is above the sink in a small bathroom. A view of a city and a body of water from a plane. A person that is driving on the street. A twin engine aircraft is flying in the sky. People are being served at the outdoor restaurant I am unable to see the image above. The lights of a vehicle streak across a modern bridge. two people are cutting into a cake with forks Several zebras from behind standing on grass plain with distant trees. a plate filled with grapes and some sliced apples, kiwi and oranges Four young people crowded in a bathroom brushing their teeth happily. A boat docked at the shore of a lake. Some people with wine glasses are smiling and laughing. People with umbrellas looking towards the grassy area A peacock standing near some metal grill fence A young man in a tan suit and shoes Two sheep on the top of a hill covered in grass. There is a tennis player holding a tennis racket People near a stone building with a clock tower. Two pictures of a woman talking on he phone at a coffee shop Baby elephant alone by a tree in the evening. Patty on a whole grain bun served over salad. A doughnut sits on a napkin, with red frosting and one missing bite. A cat sitting by a microwave under a cabinet. an image of a man in the middle of playing baseball a zebra walking on a dirt path near a fence Woman riding bike with basket and walking dog. Scissors with a blue handle are in a plastic package. The skateboarder is trying his latest aerial trick. Several parked bikes sitting in the grass near a tree. An airplane is flying high in a blue sky. a large plane is sitting on a runway A dog staring at a camera while laying on a bed. a picture of public restrooms taken from the outside The man is flying his kite high in the sky. A bowl that has food and a spoon in it. A bunch of children wearing winter gear playing ball in the snow a close up of a zebra near a car window There is a sign that warns people of work ahead A man is jumping in the air to catch a frisbee. A red bus driving down a street near a building. a man holding a cell phone so someone else can look at it A fire hydrant stands on the sidewalk in between two poles. A stoplight controlling traffic in an urban intersection A group of men riding on the back of horses. A black and white cat sitting between bottles and a furniture leg a group of people sitting on a bench in front of some blooming flowers A girl in a striped shirt and red skirt playing tennis. carrots rice and potatoes in a bowl with a spoon A man with a helmet on coming up a ramp. People riding horses on the sand of a beach. A person on a surfboard in the water. A man wearing a hat while standing next to a purple teddy bear. A black and white photo of sheep grazing near an old fashioned car. A surfer is surfing in the ocean. a young woman holding a cell phone in her right hand A urinal in the men's bathroom and a small sink. A kitchen view of a dining table with a bowl with bananas. A train pulling through a grassy area with two children near. A skateboarder in mid jump while others look on. A couple of horses standing next to each other. A robot built from a Lego robotics kit Two tall birds are standing in some mulch. A gaming system plugged into an electric source. A person with cold weather gear on while skiing in the snow. A woman on a horse silhouetted by the sun behind. The two people are facing away from the screen A man holding a tennis racquet and tennis ball. palm trees in front of a building and mountains in the background A person on a snowboard sitting in the snow. A turkey sandwich and an apple are on a plate. A view inside of a room with a television. A brown kitchen table with chairs and brown high bar chairs. A couple of small statue sculptures on display in a garden. A small giraffe stands alone in some thick grass. A number of little league baseball players gathered in the dugout. A bowl of oranges with several on table around bowl. A dog running through the grass holding a frisbee in his mouth. Two people stand in the snow with their skis on their backpacks. A red train on the track in between two buildings. A group of baseball players is crowded at the mound. The stop sign has two street names posted above it. The woman is rocking the newborn baby and smiling. Birds in the air in a circle with ocean and mountains and city in background A car near a toilet sitting on a sidewalk. a wine bottle and a small vase sitting next to a tiny pizza Boats sit in the lake next to one another. A yellow stoplight with a smiley face drawn on the lens. A narrow kitchen with a refrigerator at the end of it. a man in a red and black leather jacket on a motorcycle A man in blue jersey throwing a baseball. Three women sitting on a surfboard in the water. there is a large building under construction and many parking meters a big teddy bear behind a glass wall A living room area with some couches and a television a large group of zebras under a shingled roof A piece of meat covered in marinara sauce, cheese and herbs. A brown and black puppy sits in the sun. SOMEONE BRUSHING THEIR DOGS TEETH WITH A TOOTH BRUSH Several people sit on park benches by the water. A man is taking a picture of a toilet from outside the restroom door. A very happy surfer guy is loving life as he hangs ten on a beautiful wave. A cat looking at the camera with a funny expression. A sign post with two street signs and a stop sign a paper plate holding a slice of veggie and sausage pizza A train engine carrying carts over a hill side. Skier on red and black skis jumping near a mountain. an image of a bathroom scene with lots of hair products on counter A public transit train going through a station. A yellow fireplug in front of a blue street pole and store window. an image of a man riding a horse A STOP sign that has been written on with paint. Girl in a pink scarf eating a pastry at a table. a surfer in a wet suit is surfing in a sunny day Two people are walking up a snow covered hill. A bird standing on the rocks in front of water. A counter top in a kitchen with various items on it. White people looking ridiculous playing wii and drinking beer. A man bareback riding his bike down the street Small fishing boats lazily drift in the bay. Three baseball players are standing by a base and smiling. A table topped with lots of different types of cakes. A train is approaching an opposite side boarding area. A horse is standing in the grass with its head over a fence. A puppy chases its tail, next to a mirror. The two elephants are eating their grass for dinner. A man is standing next to a city fence. a person riding a surf board on a wave An image of a man wearing a baseball glove and leather jacket. An assortment of foods on white and blue plates. A little girl sits with a piece of cake A red and yellow double decker bus at a bus stop. A dog that is laying on the bed. A dog and someone laying on a bed in a bedroom A man standing in a snowy forest wearing skis. A simple vase with a few flowers in it . a child on a tennis court getting ready to swing a tennis racket A box that has different kinds of donuts. A baseball player swinging his bat, while the catcher and one spectator look on. A man is steaming his clothes in a bathroom. A man rides behind a horse during a race. A man in black gear skiing down the hill Old photograph of baseball team posed on a set of steps A woman at a crosswalk that has a green light. A young man riding a skateboard on top of a rail. a young lady holding her kitten and kissing its head A man in a red shirt and sunglasses is playing frisbee. Up close view of two zebras in a zoo. a monorail making it's way down the track above a bunch of cars A chocolate donut filled with cream and custard. a large crowd of people in a park, a good portion of them are flying kites. an open field with some people flying two different kites A professional snow boarder flying through the air Seafood with pasta and broccoli is on a plate. A young person is biting into a hotdog A couple of birds sitting on top of a large clock. A woman leaning in and smiling at the camera. A double decker bus is on a street. A cat laying on the bed looking at the camera A busy intersection with traffic captured in motion. a pair of pet bowls on a mat is next to a screen A man riding a skateboard on piece of concrete in a park. There are two people posing and one man is holding a banana A FIELD AREA WITH GREEN GRASS AND TING BUILDINGS There is a couple standing among some fountains A mp3 player sitting on top of a speaker system. Large sunflower displayed in colorful vase on table. A group of skiers watch as one members does a trick. A man riding down a snow covered slope on skis. A single giraffe standing by a tree and some rocks. Man with tennis racquet, soccer ball, golf club, and hockey stick. An airport is full of people's luggage and no one is there to claim it. A man riding a horse in an arena with a bull. A base with yellow pink and orange daisies in it. A cat sitting on a table next to a vase with flowers. There is a train sitting on the tracks. A white toilet sitting next to a bathroom sink. A bag and it's contents sitting beside it on a floor. A colorful display of hundreds of small teddy bears is featured. a bed with a shelf above it with items and luggage A living room with a chair, fireplace and mirror. A tray of food sits on an outdoor table. A double sided parking meter buried in the snow. A black Macbook on top of a stand A stop sign near a Star Bucks Coffee Shop. A girl in dress sitting on a park bench. A man standing in water holding a fishing rod. A blue and green hummingbird seems to hang in the air with its wings together and outstretched. The man in the chair is playing a video game. The airplane is on the runway t the airport. A plate with some eggs toast and bacon on it. Two horses graze in a large grassy field. A bowl with rice and a side of broccoli in it. There are two giraffes standing in the wild next to trees. A cow grazing upon a hill on a foggy day a child lying in a children's bed next to a wicker basket dresser. A piece of birthday cake is sitting on a plate. Three people ride on an elephant in front of a forest. The fork is attached to the dinner tray. A cutting board with chopped carrots and apples. A parachute floats in the sky above the ocean a man on a snow board riding through the snow Zebras, and elephant and another animal standing near water. The young child with the missing tooth is holding up a new tooth brush. A man standing in a tool shed running water from a wooden sink. A woman is cross country skiing in a forest. A boy in jeans spreads his arms wide as he balances his skateboard on the edge of a pool. A girl is sitting at a table in front of a skate board and helmet. Two double deckers buses travelling on a city street. A girl with a curious look in front of broccoli and chicken. a bronze statue is looking at a clock on a building A bike in front of a scenic welcome sign. A kid wearing a Georgetown Day Shirt has a baseball glove in his hand. a black and white cat is sitting on a yellow and red chair A living room with two red chairs on front of a television set. A plate of food sitting with a very elegant setup to it. The remains of the breakfast table from above A picture of a person laying on a bed. black and white picture of elephants in a fenced water tank Two people playing a board game involving cards and chips. The giraffe is standing by itself by the gate. A large jetliner flying through a clear blue sky. BEDROOM WITH BED, DRESSER, TV, LAMP AND OPEN WINDOW A woman dressed in white holding a colorful umbrella. two girls sitting in a restaurant eating noodles A man surfs a small wave on an overcast day. fruit hanging from a tree with trees in the background A desk with several computers and electronics on it. A couch sitting in front of a rub on a hard wood floor. A man that is on a surfboard sitting on a wave. The people are piling on to a large truck bed. A plane on the tarmac with airport personnel. A man in a suit and a woman in a dress standing side by side. The men are walking with each other wearing ties. The guy is on the computer while there is a girl on the bed. A large clock in a mass transit station. People in a room with one man working on a laptop while another looks on. Two horses standing together in an open field near some mountains. a couple of players are out in a baseball field A couple of small birds and a building. A woman smiling while showing off her cell phone. A baby on the floor biting into a remote. The teenager is taking a picture of her male friend with her cell phone. A prepared plate of dinner has meat and broccoli. A man is doing tricks on his skateboard, him and it up in the air A green and beige bus sitting on display behind a traffic light. there is a small boy getting help brushing his teeth A hand that is that touching a dog. A lady making something in a home kitchen of some sort. A man sitting in the middle of a fresh produce stand. Large oak desk with laptop, keyboard, and pictures on shelves. A small bed in a room with lacy curtains on the window. a blue billboard sign in a busy city A black and white dog on shore of a beach. a trey with some fruit inside of it A cup of coffee and a banana are setting on this desk. Birds are sitting on the arms of poolside chairs. A remote and a container is sitting on a table. A person skiing down a snowy mountain slope. A doorway view of a bed window and doorway to another room. A man that is standing in front of a television. a man lighting the candles on a birthday cake A group of people flying kites over a lush green field. An airplane is flying through the sky during the day. A baseball game is being played on the grass. Several ducks are out in the middle of a lake. There are some people walking on a beach with surfboards. A stove and some books in a kitchen. A small pizza with burnt edges and fresh toppings. A living room features a gray and yellow couch, and wooden furniture. a man with a hat on a bicycle beside a tractor A pan filled with celery, onions, and carrots. a person standing near a small motorcycle on a city street A remote control that is laid on a piece of furniture's cushion, which is ripped and exposing springs and wood. A man wearing gear on his feet walking in the grass. some bananas oranges apples and other fruits and a bowl THIS IS A TRAIN GOING THROUGH THE MIDDLE OF THE WOODS Living room with large television and lit fireplace. A herd of zebra standing on top of a lush green field. a bunch of fruit is laying on a table A large white bus on the side of the road. Various street signs next to wall with a building in the background. There is rice, broccoli mac and cheese, and turkey on the plate. Some people are holding a union Jack umbrella. A street sign saying Major Street with an arrow pointing to the right. Sauce covered pizza in a box on a wooden table. A couple of people standing next to each other. A group of people standing and holding wii remotes. A man taking a swing at a baseball A bedroom with bed, chair, table and bookcase. Two large trucks traveling in the side view mirror of a car. Pancake breakfast on wooden table with blue and white mat. A cow walking by a creek with two swans swimming in it. A stuffed bear sitting on top of a window sill. A group of sheep sit on top of hay bales. A dog wears goggles while sitting in the side car of a motorcycle. A boat sitting on the beach next to a van. Two traffic lights facing opposite directions with a street sign atop the same pole. A red umbrella and chair are by the ocean. A woman hitting a tennis ball with a racquet. A bathroom with a pink toilet and pink tile. A young man sitting at a simple desk with a laptop computer and bed in the background. a boy swinging a tennis racquet at a tennis ball on a tennis court A white truck is carrying three motorcycles on the road. A tennis player is hitting the ball on a tennis court. A man that is wearing a tie and is standing while smiling. Someone's hand holding up a glass of wine. Person of a surf board riding a wave in the ocean. there is a male tennis player playing on the court Some giraffes are walking around near some bushes. A group of people traveling uphill on a snowy mountain. Four people sit at a table full of pizza. A street with many cars and busses in a city Plate of food with mixed vegetables and a side of meat. The motorcyclist is happy to be on the road. A young girl smiles as she holds a cell phone. The person in an apron is arranging boxes of fruit. A bird sitting on top of a tall metal weather vein. Thee zebras graze in the middle of the zoo. The tower has a clock displayed in order to tell time. a jet that is parked on a runway a man standing on a red boat out on a large body of water. a cat looks out of a room while on a step a white cat is sticking his head out of some iron bars A pitcher gets ready to throw in a baseball game. Two two birds are sitting on a rock. Five giraffe stand around a pole eating hay. a baseball player holding a baseball bat inside a stadium A woman holding a sign sitting on top of a truck. A truck full of luggage has the hood opened Motorcycles are going around a track leaned over. A drink cooler with bottles of water, juice, and soda. A train on one of two of the train tracks. Indian woman selling bananas while others look at stand. a female tennis player in a white dress is playing tennis A baseball player standing on top of a field. A vase of colorful flowers sitting on a table. A white toilet commode sits on a tile floor. Assortment of fruits with pastry and beverage displayed on table. A skiier slides down a snowy mountain on his board. An orange van with vehicles behind are sitting on the road. a little grey teddy bear with a missing eye sitting by a tree stump A teddy bear in a top hat and bow tie with the message "Me To You" A couch sitting next to a white fire hydrant. A man with a frisbee in his hand in the woods. a few people that are playing with a white frizbe A person is choosing produce to bag at an outdoor market A stop sign in Arabic, in a desolate location. A couple of women sharing a toast at a table. a black and white photo with a bench grass and trees A house renovation showing an unfinished room next to a kitchen. two buses moving on the street besides residential houses An alley with a person on a bike and a girl walking. A pita is topped with onoins, carrots, and bacon. A toilet has been fitted with a system to potty train a cat. A cute dog has large pink ears and eyes. Big green monument with a clock on top. Someone going down a hill on a pair of skis A white sink sitting under a bathroom mirror. A train going by a platform in a train station. A couple of boxes filled with hot dogs and fries. a group of bike riders going past a yellow bus Different types of luggage trunks stacked up together a kitchen with a stove and a glass door The sink and counter is in the grouplab. A man in a gray hat and sunglasses on a cell phone an image of a man posing with surfboard A baseball player standing next to a woman. A double decker bus pulling up to a bus stop. Four young skateboarders are holding onto the back of a bus. A long train traveling down tracks with rusted cars. A display rack of a variety of tools in packages. A bird in a dark room perched on a stack of books. a television is turned on in a living room A black and white picture has a posing crowd. There is a phone on top of a calculator two black and white clocks a tan building and a white and blue bus A motorcycle stopped on the road during nighttime in the city. The two men stand next to each other looking out on the beach. Two men standing on a sandy beach holding surfboards. a man uses a bat gets ready to try and hit a ball A television that is on with a white man talking and campaign signs A gigantic size pizza on a table in front of a woman. this is a trck driving over an overpass A woman with her dog are seated on a bench. Two cows in a grass field with a blue sky and clouds in the background. a bridge over a body of water near a building A yellow parking meter on the side of the street. The couple is sitting at the table talking with friends. A person riding a horse next to a big black and brown dog. a number of people in a small boat with a car A woman standing between two cows on a field. This refrigerator has a monitor on its door. A bus drives through a street with an arch. A sink and bath in a small room. A child with stuffed animals in the background People playing a game with a Frisbee outside. On that point are a bunch of individuals celebrating. a man about to take a swing at a base ball. A small plastic container of rice and vegetables with a few crackers. A woman cutting the hair of a boy whose sitting in a toy airplane. A variety of cookbooks stuffed into and around a microwave A baseball player swinging a bat with a catcher and umpire behind him. A group of skiers trudging up a snow covered hill. Smoke billows from two smoke stacks of a steam engine boat. Two Indian men decorate two different birthday cakes A person riding a yellow motorcycle on a track. TWO KIDS ARE PLAYING INT HE ROOOM A bird that is sitting by some water. A large dog holding something yellow in its mouth. A bunch of birds that are standing in the grass. A police office who is sitting on a motorcycle. A large metal clock and some bright lights. a very large pizza that is on a wooden table A brown bear and a white bunny sitting next to each other. a clock with religious icons painted on a wall The surfing board is on the sand on the beach A boy and his dog are playing in the snow. a man uses a knife to chop up some carrots A large jetliner sitting on top of a tarmac. A man wearing a hat carrying two lamps in a field. A group of people stand in a dimly lit area between roads. a stop sign and street sign on an pole a horse is standing with his owner next to a tree. landscape of a snow covered field and mountains A man in the park with a frisbee. Group of double decker buses on road near crane. a photo of a man in the credits of a film A large blue train going down the rail road tracks This is the front of a mobile library. Black and white of man crossing to old style building with clock tower, possibly in Cuba as cars 1950's vintage. A lone zebra grazes on grass in a pasture. Two gentlemen discussing something being viewed on one of their phone screens. A man sitting on top of a couch holding a game controller. A man is in the water with a beautifully painted surfboard. A living room complete with a couch sliding door and a window. A view of a downtown area, looks very rural. A bison and her babies walking through a field. Red Light at a street intersection with people present on the corner A man and woman having a drink on a docked boat. A woman and a baby are looking at laptops. This is a boy on skateboard about to go down a ramp A girl in yellow dress eating a piece of cake on table. a very nice draw showing a vase with flowers An oven is shown with all of the burners in use. Two young girls making pizza on a counter top. A bathroom under construction with a white tub next to a toilet. A person walking a dog on the beach a beagle with it's tongue sticking out standing by a water bowl A stack of luggage by a curb and parked car. A person that is in the water doing a trick. A guy in a white t-shirt rides on his skateboard. Airplane being loaded sitting on the tarmac at the airport. A jumbo sized stuffed teddy bear waits on a wheeled dolly. A man in competition gear on a red snowboard going down a hill. Seven circus elephants, on their hind legs, leaning on each other, with a standing elephant in the middle of the line. A female tennis-player with her racket in-hand in front of a crowd of onlookers. Three sheep in a field of grass near a steep hill. A man is surfing on a wave in the ocean. Two blue suitcases right next to each other A brick wall and several warning signs nearby. some slices and pieces of yellow bananas on a towel A row of luggage sitting on a wooden floor. A giraffe licking a fence post while standing in a coral. A bathroom with a toilet, television and bathtub in it. a bus that is parked on a very large hill The woman is holding a teddy bear in her arms. Four cows in a pen on a sunny day A baby elephant stands near its mother. A person with a hat sitting down with an instrument between their legs. Large assortment of traffic signals in outdoor area. A plate of noodles, beans, broccoli and an egg roll. Three Zebras standing in front of a gate. A man with a surfboard walking across a bridge towards the ocean Snow skiers enjoying the slopes in the mountains. Partially eaten donut with glazed topping on wax paper a cat that is on a couch and lap top a woman riding a bike down the street A plate of food containing meat and vegetables. A man sampling donuts and ice creams for a birthday party There is a computer on the work desk. A large jetliner taking off from a runway. An older man holding up a handkerchief with an image of a woman in a bikini. A room with furniture and a fire place. Adult woman walking on sidewalk near yellow fire hydrant in city. three people in Japanese clothing, two are carrying umbrellas and all are wearing sandals and they are walking past parked bikes. A baseball player attempts a slide as a catcher and umpire look on. a man holding a brush standing in a room Trays of food that include couscous, apples, and raisins. a cat in a blue hat is laying down A large orange cat sleeping on a pair of shoes. a person riding a skate board ata skate park a sign showing no birds allowed while a beautiful bird stand there A piece of cake in a plastic container next to a large cookie. A man in a blue shirt preparing to throw a Frisbee. A plate of meat, bread, and vegetables on a table. A group of people standing around a chicken coup. a man at the beach holding a surf board A room with wooden floors and white walls A young woman sitting on a city bench talking on a cell phone A GPS device on top of a counter next to a book. A person surfing in shallow waves near the shore. A youth baseball team is grouped together for a photo. A LOT OF PEOPLE WALKING THROUGH A BUSY SQUARE A train with one of its doors open. A man sitting down at a table using a computer. Olive green vintage military truck, six wheeled. Sculptures of zebras stand in the brush and grass. A clock tower lit up at night with an array of bells at the top. A white plate of food that includes an artichoke and bread. a close up of a vase with flowers on a table The baby is helping its mom on the internet. A laptop and books sitting on white sheets on a bed. A man laying on top of a bench on a dirt field. A new tv on top of an old tv. people standing in line beside a food truck A skateboarder rides the rail in an urban area. An Asian building with a satellite dish on the roof. A bowl full of something that appears to be nuts, which can be eaten with chopsticks. Two people are pictured standing in front of an apartment building. A train traveling down train tracks next to a forest. The bathroom only has a broken door, a broken toilet, and a broken window. two people playing with a dog on a leash A metal bowl containing five oranges in sunlight. An indoor fruit market with citrus and tropical fruit. A living room that has a couch and television set on a table. A beautiful woman standing next to a man holding a Nintendo Wii controller. A scenic view of Big Ben in the evening hours. A wooden table holding various bowls and food. The dog lies down next to the parked motorcycle. A red and white fire hydrant was given eyes. a woman in glasses sits in front of a laptop Surfer riding a large wave next to a platform with people standing on it. A chair is outside of a window that a woman is cleaning by a bathtub. a living room with a lot of chairs and a big entertainment center Stairs that have some fading green paint on them. A man with a meal and drink at a round table. A pizza with various toppings is sitting on a wooden slate. Two coffee mugs by an orange juice and a juice glass. A group picture of young men and women at an event at night. The group of people in business suits is standing beside a large poster. A smiling man poses with a healthy cow A large commercial airplane taking off for flight A woman holding a carrot in one of her hands. A woman taking a picture of herself in a mirror. a close up of two pots of food on a stove The hand is holding a controller for the video game console. A hotel with a large blue poll lined with lawn chairs covered in umbrella. Two pieces of fruit, an apple and an orange. A tennis player in action on the court. A woman is sheering the coat off of a sheep. The woman in the mirror is taking a picture of herself and the dog. A flock of birds sitting on top of a tree. A woman with an umbrella on a bicycle Two bulls are resting on the sand next to a boat. A man with a dog plays a game on his Wii. A man snowboarding down a hill while wearing a coat, goggles and a hat. A man is standing near a car with some luggage while another stands near by. Someone's shoe stuck to a stop sign in the city. A large clock sitting on a sidewalk in front of a brick building. A man and woman are playing an interactive video game with controllers. a small yellow Cessna plane flying on a clear day A monitor, keyboard, coffee cup, and plastic bottle sit on a table. Two polar bears in the snow surrounded by trees. a bunch of cattle are standing in a grassy field Two pieces of bread with sauce on them next to a bowl of chicken salad. A smiling man eating at a table with people behind him. A stove top oven sitting next to a mixer. a cat lying on the floor in front of a mirror Four giraffes standing in the grass in their enclosure. A small engine plane making a slight right turn over farmland. An older man is on the soccer field with the ball. The two sheep are enjoying their time in the hay. A skiier approaches a huge snowball at a ski resort A man doing a belly flop onto a bed Boats floating on top of a large lake. A surfer is surfing the waves in the ocean. A man standing on his surfboard riding a small wave there is a male safer that is seen riding a wave A wooden shop stand loaded with drinks and food. A large tanker truck driving down a road. A girl trying to fly a kite with a face. Standing man and woman near a dining table full of food. A group of people hiking along a mountain line on the peak. We are looking down on to a small bathroom. A parking tole on the side of the street with snow on it A passanger bus stopped in front of another passanger bus ready to pick up passangers. A very intricately designed old tower clock, with people coming through the arched doorway at the bottom of the photograph. a small teddy bear dressed in clothing A bathroom scene is pictured in this image. Two sheep are lying on the ground under a tree. a number of luggage bags on a cart in a lobby A white cat sitting under an open umbrella A kitchen with woodwork cabinetry and pendant lights is displayed. A blue couch with a bunch of pillows on it A bus stop next to a curvy road surrounded by traffic lights. A man surfing rocking waves in the water. A blue eyed dog panting as he walks by. A team playing baseball on a baseball diamond. A baby in grey shirt sitting on a toilet net to a tub. The bird is perched on the gate by the mountains. A woman wearing a red shirt blow drying her hair. An airplane sits on the tarmac at an airport. a woman looking at her phone in a crowded area a room that has a couple of different computers A man holding a slice of pizza up to his mouth. A crowd of people standing underneath round lit orbs. a man made pond inside some kind of enclosure A man flying through the air while riding skis. a green motorcycle is parked next to another one A sign cheering on the Colts sports team. Two woman on an island pose for the cameras. A pizza sits inside of a box on foil. a bat and a ball on the ground next to a flower There are seven chairs around the round table. A person wearing a helmet riding a skateboard while a person stands in the background looking off into the distance. Two skateboarding decks mounted on a grey wall. Two zebras in an enclosure walking a dirt path. A large room has a stone fireplace with candles inside. A group of giraffe standing around a tree. A pair of giraffes looking opposite directions in a forest. A basket with a stuffed teddy bear hangs outside. People sitting at a table with laptops and books. A picture of three computer screens with two on. A young girl eating something while wearing 2 different shoes. Two men are working in a commercial kitchen to cook food. a man with a bike sitting on a bench in front of some trees A man in thought sitting in front of a laptop with a pen in his hand. a close up of a child eating a banana People standing at a park with a large yellow kite. A foyer of a home that is leading to a dining room. A black bear relaxing on a hammock supported by chains. Old blue bus with bicycles parked on roadway near green space. A man lifting up a lid on a toilet in a bathroom. A group of people playing Wii and smiling. Two birds are sitting on the ground together. Several boats are docked in a harbor. A sign advertising a reptile sale in May one zebra drinking at a pond and another standing A rain covered street filled with heavy traffic. A large decorated bus has a couple of folks standing by it. A close up of a doughnut covered in red, white, and blue sprinkles. A purse sitting next to it's contents on top of a table. A zebra standing in a forest next to a large boulder. A bowl of fruit on a table next to a letter and a plate with two tomatoes. A bear in the arms on a heart pillow. There is a laptop sitting on a computer desk a lady taking a picture of a long horned mountain goat A railroad train heading toward a traffic light on the tracks a line of people that have horse and wagons A skier stands posing on a flat area in front of the lodge. A young man holding a cell phone with two hands. A big sign telling people to stop eating animals next to a building with cars parked outside Fire hydrant on a corner with a smile painted on it. Youngster on a skateboard, trying simple tip up stunt. A woman getting food out of an oven while another woman stands by. A small city bus with advertising on the side and back A hotel bathroom with focus on the toilet. A yellow yawning dog laying on the ground A black bear is crouched down in the water. A couple of men standing next to each other on a lush green field. Flower are placed in a vase covered in shells. a little boy with a suit, tie and glasses A wooden shelf holding a microwave and small refrigerator A giraffe sitting on a grassy patch of land. A man sleeping in a bed by a dog and remote controller. A jetliner sitting on top of an airport runway. A man is standing talking on the telephone Two skiers are standing at the top of a hill. A table that has a pink hardcase carrier on it, along with several smaller containers. People are walking along a sidewalk with their luggage. Baseball player in the motion of swinging his bat at the plate. A woman is talking on a cell phone outside. some water and a person is flying a kite A keyboard that is sitting next to a mouse. The table has two wine glasses, a bottle of wine, and a vase sitting on it. A man that is on a skateboard on a sidewalk. A melted looking lay pot sitting on top of a spindle. A black and white image of three people on a bench a big lake with some boats out in it A young man using a laptop computer with a large monitor. A small bird sitting on top of a tree branch. A airplane that is sitting on a tarmac. The living room in the wooded house is empty. a very clean bathroom with a walk in shower An old, large clock hanging off of a building. Two zebras standing side by side to each other in a zoo pen. Two young boys carrying red and white surfboard. a big airplane that is parked in teh woods An assortment of muffins are on sale in a Japanese store. A close-up of a bear swimming in the water. A young man is playing tennis on a court. a woman sits on an ornate wooden bed with fancy bedding Man sitting at table with pizza and beer in restaurant. Several pizzas are lined up on a table. A man driving a motorcycle with a sports car trailer and two dogs sitting in it. a big building with a clock inside of it sitting in front of a water way A man riding skis down a snow covered slope. Three men admiring motorcycles in a sidewalk exhibit. The inside view of a bus and its passengers. A side view of a train passing through a mountain trail. Looking down at a partially eaten salad sandwich in paper A black and white dog jumping up catching a Frisbee. A smiling little girl hugging a teddy bear. a tennis player swinging a racket on a court A man standing in front of a flat screen TV. The yellow mustang car is sitting on the side of the sign. A dog sitting on a chair underneath a painting. Group of people outside and one pointing up to the sky. Two teenage girls wearing hats are smiling for the camera. A plate and fork with toast and vegetables on a chair. A group of giraffes on a path near a few trees. A laptop computer, some speakers, a cellphone, empty pill package, and bowl of chili are on a narrow table. A man is sitting in the grass holding four cell phones. an asian man pitching the ball during a baseball game Young baby in crib laughing with bear a person in a scarf and suit sitting outdoors on a bench A street sign with two signs hanging off of it's sides. a teenager playing in a skate park with a skateboard A police motorcyclist with a flag is riding while a large crowd watches. A man doing a trick on a skateboard off a rail. A person on a surfboard in the water. The bed has yellow sheets with sheep on them Two urinals next to each other in a bathroom. A motorcycle is parked in front of two garages. A person in skis stands over snowy ground. A little boy playing a Wii game in his living room. A man in a black shirt plays on an ocean wave. A small kitchen that has small kitchen appliances. a man and a woman playing tennis on a tv. Bowls of lettuce, pepper, chapati and other foodstuffs A dog is laying on a pillow holding his toy. Two male tennis players, one has on a white hat. They look like they were mid conversation. Many people are dressed as zombies covered in blood. A white bathroom toilet sitting next to a urinal. A dog sits on a couch with pillows. A young boy is playing frisbee in a park. a closeup of a person's hands as it plays with a Wii controller in front of booklet with Mario and Luigi characters a note sitting between a couple baskets of oranges A couple of men standing next to a man and his brown horse. A bathroom with a hole in the ground for a toilet. A man behind a cashier holding a red pen We are looking at a simple clock tower. A man eating a doughnut at his computer keyboard. A lab with refrigerators and a man sitting nearby in an office. a CGI photo of a animal sitting on some vegetables A hodge podge of colors and patterns decorate a bathroom. A child and an adult are paddle boarding in the ocean. A parking area with trees next to a stadium. A person communicating with two phones at once. A laptop computer that is sitting on desk that has a lot of clutter on it. A young girl smiles brightly over a chocolate birthday cake. A person putting their foot up to a skateboard. A person's handing pressing a button on a WiiMoted for the Nintendo Wii gaming system. Man walking on beach near ocean carrying surfboard and holding para sail handle. A photo of an old white building with a clock tower. A large grey airplane flying through the sky in the daytime. A collage of photos including a restaurant, waterfall and a rose. a baseball player holding a bat in the batters box A man riding a skateboard on a street next to a park. A mother and her baby zebra grazing on dry grass. A woman petting a giraffe that's leaning over a rail. A black laptop sitting on a desk next to a remote controller. A plate containing three sandwiches, fries, and ketchup on the side. A building that has a clock on the front of it. A parasailer on the water with sky line in the far distance. The group of zebra are eating and there are small birds in their pen. A long-haired man with a beard is wearing a suit and tie. Cross country skiers travel through the snow during a race. A plate filled with cooked vegetables and meat. a person jumping a skate board in the air A bedroom with a day bed next to a window. A giraffe comes close to a visitor in its enclosure. A laptop computer sitting next to a computer monitor. The buses and trolley cars run on the same street.. Apples and pears are in a box with grains and a bowl. People are marching down the freeway with a banner. a man is sitting with a laptop box on him A full view of a flower vase with drinks and cups. a baby giraffe stands in a area with some birds A man wearing a silver tie near a clock. A small bathroom with tiles appears clean and organized. An airplane engine is seen passing a mountain in the distance. A couple of brown horses walking down a street next to buildings. A woman taking a bite of pizza in a restaurant. A group of people use paddles while standing on boards. A baseball game is being played on a dirt and grass field. A dust plane is pulling sharply up into the sky while leaving a trail. A man on a horse monument in front of a building. A girl with a bun is sitting on a scooter type motorcycle. The stop sign is clearly visible for all of us to see. Advertising image with writing backed by bags of oranges. A umpire signaling safe at a baseball game as a man slides into the home plate A person and a child ride on a skateboard in this black and white photo. a man with a tennis racket is running on a court A plate contains chips and a sandwich. A man in a suit standing next to a control board and computer. A bus travels down a busy street in a crowded city. a traffic light with two street signs on the top of it A picture of an outdoor area that looks great. A man giving a thumbs up behind a computer screen Men keep watch on a herd of goats. A group of elephants gather around in a field. A street post with street signs and lights on it A plate of food with broccoli, radishes, rice and chips. The cat looks through the door that is cracked open. A man in snow shoes and his dog on a snowy path in the woods. a building is shown with a big clock in it Five stuffed teddy bears sitting in a row. A group of people standing together under an outdoor hut. A clean kitchen has dark brown cabinets and white appliances. A person is riding a horse on the sandy beach. a couple of stuffed animals sit next to each other A variety of vases are shown on a table top. This is an image of a baseball game with players at home plate. A man sitting on a bench next to potted flower. Two people on bicycles riding in street with signage in the foreground. A salad made with yellow pepper strips and green sprouts sits on a square white plate. The adult elephant stands near a large toy ball. A skier with a backpack pauses to enjoy the view A baseball player prepares to swing at the ball. tHERE IS A HOT DOG INSEAD OF A BREAD HOT DOG BUN Two men sit atop motorcycles and two men sit in sidecars. A surfboard sitting in the sand on the beach. A cell phone sits beside a small crocheted change purse. A collection of green vegetables sits on a table. A toilet and shower-bath combo in a small restroom A room with a television, couch, chair, tables and potted plants. Three oranges sitting on a dark black surface. A black cat laying down on top of a refrigerator. A book opened sitting next to some mushroom ornaments and a vase. A very narrow busy road of shops with a lot of people. Some teddy bears hanging from chains on a sale rack Zucchini, summer squash and broccoli are mounded in baskets. A large flock of birds fly through the sky. A baseball player is holding his bat, and blowing a bubble. A tennis player readies herself to receive a serve. a tennis player hitting a serve on the court a round table of people with drinks and a cake A snowboarder and several skiers at the top of a run. Several people around a boat on the beach with an umbrella shade. The giraffes were outside the building in a pen. A couple of people sitting on a bench. A woman in a dark blue jacket playing Frisbee. Zodiac on back of large boat in a lake. Young people painting a mural on a traffic divider. A woman street skiing with a helmet on putting on her gloves. A closeup of several ripe bananas clustered together. A woman wearing a bandanna and ugly sun glasses. a family in a small row boat in a river A pile of trash sitting on a boat next to an umbrella. A man holding a banana over his face. a kneeling woman taking a photo of her black dog Woman eating a hot dog while walking down a street. a white toilet two rolls of toilet paper and a phone A large airplane is on a runway with clouds in the distance. Child at bat in Little League baseball while teammate watches from first base. Two chairs and a small birds below it this bathroom has white sinks and black counters A line of people sitting on benches in a courtyard. A large concrete skyscraper on a sunny cloudless day.. A hand holding a PDA with a illuminated keyboard. A man sitting in a motorcycle poses with his arms outstretched. A man balancing a bike on a bench. A trash can is sitting next to a lowered curb. Dog laying down partially covered by a comforter. A yellow double-decker bus next to a traffic light. An empty, clean toilet stall with a stack of toilet paper. A group of women sitting on the floor eating food. a dog looking out a window with it's reflection in the mirror A blue and white street sign above fence and water. A baseball player balancing the ball on his left hand. A close up of a man petting an elephant. A murdered monkeys head sitting in a white bowl next to bananas. there is a cup of coffee and a half eaten sandwich on the table A kitchen has a plain white fridge in the corner. a plate of french fries, two sliced sandwiches, and a pickle A man using an outdoor oven to cook a pizza The sink is on the island of a large kitchen. A cat is standing in the corner of the room It is dusk, and the skiers have abandoned their skis and snow boards for social interaction. A little girl cutting a piece of paper with blue scissors. A group of people who are walking with umbrellas. The flowers in a vase are dying. A woman with short brown hair getting ready to bite into a hot dog. a dog is floating on top of a water. A baby and a young boy are inside of a rolling suitcase. A black and white street sign that reads "end bird." A teddy bear sitting on the edge of a toilet seat in a bathroom. A woman is placing a flower into a cake. A unfunished bed in the corner of a room. One boy watches as another kid performs a skateboard stunt A puppy cuddles with a shoe on a couch. An adult giraffe extending it's tongue over a fence. a couple of planes flying through the air A bathroom with a white toilet and window over the toilet. a person in a red jacket skiing along a path A living room with a large book shelf filled with books. a bed room with a neatly made bed and two lamps Three people standing near a table with several glasses on it. A small, white dog laying on a bed with a stuffed toy. a toothbrush on a table with a bunch of scissors A pedestrian walk light is lit up on the corner of West 3th St. and Seventh Ave. A man placing a tie on a womans neck A man flys through the air on a snowboard A little kitty on the bed using a laptop. Evening view of traffic light intersection with cars with headlights on and a building and trees. A close up view of a hand on a keyboard by a monitor. Here is a image of an zoo animals. A tire sitting on top of a green fire hydrant. A subway sandwich on top of plate and napkin. A group of people riding on the backs of horses. Clock tower ascending into overcast sky from buildings below A family of four playing a Wii game. A lot of sheep eating grass in a ranch. Many people are scattered together at the air port. A plate with a drink and a variety of deserts. A large pizza on a plate on top of a dining table. The adjacent farm land hills attest to the height of the soaring kite. A small bird is perched on an empty bird feeder. A shot of a desk with two computer monitors with a teddy bear on top of one of the monitors. A cheese pizza is on a tray with pieces missing. a couple of zebras watch a giraffe walk through the grass In the evening a large amount of open umbrellas are together. It is almost like the dog is flying in order to catch that Frisbee. A pile of busted up toilets and sinks laying on the ground. A person wearing a helmet and riding a motorcycle. A cat laying on a couch in a room. A young skateboarder performs a trick on the stairway. two elephants in a field near a tree A child sitting on a wood bench typing on a toy-like laptop. A man who is looking at his cell phone. A large plane with propellers high up in the sky. A double deck bus driving down the street. A bunch of people heading to a big plane. A youth baseball player throws a baseball outside. A horse that is enclosed eating grass during the day. The young man on the skateboard is practicing his tricks. A hallway with piles of luggage and other things. A cat under a table on a wooden floor playing with a jar. A very tall clock tower towering over a city. Seven people are posing for an old time photo in a large kitchen. A laptop computer and a desktop computer sitting on a desk. A cat sitting in a flower pot with no flowers. A bike and a large pile of luggage sitting under a sign. Two giraffes with dried grass and trees in gray light A large red stop sign on a street. A train that is going by a train stop. A woman wearing blue crosses the street on a bike. A bathroom with sinks, mirrors and a towel dispenser. Several people on a beach one is parasailing , one has been wind surfing , and some are gathering up a picnic. A woman and her son using an old iMac computer A herd of sheep laying down next to each other on grass. A couple of men sitting next to each other. A large white double decker bus parked at a bus stop. A table is adorned with red, yellow and green fruits and vegetables. A picture of a sidewalk in front of stores. close up of a toilet that looks like it is smiling a fire truck at an intersection resting on its side A couple of people carrying luggage through the snow. A bird perches beneath a multitude of clocks some baseball players playing as people watch on The couple are posing for a picture while he is brushing his teeth. Two men riding an elephant driven by a boy. Wide angle view of a girl in a living room watching television. A bathroom with the door opened to a toilet and separate sink and vanity area. Two people sitting back to back on a train. A red double bus is traveling down the road. A man and woman on a couch playing the Nintendo Wii. Two zebra standing next to each other on a hill. A WOMAN TAKING A PICTURE OF CAT IN THE BATHROOM A family open Christmas presents near a Christmas tree. A sleeping woman cuddling a cat in bed. Pedestrians walk on the sidewalk of a busy city A man is sitting by a river and brushing his teeth. A clean and bright kitchen with hard wood floors. A professional baseball player holding a bat on the baseball field. a bird in a tree branch with green leaves A plate with stir fried noodles, broccoli, beef and carrots. a couple of bowls with some fruit in them Two people on horseback are posing while the horses gallop on a beach shore. Man standing on shoreline by ocean holding surfboard A large white tank sitting on top of a green lawn. A white table with two laptops and a bag on it. Two hot dogs on buns next to a glass of water. A pizza toped with cheese and met on a wooden table. A photograph of a kitchen in the day. A group of giraffes standing around their enclosure A bedroom with a large bed with a white comforter. A boat floating out in the ocean next to a shore. A man on skis walks on the ground. a bedroom with two big beds covered with green blankets A ,man holding a boys legs learning to surf A male surfer performs a stunt in the ocean A bear reading a Christmas book in four separate shots A pizza with pepperoni and sausage sits on a baking pan. A group of kids that are sitting around a table. A bathroom with a white toilet and white tub. A morotcycle sits parked near a curb where two people are walking. Motorcycles standing in a row in a museum. Picture of an exterior place that looks wonderful. A toddler eats cake with his hands in his high chair. A thing leafy green tree branch with many oranges. There are many cows on both sides of the road. Two zebras are standing close in a field. A CD case is sitting on a bench. A cat laying in a bathroom sink, looking at the camera. People browse and relax in a wine store. a man sits on a bench while petting his dog Yellow fire hydrant with a blue top sits on sparsely cut green grass. A couple of giraffe standing next to each other near a fence. A person with their feet on a coffee table in a living room. A man sitting in a motorized raft in the water. in a baron field a heard of zebras move about. 2 seem to be fighting A smile white dog by a bike on the road. a close up of a person playing nintendo wii A multi layer platter filled with different types of cup cakes. A woman with pink hair riding a motorcycle. A plate with different vegetables and bread on it. Group of four zebras standing in a field of grass. A man holding a Frisbee about to throw it. The baseball player at bat is hitting the ball Three different road signs are stacked on top each other, as a man on a bike approaches. Two children are on surfboards in the water. A woman is seen in the kitchen cooking on a white stove A bus sitting on the side of the road. A man riding a skateboard down a street next to a tree. A dog that is wearing a dog collar smiling A small child holding a remote and a remote controller. A person on a snowboard riding it in the snow. A purple bird perched on a tree branch. Double decker bus that is blue and green A man with a playful look standing by a dessert. A skateboarder performing a trick on his skateboard. A man on skis on a snow covered slope. A man riding on the back of a bike. A man holding something with some beakers on a table like a science experiment. Three people are looking at their cell phones and drinking wine. A small child and a baby are lying down together. A baby is laying down with a teddy bear. A group of young adults play frisbee in a park. Car parked in front of a donut store. there is a baseball game on and a player is preparing to run A close up view of Italian mini hoagie sandwich. A close up of a man's hand holding a cell phone on his lap. A man playing baseball prepares to run after batting. a herd of cattle on the field grazing A bed with a purple bedspread on it in a room with a picture on the wall. Two very large vehicles side by side on a street. Two red two-story buses are parked outside of a building. a dish of food some small plates and a wooden fork and spoon A picture of a restaurant interior is taken through a fish-eye lens. A young girl standing on a field with a flock of birds. A man skiing down a hill covered in snow. A man standing on top of a field holding a bat. Two bottles of champagne sit in an empty fridge. A heart shaped cake with bear decorations on a pedestal. A trash can and a white toilet in a room. A couple of giraffe standing on either side of a tree. A made-up bed in a drab-colored hotel room. Two dogs are tugging on the same Frisbee. A man holds up a hot dog covered in toppings. a person wearing shirt and tie and looking up. A yellow packet sits on a wooden bench. This photo is shot from a side angle, capturing the dog looking out of the car window. A dog looking out the car window as seen in the side mirror. A group of people eating and drinking in a restaurant A clock tower overlooks the city and tells the time A hedge row with rock pillars and a blue gate with sheep behind the gate and a mountain in the background. A ship is coming in to port and is about to be docked. A red towel hanging in a black and white bathroom. A skier is posing in skis and with poles. A woman riding a wave on top of a surfboard. Children in suits and ties are standing together The bathroom is clean and ready to be used. Some green bananas and coconuts are sitting on a picnic table. Several elephants walking together in a line near water. A group of baseball player playing a game of baseball. A fully tiled bathroom with a bathtub and bowl type sink, and a wooden framed mirror. A crowd of people standing on loading platform between two trains. A cart by some water loaded with old traveling trunks. A stop sign is shown behind two trees. A man on a tennis court about to hit a ball A furniture store display, with a chair and set out A couple of people on the snow putting skis on. A building sitting along side of a street. a couple of zebras are standing in a field Women and a child in a boat made of tree trunks A woman holding a small boy while a man feeds him some rainbow colored cake. Smiling orange shirted sports fan using cell phone. A vase filled with lots of different colored flowers. An oriental temple of some sort somewhere in the world. A man is wind sailing on the lake. A woman holds a baby on her arm and both are looking forward at an enclosed area with two giraffes in it. Two baseball players from different teams holding their baseball caps against their chests. A shirtless man on a beach with a disc in his hand. Men and women standing and crouching in front of a door. A child is holding a baseball bat at a game. A man is holding something up that says PPK two people riding motorcycles on a city street A cat sitting on a towel that's covering a plastic chair. a black and orange cat in a shoe box and shoes A small bird on an orange chair back. An SUV parked in front of bock of businesses. a person riding a skate board on a street a stop light a md del line road A little boy is eating a donut with white frosting and blue candy. A zebra running through the brush tail swinging there is a man and a woman posing in a kitchen An old fashion oven is shown in dim lighting. A long exposure photograph of a tattoo'd man skateboarding. A group of friends gather on a hill to enjoy a day of sking A single tulip is seen in a small vase. Small bathroom with a shower with red curtains on it. A truck traveling down the street near a fire hydrant. A hand is near a pizza that sits on a silver platter. A woman jumping up from a wooden park bench. A double decker bus waits at a bus stop. a box holds some gloves and old-fashioned photographs, with ties hanging above The police officer is riding the motorcycle threw the streets. A silver vase sits on a wood surface with sprigs of silver leaves in it next to a leafy green plant. A bowl has a dish that contains broccoli and mushrooms. A zebra bends over to pick up a stick off of the ground. A bunch of bruised apples sitting on the cement A big green Ford F250 Pickup truck parked in the city lot A dinner of a pork sandwich and french fries, with beer as a beverage. Street sign on pole outside of building with windows. A window with so Michael light coming inside A boy riding a skateboard down a hill. A man with short grey hair talking on a cell phone. A crowd is shown walking on the street. a man that is on a skateboard on a ramp Three mountain goats sit and stand on a rocky cliff. A woman on a sidewalk against a wall on a cell phone. Two children are playing frisbee on the beach. A young man spreads his arms to steady himself in mid air, as he and his skate board soar over the pavement below the concrete stairs. The two cats are laying on top of the computer desk. The huge airliner has four engines on it's wings. A double decked bus from behind in front of building. A tall giraffe is observed by people at a zoo. A snowboarder makes a somersault on a snowy course. A child drinking from a bottle in one hand and holding a remote control in another. A man sits at a table and takes a drink of his beverage. A flower bouquet in a glass vase and some writing on the photo. an old picture of a person riding a bicycle A man stands next to a very small plane. The yellow fire hydrant is rusted on the sidewalk. People walking and waiting around a baggage claim area. a person wearing a suit and tie A man who is holding a tennis racket. A black dog laying on bed with a striped comforter. A microwave oven door with a light bulb on inside it. Car's driving on a city street lined with houses. There is a vase with red, yellow, and orange roses. A cheesesteak with a bite out of it along with someone else holding one. Two teams compete for the ball during a soccer game. A couple of neon signs sitting above a bar. A kitchen stove with a microwave in the cabinet above. A set of electronics and appliances sitting next to each other. A pole with a clock on the top of it and a building in the background. A very uo close and personal look at a sugar glazed donut. A lady tennis player is bent over slightly and off the ground. A basket filled with items on top of a table. A stack of plates is adorned with pictures of round cats. Horses pulling carriages on the sidewalk along the ocean or a large lake A women who is swinging a tennis racket as two others watch. A white vase filled with different colored flowers. A man posing next to a couple of bikes on a street. A girl sits between a mans legs on a skateboard. a man is in the air on a skateboard A red sports car parked next to a truck. Three men ice skating in a line while one juggles a Frisbee on his head. A group of people riding skis on top of a ski slope. A bride and groom are sitting outside on a bench. a couple of guys that are standing up with a wii remote A parking meter on the side of the road. Empty wrought iron bench outside the house on a tile base. A cat drinking water from a toilet in a bathroom. People are sitting and eating in a cafeteria. A group of three people riding waves on surfboards. asian woman with umbrella smiles at the camera A train pulling several train cars full of coal. a desk with a monitor a keyboard and a mouse Man serving sliced pizza in brightly lit kitchen. Planes lined up at the airport arrival gates on a snowy tarmac. a bunch of kites being flown in the sky A surfer stands with his board on its back in the water. A subway car is coming down the tracks A woman walking on a stone wall near two giraffes and a zebra. A man standing on top of a snow covered slope with a snowboard. The fire hydrant is by the building on the grass. a silver car is parked in a lot A cat is sitting on the dashboard of a car. A plate of salad at a table setting with a glass of wine. Horse statue displayed on stand in park setting with trees and flowers. A child mixing food in a bowl on a table. A pair of skiers sitting down looking at the scenery from a top of a hill. Very finely made vases with painted designs on them. A man is in a restaurant eating sandwiches off paper. a close up of a bowl of broccoli on a table Accident scene with a fire truck tilted on its site. A room that has a couch, chair, and table in it. A plate of meat, broccoli, rolls and rice with gravy. A health hazard sign closing a beach to watersports with a sailboat in the background. Two men laying on the ground near parked motorcycles. A small airplane that is flying in the air near the airport A dog laying on the ground with a pink frisbee in it's mouth. Little boys playing soccer together on a field. A group of people wearing orange are standing next to a VW bus. two men standing in a room near two microwaves Fishing boats docked in a harbor with mountains in the background. A train can be seen in the foreground and a shipping dock in the background. A blue ceramic vase with fresh flowers on a window sill. Tall buildings surrounding an alley way with birds flying over it. Some people in white overalls working with some metal bars. A table in the kitchen of a building with screen walls There is no image here to provide a caption for. Clean plates, cups, and spoons drying on a towel. Two people seated at a table with other people in the background. Two birds are standing among leaves and sticks. A snowboarder holding a board while looking at the mountain. A large wave with some people on surfboards Someone is surfing the breakers under a sky filled with fluffy clouds. A sign with a gnome crossing symbol on it. Two people playing a video game on a large television. A busy city street with a traffic light on it. A black and white photo of a cow running in the desert. Some people are flying a kite at the beach. A man sitting on the floor with a laptop as others walk by. An old bus being driven by a beard man. a clock is up near a statue of a bird A man shaving in a large bathroom mirror. A man stands at a counter with food items. A skier skiing between poles on a ski course. two girraffe standing in an open field with their necks crossed A bus with the windows broken down sitting in the open area. A girl on a surfboard that is on the ground A freeway is busy in the late evening. A garden with yellow flowers on a sunny day. A child skier is headed down a small slope on their skies. A group of people stand in shallow water near a wind farm. A white toilet is shown in an all black bathroom. A laptop is powered and sitting next to a mouse and a cell phone. A train that is on the rails in a station. A man rides an elephant as it crosses a river. A boy is eating a dessert on a table. A woman, man and child standing near a food truck. two hands are holding white video game controllers A fresh fruit plate with grapes and oranges. Two female tennis players shaking hands over the net. A group of people are on a platform above giraffes. The boy is sitting on his blue suitcase. A large boat is carrying a smaller boat through the water. A small clock on a pole in front of a building. Two beach chairs with towels draped over on a beach. Lawn area outside a McDonalds, no customers, appearing closed. Some men are putting lots of bananas into piles A cat is looking up next to a large television. A man dressed up in renaissance clothing talking on a cell phone. Four soldiers and retired officer jointly cutting ceremonial cake. A cat sitting on the edge of an open car window. There is a woman that is riding on a bike A traffic light with a red light and an arrow pointing to the right. A group of horses stand beside water and grass. A group of bikers riding down a busy city street A large cat is laying belly up on the bed A fire hydrant is sitting on a sidewalk. A lone zebra standing next to a sheep in an enclosure. A woman and a man with a surf board on the beach. A red fire hydrant raised up in the grass. Tagged animals are grazing on grass in a field A photographer with his nice camera walking in a dirty road Person in a black wetsuit and gorilla mask carrying a surfboard on a beach. A woman sitting on a curb with her feet on top of a skateboard. A room with a large clock sitting next to a wall. A motorcycle sits on the side of a building. A young surfer rides the side of a wave. A meal of broccoli and some kind of meat. A bunch of cows in a field with a man standing near the fence. a vandalized stop sign on a city street near a pole Old style computer with keyboard and mouse sitting on rug. A close shot of two separate trains. A heavy set woman wearing a gray sweater holds a brown teddy bear. A kitchen with a counter, refrigerator and a dishwasher. A hotel looking room has another room through the door. A grass yard that has a large sheep laying down on the grass next to a dog. A woman and a little girl in blue are making pancakes and another person with her hands are putting on some cheese. A large brown dog standing on the side of a small road. A bunch of people who are standing around a table. A dog is lying on the couch with its head on the arm A bird sitting on a branch looking away. There are moving motor vehicles on the road. A woman playing tennis on a clay court. A blue and gray commuter bus traveling through a shopping district. A person sitting at the edge of the surf in a wet suit. a black and white cat wearing a neck tie A giraffe is in the wild standing next to a tree. A group of people sitting around a table. a suit case on the floor with a hat on it A little girl sitting on a bed with a teddy bear. a person in a blue jacket and is rowing a red kayak A paper plate holds two slices of pizza. The giraffe is walking beside the chain linked fence. Two elephants are walking through the mud in a clearing. A turkey that is cooking in a large roaster oven on the counter. A man eating a piece of pizza on top of a plate. A cat sits on top of a laptop computer. A room of people standing around playing video games A small black and white dog with its head on its paws A small beige dog with short curly hair. No dogs, only teacup poodles OK sign and fire hydrant. a couple of people are typing on their cellphones A woman is holding a phone and sitting in a chair. A pan has fruit and vegetables on it. A white dog stands on the back of a sofa. A bedroom scene with focus on a bed and a teddy bear. Men work on the basket of a hot air balloon. A hand holding a small orange Japanese umbrella. a statue of a cat sits next to some scissors A great view of a street in the picture. a man is doing a trick on a skateboard A man standing on a tennis court holding a racquet. An armchair with a stuffed bear on it on the sidewalk. One small and large giraffe standing next to each other. A cat sitting on top of a grey cloth and next to two staplers. A older man enjoying a variety of pastries and breads. A couple of fans have painted their faces red in a large crowd. A person holding a hot dog on top of a bun. Woman sitting near a table eating a cake. point of view shot of man using a small urinal in bathroom Man throwing a disc at a bush park. A zebra eating grass on a sunny day. a person riding a surf board on a wave A vehicle with Melbourne Tigers painted on the side of it. an orange small van and a white surfboard A gray remote is sitting next to a black remote on top of burgundy fabric. A man sitting on top of the snow holding skis. People are meeting around a circular table. A train stops at a vacant train station. a ball player holding a bet and some business men some people in a room with tables and two are playing a video game Two women on a park bench looking at a digital camera. A group if people that are sitting on a park bench. A red fire truck toy on a table. A man and woman on their cell phones by an umbrella. A dog is watching a man ride a skateboard on his stomach. A carousel view shows the circular.. lighted center and several rides, including horses, a giraffe and an elephant. A man is holding a large pepperoni pizza. two birds standing together on a rock A table with utensils, glass, plate of bread and salad, and stones, on a stone patio with chaise. A group of people standing around each other near a tent. A group of elephants that are in a field. there is a flower in the glass vase on display A tennis player in a blue shirt runs toward a ball. Two men are skating on their skateboards in the middle of the afternoon. Cows grazing on the side of a mountain covered in green grass and trees. A small locomotive engine blowing a cloud of steam. a couple of people on a tennis court pose for a picture A person wearing stiletto heels laying in a bed. A bride and groom exchange a fork-full of cake on their wedding day. A nerdy woman brushing her teeth with a friend nearby. There are lots of kites in the sky by the beach outside. A man throwing a Frisbee on a sandy area. Two oranges on a cutting board with a zester full of rind on a counter with pot in back. A TV sitting on top of a wooden dresser. A dog with closed eyes sitting on a cushion. a close up of a cat walking on a brick surface there is a cow that is drinking water from a hose A woman helping along man put on a tie. A boy laying on his side typing on a laptop computer. A pair of dogs lie down beside each other on a bed. THERE IS A SINK AND A SCREEN DOOR IN THE HOSUE Four individuals on a basket ball court, one of them holding a tennis racket. People standing in the sand flying colorful kites. A large fancy clock on a building showing the time of 1255pm. A blurred picture of a laptop and a box of tissues. A couple of one way street signs hanging on a traffic light. A bull and two calves block a vehicle from going down a road. a white plate with meat and a green vegetable on a glass table A subway train painted with graffiti pulls up to a platform. A person surfing a wave on a yellow surf board in the ocean. A dog is sitting on a work bench in a shop. Crowd of people at public market in urban setting. A man uses an oar as another man looks on a adult sheep stands by a tree as some baby sheep look on An old man with a tooth brush head under his nose, mimicking Hitler a little girl with her teddy bear sitting in front of a morror A group of older people sitting next to each other eating cake. A police motorcycle is parked at a festival procession. A guy walking on a field holding a Frisbee. an image of a person making a video game character A small bathroom, with only the toilet and sink visible a person riding a skate board at a skate park A bulletin board filled with blue pamphlets on a city bus. A small bird sitting on a thin tree branch. Two people and their dogs skiing along a trail in the woods. A living room with red carpet and blue couches. A man sitting at a desk with a laptop and a coffee mug. A herd of sheep and cattle standing on a lush green hillside. A woman in plaid shirt looking at a bird on a ledge. three baseball players standing around a base A large airplane flying high up in the sky. Sunset seen across an expanse of calm water A man is holding an umbrella beside a truck. Stalks in a ceramic vase against a mustard background A man in white hoodie sitting in front of a leather couch. The baby elephant is walking with a small object in it's trunk. Male surfer in a wet suit, on a board, about to be overcome by a breaking wave. Baby elephant with ears spread standing in front of larger elephant. A photo of stuffed large animals taken through glass. Two plates full of breakfast foods are next to cups of coffee. The two slices of toast each have cheese on them. Two young men trying to catch the same frisbee. A flotilla of small boats circle around water buoys. Two colorful parrots perched on a tree branch. a group of people watching a baseball game A photo of a yellow and green fire hydrant. A yellow teddy bear on a little girls bed A train station with a red,white yellow and blue train pulling in on the tracks there are many cows that are laying in this barn White dog sticking its face inside a white toilet bowl. Two guys in a bar eating pizza and drinking beer. A white carriage with a white horse carries passengers through a city square. A man crouched down with a camera next to a small white horse three laptop computers sit on a table in front of a television playing the opening scene of a Star Wars movie a teddy bear with a hat placed on his head A brown horse with pink and black harness stands before a business with a short white fence. a bathroom with a very dirty toilet and sink A stainless steel sink is next to shelving in a room. A small dog is beside a laptop computer. A long row of buses driving bumper to bumper near trees. Road sigh on wooden pole shown upside down next to white wall. Two leather clad motorcycle riders on a paved road. baked round bready pieces of food piled on a plate next to bowls of vegetables and other sauces A red vase with dozens of roses sitting on a piano. A man performs a trick on a skateboard in a skate park. Snow-dusted evergreens and rolling hills mark the distance, while in the foreground a hunched over skier moves through a dip between two snow-packed slopes. A dog is running on the beach sand. A lone horse shades himself under some trees. A pan with pizza and its cutter on it sit on a stove top. A half eaten cake sits on the table with a knife. Three people water skiing at the same time while folks in another boat watch A woman in a bar is wearing a tube dress. A neat and modular kitchen with electronic gadgets and dining table and two chairs. A clock tower and other buildings in a city. A chocolate cake sitting on top of a table. A woman and a man playing an interactive video game. Some men hugging each other and a person with plates on their head and shoulders. Several people are sitting around a table having a business meeting. A woman is checking her phone outside on a fall day. A young boy who is eating a chocolate doughnut. A man who is sitting at a bar. A man wearing a red neck tie and a blue jacket. A display shows hummus and vegetables on white trays. A black train sits on the tracks of a station. Table with food on it including bananas and rice. A skate boarder is doing stunts on a bench. A hand is cutting into a large white cake. An old truck with a broken side view mirror. A lady with a brown hat and long white socks sitting on a wood bench. Sun peering through leaves of a grand land scape in distance A group of friends posing for a picture at a deli. A large passenger jet on an airport runway near the coast. A man standing next to a white horse. A baby girl is using two brushes to arrange an older child's hair. A young girl is sitting on a bench in front of a rock cliff. A dog laying on a bed under a pillow. An adorable grinning girl laying in bed between ms piggie and kermet the frog. Coffee and powdered sugar doughnut on a woven cloth. Two pieces of meat covered with gravy next to broccoli on a plate. A flock of birds sits on top of a large giraffe. A statue of man sitting on a bench overlooking the ocean. a train on a track near a platform with people near by THERE IS AN INSIDE OF A KITCHEN IWTH A STOVE AND A DOOR A baseball player is holding a base ball bat at the game A young man ridding a surfboard down the rapids of a river. A red stop sign under a street light. One man is on skis and another man is behind him as they stand in the snow near a pond as a group of onlookers stand off to the side. A toy truck sits on top of a table. The small bathroom has an electronic toilet near the sink. A clean handicap restroom with plenty of toilet paper. Two multicolored cows cross the road very slowly. A airplane in a field with a freight train going by in the distance An equestrian riding on the back of a horse at an event. An picture of an old building with two towers and a clock is taken from below. a computer that is sitting inside of a room Three young people sitting at a table and enjoying some lunch. a brick building with a blue sign on it in front of a metal pole A group of people posing for a photograph at a black tie event. A man sitting in a chair while working on her laptop. A man is jumping as he tilts his skateboard to the side with his feet. A hand holds a ball in a green sign that sits on a post. a woman stretches high to hit a shot in tennis a close up of a dog laying on a bed The people are drinking from cups and smiling. Two ovens next to a plastic bucket and trash container. A giraffe enjoying the company of another giraffe. A warning sign in front of train tracks. a laptop with some other electronics on top of it A bench is in front of a flower bed. Three airplanes are lined up for take off. A couch is sitting outside on the curb by the pole. A cat relaxes in a suitcase next to a pile of clothes. Cooked broccolini and greens on a white plate. A leaning stop sign has a street sign on top. there are many vegetables sitting on this counter Children looking at a zoo giraffe and its baby a motor bike sits parked on some ply wood A donation station on the side of the street A group of vehicles in street area next to a building. A sumo wrestler is shown wielding a baseball bat and awaiting an incoming pitch. A bathroom sink with no mirror behind it A shiny new racket looks down upon the worn shoes of a tennis player. Three people in the distance riding horses along the beach. A young skier headed down to the ski lodge. The beat up car is parked beside the building with a statue of several men. A large long train on a steel track. A dog is on the grass playing frisbee. Two zebras chasing each other in an enclosure. There is an animal walking along the hill. A group of long horn bulls in a field. A man is laying on the floor of a hotel room next to an open suitcase. A view of a dock with a lighthouse in the background. a person holding a hot dog with mustard and ketchup. A man on skis on a mountain trail. There is a hotdog and a side dish on a plate. A black and white picture of a jumbo jet parked on a runway. A black and white photograph of something I cannot quite make out. A zebra standing on a field next to lush green trees. The man is sitting on the bench by himself. A girl is petting a cow through a fence. A woman wearing a short skirt kneeling on a tennis court. Two black, white and orange stand on the grass near a cliff. the baseball players are talking on the field. People are walking on the street by a homeless person. A grey clock tower above grassy area and building. An elephant standing next to a body of water. A person trying to fly a kite on a beach. A dog with a bandana and goggles sits on a red motorcycle. A pile of identical teddy bears lays on top of some pillows. A couple of red double decker buses sandwiching a small white bus. A grove orange trees filled with juicy oranges. A man riding a snowboard down a snow covered slope. People and cattle standing at the waters edge on a bright sunny day. a little boy retrieving a mans banner flags that have broken an overhead view of people at desks working on their computers A person in a wet suite running beside the water, holding a surfboard. A pair of seagulls resting on the top of a lake. A close-up photo of a piece of broccoli upside down. A large tall tower with a clock on the top. A snow boarder in mid trick with the Hilton in the background A bear and a dog sitting together on a hillside. A person that is catching a ball in a baseball game. a guy standing in front a large building holding a tennis racket Three people sitting on a bench in front of a lake. A bus driving down a city street next to tall buildings. A couple of little kids sitting in the grass. A black plate topped with lasagna and garlic bread. a bus is driving down a snowy road near the days inn and suites A vegetable and fruit stand on display at the market Several bundles of green and yellow banana's hanging around a table. A group of people who are sitting on couches. Boy performing a trick in mid air on a snowboard The people are cooking out and have hot dogs on the grill. THIS IS A PHOTO OF A SITTING AREA WHERE SOMEONE HAS PLACED THERE LAPTOP A young child sitting in front of a pizza. A Siberian Husky dog is being brushed while he lies on the floor. A toilet area with bright and colorful wallpaper. A group of people playing a video game on Nintendo Wii. There are plenty of apples to choose from in this outdoor market. A woman and a small child watch a train as it passes. a lock on a door under a window Two vases of flowers are sitting on a counter top with bears. A class room full of students and they are on their laptops. An old stoplight with a clock and a troll doll next to it. Buses are parked near a field with a fence. A lady is pulling her luggage through a terminal. A group of sheep gathers under a tree in a grassy field. A BOY JUMPING OFF A CRATE WITH HIS SKATE BOARD A black and white cat sitting on a suitcase. An older man downhill skiing down a slope. Large canyon surrounded by a series of trees. A waiter lighting candles on a cake at a restaurant. A man with skiing gear on top of a mountain with snow A man skateboarding on a skate ramp at night. a person spraying a horse with a water hose. A man and a woman ties a boat to a wall. I am unable to see the image above. A black and green vintage engine moves along tracks by a station. a see through sun roof cover is being used A person in a room with knives and scissors hanging on the wall. A train pulling several carts traveling down the rail road tracks. A table with various meets, breads and tomatoes on it. The two ladies is outside talking in the rain A woman in a plaid cap cross-country skiing in a group. A small child with a kite walking on a beach. a man sitting in a lawn chair in the snow A window with a bench is under a staircase. a sign on a pole advertising free bus rides a bunch of students stand around on the field behind some school buildings, playing Frisbee A family who are selling bananas in a portable cart. a person is holding an old cellphone outside Girl in a dress throwing a red frisbee. A group of people in the ocean on surf boards. A large crowd of people gather in a square with Capitol Hill in the distance. A plane with a sign attached to it flies high over an ocean beach. The kitchenette has a stove, microwave, and sink. A red, white and blue train filled with passengers. a close up of a vandalized stop sign A woman playing tennis holding a pink umbrella a man and woman recline in a bed, each with their own laptop There is a purse on the floor with its contents spilled out A bunch of surfboards are standing in a room. A man and a woman are cutting a cake while others watch. kitchen with a wooden kitchen island and checkered floor The two elephants walk next to each other in front of the water. A bus filled with little monitors displaying video. A laptop sitting on a couch with cell phone on a table. A tile floor in a bathroom with a urial. A jet airliner is on a runway on a cloudy day. A vase of flowers sitting on a checkered table cloth Two vases filled with white and purple flowers. People walk up the stairs to get on a small airplane. The young child is eating from a spoon. An end table with a vase, remote, phone, candle and wedding picture. large tourist clock near a body of water. A parked motor bike on the side of a the street. A bowl of fruit is presented with a pitcher of water. A woman that is hold a device in her hand while standing on a court. A strawberry shaped cell phone holder hangs from a belt loop. A silver train traveling into a train station next to a platform. Two women, one with glasses standing next to a sheep pen A man is in the park with a hula hoop. A car that has the front of it open. A street scene with signs and people on bikes. A plane is flying in the air nearby a mini van and rental truck. A bird perched in the top of a leafless tree. A stream with rocks outside by train, with hills and evergreens. A picture of a gross looking cheese pizza. A door is open on a white subway train View from behind of two women under umbrellas A man or woman skiing down a snowy hill. There are sheep grazing together in the grass. an image of two horses at an outdoor park A woman and two men on skis on a snowy hillside surrounded by trees Two women are riding motorcycles down the street. Two street signs on top of a metal pole. a brown horse with a brown nose laying down A hazard yellow Navy plane sits in a hangar. A couple of people playing Frisbee out side. Several remote controls piled up on a flat surface The people are moving across the snowy mountainside. recovery tow truck towing a bus from a parking lot A living room setting with two bookcases with books a small refrigerator on the floor next to a freezer Zebras are grazing next to a car in a field. The curious kitten is looking down into the bathtub. a yoilet is int he middle of a clean bathroom A young girl puts a TV remote control to her face like a cell phone A man engaging in a game of tennis on a court. A clock on a red building letting people know the time. A bunch of green bananas tops a large bloom. A brightly painted bus pulls out of a parking space. Front half of a commercial airplane on a runway closeup with dusky sky. A vase of flowers sits near a window with a blind. A street sign in front of an old building in Ottawa, Canada some elephants are standing around some water A computer mouse, mousepad, and computer keyboard on a table. This is a picture of a popular sking mountain. A woman walking on a tennis court holding a racquet. A man that is sitting down holding a telephone. A sand area that has various sets of vehicle tire tracks on it and one beach umbrella open and set up in the sand. an old man holding an umbrella next to a bare tree People fishing and enlarge Mountainlake with trees lining the shoreline A tractor sits on the back of a large truck in front of a clock tower. A boy feeding a giraffe something green with palm trees in the background The clock tower sits in the middle of the pavilion. A sign indicates directions of travel in a circle An elephant standing in dirt under a tree. There is a giraffe that is standing at the fence and someone is petting the giraffe A young dog lies on a freshly made bed. Three green birds perched on a limb with the sky in the background. Three good friends having a bit of lunch and drinks together. a person walking on a city street with signs and poles a baseball player holding a bat on the field a group of people on a paddle boat in the water Some bananas are placed on a cutting board along with some yogurt and a package of creel. A white toaster in the middle of an asphalt road. We see a close up of a vegetable ad pasta salad. A couple of men playing a game of frisbee. a diced up credit card next to some scissors A blue and red airplane is flying in blue skies THIS IS A UP CLOSE PHOTO OF A PLATE OF FOOD Red traffic light at intersection on paved four lane road. a cargo truck is loading a train with luggage Kites flown in large grassy open area with numerous onlookers. Motion capture shots of a person riding a snowboard. A large long train on a steel track. Two black bears being kept in an enclosure A man snowboarding down a snowy and hilly slope A female tennis player gets ready to begin play. One tennis racket is place on top of the other one. A man on his motorcycle is attempting to mount before taking off. a man holds his racket out while on the tennis court. A man in white shirt and shorts playing a game of tennis. A baseball player holding a bat while standing next to home plate. A cat lying on carpet with its head on a banana. A Macbook is placed on top of a book. Many brown and white cows are in a dusty field. A woman holding a tennis racquet walking near a little child and a man. a person holing a hot dog with onions on it A black and white photo of a glass bakery shelf. A man on a boat preparing his fishing pole. A football game on TV reflects in a bathroom mirror. A baseball player holding a bat next to home plate. A couple buses are parked in a parking lot at night. A light brown dog laying on a leather sofa. THERE IS A BABY THAT IS IN FRONT OF THE REFIGERATOR DOOR A man is filming something on a cellphone A little boy holds a toothbrush in the bath. A photo of a bathroom sinks and tub taken in a mirror. A cat sleeping on the contents of a piece of luggage. A laptop, mouse, cell phone and a notepad sitting on a table. Two giraffes grazing the in wilderness with a mountain in the background A rusted locomotive on a hot summer day. A group of men standing next to each other holding snowboards. A man in a baseball uniform walking on a field. A train travels along the platform in a train station. The table has most of the items needed to keep in the repair shoulder bag. A bird is perched on the branch of a tree. A metal stove sits under a granite countertop in a kitchen. A metal refrigerator freezer inside of a kitchen. A person in winter clothing, a helmet and skis, doing a trick i the air with jos skis crossed. there is someone holding a remote in there hand There is a rowboat out on the water in this sepia tinted photo. A bathroom with a shower and toilet decorated in pink and green. A woman in black standing near a bus stop A black and white motorcycle parked on the sidewalk outside a store. Lights shine on two matching, white, pedestal sinks. A man with a dog in his backpack walks down an aisle on a bus a clock on a building with a sky background A new silver motorbike parked in a garage. A black dog laying on a bed under a blanket. A truck that is parked on the side of the street. A guy with a broom and dog stands on a surfboard. A young person laying down on a surfboard riding a wave. a man and a woman are playing a video game together A boy is sitting on a hospital bed. Zebra alone in a field of dry vegetation. A chef preparing food inside of a kitchen near a window. some people on some grass playing frisbee and some trees A young boy standing on his tip toes playing a game on the Wii. a surfer laying on their board about to catch a wave A clock at 7 during a hazy autumn time of year Two laptops sit atop a desk on either side of a phone. A man and a woman sit together on a bench. dressed up toilets in a toilet competition on fake grass A group of sheep are out in the fog wandering. A baeball player I l9e standing in a field A brand new black stove in a primarily white colored kitchen. A herd of cows walking across a river. A baseball player slides in to base to try and take it. The plane is flying very low to the ground. A parked red and white motorcycle is shown from closeup. Black British fighter jet doing a barrel roll. a rusty old truck sitting in an overgrown field A pair of men looking at a tablet perched on a table. A pizza sitting on a table outside a sotre front with bread in the display window A cat lies on a rug and chews on a banana. A bed is in a bedroom with two lamps on nightstands. Woman as seen through window of red vehicle. Spectators watch as a skateboarder performs a trick on a ramp. A girl is on her cellphone surrounded by fruit Two people using cleaning brushes on an outdoor monument. a lone zebra stands between some trees with a zoo sign in the background. There is a row of parking meters on the street. A baseball player throwing a baseball during a game. Cargo train is traveling on a track next to a forest. There is a man putting bread on a shelf. a tennis player swinging a racket at a ball A large commercial airline taking off from the airport. Tables of laptops are visited by various people. A person riding a skateboard through the air on a ramp. A woman swinging her tennis racket on a tennis court. A surfer balances on a surfboard in the ocean. A steeple of a large building, with a clock on it. An airport runway with a jet airplane ready for takeoff. A pair of people sit at a table with food and drinks. A laptop computer on a desk beside a paperback book. Bowls of soup sitting next to oranges and limes on a table. A fire hydrant is shown on a sidewalk with a brick building nearby. A large tray filled with tasty looking food. a boy throwing a Frisbee at night in a park a close up of a man with a mustache smiling A flock of ducks swimming on a lake together. there is a man in the water and a boat next to him A very crowded busy street with many signs hanging from tall buildings. a surfer is out at sea riding a wave A man on a skateboard passing by large glass windows. A train is on a bridge next to buildings. These are special repair vehicles used on train tracks. Person laying on bed by a window reading a book. a group of people seated around a dining table outdoors. An old fashion setup with cakes, candy, and tea. A skateboarder doing a trick in a parking lot. A bench with memorial bears and flowers on it. Up close picture of baseball batter wearing gloves and helmet. A woman crosses the street while she talks on her cellphone. Young adult male is surfing and riding the waves. a surfer walking down the beach looking at the ocean The young man is practicing on his skateboard. There is a person holding a Wii remote in their right hand while holding the nun-chuck in their left. A man riding a wave on a skateboard in an ocean. A cat is laying on top of something on the side of the road. A man is talking on the phone while working on the computer. an older person passing out plates of food to young people A person on a snowboard is sitting in the snow. A team of baseball players are posing for a group picture. A plate of food with broccoli and different kinds of pasta. A woman laying on top of a surfboard next to a black cat. A bedroom filled with bunk beds and a latter. A few kittens in a bowl in a white void A small dog wearing a colorful sweater leaning out a car window. an open pizza box sitting on top of a stove A donut with red, white, and black sprinkles. A woman running to hit a tennis ball with her racket. a baseball player holding a bat in a batters box Military plane is being flown by a pilot A cow stands next to a calf inside a fence. there is a man on the beach that is flying a kite Three boys playing a soccer game on a green soccer field. A bowl of corn chowder with broccoli, and a spoon on the side. an image of a group of people surfing A man surfing with a photoshopped character on board in front of him Two zebras grazing in front of a large bird A zebra grazing on long dry grass in a field. A man and woman looking at their cell phones. A dog has a frisbee in his mouth outside A young man swinging around a Wii remote. a field full of windsocks and cars parked in the background A couple of deer standing next to a zebra on a grass field. A stylistic pot and vase sit on top of a mantle. Two stacks of towels are laid out on a bed. A classic car waiting at a 3-way stop sign. A bathroom scene complete with a toilet, sink and bath tub. Two women sitting next to each other on luggage. A dog sits waiting while his owner cuts some meat. A living room with chairs and a wall of windows looking to a patio. Room with a lamp on a wooden computer desk. A woman is reading a book with her head cupped in her hand, as she sits in front of a park. Two towels hanging over a shower rack in a well-lit bathroom. Five jets flying in the sky and making colored smoke. A young child eats a hot dog on a bun. A train passing by a field that has been cleared. A cat who is sitting in front of a keyboard. Snowboarder going down the side of the mountain of snow. A bird is sitting perched on a branch. There is a black cat that is sitting on top of a toilet a little cat standing on the lap of a man sitting in a chair a kid is riding on a surfboard at the beach A small dog sits on the driver's seat of a car. Two toilet stalls in a bathroom with a black and white checkered floor. A giraffe standing on top of a grass covered field. Two people siting together on a bench matching . Several people seated at a table with pizza. a small white hand held remove control device. A person driving a motor bike through the sand on the beach flying a kite. Two frames of a woman with a tennis racquet. A woman with blue hair and a giant toothbrush Four guys with game remotes playing a video game. A photo taken from a plane looking down at the mountains. An airplane moves along a runway at an airport. two guys sitting next to each other with laptops A man jumping to strike a tennis board with his tennis racket A piece of cake sitting on top of a plate. A kite surfer flying above the ocean in his wetsuit Men and women sitting under umbrellas on the beach. Vehicles at night on a highway near a large hotel. a double decker bus is parked in front of a store A large teddy bear sits next to a red wall inside a toy store. An abandoned red train car in dirt lot. Two women sharing a plate of breakfast are happy. A porceline toilet sits outside on a sidewalk. This is a train that is parked near a building. A warped photo of an unoccupied bathroom in a home. The manufacturers box for the Nintendo wii on the floor Four people carrying luggage turning for a picture A dog with a purple Frisbee in its mouth. People standing on a sidewalk near a parked bus at night. 6 people gather and socialize in a kitchen. A bird flies over the water near an island. Green salad with broccoli and peas with fork and bowl a woman laying on a bed with a sleeping cat and poodle A man posing in his office work cloths A sturdy, small brown horse looks back as he walks through the hot sands. A young blonde boy sheers the wool of a sheep. a group of pictures of the same table with multiple trays of food a man is standing on a skateboard around people A fuzzy picture of a man on skis An old refrigerator is near shelves of bottles. A herd of sheep standing on top of a lush green hillside. Two puppies playing in the green grass of their yard. A teddy bear is sitting outside on a chair near flowers. a large kitchen with fancy counters and white cabinets The pizza has more sauce than cheese and pepperonis. A picture of a group of people surrounded by bananas. A person in a purple shirt plays frisbee golf. A stop sign and street sign stand on the corner of a street. The woman on the horse is racing the course. A rusted stop sign attached to a school bus A cow that is standing in the grass. A zebra taking a drink out of a basin at the zoo. A white horse pulling a horse carriage down a street. Cat laying on the floor wearing a tie around his neck Two white plates topped with french toast and fruit. A sub sandwich on a white plate on a table. A woman sitting in a car while her dog hangs out the window. A sheep in a field overlooking a lake and forest of trees. A pizza that is laying on a table. A young boy catches a soccer ball in his house Laptop and mouse sits on desk in front of computer monitor A pole with several street signs outside of a building. A boat floating on a river that runs through a city. A phone case, with a phone hanging on a belt loop. A young man standing next to a skateboard. A man sits holding jewelry near a woman. A cat sits on the seat of a motorcycle. A picture of an airplane flying high in the sky. A pickup truck with a camper is in a parking lot. A boy skateboarding down the a busy street A train sits on the tracks by the platform. a microwave sits on a stands with a vase on it A food entree is served on a plate with skewers. A man sitting next to a Wii machine with a Wii controller in his hand. A building outdoors on a town street near some street signs. a man wearing a wet suit in turbulent water A group of cows walking across a grass covered field. The modified school buses are in a muddy arena. An assortment of different pottery on elevated shelves A parking meter in front of building windows. A woman stands with her green and black luggage. People with a without surfboards watching a surfer in the water. a blue vase holding some flowers next to a wall with a border A clock tower in roundabout next to an ocean. A half eaten pizza sitting on a table next to stuffed animals. A street riddled with garbage and people walking, sitting and standing around it. A woman inside of a room with many items plugged into wall outlets. A child riding a bicycle with a lady sitting behind him. A kitchen table and bench made from a door A black bird standing in the green grass. a couple of women take a photo of a bath room Plane on the tar mat of an airport. an image of a woman eating food at the restaurant A view from the street of two traffic lights and a building. A clean bedroom with a tidy bed and large windows. A man is on a field kicking a soccer ball. A CITY BUS IS PARKED ON THE SIDE WALK Many different fruits that have been organized by types. A set of three piles of ripe bananas. a very big bus moving on the street with no people A newly married couple touching a strange mans hand. Two business men with colorful ties looking to the right. A breakfast plate including potatoes, biscuits and gravy. Two white toilets in a alley with a tiled wall. A old fashioned colonial dining room hutch and an anniversary clock on a shelf on the wall. stop light placed near the ground beside a white building. A man walks under an umbrella for The Bitter End. The streetlight has several different colored lights. A woman sitting at a couch with two cats looking out a window. A boat is parked on the side of the dock. A hand sitting on an open laptop computer. a couple of people that are playing a wii Three women are enjoying an outdoor lunch on a sunny day. A large black and white statue of a cow. A large building in the background with a clock and tower on the top of it and people walking in front of it down a sidewalk and paved area. A surfer is bent over riding the wave in to shore. an elephant in an enclosure at the zoo is walking A man in a tie and vest looks seriously at the camera. Several different kinds of vegetables on a black countertop. A woman stands next to a traffic light. Eight busses are parked in front of a field. A white refrigerator freezer combo sitting in a kitchen. Two older people walking two dogs on the beach with surfers in the water. The people are dancing down the street with umbrellas. a close up of a person in bed with a book A cake sitting on top of a plate with a knife A lady with a dog is talking to a lady and man. A toothbrush with round and straight bristles on it. Four people standing next to a net holding racquets. A clock that looks like it has melted sitting on the edge of a shelf. A peach cobbler is made in pizza style. someone sitting on the couch while they use their laptop Two elephants with grass in front of them in an enclosure. A very attractive and neatly kept bed room decorated in red . Meat and cooked vegetables served on a white plate. Man herding some skinny cows in a street. a tennis player stretching to hit a serve Bicycles in the bed of a pickup truck. A black cat sitting on top of a bathroom sink. a black cat is hiding in a box with shoes A bouquet of flowers in a blue vase contains roses and large leaves. A woman is talking on the phone and leaning on he xar A kitchen with a black stove top oven. Airplane flying over the top of a White Castle. A man walking his dog on a quiet country road. A man poses with a cane and purple hat in front of a woman carrying an umbrella. A man speaking to an audience in an auditorium. A long train traveling along train tracks in a train yard. a hand is holding a silver cellphone against a white background A flock of birds standing on top of a grass covered field. A picture of a bench outside by the water. A dog that is sitting by a computer. THERE IS A BLACK BEAR THAT S WALKING IN TEH DEN Man up at bat in a baseball game. A couple of people pouring a glass of wine. A bird is perched on a large rock near the shore. A pot on the range with different types of vegetables. some cupboards with a microwave sitting on top A person is holding a doughnut with coconut on it. The confused man is trying to read the sign. A brown and white cat sitting on top of a desk. A view of a sign on the side of a building. A body of water near a city with ice chunks. A plate with a chicken breast, ear of corn and broccoli with sprinkled parmesan cheese. a boy performing a skateboard trick in a skate bowl at night A baseball stadium with a crowd watching as a man holds his bat and another man throws a ball. A bus is stopped while three people are crossing. An adult and a baby zebra are walking through the grass. A man on a tennis court holding a tennis racquet. a yellow blue red and silver train engine and some tracks A small toilet in a wood walled bathroom A tennis player, playing in a stadium, in mid air. A tan dog's head poking out from a dark colored backpack. A bike is propped up against a building. A professional tennis player walks at the back of the court. Two horses grazing on green grass in a fenced in area. this is a close up picture of a roosters neck A cat that has just come through a doggie door. A dog in an open doorway with a pile of green bananas in front of the house. A skateboarder performing a trick next to a bike rider. A boy in a hat is smiling while holding a Wii controller. A cat lays between two parked bicycles in a black and white photo. The batter on the Ray's baseball team is celebrating a run, giving the incoming runner his outstretched palm. The woman is playing a video game on tv. The woman is flying the kite on the walkway next to the water. A bathroom with a wooden vanity and large wall mirror. An elephant with seat on its back standing by a fence. A little boy running on the beach with a kite. a desk with a ton of televisions and monitors on it A dog standing by a truck pulling a trailer. Boat that just crossed under a bridge on the waterway of a city. A beautiful young lady standing next to another beautiful lady and a man. a person riding a horse on a beach a young child standing in the kitchen next to an oven Two birthday cakes sitting on table beside each other. Bright and shiny red motorcycle parked on the street. Two people sitting on a bench by a tree outside a building. A man holding a tennis racquet pretending it's a guitar. A dog is lying on a bed with a red blanket. Teddy bears of all colors are in a big pile. Three boys walking along the beach carrying surfboards. Two bathroom sinks under two mirrors next to paper towel dispenser. A very cute cat sitting in a corner. A very big pretty bird by the water. Two small children ski down a snowy tree lined slope. Two women hold umbrellas outside a store with a young girl. Woman in a folding chair with surfboard beside her on the beach. A small bird perched on a fir tree Four zebra stand near each other looking at the ground. A woman prepares a fruit smoothie inside a blender. a street view of cars parked alongside parking meters on a one way street A man who is looking at a giraffe in an enclosure. A white bowl that includes carrots and broccoli. A white pickup truck is parked in a parking lot. Several planes are admired in an airplane museum. A woman walking down a street on a sidewalk. Two cats perch on the roof of a car. People are walking down the sidewalk in a storm, The sheep are scattered to graze in the field. a young kid stands in front of a granite table with a train on it A pink cell phone sitting beside a tree. A woman is milking a cow into a metal pail. Boy doing a skateboard stunt with feet and board off the ground. People line up in the snow for pizza and soda. A woman holding a umbrella over her head. Group of people watching something with man recording in room Two large pizzas covered in sauce and cheese. A billboard on the side of building features a bull. Two plates filled with hot dogs sitting on a wooden counter next to drinks. An airplane sitting on the tarmac with several service trucks around it. A glass table with pink flowers and green plants. There is a funny picture on the screen of the laptop. A dining table is set with many different dishes A bakery shop displays an assortment of cakes in a vintage case. A girl displaying a sad expression while she eats. A wooden trunk sitting outside with stickers on it. A city view shows architecture and people walking. a cat resting on top of a luggage bag resting on a bench seat A person holding an umbrella leans out the train door. There are two computer screens next to a lap top on a desk An artist's rendering of birds flying past a lighthouse. A boat is traveling on rough waters in the ocean. A cobble stone path through a park leading to a bench. A beautiful red haired lady preparing food in a kitchen. The guest of the wedding are gathered in a house. A train that is sitting on the tracks. A herd of elephants walking across a stony river. A smart phone is very companct and handheld. A surfer riding a wave in the ocean, performing a trick. some snow coming down on some street signs and trees Several women sitting in front of a birthday cake and laughing. A colorful railroad train arriving at a station. A piece of cooked pizza that is on a plate. there are two street name signs on a street pole a man standing in the park while holding onto a frisbee A box that is filled with oranges in the grass. A view of a sign that reads steep descent on it. A young man on his skateboard next to a rail. A plate of cookies, a bowl of carrots and blue frosted muffins A blue basket filled with bunches of ripe banana. A woman prepares a large pan of food. a sandwich with a bunch of mushrooms on a plate a person sitting wearing a suit and tie blue and yellow train carts on the tracks Multi-colored stuffed animals standing side by side in a shop. A guy with a pet sits in a parking lot a man is looking into an oven opening A herd of elephants walking across a field. A woman standing next to a man holding a cake filled with lit candles. Modern bathroom with two sinks a toilet and a shower A person holding a controller aiming it at a tv. Sleepy dog guarding two remote controls on the couch. Two checkered chairs and a clock in a room He is flying over the steps on his skateboard. An open box of pizza with toppings on a counter A truck pulling out of a parking lot onto the street Slices of vegetable pizza arranged on a white platter. This is a bride and groom cutting their cake The front view of a bathroom toilet inside a stall. a man is watching a television on the floor A teddy bear is sitting down wearing a bow. A person is standing in the intersection of a street. Two men in purple rush to catch a frisbee. A group of competitive cross country skiiers in a race. a white bathroom a sink toilet and tub A person that is looking at something down the street. The flat bed truck has a huge roll of tape on the back. a man swimming on a large wave in the ocean. A cartoon version of a bed and bedstand A caved in street with a bench in the hole A yellow train on the track at the train station. a cat is being fed by it's owner in a bed. Two young boys in shorts at park with hands raised. A young boy pulling a pink piece of luggage. A man who is holding a surfboard and walking in the water. A young surfer riding a very nice wave. Four photographs of a man shaving his face. A painting of a dog holding a dead duck in it's mouth. Some people walk on the sidewalk near a busy intersection. Row boat sitting in the middle of a lake by building A man riding a skateboard down the side of a ramp. there are many people that are flying kites A piece of cake with a fork and one and a half apples on the plate. A young man is riding his skateboard on the road. this is a man ridinbg down a hill on skis An elephant crossing the road behind a car that has just passed there is a pink rose in a glass vase A couple of men walking with a large elephant. a train on a train track on a city street A tennis player getting ready to serve a tennis ball. A man on a surfboard riding a wave. Some vegetables in a stew of some sort. A man riding a skateboard up the side of a ramp. People seated on a stone bench on cell phones. A photograph of a giraffe in the wild. People sitting at a bar with a lady turned smiling at the camera. Two men standing in front of a TV playing with a Wii. A woman flying a kite and holding onto kite string. Two people jumping up to catch a frisbee A man sitting in a chair playing a guitar in front of a microphone. a black gray and white cat is sitting in a sink Large motorized model plane parked beside air field. A young child at the table with a birthday cake and three candles. A very big display of many kinds of pastries. A baby sitting on a females lap staring into the camera. A boat tied up to the pier next to other boats on a clear day. A large clock outside of a window building. This is an image of a giraffe with a city in the background. A skateboarder has his feet off the board before a landing Cattle walking in open rutted field on sunny day. A girl making a "peace sign" with her hand and a woman holding a big black suitcase. A laptop is next to a desktop compute near a window. A basket ball player is posing in front of a basket. A large sandwich being cut by a person Large, mild waves are coursing towards two boats. A giant clock is on the wall of a brick building between two windows. A skier stands next to skis stuck into the snow. A plate that has a sub sandwich on it. A table has a plant next to the glass doorway in the kitchen. An old fashioned passenger train traveling through the countryside. a person sitting at a bench with a skate board A woman tossing a frisbee on a lush green field. Jockey riding a race horse on a runway. A horse grazing in a field witha blanket over its back. Two men in suits with one man leaning on a railing. A man and child sit on the floor with game controllers in their hands A blue and green plaid tie with a flag pin on it. A tofu and broccoli dish simmering on the stove A couple of chow dogs sitting in a car looking onward. A crowd is watching a woman play tennis. A small boy with a birthday hat on holding a tennis racket. The view from a motorcyclist's point of view, looking down a street. A man playing a guitar and other musical instruments Batter winds up ready to hit the baseball A chocolate caked frosted and topped with blueberries on a metal cake plate. A large clock mounted to the side of a pillar. A bird sticks its head into the water underneath a layer of plants. A baseball player standing in front of an A's poster. Woman in maroon shirt holding up a bagel. A couple wearing skis at a ski slope A white toilet sitting next to a large window. A red and black motorcycle parked on the sidewalk a yellow cat going after some corn on the cob A baby cow with his ears tagged with yellow markers. A red and gold painted fire hydrant on the street a cat sits on a wooden cluttered table Three Asian takes on hot dogs on display. A gray haired man is wearing a blue shirt and has a tie draped around his neck. a cake with a section missing sitting next to a burning candle These two riders are far ahead of the ones behind them. A woman talks on a cellphone while holding a pen. A plastic container filled with sliced carrots next to a yellow object. A girl swings a net a tennis ball. A cargo train that is traveling down railroad tracks. many small boats in a large body of water An older zebra and younger one nuzzle in a field The clock on the post has faces on four sides. A woman holding her head out the side of a train. A person on a motorcycle with a stuffed animal on back. A flower is put into strange pots next to a plate. Two zebras stand in the grass together near a fence. A man in a surf board shaping studio. very long and nice buses standing at the zebra crossing a window shoing a man standing alone on a train platform A red fire hydrant stands in the dirt of a stone platform. A box containing three round doughnuts and a fritter varies vegetables sitting on a black counter top An orange cat laying on top of a black piece of luggage. there is a man playing with a frisbee on the field A large market display of citrus fruits including navel oranges and clementines. A plate of assorted desserts and dessert sauces and a bowl of ice cream. someone holding a half eaten hot dog that has mustard and ketchup A bowl full of soap with a bowl of vegetables on the side There is a rug on the lid of a toilet and another rug in front of the toilet. A black and white modern bathroom showing he sink and mirror A hospital bed next to a blue chair a room wit ha chair a bed multiple windows It's a very elegant looking bathroom with double sinks a large mirror and a tub. A baby holding a spoon and looking at a pair of scissors. A woman rides a horse through a grassy field. a group of zebra drinking from a trough together A skate boarder reaches the top of a steep barrier. A plaque on the floor in front of a chair and grandfather clock. a bus painted in white, blue and yellow Some is holding a bottle of wine next to a huge hot dog covered in chili. A cardboard box containing a reef of glazed donuts. A bathtub with candles lit up around it and a stool next to it. A man wearing a blue shirt and an orange and black neck tie. A car parked next to a brick sidewalk on a street at night. A man is flying a kite on a clear day A picture of a lot of kites in the air. A black and white image of a young men on his skateboard. A girl using her laptop computer on her bed. The young man hurls his frisbee towards the metal structure. A clock above two pink colored stone arches Vegetables being displayed with each other in arrangement. A game strategy is hatched by the boy NOT wearing the boat like a hat. A little boy wearing a bib eating a doughnut. A bathroom with a urinal and tiled walls. A photo of a living room with a purple chair thete A little girl that is flying a butterfly kite. a man eating food at an airport terminal A child wearing pajamas holding a brown teddy bear. A slice of pizza with lots of vegetables on the top of it. a bowl with some fruit inside of it a man is on a surfboard with a dog a number of people standing near one another wearing suits and ties The kitchen counter is cleaned off and ready for us to use. Two young people sit next to a bunch of snowboards. An airport filled with planes sitting on tarmacs. A man on a horse during a race jumps over a hurdle The little girl is sitting in the chair eating candy. Two zebra standing in the trees next to a fence. The girl in purple is using her phone. A double decker bus driving down a city street Four sets of legs with one standing on a skateboard in the dirt A table filled with food on a patio A cross country skier traveling down a slight slope. A man and horse near a painted man wearing shorts. A man in a black coat sitting on a bench at night. a city street with bicyclists, double-decker buses, and many lights A book mobile bus from a library sitting by a street side. A man sitting in front of a tv with a Wii remote in his hand. A group of young children are petting a horse near the gate. The keys 1, 4, 7, and 8 are clearly visible on the remote. A laptop sitting on a bed near a window a rocking chair siting in a house next to a green lamp A busy street is crowded with umbrellas on a rainy day. A little kid standing on a household appliance A fridge with a bunch of papers hanging on it. A pole that has different types of signs pointing. Woman leans over as she serve the tennis ball back to the other side A toy monkey sits on a desk beside a laptop. A man in a suit and tie is smiling. A young lady with blue hair is holding her phone, posing for the camera. A group of horseback riders walk down a trail. Two giraffes eating the leaves off a tree. A flat screen TV mounted on a brick wall in a living room. A close up of the luggage claim at an airport with many suitcases. A baby with a teddy bear looking over his shoulder. The skier is jumping into the air above a half pipe. A tow truck driving down a rural road. Two street signs located above a stop sign. A group of actors and stage workers on the set of a TV show. a tennis player about to hit a tennis ball. The man is posing for a picture on his motorcycle. A person riding a board on top of a wave. A man in a wetsuit surfs a churning wave. a living room that has a couch and a chair in it A hand is seen pulling a piece of food from a toaster. A man and a woman stand under an umbrella at a street crossing on a rainy day. A group of people riding on the back of an elephant. A pulled pork sandwich with a pickle slice. A virtual woman in a rainjacket, carrying an umbrella. A yellow cat is among the camping equiptment. A pigeon stands on a window ledge overlooking a street. Two men are checking out several wines in a crowded room. Several sheep herding towards an outdoor pen on a county side. Yellow train on the tracks running parallel to the trees. people are taking samples of wines in a room next to an outdoor area where people are sitting Large white passenger bus parked in a parking lot. an elephant standing by some trees with it's trunk in the air A dad or grandpa looking at a child both are smiling. A close shot of a unique looking plate of food. a couple of bears are sitting near a glass A black and white dog examines something on the ground. a big propeller plan flying through the air A group of street signs in a display case in a room. A small green vehicle model is on display next to a busy city street. This is an old picture of a train at the station in Boyne City A group of people sitting on a yellow couch playing a video game. A zebra standing next to a tree on a dirt lot. A lady with a young girl standing in front of a few english muffins. A happy girl is showing off her Nintendo Wii. A man in a crowded room gazes into the distance. A series of street signs in French on a city street. A person in a cross guard uniform directing traffic. A man sitting in a chair with a laptop computer. A "One Way" street sign pointing to the right. A man standing next to a news stand on a street. A red bus parks in front of a building by a large tree. A white busted up toilet sitting on it's side. A young girl standing over a soccer ball. A gray vanity with three spigots in a public restroom. A small refrigerator sitting on top of a wooden counter. A boy is sitting at a table eating. A group of people is playing frisbee in a field There is a cat drinking from a faucet a close up of a bench near many plant life This is the head of a giraffe standing in a fenced in area. A plastic male doll is sitting on a toothbrush on its holder. Multiple items on a metal bar near an outlet. A big building with a large clock at the top of it . The street pole contains traffic and street signs. Two representatives from two different governments shake hands. A large black bear traveling across a grass covered field. A game of baseball being played in front of a large crowd at a stadium. A park with kites flying in the air A chair and a couch in a small room. There are vegetables that look like they have seasoning on them Two people riding horses down a sandy beach. A bunch of busses are in a lot. A plate filled with a chees filled meat sandwich with sauce. A bowl of soup, rice and fish by a woman. A group of scooters parked next to an old building. A man doing a trick on a skateboard off of a rail. A boat is running in the water with a low sun in the sky. The jazz band is taking part in a parade. A bunch of seagulls eating on the beach. A little girl enjoying a sweet confection and awaiting a sugar rush. Two giraffes stand in an open area with water and other animals in background. a vase full of colorful flowers in a bedroom A group of people riding boats on top of a lake. A computer desk with a computer on it and a chair in front of it. A tennis player trying to hit the ball. a very large animal submerged in water with two people near it An adult and a child sleeping in a bed. A lady with blue pants and grey sweatshirt playing tennis. a giraffe standing next to a tree with more trees in the back ground A closeup shot of several zebras standing together. a baseball player wearing green and yellow wearing his glove a giraffe standing on a field near a bush Woman getting ready to hit a ball on a grass court. A young lady taliking on a cellphone in the hallway at school. A woman holding a skateboard posing for a photo. A black and white dog on a brown tile floor next to counter. A black train traveling past a train station. a tray that has a plate and a bowl with food on it A woman sitting on a couch in a living room. A stop sign with lights lit up all around it. TWO TRAYS FULL OF FOOD SITTING ON THE TABLE AT A RESTAURANT A train on train tracks that run parallel to many other train tracks. A group of zebras are standing in a field. A young child in a white dress holds a teddy bear while standing outside. A man with a tie and a work badge Five loaded hotdogs surrounding a tray of cheese fries setting on a round table. A train goes through an intersection with traffic lights to stop traffic. an image of two men that are walking down the street A large sheep and a smaller sheep graze from a field. A small bulldog sleeping on a bed while wearing a pirate hat. A brown dog carrying a black frisbee in its mouth A long row of wood and wrought iron benches along a sidewalk. a young woman with a slice of pizza in her mouth A pitcher throws the ball towards the batter at a game. A cute little girl sleeping in a wooden framed bed. A horse connected to riding equipment walking in the street. A car parked on top of the curb next to a meter pole A date book is next to a phone, calculator, and a keyboard. a cat sitting in the refrigerator next to a gallon of skim milk and a bottle of gatorade A man is prepping a turkey in front of a bottle of wine. A wooden stop sign in a rural area Small children playing with toys and stuffed animals People sitting and eating in a restaurant. A group of young kids playing soccer on a grassy field. Tables and beach chairs on a sandy beach. Some unfinished looking wood is in a white bathroom. A baseball player slides into base while another leaps over him. A boy is jumping into the air on a skateboard. Children on a tennis court holding a tennis racket and tennis ball. a person riding a skate board at a skate park A young child standing at a table with a plate of food. a couple of elephants walk in a caged area The gentleman is taking a selfie while riding his motorcycle. The dog is looking at the toy bird being held by him. A bus driving on a street with people approaching it in the mountains. A man is peeing and has his behind exposed. A figurine with a plastic witches head is standing in front of a computer keyboard. a bi plane with a nazi flag on the tail A dog is standing in the grass with its tongue out. A red car is parked by a parking meter. A man sits on a blue and black motorcycle. a young man holds a snow board a tennis player swinging a racket at a ball A COUPLE WEARING YELLOW DRESS STANDING NEAR TWO HORSES. A television is on the beach near the ocean. A white bear sniffing on to some rocks a black and gray cat is sitting on a toilet A bike tire and a boy with a skateboard A sandwich with chicken and lettuce is on the table. Various sized knives are hung on a wall magnet. Two red traffic lights lit at a street corner A rhino and a baby elephant by a river. A girl is drawing on a birthday cake. A herd of cows is standing in a grassy field. A man laying on a bed bent like a pretzel. A brick wall with a blue and white sign next to arc. There is a building with surfboards outside of it. An old image of a pickup truck broken down on the side of the road. a mama goat and her baby walking on a slope Two men standing in a store aisle with one holding a baseball bat. A cat and dog are laying on a red rug. Train on tracks riding pass bus and couple cars on the street A bunch of animals out on the field. A woman standing on a beach throws a frisbee. Three workers stand behind a colorful fruit stand. There are several boats docked at the dock. A man standing next to a large elephant. traffic lights besides the road with so many vehicles An eagle soars through the sky near trees. a bathroom with a corner bath tub and duel sink. A man wearing ski gear and skiing downhill in the snow. large brown elephant making his surrounding look so small Standing in the ocean waves, a man flies a kite. a twin engine airplane stored at aviation museum. Skiers grouped up in front of a vancouver sign. A very large group of people are sitting at tables. Man and woman enjoying video game in living room. Several people cross-country ski on a snowy mountain. A man stands in a room with a cardboard box sitting on a chair. A man holding a racquet preparing to serve a tennis ball in front of a crowd. A plate with cooked meat and vegetables served on it. A toilet is standing in a room with a picture frame on top of it. A Yorkshire Terrier is looking out the window of a house. A person does a trick off a ledge on a skateboard. A very big airplane that is making a turn in the sky. A man lunging forward towards a frisbee next to three other men. an old black and white photo of a large building some big black cows in a grassy field A line at an airport with people and their luggage a nice neighborhood with some green grass in it A giraffe standing on top of a lush green field. Two skateboarders are racing through an obstacle course. Two giraffes and a zebra roam in a preserve area Little Asian girl holding a wii remote control. A computer monitor, keyboard, phone and various papers sit on a desk. A blue suitcase is leaning against a post on the street while a man walks by. A VW long van parked on wood strips on a grassy area. A skier pauses near the side of the course. Old passenger train making its way down from a rocky hill. Two women are standing playing with a nintendo wii. Large group of clothing sitting on top of each other. A mug of hot beverage sitting by a computer. a lake with a lot of boats on it A partially open door with a bathroom behind it. Many people are walking around in this square. Two flowers are on the blanket across a bed. Ingredients for a tasty bite, including peanut butter, oats, banana, preserves and syrup. Looking at a barge cross a channel of water under a cloudy sky a man putting a pan of food into an oven Several people boarding an old fashioned airplane in a field. Black dog jumping up at big screen television. A person is holding a banana that they are peeling. Box of dollar bills tooth brushes pills and spoons. A hotel bedroom with balcony overlooking the ocean. horses stand around on a neighborhood street in front of a car A large crowd is watching a baseball game. A zebra is grazing on scarce grass in front of a rock wall. Various luggage tagged and stored on numbered shelves A herd of cattle standing next to each other on a dirt field. A child dips broccoli in dressing before eating. Two black cats looking out of a window. A dog in a grassy area with eyes on a flying frisbee. A young male plays with a green frisbee. A room with some big equipment and a toilet. an elephant with a seat on it's back A boat that is sitting in the water. A bed next to two mirrors on the floor. a tour bus with a wi-fi notice parked on the side of the road A mostly white bathroom has a black toilet seat. A man on rollerblades at a crosswalk holding a sign that says slow. some buildings and a clock tower with two white clocks there is a herd of animals running infront of a man a person that is holding up a frizbee People hanging out in a kitchen eating and drinking Two animals that are looking at something in the wall. A banana and a vanilla bean are next to a shot glass. A young girl holds a Frisbee at a park. A person flying a kite on a beach Steak sits on a plate with broccoli and mashed potatoes, next to a glass of water. a number of people riding skis on a snowy slope A house plant on a sink in a bathroom. an image of a female tennis player returning a serve Perhaps he's a magician who will pull a rabbit out of that hat. The man is using the toilet with the bathroom door open. A person looks at their reflection in a bathroom mirror. Two adult giraffes and a baby giraffe are in a cage. A grupo of people in a field with tents flying kites. An asian dish topped with sesame seeds. Two dogs and a cat on a boat at edge of water. Several plates of foods including strawberries and vegetables are next to a sippy cup. A very long limo with a bunch of farm animals on top of it. A couple of umbrellas in a small room. A girl walking and talking on a cell phone. A man with skis walks through a snowy area. A view of a bathroom with a yellow towel sitting on the shower. A red truck moving towards a busy highway. Two people are walking towards some motorcycles to leave a market consisting of umbrellas over tables. A close up photo of parking meter on a street. A ship of people cruising along the water. A couple of giraffes are standing in the wild. A tray topped with sandwiches and cut up apples. A person on a snow board performing a jump on a mountainside. A large passenger jet flying through a cloudy blue sky. A man swinging a tennis racket during a tennis match. A computer desktop with a keyboard and monitor. a small white vase is on a table A bridge stands over a river before a city sky line. A group of people riding skis across a snow covered slope. A cyclist rides through a tree-lined path in the park. A man drinking from a glass on top of a night stand. A giraffe standing at a dirt road eating off a tree branch. A pooh bear is sitting upright holding a honey pot. a living room with a fireplace and a big brown chair A man riding a motorcycle across a lush green park. a couple of beds sit inside of a room a herd of giraffes on a dry grassy plain A group of cats sitting on top of a chair. Some strawberries floating in a bowl of pudding with sparklers added. A tablet, a laptop and a computer on a desk Bathroom with granite counter top and single sink. A train traveling along a rocky mountain side. A man and a child are dancing by the water. a girl and a dog are sitting on a bed hugging A man and a woman standing next to each other holding tennis racquet. A cat scanning the floor in front of an orange bucket A wet polar bear holding a green cone in its mouth. The pink Frisbee is laying on the snow covered ground. Two computers on a desk in a small bedroom a close up of a table with an ipod headphones and a remote A destroyed toilet and sink lying on the ground. two people on stage performing a song to a crowd Two females sitting on BMW motorcycles under a tent. A boy is holding his hands out as he jumps with his skateboard. A boat filled with produce and people floats on a river. A public official helping to feed some school children a healthy lunch. A red parking meter sits on the sidewalk. A man holding up his tennis racket . The shirtless man plays frisbee in the water. A group of people sitting at a restaurant table with food. A wall with vines and old tools strewn on it. A very big nice looking truck on a street. A slug crawling on the seat of a toilet A pole holding a traffic sign at an intersection. A radish on a cutting board next to a knife A bride and groom walking next to one another. I am unable to see an image above. A couple of women holding up a cake together. A fire hydrant at a intersection at night. A baby pressing a key on a laptop. there are many different pies on this table a bunch of people are sitting in a busy room people in a large sleigh being pulled by horses a close up of a person throwing a pair of scissors A blue vase with sunflowers and other flowers The snowboarder is standing on a conveyor belt with others. A couple of men holding a bunch of baby sheep standing next to each other. Three people in work uniforms and visors standing together in front of various types of donuts. a close up of the front end of a school bus A young child walking down a street past two nets blocking a road. Dog fetching a frisbee in a rough field. A group of kids standing in a forest. A person wearing a wedding ring has their hand on a teddy bear. a very decorated work cubicle with a laptop A catcher and a batter playing baseball in a park. A guy jumping through the air with a Frisbee in the air. Two large green and white jumbo jet planes on the tarmac. Twisted bars of metal connected to a tall building. The double sink in the bathroom is nice and clean. a horse and foal grazing on dry grass. A train traveling over a bridge spanning a river. Bunches of fruit growing on native trees shown on cloudy day. A man gets ready to hit a tennis ball with a racket. A lamp on a table in a livingroom And upload picture of some food in a bowl. Lady with a slice of piece in front of a stack of pizza boxes. A train is on the train track, which is surrounded by trees with autumn foliage. A kitchen area with a refrigerator, table and doorway. The person is taking a high jump on their skis. The skiers are getting ready to go on their run, A red two level bus with front damage to it being towed down a street. A small bathroom with a yellow toilet, sink area and shower. Seagull on rock with ocean and lighthouse in the background. Men in a teaching kitchen discussing all the visible prepared food. a wooden desk with a black and silver computer A train is going through the pretty country side. A herd of elephants drink from a river as two wander away from the group. A group of people standing in the snow with skis A picture of a person and a motorcycle on the street. A white bowl of food with a spoon. three giraffes standing up near some dry plants. A couple of men riding motorcycles behind a herd of sheep. A girl in a bikini sits on a towel at the beach and holds a pastry. A large computer screen with keyboard on a small desk in a corner. The zebra is standing behind the rocks in the exhibit. A zebra eating hay out of a container near a rock. A boy prepares to swing his bat during a baseball game. A foyer furnished with a sofa, arm chairs, and end tables. A motorcyclist is being followed by a familiar face. A person sitting at a wooden table with pizza, and some other foods on a brown paper bag in front of him A MAN IS RIDING AMOTOR BIKE IN THE CITY The giraffe looks like he is in the wild. a couple of giraffes that are outside a brick building A bathroom with hand towel, mirror, and sink. A man surfing on his surf board against the waves A statue is set on top of some banisters. a man walking through the water holding a surfboard A man in a car who is on a cell phone. A bathroom counter with a sink and various cosmetics and toiletries. A blender sits on the concrete next to some greenery. an animal in a field behind a fence A metro bus approaches an intersection where a traffic cop is directing traffic. a table covered in vegetables of all sizes and colors A traffic light with a pedestrian crossing sign on it's sides. A torn apart bathroom with a toilet in a bathtub. A man does a trick at a skating course. A double decker bus and a truck driving next to each other. Boats at a dock near a large hotel. Black and white photograph of a tennis team and their coaches Wooden benches are lined along the edge of the water. a group of people standing in the snow next to a building a couple of red lights are on a pole a man standing on a tennis court holding a racket A chef is instructing two women on how to slice vegetables. A woman looking up at the kite that she is flying. A red and yellow fire hydrant with the lid taken off. A man in a green shirt holds an appliance while another man stands by. A small brown teddy bear sitting on a white bed leaning on pillows. Three people sitting in the snow with snowboard on their feet. a very tall tree in the field with nice flowers A young boy standing next to a giraffe he can pet A young boy with a fish hat eats a snack. Skiers and snowboarders mill about on a mountain. A young girl standing in front of a book shelf holding a red tie A male skier navigates a course at the Vancouver Winter Olympics. a kitchen that has a bunch of people in it Happy girl in a green shirt holds onto her suitcase. a bath room with a toilet and a window A young man playing a ball game on a cement basketball court. Multiple boats sitting dormant on a lake bay. A bunch of people riding motorcycles down a road Assorted food items displayed in white dish on wooden table. A red double decker bus traveling down a city street Chicken wrap cut in half displayed on wooden board near silverware. a man in a wet suit rides on a surf board Two horses graze in a pasture in the setting sun. An airplane is flying in the air on a clear day. some people are riding elephants in the jungle A man hitting tennis balls on a blue painted tennis court. A person with a laptop sitting in front of a window. A baseball player runs across home plate after hitting the ball A purple swamphen with a red crest on its head walks on the ground. A delicious looking hotdog sits in cardboard with tons of toppings. It's strange to see a bow tie with a military uniform. a black cat sitting on a cement patio An infant in a high chair covered in pink glop Bicyclists ride down the sidewalk in front of several stores. Some zebras that are sitting on the ground next to each other. People watching a horse race image is fuzzy. A bedroom with a bed and other furniture in it A blurry photo of meat patties on a big meat patty a buffet in a restaurant with some big crocks glasses and bins of other foods a group of zebras standing around a food trough to eat Brown gull in water on beach littered with seaweed. A group of people wait for the start of the race on their bikes. a man sitting in a chair watching another man pretend to be an elephant while playing with a child on the floor A minimalist bedroom with low furniture and a quote on the wall. Two birds sitting on top of a branch on a tree. City bus next to traffic cones in the far right lane of a busy freeway. A cat reaching up to grab a feather on a string. a person leaning on a stop sign with a skate board A woman in a blue sweater sitting at a table with food. A child making a silly face over a tray of donuts. The bed is located on the edge of the beach. A bald man is using a surfboard to ride a waves. Broccoli next to some meat on a small plate. Shelves filled with pots, pans, and cooking utensils. Two large boats sitting on a docking area in the evening. A man jumping on a dirt bike while another man watches A cow laying down in a grass field. a blender sits on a counter top unplugged A bicycle leaning against an old white building. Three men, one caring a skateboard, are wearing matching t-shirts. The clear shelves on a green wall that have vases with designs on each shelf. Helmets should always be worn by motorcycle riders and passengers. two people on a field wearing baseball equipment A man puts on his jacket while standing near snow skis and poles. A blender sits on a kitchen counter surrounded by baking supplies. A small child wrapped in a towel brushing their teeth. Green and white airplane sitting on a runway by the ocean. A keyboard and monitor on a corner desk. Two couples getting ready for a tennis match. A street sign showing the words Gay Street. A man in brown shirt standing in a kitchen. People cut a cake outside for a celebration. A large passenger plane is parked on the runway. a close up of a red fire hydrant with a chain on it A young child running down a rain covered walk way with an umbrella. Couple people walking up the snowy hill wearing skis A dog is sitting down in front of a mirror a group of guys standing out on the road Four people standing on a balcony with a clock Why would the cow be grazing in front of those homes? A woman is taking a picture in the bathroom. a little boy and his father skii down a big hil A man with camera watching a group of giraffes Herd of sheep standing on pasture with stone buildings in the background. a bunch of people walk on a beach to the water A myriad of wind socks blowing in the wind. A trolley rolling down the tracks in a forest. A box of doughnuts and pastries with strips of bacon. A woman on a cell phone at a station. A pair of hand slicing carrots with a large knife. A woman is sitting down in her kitchen to feed her young child. A horse drawn trolley sitting in the middle of a street. a smiling woman holding onto a pizza box A picture of a computer sitting on the floor. Two giraffes are found wandering around the buildings. Jockeys on horses riding on a racing track. The newborn baby is sleeping next to a teddy bear. A girl riding on the back of a scooter on a cobbled road. A girl in pink ski gear that is sitting in the snow. A couple of people sitting on a wooden bench. A flat screen TV sitting across the way from a laptop. Two horses graze in a field surrounded by barbed wire. Mass transit train waiting for passengers at the station. Four women sit on a park bench with groceries. A baseball player in a blue jersey standing ready with a catcher's mitt A larger commercial jet is flying in the air. a large pizza is sitting on a pan A cow walking on the beach towards people on lounge chairs this is stuffed teddy bears sitting in the grass A man a woman are standing together holding tennis rackets. A blue and white double decker bus on side of street. Person in a parka taking pictures with a mobile phone camera. A cat next to a grocery bag on the hardwood floor in a kitchen. A man riding a paddle board into a massive wave in the ocean. Man and woman in a bedroom holding up Wii controllers. A group of people on some skis in the snow. A woman working in commercial kitchen with stainless steel appliances. A man sitting on a horse drawn carriage A girl in black jacket drinking milk and eating pizza. A deep marble bathtub under an ornate mirror. a woman posing on the street for a photo A horse pulling a carriage down a street with other people. People are huddled together under umbrellas on the beach. a person sitting on steps talking on a phone A large passenger jet flying through a cloudy sky. A cat is lying on its back in a man's lap. Several people are snowboarding off the top of a snow covered truck. The adult elephant stands idly in his zoo habitat. A man is holding a baseball bat while wearing a muddy outfit. A woman is sitting with a suitcase on some train tracks. A train traveling over a bridge over a freeway. a couple of chairs are around a table outside A man in vest and bow tie standing over a keyboard. A cat sleeps on a pile of discarded shoes. An open cell phone in a person's hand. There is a dog walking down a path near grass. Three giraffes behind a wire fence next to a tree. A tennis player turns her racket sideways as she returns the ball. A horse grazes by itself on a grassy plain. People playing frisbee out on the lawn, on diving for it. Some type of bed outside on the beach Virgin Ameican Airline planes with passenger boarding bridges attached. A woman walking a horse down a trail. Two polar bears playing in the ice and snow. A wide shot of a modern kitchen with a glass table in the foreground. a photo of an old tall cathedral and bell tower. A parking meter and a umbrella on a street. A vase sitting on a table filled with flowers. Damaged bathroom with a toilet, sink, and damaged window. A batter up to swing in a baseball game Boats docked in the water in a marina. A young giraffe running across the road on an African plain. a picture of a bulding with a open window and clock. A plate topped with two pieces of cake and strawberry. A man with a helmet is on a surfboard Two men shaking hands while standing on a tennis court. Visitors walk beneath huge airplanes on display in a hangar. This train is riding a rail near some water A stop sign that has another sign saying all way under it. People standing on the street holding umbrellas near buildings A knitted cap sits upon a red hat stand. A desktop computer sitting on top of a desk. The sandwich has chicken, melted cheese, and tomato inside. A diesel locomotive approaching a rural grade crossing. A Nashville bus with a big ad for Coors Light on the side. Three people standing at a baggage claim at an airport. five zebras standing in a row in the wild a woman holding an umbrella on the street A pizza cutter being used as a spatula. A male flying a kite on a sunny day. A group of people standing on top of a lush green field. a person sending a text on her phone A hand is holding a pack of Japanese donuts. A couple of red chairs against a wall Smiling and smirky people are in a small kitchen. A building with a clock tower on it A steam locomotive with passenger cars crosses a bridge over a channel A female tennis player hitting the ball. Various zebra in dirt field with mountains in the background. A black and white image of a vehicle that is decorated like a dog. a bucket of oranges sitting next to a bike A freshly made bed resting on a tiled floor. Two people in the middle of a skiing trail with trees lined on each side of the trail. a street-side market with colorful plastic furniture. A dessert that consists of a piece of cake and some ice cream. A toddler in a kitchen trying to use a vacuum cleaner. Three sheeps are grazing in a small field. A very cute small dog laying on a big couch. a table and chairs with silverware and plates a pan and bowl of food Stuffed bears sit in the window of a store. People and carts loaded with suitcases on a train platform. A boat with a bed set by a set of windows. a messy living room with the television on. A living room with leather couch, settee and chair, rustic tables and a cowhide rug. A red flower vase placed next to a clock on a window sill. A glass vase full of dried dead roses Two googly eyes and a Santa beard placed on a microwave oven A tabby cat sleeping on a wooden island in an old looking kitchen. Three giraffes in the wild stand by shrubs. A young woman wearing a white hat in a commercial kitchen chopping lettuce. A person who is standing in front of a laptop. A couch and a chair in a room. A flat screen television mounted above a fireplace. A cart with a load of suitcases pile on it. A white sandwich has pink meat in it. A bunch of people are on stage and the guy in white is doing something to the one child who is holding his skateboard and next to him is a child in a red helmet. A banana plant with a large flower and unripe bananas. A yellow and grey train on tracks beneath a traffic signal A very nice looking trolley car on a city street. a person wearing a black coat and a tie with bolt designs on it. A giraffe stand alone in a zoo during the day. There is a long wooden bench with a fountain in the middle of the area. A couple pieces of food that are on a table. A adult elephant and a couple children in the water. Taking a moments rest on their cross country ski trip. A bunch of cut meat sitting on a cutting board. there is a luggage that is sitting on metal outside Four people are posing for the camera with flags behind them. a living room with book bottles a lamp and television set A white plate topped with two different type of food. A woman standing at a counter using a blender. this man is jumping high over the grass People are gathered to watch two women, one who is doing the splits. Two men are standing and talking alongside an old fire company van. a young girl is getting her temperature taken A green tennis ball bouncing on a wood tennis racket. A cat sitting on a chair looking straight at the camera. A Studio apartment with minimal furniture and a refrigerator. A giraffe is walking through a grassy field. A room in a home that has a small table with one chair on the side and another piece of furniture in the next area. A living room with white walls and stained wood furniture. zebras stand next to each other in the zoo. A train station that has a train pulled into it. A stop sign in front of a Google building. A woman standing in a grass field with a cell phone. a close up of a dog on a desk near a monitor A street sign that reads Ronald Reagan Allee. A white plate topped with a piece of toast and eggs. A cut in half bagel sandwich sitting on top of a plate. A couple of bowls are on a counter by a man. A man leaning on a fire hydrant on a city corner a girl and a dog looking angry in a photo The corner of a batch room with a white sink and red shower curtain. a close up of a cell phone on a table near earbuds A person holding a red phone nest to a flower filled plant. A couple of cars that are in the dirt. A pastry is decorated in a lattice style on a piece of burlap with a knife. Herd of goats in grassy area with herder. A semi truck pulling a trailer filled with logs. A couple of people standing with a umbrella. a man is in a store making donuts with flour People at the table getting sandwiches to put on their plates Four men and two women sitting on two different benches. a whole bunch of bananas cut up in a large bowl a male in a black shirt taking a photo in a mirror and a sink A large passenger jet flying through a blue sky. The display case has many different scissors on it. A red fire hydrant sitting on a slab of cement in a patch of grass. A cat sits on the couch next to the remote. a close up of a dog laying down with a chew toy A man holding his cell phone in front of him in his left hand. A boxer dog faces the camera while sitting on a computer chair. A man riding a snow board down a snow covered slope. A man sitting in the back of a van talking on a cellphone. a black and white clock on a gold and black tower a baseball player that has a ball in his hand A truck parked on the street with a man getting out a woman hitting a ball at the end of a tennis court A bathroom with sink and a toilet in it. Two plates of food that include potatoes, broccoli and sausage. Two gentleman are playing on the Wii. A living room has a fireplace and bookcases in it. A soccer player chasing a ball in the air. Two cows standing close together on a grass field. A laptop computer sitting on top of a wooden desk. A knife being slid into a wooden block. a man pouring liquid in a line of glasses on a table with a hat on his head Several people in a field, some are flying kites. Some items sit next to the door. A food truck parked along side the street A man swinging at a tennis ball with a racquet. a fire house with a grass field in the back of it A brown cow under a tree in a grassy area. The insect is flying around on the porch. A woman puts her market shopping in her motor scooter seat Four hotdogs in buns sitting on a white platter. A group of boys wearing white shirts, black ties and red caps. a woman hitting a tennis ball with her racket A transit bus pulling through a shopping area. A cluttered restaurant has a boy at a table with a phone. Two girls sitting at a table near a dishwasher. A desk topped with a laptop computer and speakers. A passenger train is going down the track and people are in the car. A group of sheep grazing in a large open field. A teddy bear is next to a banana in the air. A bowl of fruit is on the floor in front of some feet. A woman riding down the side of a skateboard ramp. A beautiful girl playing a game of Frisbee with an orange Frisbee. A plate with a slice of cake on top of it next to a fork. A man loading luggage onto a machine as it comes off a plane. A man and boy are talking behind a rickshaw. A woman dressed in colorful clothing preparing a meal. A vegetable pizza on a plate on a table. A small, black cat sleeps next to a mouse and keyboard. A black dog laying on a rug next to a TV. A stop sign with street sign at an intersection. two woman standing in a kitchen by astove this bathroom is all white and has a white toilet and a tub Large dog laying on top of a bed and looking up at mirror. A room with some windows and a clock and a air sign. Group of men with skateboard celebrating while in grassy park. Several skis and snowboards laying around in the snow. A dining and kitchen area with high wood ceilings. Two older men throwing a ball on a baseball diamond. A dark frame surrounds the window and mirror in this bathroom. Two children playing Wii while adults look on. some toy buildings a fire engine and a police car A tablet is set in front of a Dell computer screen. Airport baggage handlers loading luggage into a cart. A picture of an animal catching a frisbee. Two women laugh and show movement in the picture. There are keyboard keys on a wooden table. A wooden cutting board topped with a sandwich with a knife. A woman in green cardigan with brown dog at a table. a big bear walks through some grass A young woman riding on a brown house through a course. a black and silver motorcycle is parked and some people A busy street in the city on a sunny day a baby elephant walks through some shallow water A boat traveling along a river surrounded by grass fields. Two people are crossing the street as they are heading towards the stop sign. A person performs a jump on a hill on their snowboard. A shelf that has a wedding photo on it with flowers. A gentleman is trying to pull off a skateboarding trick. A laptop and a keyboard are on a computer desk. A small boy skateboarding in a city mall A pile of broccoli with a sprout sticking out of the top. some people playing soccer while a crowd watches them a couple of birds that are standing on a beach a bunch of bottles are in the fridge A white cup of coffee sitting on top of a wooden table. A green cloth holding a white tray full of food. An open laptop computer sitting on top of a bed next to a mouse. The cathedral has two clocks on each of it's walls. He person that is doing a skateboard trick. Two pieces of pizza on a plate with a small servor. Several cars and a motorcycle are parked in an alley. Blue bullet train waiting at the train station A plate of some sort of a vegetarian pizza dish. a man with a hat holding a baseball bat an elephant is scratching his head on a tree a train engine and box cars on the track Two people sitting on a bench with their dog. A man smiling on skis in the snow. A cow running on to a road near a town A large wooden structure displaying boxes of fruit. A bird is sitting on the top of a log. A young boy riding a small skateboard on a pile of dirt. An SUV driving on a rain soaked roadway past a red stop sign. A very big building with a clock on it. An unmade bed is covered by a comforter and a bowl. A picture of a street light through a rainy lens. A man is about to hit a tennis ball during a match. Several women in a kitchen preparing many identical meals. A man and woman are walking and the man is pulling a suitcase. A woman riding a horse wearing a white outfit and helmet with yellow stars on it. A black and white traffic sign under a cloudy sky. A group of people sitting around a long white table. A bowl of pasta sits on a table with a candle. A line of buses parked along a wall by a building. a semi truck driving on a road with a sky background A very tidy living room with a white couch with pillows on it. A stop sign in English as well as some other language. Dozens of brightly colored kites lined up on a beach. Freshly cooked lobsters served at home with vegetables and salad Some people stand on the beach and others go in the water. The people are riding motorcycles on a racecourse. A person sits on the road near their motorcycle. A Kenyan Airways airplane sits on the runway. A male tennis player stands with his racket poised. a large air plane on a run way A white cats sleeps on the seat of a chair. A beautiful woman holding a hunk of cake. A field with large clear balls and a large amount of people in the bleachers. An action shot of a moving bus on the street at night A man is skateboarding in front of two women. A stove top oven with a couple of pots and pans. a hand is holding a kites string and a flying kite Colorful graffiti on an old Canadian train car. There is a picture of a traffic sign with north and south arrows in the foreground and a graveyard in the back ground on a projectable slide. A bedroom with a balcony in a hotel A fire hydrant is attached to a building wall. A living room with a white circular table in it's center. small boats in a large body of water A fluffy dog is walking up the beach A man on a court swinging a tennis racket. Two custom pizzas with different and interesting toppings. Zebras and wort hogs living together on the plains. A wooden statue of a man near a window of stacked donuts. A couple of shaggy haired sheep grazing in a field. A group of surfers walking along the beach with surfboards A group of people skateboarding in park area next to palm trees. A grass level shot of a small heard of zebras in the wild. A child is tossing a baseball to another child with a wooden bat. A man is spraying an elephant with a water hose. A horse-drawn carriage traveling down a city road. Two giraffes standing together next to a wall. Couple of attentive enthused women playing Nintendo wii A group of women standing around a cake cutting slices. A street scene with a couple taxis lined up. A child in a vehicle holding some toys. Two fire trucks from Seattle sitting in a lot. An older gentlemen reads in his hotel room A man surfs down the waves of a beach A herd of horses grazing on bales of hay. A large clock constructed of landscaping plants and flowers on a small rise. A stop sign that is next to some plants. A flying bird seen through a liquid filled feeder. A close up side view of a zebras face A man on a water board speeding down the ocean. Four adult elephants and a younger elephant walk through dry soil. a couple of anmails standing next to a truck A giraffe is overlooking a barren plain, behind trees. A woman and child walk, holding hands, under the large freeway sign. Two people are eating pizza at a dinner table. A cat eating at something dead on the beach. A cluttered desk with a laptop and discs sitting on it. A cat in a bathroom sits on the lid of the commode. A little girl with a broken arm standing in her bathroom. A man is skateboarding near the parked cars, A stuffed animal dog sitting between to trash cans. A refrigerator with magnets on it sitting beside a trash can. A milking a cow in the middle of a pen. A black cat lying down on a laptop. A red stop sign next to a brick building. a sheep eating hay next to a log cabin. a group of zebras on a farm in a field Two people are in shallow water with horses. A hot pizza on the table is loaded with pepperoni and cheese and sausage. Two boys sit on chairs and play video games. A lot of fishing boats have a lot of men off loading their catch. Group of people sitting in auditorium with a screen. A person is holding two spoons over the sink. A young boy who is surfing on a surfboard. An assortment of computer devices resting on a large wooden table. a fenced in park on a city street a couple of zebras are grazing on some dead grass A baseball player taking a swing at a ball a boy on a skateboard is skateboarding on the ramp doing a trick There are adult bears that is sitting in a den A small cell phone sitting next to a glass of Pepsi. A couple of bananas hanging from a metal hook. a bag that is filled with pens and scissors People sitting outside along a concrete wall on a sunny day. A large tiger cat sits on a chair. A man riding on the back of a motorcycle. A bike parked out a store front with a lot of boxes. a group of people under a tent celebrating something The young men are playing a game of baseball. A yellow door is detached from a refrigerator outside A gyro and french fries with a drink displayed on a table. Large hotel room with a king sized bed and large view of the ocean. There is some chicken with cherry tomatoes and edamame. A bed with white pillows next to a wall. A person with a blue, red, and green plaid umbrella A room with a bed, chairs and various boxes. A bunch of little kids playing a game of soccer. An airplane parked on a runway in the day time. The children are fascinated with the making of the cake. Ties of various sizes and colors are hanging on a portable shelf. A cat is sitting on the floor while watching television. Three different colored apples and a banana next to one another. A boy playing tennis on a tennis court swings his racket. A skateboarder up in the air over a snowy hill. Man in grey uniform during a baseball game. A woman standing next to two baby elephants. Two bicycle riders are on a trail through the woods. A cat sleeping on top of a brown chair in a yard. A mostly empty train station with two trains ready to depart. A man and a woman holding remote controllers in front of a television. Sun is coming through a window in a living room. A young boy riding a surfboard on a wave in the ocean. A pretty little girl flying a kite on a lush green field. A picture of a police man riding on a motorcycle. A parking meter with a picture of a bicycle on it. The thin pizza is sitting on the plate. Tan suitcase behind a match magazine and CD. Five sausage, egg, and cheese egg muffins. A woman with pink hair walking next to a man with a suitcase. A group of skiers pose on a snowy slope. An outdoor table and chair setting on the curb The old time fire engine joining the parade. A tiled mosaic empty shower stall with bathroom mirror. a desk with a laptop and a desktop on it A young man on a skateboard maneuvers around traffic cones a train on the railroad near a forested area a person on a bike rides next to a city street Skier on slope in alpine mountain area on sunny day. a man in a suit carrying a drink and a red and white sign A glass of alcohol sitting next to an open laptop computer. This old wooden fishing boat appears to be permanently dry docked. THERE ARE PIZZA THAT IS ON THE TABLE A male tennis player bouncing a tennis ball. People standing around in the street talking near buildings. A bus and cars sit on a street. A closeup of a empty boat surrounded by dark waters. a black gray and white cat is sitting on a bookshelf a black silver white blue red an orange parking meter and a hand flipping it off pitcher with grey and white shirt throwing a pitch A slice of pizza is on a round white plate. three motorcycle riders some dry trees and a few green trees Three people, one in a suit, are posing for the camera. A toilet and trash can behind a wall in the bathroom a group of people pose for a picture at a wedding A man is sitting on a bench, taking in the city. A man sitting on a white chair on top of a tennis court. In the station people are standing and talking. A man and some giraffe standing in a field. A zebra with a left side pose while standing in a field. The man is dressed in a suit and tie posing for a photo. A man wearing eye glasses is staring at the camera in front of a room. A white and blue vase with a peach rose in it. a black and white photo of a tooth brush in a cup A group of sheep and some birds in a fenced in area. a small child is looking at the kite flying. A yellow bus driving down a street next to a ball building. A person with blue hair takes a photo of themselves. An adult with a child riding skis down a small hill. The group of three friends are sitting on a fallen tree in the woods. A black cat laying on a parked car. The man sits cross legged while typing on a laptop. A couple of women riding on top of a blue motorcycle. A clean looking bathroom has a white shower curtain. An open door on a public transportation system. A small clean simple bathroom contains a sink tub and toliet A clock displays the time on a brick building People walking on a beach, many carrying surfboards a person riding a skate board at a skate park There are two people and two motorcycles by a brick building. A little girl out on the beach with a fish kite. a little bedroom with some curtains blocking the window A brown horse grazing in field behind a fence. Hundreds of people cycling in front of several skyscrapers this is a sandwich and french fries on a plate A man is swinging a baseball bat on the field. a close up of a cup with tooth brushes a bunch of people sitting under a umbrella A small dog buried in the covers of a bed. Two man preparing their surfboards to go surfing. Two giraffes standing in front of a wooden wall A computer on a desk with two cds lying on top of the keyboard. A small cat sitting on the edge of a toilet seat looking into the toilet. Someone skiing down a hill on the ski slope. A cartoon of a person surfing a big wave Urban street with storefronts and parked trucks, on a rainy day. A man takes a bite out of some sort of food. A man riding on top of a wave on a surfboard. A remote control lying on a wooden table a gothic clock tower beneath a blue sky Two skateboarders are riding on a slanted walkway. A group of kites flying through a blue sky. A ripe banana sitting on a table next to an apple. Some passenger buses that are driving down the street. A little boy is at a dining table in public. A double decker bus driving down a street next to a tall building. Pizza and appetizers with a side of ranch dipping sauce. A large statue of an Italian chef wearing an orange tie. a man standing at the beach in the water holding a kite A group of people in a circle, while holding tennis rackets and standing on a hard surface tennis court. A dog that is running on the grass with a Fribee in its mouth. A clock that is sitting on the side of a tower. A bed with a colorful blanket sitting under a picture. A stop sign on a corner with water and snow covered mountains in the distance. Snowboarder bundled up in winter clothing while on slope. An old plane is sitting on a runway. An old clock is seen on a foggy street. A man cross country skiing through the woods. A parrot sitting on a person's hand while eating fruit. A group of eople binding over fastening their ski boots. A young boy holding onto a parking meter. An elephant roaming the grassy areas in his natural habitat. A few people standing on a court playing tennis. A couple of bears are outside, both on logs. A spiral glass water feature showpieces a commercial bathroom. A man attending to food by a pile of fruits and vegetables. A baseball player with a mitt on one hand. a person jumping a skate board in the air A man who is eating a glazed doughnut. A computer on a countertop with a tangle of cords behind it. A tray of food consisting of vegetables meat and rice. A unique style bed with red covers and a mirror behind it. A large and a small teddy bear at the teddy bear museum. A person water skiing behind a boat full of people. A man on skis standing at the base of a mountain. A container with a variety of vegetables, desserts, breads and other types of foods, with one spoon on top of the food items. A pair of plush animals dressed in halloween costumes. A black dog in a yard jumps up toward a yellow Frisbee. A horse reflected in the surface of water a toddler standing while holding onto a toilet and reaching for a towel Two men with suitcases and a lady nearby. A black and white dog curiously looking at something on a counter. an adult and two children snow skiers snow and trees A group of children running after a soccer ball A blurry image of yellow flowers with a fence in the background. The long meat and cheese sandwich is wrapped in plastic. A man standing in front of his tv. A train riding a group of people around. This is a plate holding a double decker sandwich. A giraffe grazing from a tall tree next to a rock. A youth baseball team and their coach poses for a photo on the field some people are sitting under umbrellas at the beach A surfer carries his board through the snow, and rides a wave. Apples and leaves on the ground with a cat in the background. A dim living room with modern furniture and potted plants. a big group of people that are standing under a shelter A street sign on a pole on the side of the road. a woman taking a picture of her microwave A living room with everything in it labeled people watching young boys playing a game of some sort The young man is practicing his tricks on his skateboard. Lambs in a sheltered place are eating and laying around. A living room scene with the television and a Christmas tree. A living room has two couches and a television. Two men who are looking at a passenger jet. A man sleeping in a bed with two cats. A woman bending over holding and kissing her cat. A person jumping in the air on a skateboard. The stop light reads green, and there are two huge buildings in the back. The back ends and legs of three elephants, including a baby, are seen on the side of a road. A baseball player wearing the number thirteen at home plate. Two large white commercial airliners on an airport runway. Nested measuring cups and spoons on a gray surface. A man playing with a Frisbee in a gym. A crowd of people carrying umbrellas across a rain soaked street. three fourths of a pizza with meats and vegetables on a pizza pan A woman and a man standing with a horse in a boat and a dog laying next to it. A dog is sitting on a counter in what looks like a factory setting. A large tow truck drives down the street. A silver, stainless steel refrigerator in a kitchen A living room filled with furniture in front of a fire place. A woman with green lace underwear is walking away as tennis balls are hanging all around her. A photo of a dirty bathroom with a sink and toilet. A gray bird perched on top of a tree branch. An elderly man and a teen play video games together. A keyboard and monitor on a wood desk People sit and wait, looking at papers and phones. Traffic light in a blank space with lit green light. A woman at a table eating with two pizzas. A man standing next to a woman holding an umbrella. a man that is looking into a stove A desk with ruler, whole punch and scissors on it A bathroom that has a mirror in it. A long empty road with an over pass bridge. A group of young children sitting around a table eating food. A man is skateboarding down a path next to some grass. A child at a store display selling green bananas. an adult feeding a baby some cake a suit coat shirt and tie hanging on hooks A zebra walking through a green field of grass. Two photos containing food with hot dogs and pastries. A picture of some delicious pizza ready to be eaten. A small white and brown bird resting on a twig. A herd of black cattle grazing on a lush green field. A toilet in the bathroom with a wheel in the window. A CUP OF COFFEE AND A PASTRY ON A TABLE A black and white cat is sitting in front of fall foliage. Two adult males enjoy playing a videogame together. a living room with a low ceiling and it has a couple of couches The personal sized pizza on the plate has many toppings. A little toddler boy sleeping on his couch with a remote in his hand a cat dressed with a collar and tie decorated with irish symbols Little boy in boat with two halves of a banana in mouth. a man on a pay phone holding his hand out to someone A bunch of food sitting on a plate with a spoon A giraffe leans its neck as it walks through the bush. A toddler brushing her teeth with an electronic toothbrush. Little girl with a group of children watches a show. The tray has fries, meats and vegetables. A long row of train carts sitting in a yard of tracks. a woman in a dress prepares to hit a tennis ball there is a woman sitting on a couch holding a piece of cake Several sausages cooking on a grill with glowing charcoal. Black and white photo of a large clock located outside. this person is doing his work on two computers a dog and a person stand on an edge with a mountainn in the back ground A table with plates of food and an orange on it A hanging street sign that says Rockefeller Plaza. Two people with remotes in a living room. A couple of people on a field playing baseball. an elephant resting in the water next to the shore area Black and white photograph of a woman surrounded by pigeons on a city street a couple of animals are standing in a field some boats going down a tree lined canal An overhead view of a lot containing many parked, empty buses. A sign indicating Florida Avenue and another one stating the speed limit is 35. A guy holding a piece of food up to his mouth. an image of a dog with one paw out the window A book is open and kept in front of a soft toy. A disturbing doll sits next to a clock in a mirrored image. people bicycling down a city street in daylight A group of people sitting next to each other on a bench. A person walking down a sidewalk carrying a back pack. A reflection a person catching a frisbee in a mirror like object. These military guy is celebrating something big with a nice cake. a very big elephant with some clothes on carrying three people Two bears coming out of the woods to a road. Broccoli and a deep fried food lay on a black and white plate. There are two elephants standing next to each other. Three cows in a barn eating food off the ground. A truck driving through and intersection waiting on a pedestrian to finish crossing the street. A car at the light getting ready to go because the light is green. A book setting on a green bench in a park. A small herd of cows grazing along a path on the side of a hill. Helmeted and uniformed military men travel together on horseback. two men in suits standing next to each other Three people are standing in front of a truck while another is in the background. A person that is going out some candles. A man and a dog on a skate board. The serving counter of a restaurant is quiet. A man tossing a teddy bear off the side of a bridge with a parachute. Two men sitting at a table with a very large pizza. A couple of ships in the water by some buildings. A big building on some grassy field during the day. a boy laying down on a surfboard in the water A plate of food that has pita bread, green peppers and tomatoes. A man that is sitting on a moped. A picture of an oven with food baking inside. A man riding down a snow covered ski slope on skis. The travelers stare outside a tram as it approaches a giraffe standing by the roadside. A man filling jugs with water from a bathroom sink. A white cake with blueberries and oranges on top. Leaves and purple flowers come out of a brown vase on a desk. A toddler is running through a kitchen while some adults stand close by. A cat laying on top of a wooden desk near a monitor. A couple of people on a field with a Frisbee. A view of the street signs "W 122 St.", "Seminary Row", and "Broadway" in front of an old red brick building. A skier lifts their ski poles in the air on a slope, with other skiers nearby. A protest sign painted like a stop sign stating "stop harper" The face of a dairy cow in a pen. a person ina field playing with a frisbee with trees nearby There is a person flying a kite at the beach A bowl of raw fruit on a table by a painting A plane flying over waves and a small island. A open suitcase containing shoes with a table on top. A man poses with a pinwheel against the blue sky. A modern looking living room in an apartment. People walking down the stone sidewalk in the rain. A group of people are around a dining table. A lot of people that are in the street. A zebra standing near a tree in a field A man eating chocolate donuts and a woman smiling next to him. A group of people posing around a woman holding a cake. a person getting ready to swing on something A pretty young lady carrying a white umbrella. A flamboyant man wearing a tight green marching band uniform. Someone has set up a make-shift photography workshop in the field. A gray train is on a track on a hill near water. Two giraffes stand back to back and eat leaves Few persons are seen on zebra crossing on road and an elephant with a banner is there. A small herd of cows stand in a high mountain meadow. Two elephants are in the middle of a circus ring. A person holding an unusually thin Chiquita banana. The bathroom is in the process of being worked on. A plate full of broccoli with fries and carrots A horse is grazing in a grassy field with a view of mountains. a wet black dog has some sand on its nose A man sitting in a wheel chair under an umbrella on a busy street. A white fire hydrant sitting outside a building with a mural painted on it. A herd of animals standing in a large field. A row of white toilets sitting on top of a dirt ground. A tennis player is lifting is bending his leg off the ground and reaching his arm up in order to hit the ball. A young boy in a wetsuit on a surfboard. Two young women playing a game of soccer. A plane sitting on a runway beside water. There are people skiing next to a dog. A grey and white cat sitting in a sink a person with a large afro and glasses A man holding up a kite so it catches the wind. A country scene has a rocky trail leading to a body of water. A hand holding a bagel covered in almonds. A girl taking a bite of a slice of pizza. A middle-aged man in a suit with messy black hair. barren clean white kitchen with white appliances and stainless steel sink a plate full of vegetables with seasonings sprinkled on top A young man in blue jacket riding skateboard in snow. a hydrant in a place near some houses A female tennis player jumping up to hit the ball. An open laptop computer sitting on top of an office desk. A picture of a building with a very nice clock. There is a large plate of tomatoes and a pan of sliced tomatoes A car that is sitting near a green street light. A young woman using a cel phone, in a college tank top. A sink and some counters in a small room. A man riding a skateboard on top of a ramp. A couple of young people standing in front of a TV. A person putting some food on a white plate. a person at a table with a plate of pizza Bright blue train carriage awaiting passengers in Peru Two slices of plain pizza are sitting on a plate. A grey table with a white plate of food. two people sitting on a bench near trees a guy in the photo looks sad and dark A group of people seated using cellphones, three ladies with handbags The side view of a man with coffee casts a shadow as he ponders at his laptop A large brown dog sitting next to a frisbee. A woman in a white and green tennis dress setting up her shot. Three women in a kitchen at a table full of food. Two giraffes in their pen at the zoo. Very pretty clock with the base surrounding by brick floor A small transport truck with a white trailer. An open faced sandwich, chips and sauce are on a plate. White dish piled with ham slices and broccoli. A train with lots of red cars traveling down tracks. A man on the beach has a large umbrella. A picture of a man in a green baseball uniform batting for his team. two brown and white birds sitting on a roof A herd of flamingo birds in the water near a construction site. a boy in a baseball uniform standing in a field A red and black long van is parked in a parking lot. A giraffe next to the road on a safari ride. A group of cups that are sitting on a table. Biplane flying over blue ocean next to coastline. There is snow on top of the snow board. An open living room with hardwood floors and a vase of flowers A picture of a bathroom with white tile walls and a window with white blinds. A herd of sheep roam in the grass. A sidewalk area with a red fire hydrant near a light pole A slow moving subway train that is going down the track. A woman hitting a tennis ball with her racquet. A young child sitting on a leather couch holding a controller. A restaurant is filled with many people and newspapers. A couple of donuts are on the plate, ready to be eaten. A stove top cooking in a pot and frying pan A street sign with the name of a street on it, and next to it is a post with various names up and down the post. close up of a bulding in the mirror of a vehicle A living room with a fire place and lots of furniture. a bunch of bright lighted signs and mopeds on a street a man with a helmet is touching some food A silver hippy van and a bus for vegans. A Southwest airplane is parked on the runway. a close up of a person holding two birds many different clock on a shelf near a wall The small bathroom has a glass shower door. Train traveling through countryside near tall brick structure. Two girls outside, one flying a kite and one sitting down. A kitchen scene with focus on the sink and counter with vegetables. A cat sitting on a windowsill next to a painted pumpkin. A family on skis posing for a picture. The big rig truck is parked in the parking lot. A cat saying on a sofa with many pillows. A group of animals grazing on grass in a field. A man riding a snowboard down a snow covered ski slope. two people on a beach next to a large body of water A banana with a face written on it in front of a mirror. A table topped with three plates of food. An old ornamental building features many beautiful windows and a clock. A light blue airliners is parked on the tarmac. A man handing out slices of pizza to protesters A bus and a car travelling in the same direction on a sunny day. A computer mouse is placed next to a computer keyboard. Somebody is sleeping in the bed next to the clock on the table. a baseball player holding a bat on a field A young bearded man holding a partially eaten hot dog. Woman in white and black outfit on a tennis court. A white bus driving down a street next to people. A sofa with pillows next to floor and blue rug. A smiling woman holds a banana up in the air. A yellow tractor digging next to a yellow and red fire hydrant. A snow boarder is going down an indoor slope. a couple of trains sit parked as it overlooks a city Group of people holding up umbrellas in front of cactus. Female tennis player preparing to serve the ball A man riding a skateboard doing a trick. A double-decker Liverpool Street bus on a city street A person cutting a pizza with toppings into slices A large bird sitting on top of a metal spire. A tennis player raising his racket to hit a ball. A tray with food on a table A person takes a picture with their cell phone. A white toilet sitting in a bathroom next to a TP roller. A stop sign that has some foreign words written on it. A skateboarder rides his board in a concrete pool. Soup presented in a bowl on a plate. Two buses under a large open structure at a station. A baby elephant reaching for grey bag at a zoo. Multiple pictures of a man brushing his teeth. A salad to be eaten with wooden chopsticks and a drink. A couple of men in police uniforms sitting on horses. Two children sitting in the grass eating food A herd of elephants walking across a river. A line of stuffed animals in a child's room. Plate of food that are on top of a table. A computer is sitting on a computer desk on the far side of the room. A giraffe sticking its head through the rails of a wooden fence. A white and black bus on street next to a building. A woman talks on her cell phone as she skates down the sidewalk. Two zebra and other animals grazing the grass. a lady that has kids in her lap at the table Two men jumping to catch a Frisbee while people watch them playing. A man wearing all blue with an oil can walking around a train engine. A young man walking down a sidewalk pulling his travel bag as others watch. A street view of a protest and a woman with her fist raised. a cat on a bed with dishes on top of it A green and white airplane behind a fence. A yellow fire hydrant on the side of the street. A tall obelisk sitting next to a tall white building. A herd of zebra standing behind a wire fence. A cat relaxes in this tan leather chair. A group of people standing around a table filled with fruits and vegetables. A man with a racket walks on a court. a person is playing tennis outside on a court A man in a black jacket riding a skateboard on the street. A group of young men standing next to each other on a field. Cutting board with various fruits, utensils and spices. A little dog is running around an outside shopping stand. Two tangerines and a banana atop a blue plastic bowl. Several traffic lights are seen near a busy highway. A yellow and white train traveling down train tracks. A group of people sitting on horses in a row. A bench outdoors on a path near a fence. An animal stands in grass on a hillside on a sunny day. An almost empty box with a partially eating doughnut and a knife in it. A clock tower next to a large building LARGE SANDWICH CUT IN HALF ON A PLATE. A lone train is parked on the train tracks at the station. People skiing down a slope with many moguls. two blue and white trains buildings and some wires A street scene with people and cars on the street. Young professional looking man looking at the camera. A large lizard float is rising in the air. A young girl with a tennis racket is in a parking lot. a person wearing a dress and riding skis indoors A WOMAN IS NEAR A CAMEL WITH A UMBRELLA Box of cereal sitting next to a box of donuts. A woman on a skateboard riding on the sidewalk A red passenger bus makes its way past Big Ben in London. A tooth brush in a blue glass sitting on a counter. a person in a tie and suit sitting before a white plate of food and wine glass. A man approaches an intersection in the rain. A man sits at a table that has a surfboard propped against it. People in business suits standing in front of a building. Several elephants walking around grassy area in the wild. A tennis player holding a a racket on a tennis court. A single seagull swimming towards a rocky shore. A man in beard and glasses with a red and white suit on. A man skiing down the side of a snow covered slope. A view inside a refrigerator that is completely packed with food. A lamp post with traffic signal, street light and street signs. Baseball players playing on a professional baseball field. A spacious bathroom with lots of lower cabinets and a toilet. an image of a guy that is walking by a train Stop sign is above a red triangle sign next to a barb wire fence. A hand holding a water bottle in front of a cat. four plants being grown outside in a planter A surf boarder stands as he rides a wave. A museum display featuring professional baseball jersey and bat. A zebra standing in grass next to trees. The old man is talking on the phone. An unmade single bed in an upstairs bedroom in the early afternoon Young skate boarder doing a nearly vertical stunt The cat lies next to a cat sitting inside of a sport bag. A partly eaten pizza and a fork with wine on the table. An older man sitting at a small table about to eat a slice of pizza. A man surfing down a rushing rivers wave A man walks a dog near a large bus. A bike rack full of bikes and people every were. a small yellow boat set in the water by large rocks Giraffes in the wild on a sunny day A woman stirring a large metal pot of food. A pitcher preparing to throw a base ball. a plate with half eaten food on it A large white bear walking across a river. A bike parked next to a parking meter on the side of a street. A white bowl topped with a sandwich filled with meat and veggies. two black horses are grazing on green grass in the field. People milling about a bus terminal getting ready to board. The cat is balancing on top of the door. A man holding a white and black umbrella in a large parking lot. people sitting on a bench facing the water. Two men sitting down at a both eating . Several people standing around and looking at a vintage plane. A white plate topped with vegetables.covered in sauce. A girl is riding a surfboard in the water. A bright red motorcycle parked with other motorcycles beneath streetlights. The street sign is in the middle of the flood waters. A stuffed panda bear is sitting on a bench near a Buddha statue. Small bird standing on rope near open ocean. A plate topped with pasta, meat and broccoli. A woman with a snowboard jumping in the air. Cars are driving through the intersection underneath traffic signals. A person holding a tennis racket on a tennis court. two baseball players standing close to the base A black and brown dog rests on a couch. A black dog sitting in the middle of a bathroom. a motorcycle with a boot on the back wheel A woman and girl watching donuts being made through a window. a scooter with a rifle bag parked in front of a fence A nun sharing pizza with two young men. A snowboarder is on a board and is jumping in the air. A little boy in a inter tube at a water park. Two black cooling rack shoaling pieces of pizza. Room with patterned carpet and wallpaper and dark wood furnishings. A door to a bedroom is open with a wooden dresser in view. A car parked in a lot with a surf board strapped to the top. A man, women, and child sitting at a table. 4 seagulls stand on rusty rods with people in a boat in the background. A train is stopped in its tracks next to a building and cars. Various items on white surface including a cellphone, keys and camera. A monkey hanging from ropes eating bananas strung to it. a group of children playing soccer on an open field A rose, an entry way to a forest, a water fall and a lounge sign are in a series of photos. a wedding cake with a picture on it A man sitting on bed looking at a television and person in mirror. A couple of large birds standing by some eggs. A hotel bathroom has a granite vanity with a big mirror. A young man is holding up a skateboard. An umbrella is strapped to a blue bike. A couple of pairs of skis in the snow. A sliced panned pizza on a table ready to be served. Three giraffes stand in front of blurry trees. A green passenger train stops in a station to pick up passengers. Three pedestrians crossing a street at a stop light. a man with a beard a deer and a pink fire hydrant Two male tennis players meeting at the net for a high five. A very tall brick building sitting next to a traffic light. A man jumps and reaches for a frisbee A group of people sitting around a table. Man holding up a plate with a brownie in the shape of a spaceship. A small garden area features a few springs of growth and a small busy plant and a few bricks. A traffic light with stormy skies in the background. A girl thinks she is being funny while eating pizza. The surfer is barely hanging on to his surfboard. A stop sign that has spikes sticking out of it. A plate of food that includes chicken and broccoli. A white toilet in a very small bathroom. A crowd of people watching a baseball game where a batter just hit a ball. A person holds a pink frosted donut with jimmies. some white black and brown sheep in their pen A black truck is driving on an open sandy area A snowboarder flying up in the air with the sun behind him. a table with a blender and a glass on it This young girl is learning to throw a frisbee. a couple of geese are on the water A row of boats on a river with trees in the background. a red fire hydrant next to a stone brick wall A man sits on a bed talking with hand gestures. person walking down the sidewalk at night in rain A giraffe standing next to trees on the plains. Three horses on a green pasture with an old building in the background. a couple of boats are sitting in the water A man flying a double string kite in a large grassy area. A man in a ball cap riding on a mule. An old red truck is driving by the water. An assortment of vegetables sit out on the cutting board. The two zebra stand in a black and white photo. A girl swings a raqcuet at a tennis ball. A young girl sitting with a young boy at a table with food. A silver and black train passing under a bridge. A person that is going out in the water. a large red bus is at a stop an open kitchen and living room in a daylit house A young boy standing on top of a rug in a living room. The head and arm of a person flying a kite. A busy market sports colorful umbrellas that shade the vendors. A bird is perched atop a computer monitor. A chocolate cake is being frosted with chocolate frosting. A large dog has a collar with clock on it. the truck is going up the hill in the snow Adult with laptop with dog lying next to him. A white plate topped with fries and sausages. An old man n his computer in front of the fire. A worker performs maintenance on a fire hydrant. Some donut are on a round white plate. a cat standing on some rocks next to some bushes the bus is blue and is stopped. Some people are standing waiting for it A man on a surfboard is riding a huge wave with his feet out and arms extended. A living room filled with furniture and a rug. Two small birds in a large green grassy field. two older people stand next to a statue of a horse head A boy in the stands of a baseball game biting into a hot dog. A very pretty shallow stream in the woods. A busy intersection in the city is full of people and signs. A gray minivan on the curb at W 38th st in a big city. Man playing racquetball about to hit a ball. A couple of chairs sitting on top of the back of a truck. Two men standing next to each other holding giant sugar donuts. pepole eating at a restaurannt meat and veggies A close up of a parking meter by a parked car. A living room with windows all around it . A photo of an old clock tower next to some buildings. Several stuffed animals sitting in wooden boxes outside. Several long boarders are riding long boards down a quiet street. The animals are roaming in the backyard outside int he grass. A kitchen with furniture and decor in it. Crowd of people standing around while someone flies a kite A man watches another man that has numerous bananas on his head. a fake mouse is in a box of doughnuts A sink with several faucets and a large circular basin. Two park benches with one man sitting in woods. A large white bed covered in two white pillows. A group of people that are sitting on benches. A living room features a wood ceiling, stone fireplace and large glass window. a desk with a monitor keyboard and mouse A young boy about to hit a large ball with a large baseball-like bat A woman stands behind a cake and baking decorations. The dining table and chairs are outside the small kitchen. A cat is sitting in front of some steps a male skateboarder in a black shirt doing a trick An elephant moves is gesturing toward a bus. Many people are sitting around tables with dinner plates on them. Two men standing in a living room holding Wii controllers in their hands. A person walking on the shore with a surfboard under their arm. A kite made like an airplane flying above several American flags. an image of two bags set on a hotel bed Keyboard, sunglasses, book, pen, and various items on a table. Collection of books scattered all over a bed. This blurry picture has a male in a suit in it. a woman wearing a cowboy hat face to face with a horse A beautiful young lady sitting on a park next next to an old man. A yellow and red fire hydrant in a yard. A little boy is waving at the runway as a plane is sitting waiting for takeoff. a lit candle sitting next to a plate filled with food a picture of a hang glider on a beach a clock that has two figures sitting on a mantle A male and a female holding up their cellphones An empty bench is on the curb side of a grassy area. A man in casual wear holding a baseball type bat. A close up of a person's hand with a scissors cutting something wet. Wine glasses sit in a row on a wooden ledge A large grassy field with giraffes and a few other animals. Smiling child with a tooth brush in hand. Pitcher at mound throwing ball to baseman near runner and umpire. A luggage cart stacked with a very tall pile of luggage. a road with many traffic lights and cars driving Two people flying a kite in a park Wildlife standing near water area in natural setting. Two businessmen talk over a cup of coffee. A bunch of construction barriers near an old, worn down building. A flock of birds flying over a body of water. A tennis player hitting the ball with the racket. An overturned skateboard lying on a grassy field. Two boys are playing catch with a frisbee. A black and white zebra grazing on grass. A cat sitting on top of a chair. A clock that is in between two windows on a building. A stop sign set on the inner curve of a curving dirt road. A cat sitting on the floor by three shoes. A man holding a ski board and parasail rope. A tennis doubles team with one player in the air, her racquet in motion. A person standing on a surfboard riding a wave. Seagull in the sand near a boat launch A public restroom with focus on two urinals. an image of a man carrying luggage in a cart A CGI man sitting on top of a CGI hospital bed. Child sitting down in a chair eating a sandwich. The cat is looking at the television screen. A small child stands in a shopping cart with an umbrella. A plain piece of bread resting on a wooden plate. A train door from the inside of the car with exit signs and grab bars. The man is sitting on the bench typing on his laptop a train car sits parked as people stand next to it Food trucks serve customers in the parking lot at the event. A man and woman seated at a table in a restaurant. A number of train tracks with a train on it A plate full of spinach salad with dressing a couple of people that are playing with a Frisbee A wooden stand with many types of fruit. A white plate holding two pieces of cake on a table. A giraffe handler training a giraffe at a zoo. A small sink area is packed with items. A pole with many different stop lights in different directions. A tree filled with lots of fruit and leaves. A brown horse standing on a lush green field. A brown towel that is sitting on a tub next to a toilet. A blue bowl containing various fruits such as apples and bananas. A bear sits next to another bear on a white blanket A dish of vegetables mixed together in a bowl. A boy is doing a trick on his skateboard. A large bus on a open city street. A clock with a colorful drawing on it. a person riding a surf board in a body of water Caucasian and African-American business men standing in line to buy 'Po-Boys from a catering truck. An old picture shows a man up to bat on home plate. A plate of bread , eggs , and bacon . Many people and a few cows are spending some time in the water and on the shore. A man and a woman cut a cake together. A bear walks in the bushes and plants in the wild. A man is riding a horse in front of several buildings. a basket of apples oranges and avacado on a table The cup that contains a toothbrush, toothpastes are placed next to the mirror. A red car and red motorcycle parked at a curb near a woman walking with an umbrella. A toddler is sitting in the bathroom sink playing with toothbrushes. A woman posing next to a double layer stack of donuts. Man intercepts man over a game of frisbee Man surfing on an ocean wave in the summer time. An aerial view of a city and waterway with ships in the water and a bridge. A bunch of books that are lined next to a clock. A man in a blue suit eating a hot dog in a gym. Pink and white flowers planted in an outside area. A skateboarder rides a ramp in a skate park. A dog laying on a couch with a Frisbee. The drink in the glass is garnished with toothpicks and rosemary. a man dangling in the air over the ocean A chocolate and ice-cream dessert in a restaurant a single person walking the beach with a dog The train is traveling down the tracks by the station. A room filled with computers and laptops on a desk. A group of people sitting around a table with clutter on top of it. A bedroom containing a bed without sheets and a dresser. A couch is made into a bed in a room with a desk. A player at bat in a baseball game. A boy is flying a kite on the beach. a woman in a gray top is cooking outdoors A man sitting with his back to a dining table, with a laptop on his lap. A young guy standing by a tree while playing outdoor activities. A couple of white parrots perched on top of a tree branch. A man holding up a tennis racket as he coughs into his arm . A young man is tilting a skateboard up with his feet. A woman is sitting on the curb with a decorated parking meter. A frozen pizza box with the cooked pizza lying next to it. A brown vase sitting inside of rocks next to a set of green plants. A half eaten sandwich is wrapped in white paper. A bald man with a mustache wearing a suit. This is a yellow and blue double decker bus. A picture of a wooden hedge hog clock with a price tag of twelve dollars. Four people with a birthday cake on a table. young children getting healthy food from a table. Some street signs point directions to various places A Water Dept sign is placed in front of the fire hydrant. A plate has beef on it near a glass of wine. A cat looks back over its shoulder while laying on top of a fuzzy white blanket. A train engine carrying carts down a track to a station. The electrical components of an oven are being tested with a multimeter. Young man wearing shorts throws a frisbee among trees. The woman is playing with a wii controller A person on a skateboard does a trick in a bowl. A man that is holding a knife and a pot with broccoli. The Halloween display includes a spiderweb and lots of pumpkins. There is a neatly made bed in a bedroom of a log cabin. White swans swimming in a harbor with docked boats. A novel is on the seat of a green metal bench. A man standing and posing for a pic in formal wear. A few airplanes on the runway at the airport Baseball team holding batting practice on the field A man with glasses talking into a microphone. There is a hanging clock in the hallway of the home. THERE IS A GIRAFEE THAT IS WALKING IN THE WOODS A surfer takes a ride on a wave near a mountain. A freshly baked pizza resting on a table. A man sitting at a table with a glass of juice in his hand. a kitchen with a counter some chairs and a sink A woman riding a skateboard in the street behind a man on a bicycle. a couple of guys that have emt equipment A man wearing skis holding two ski pose on top of a snow covered slope. A small potted bonsai plant is on the floor getting licked by a cat. A parking meter that has a blonde wig on it A large truck driving down a busy road with the back full if dirt. A clown talking on a phone next to a building. A tennis player on the court holding a tennis racket. A plate of fries and a hot dog sandwich. A double decker bus stopped at a bus stop. TWO BUSINESS MEN WITH TIES ON CONVERSING OUTSIDE A BUILDING A bed sitting in a room next to a wooden door. A view of a person's hand on a computer mouse. A park bench in the woods with a bag on it A cat is looking out of the window. four giraffes standing in a field 2 are facing forwards A pile of oranges sitting inside of a basket. On this table there are mugs of hot chocolate with shapes and half eaten donuts on plates. a teddy bear nailed to a tree suspended above garbage A room with a toilet, a door and shoes in it. A public bathroom area with orange tile walls. A watch and class with a beverage sitting on a wooden table. Room with many hanging clothes, a bed and dresser. A photo of a group of bikes behind a bus. Pendant lights illuminate a bathroom sink for two. A prepared pizza is sitting on an appliance. three people standing in a room and eating food. A plate that has a sandwich and a bowl of fruits on it. A woman posing with a bat and wearing a batting helmet. A man wearing a white lab coat walking a cow down a field. A pony grazing on grass in front of a lighthouse. Man spreading peanut butter on an English muffin A train is traveling down a track in the middle of an arid plain. A woman is playing Frisbee with two dogs. a vintage photo of man standing in the middle of some waves Personal pan pizza on a wooden table top A woman is standing in front of a stove A man stares at a cake with candles A surfer is riding a yellow surf board as he hits the waves. Several cars driving towards a public market. a traffic light and a street sign on poles A church steeple rising high in the sky. A jet airplane flying in the daytime sky. A tablet sits on a table with two pizzas. People walking down a sidewalk on a street. A yellow fire hydrant near a grassy field. a man in a black jacket is holding a hot dog with mustard Two friends are eating an extremely large pizza. There is a toilet and a bathtub in a bathroom. A guy skateboarding indoors in front of a crowd of people. A cat that is looking out of a window. there is a woman holding a baby and a pizza pie on the table A desk with laptop, mug, paper and a monitor. A woman with a tennis raquet prepared to hit the ball. Three shelf deli display case with bottle beverages on top. The kitchen has five beams running across the ceiling. A woman grabbing a piece of cake off the top of a plate. A sandwich and a salad are on a tray on a wooden table. a kid stands on a hillside while flying a kite A man in a red snow jacket is on skis. A woman standing next to a little girl playing a game on Nintendo Wii. a woman is playing tennis on a court A crowd of people standing outside of a brown brick building. A woman is preparing to bite into a sandwich. A young boy tying paper kites to a string stretched across a room. A dog that is sitting down by a bench. An airplane sitting on the runway in the snow. A man about to put a leash on a large cow. People are lined up along a train station waiting for a train. Young man on a skateboard approaching a street. A red and white fire hydrant on a sidewalk at the park. A man and woman walking across the lawn carrying an umbrella. a person riding a motorcycle on a city street a young kid performs a trick on a skate board Men playing soccer on a field at night. A couple of women riding skis on top of snow covered ground. Two ladies using the Nintendo Wii in a living room. Some vegetables on the ground are in planters. The clock on the side of the building is also a sculpture. She appears to be hanging on the street sign. A giraffe on a large plain with herd animals in the background. A man is about to swing a baseball bat. A child is in the snow with one ski on and one off. The bed red couch from the Mc Donalds commercial sitting in a living room with a fireplace next to it. A sign above a white stove and refrigerator next to it. Some babies playing in the bath tub one holding a tooth brush. A man drives by on a person holds onto a ladder below an airplane A lone zebra standing in the middle of a field. A woman is playing tennis on a fenced outdoor court. Well decorated restroom with sink and chair for sitting. Skateboarder in the motion of turning on his skateboard. A large grey horse is behind a wooden fence. a mountain with a bunch of animals next to it A dozen people smiling for the camera at a large wooden table in a restaurant. A black dog laying on top of a rug on a hardwood floor. Two giraffes in an enclosure are bent over peering at visitors. A man riding a skateboard up the side of a ramp. A little girl that is sitting in front of a laptop. A professional female tennis player engaged in competition on grass. Two computers are sitting on a brown desk. Beach umbrellas and chairs next to each other. A person who is on a barrel on a snowboard. A man standing outside holding a sausage dog in his hand beside the food stand. the mirror is showing a picture of the microwave in the kitchen a man is making some food in a kitchen A snow filled street with a stop sign on the corner. This is a wide perspective of a room in a region. a close up of slices of pizza on a plate A pair of boats stacked up on a beach. A man doing a jump on a skateboard A bunch of cars driving through down town New York City. A group of people are flying kites in a field. A large airplane sits on the runway at the airport. A group of people sitting down at a dining room table next to dishes. A group of young people playing a game of soccer. A guy riding the an incoming wave on a surfboard a little bird sitting on a ledge as it looks at the window green peppers red peppers a tomato corn and hot peppers Biathelete skiing forward with her rifle on her back. Zebras racing each other in their zoo enclosure A beautiful young bride standing next to a her husband as they prepare to cut a cake. Pizza with pepperoni, mushrooms, olives and sausage on a pizza pan. Bicycles and a motorcycle parked on a city sidewalk. A loading truck carrying boxes and a Stop sign A man jumping up to catch a frisbee A living room scene with a large window. a close up of a doughnut covered in sprinkles A person with a bike and a dog on a leash, boarding a train. a close up of a person wearing a shirt and bow tie A kite flying over a sandy brown beach. A group of people enjoying a day at the beach. A kid standing in the dirt with some fruit. A group of people are together in the snow on skis. a person in an open area flying a kite in the sky a train on a train station and people walking near by An old fire hydrant casts a shadow on the sidewalk. A man driving a yellow car on the road A kid is playing on some toy drums A street with many signs on the corner A merry go round with lots of colorful giraffe and other animals. People are playing ultimate frisbee and someone is about to catch it A woman that is sitting on a bike. Two women trying to compete for a Frisbee during a game. The child in the black helmet is swinging at a tee ball stand. a public transit bus on a city street with people near by A laptop computer sitting on a cluttered desk. A pedestrian sign has been devised in comic fashion. A man wearing a pair of glasses and a tie. Two dogs near a carry-on bag on a tile floor. A dog wearing a bandana rides a skateboard. Mini pizzas on shelves waiting to be bake. there is a white toilet that is broken on the street A messy bed in a room with large glass windows. A black and white view of a clock tower with a ferris wheel in back. People at an outdoor market under a canopy. A computer monitor in a home style office A woman reaches out to pet a giraffe who stands in confinement with his companion behind a fence. A man on a cell phone resting his legs on his luggage A black and white photo of a man walking around with an umbrella. a tall giraffe standing in front of a wood fence A clock tower with lighted clock faces, against a twilight sky. The sign on the sidewalk shows a U turn. Two people are aiming controllers at the television set while other sit on the sofa watching. A kitchen with steel dishwasher, refrigerator, cabinets and microwave. Two men play Frisbee in the sand while others watch. A brown and black dog laying on top of a wooden seat. A blurry image of a knife cutting into frosted cake. A man in surf gear walking down a crowded street. A man in a warehouse riding some moving object. people flying very high and waving their hands A man cross country skiing in the country. a building with some really big and fancy clocks on the side of it A close up of a giraffe with its face against a pole. A person flies a kite in a field. Some cooked vegetables are sitting on a plate. A plate of food and some cups of drink on a table. Several slices of pepperoni pizza sliced into squares. A large fed ex plane flying over mountains. A banana, tomato and apple laying on a desk The laptop is connected to a full size keyboard to make an effective work station. A giraffe is standing in the bushes and tilting its head. A man riding a skateboard while flying over a board. a broken up DVD in front of a keyboard Two people in a room playing a game of Wii. a white horse sniffing the hand of a person in front of them Some chopped vegetables layed out on a pan A woman throwing a tennis ball up in the air to serve it. A baby elephant following an adult elephant by a fence. Small group of people playing video games in a living room. A woman in pink dress playing a game of tennis with people in background. A group of airplanes fly through the sky. Skateboarder in purple shirt riding on top of his board. Several young Asian people are snowboarding and skiing. A kitchen with appliances that include a sink, dishwasher and a refrigerator. Four giraffes are standing next to a bare tree. Three beds with clothes laying folded on each one. A man has his neck covered by clothing. The warning sign is below two street name signs. A group of people watching kites being flown in a park. A man and a woman eating donuts and having drinks. A jockey sitting on the back of a horse a red and white sign in front of a white house A man rides a donkey pulling a trailer of hay A variety of sandwiches on a table with photos on it. A dog with it's nose on a couch and an open laptop A young boy is sitting on the wooden bench. Clouds loom over the city skyline with a clocktower in the front. A group of men holding cell phones down at their waists. A guy that has a burrito in his hand and is eating the burrito. Stuffed animals are sitting on top of bookcases. A woman playing a game of tennis on a tennis court. A close up photo of a baked food in a pan on a stove. A white bowl filled with different colored vegetables. A pile of carrots and broccoli next to green onion. a surfer in a wet suit is surfing on a white board A lady wearing a hat talking on a cell phone. A female equestrian is riding her horse in a show arena. A little dog sitting on a wooden bench. A single skier is the only person for miles of flat snow. Someone is displaying a colorful pinstripe wallpaper on a cell phone. A very rusty old car near some pretty flowers. A group of men sitting next to each other holding cell phones. A cat that is laying down on a couch next to a remote. A man is jumping and guarding in mid air while another guy is throwing the frisbee. A bear laying inside a decaying mass of some sort. Two train cars are beneath some trees on the top of an incline. A man is doing a trick on a skateboard. A laptop computer is on a table in a nice back yard. A jar of food on a wooden table. Thee people stand in a lot while one holds an umbrella. TWO BALL PLAYERS ON THE FIELD, ONE RUNNING TO BASE Men standing and one pointing to an object on a street. A man swinging a baseball bat as another looks on. The baseball team getting ready to walk off the field. A small kitten walking on a laptop keyboard. A CITY HAS A CLOCK ON ITS BUILDING Train that is very aerodynamic in its appearance A person wearing skis, standing in the snow. Two plates of broccoli are sitting next to each other. A woman sitting at a table across from an entree of beef. A line of food trucks parked on a city street. A yellow commuter train pulling into a station. A large picture of a man with a mustache and a bird on his shoulder. A large group of people at a table using laptops. Bird sitting atop a wooden railing among the trees. A guy holding a cellphone from a display. A view of a street with multiple store fronts. A woman helping a small child on snow skis. a plate filled with assored meat, some fruit and veggiesm and a roll A person crouched over on open lid toilet A man using scissors to cut white paper. Look at how high the snowboarder is in the air. A large yellow and brown boat floating on a body of water. close up of a large stuffed pasta shell and vegetables on a plate A line of bicycles beside a street where a bus is stopping for passengers. A ginger cat sits and looks out a window A holiday cake with holly designs on it. A woman feeding a giraffe under a tent. A clock on the side of a church tower. an image of a girl walking on the sand on the beach A promotional photograph of professional MLB player Travis Buck. There are several hot dogs on this plate along with two sides. Baseball players are watching as a hitter hits a baseball. Several pictures of someone baking using an old school outdoor wood fired stove. A duck swims along a large body of water. Large group of motorcycle riders coming down the street with flags. A horse is walking down the street alone. a small child is playing in a field A blender and a glass on a counter top. A red stops sign stands on a grassy island that has grass and is near a street. A baseball player up to bat during a baseball game. a multi-colored boat with tents sitting on the water A break room with a sink and a microwave. A couple of toilets sitting in a bathroom. A locomotive on tacks with smoke coming out of it's stack. A group of baseball players standing on top of a field. Small herd of sheep walking and grazing in fenced farm field. Sheep are grazing on fresh leafy vegetables that have been given to them. An incoming train is approaching a railroad crossing. A SURFING BOARD STAND WITH A PERSON STANDING NEAR BY. A boy that is holding a bat in the grass. A family gathered around a dinner table getting plates of food. An older man is holding luggage outside a transport center A girl is standing next to a horse. A giraffe stands next to a lone tree in a grassy area. A white building sitting below a brown tile roof. Woman in center of dirt intersection holding pink umbrella. A pizza is shown displayed on a plate. A green road sign with a bike painted on it. Someone holds a bottle of mayonnaise near a hashbrown sandwich. a young person riding a skate board on a wooden surface Two cows with heads through bars eating hay. Two large elephants walking behind a wire fence on green grass. A towel rack in a bathroom topped with two stuffed animals. Woman in bathing suit sitting on a beach chair, drinking a soda. Two sheep in a grassy field with a rabbit nearby The person in the bodysuit is surfing a wave. A small plane flying through a blue sky. a old jar that is sitting on the ground Many pedestrians are navigating around a street corner a man in a suit standing in an office An orange cat is sitting on a bag. A landscape photo of a large swimming pool area. A cat outside a window looking at a Buddha statue. A batter has just hit the ball but has not dropped the bat yet to run. Trolleys in the mountains travel through the snow. A photo of a woman sitting on a train on her cell phone. A plane is parked and being examined by several men. A group of skateboarders atop a concrete surface. A man with sunglasses dressed in a suit and tie There is a baseball game going on, the hitter is about to hit the ball. People look on as an airborne snowboarder competes. a glass wall to a shower in a bathroom A water skier holds on to a rope being towed by a boat An unmade bed in front of a poster on the wall. Three people on horse back at a rural road intersection. A woman walking around a living room next to a TV. A douhnut and coffee are on a table. A person covered with snow on the mountain with skis Three women sit on the beach with two of them holding onto some umbrellas. A woman in a red bandana slicing a banana. A man is paddle surfing alongside his dog. A plate full of half eaten food with utensils. a person in red is snowboarding on a hill a dog sits in front of a window on a bed A classic building in the background frames a stoplight. A group of men standing on top of a baseball field. Many people sitting under umbrellas on a sunny beach Two zebras standing by a log in a grassy field while people in a car watch. A woman on a court swinging a tennis racket. A person in a red shirt is riding a skateboard. Three different vases are on a shelf. A woman in a red dress talking on the phone. Carrots fresh from the ground with dirt and gardening gloves A fireman is getting water out of a boot. Two men are holding video game controllers preparing to play. Some guys are watching two others playing the Wii. A young person in plaid doing snowboard tricks a iced cake that has been cut up with a server resting on the plate next to it The living room looks into a small, well organized bedroom. Cooked broccoli and beans are a side dish. A group of people standing in the sand with a kite. A group of people sitting around a wooden table in front of a projection screen. A person looks on as two other people prepare to fly a kite. this is a person flying a kite in the water Man looking at a screen while holding a Wii controller in his hand. A man with a tennis ball sticking out of his skull. A donut factory with donuts on a conveyor belt A building with a clock tower and a light blue roof. A store shelf filled with different heart shaped boxes. A man is smiling as he eats his passover dinner. A city as the sun sets with a gas station next to a traffic light. Model car sitting on a table next to a slice of chocolate cake. A man and his shadow on a red tennis court while the man swings a tennis racket. A birthday party for a baby with it's parents There is a big room with furniture and items inside. Two wine glasses sitting on top of a table. Two zebras face each other and graze an open field. an image of a guy that is on skiis A little girl riding a pair of skis on top of a conveyor belt. A pastrami sandwich being held by someone A fanciful dressed piece of pizza on a plate. A small Frisbee is lying in the water. A man holding a small white dog while wearing a black hat. some white birds flying over very long grass A red double decker bus parked near a curb Two computers are side by side on a desk. barefoot little boy holding a hairbrush in his hand A boy throwing out a pitch in a ball game. Some sport players are competing in the Frisbee game and having fun. A train platform with passengers and two stationary trains. Two horses trot on a field with their handlers. A white cow makes a face as he stands near a stone wall. An open top double decker bus driving down a street. a desk with a laptop and a monitor and keyboard on top A giraffe standing in an open field next to some rocks. A group of three people sitting on a couch. A vase with ref flowers in it on a table. Poised to slice into an iced multi-layer cake. A bench next to a small pond with a white bird standing in the water. A black cat underneath a umbrella in a room. There are many birds flying near the boat. A room with a wooden desk and matching shelves A clean and tidy kitchen counter with nothing on the counter. A couple of girls standing in a livin groom holding Wii controllers. Inside a restroom stall, a rag floats in the toilet water. A closeup of a deep dish pizza in a restaurant, Seven vases sit displayed on top of pedestals. Men are in a life raft which is beside a ship. A large boat with people on the back in the water. A giraffe bust hanging by a Rain Forest Cafe Sign. a man that is skateboarding on a ramp some forks people and a white cake Man riding on the back of a painted elephant. Two women with clear umbrellas stand near two people in uniforms near a building with a thatched roof. A motorcycle parked in front of green doors. a plate holding a slice of broccoli pizza next to a bottle of beer Two mean getting ready to hug each other while standing in a classroom. A young man preparing to throw a frisbee. A man on a surfboard surfing in the ocean. A close-up picture of some food on paper plates A baseball player at home plate with a crowd of onlookers watching Some children are playing game in the room. Twin beds with pillows, and a lamp and vase A table full with a display of cupcakes and donuts. A chicken sandwich and french fries are on this plate. A vintage tennis team posing together on the court. A group of trucks on a mountain side trail just sitting there. Someone skateboarding in the park and doing a trick in the air. a cat that is laying down on some carrots A black and tan dog laying peacefully on a sofa A pole with a lot of street light signs on it. A table with many fruits and vegetables, including carrots, potatoes, squash and apples to name a few. The brown dog is waiting for his owner to play frisbee. A shot from the crowd of a player during a tennis match. The train is stopped on the tracks to pick up passengers. Some hotdogs and plates are on a table. Black train cars on tracks next to trees. A man with a helmet on, on skis at the top of a slope. A vase with yellow flowers sits upon a red and blue table cloth. An office desk with several monitors and birthday balloons A sleepy dog wearing a cowboy hat in the back seat of a car A truck hauls a group of tractors down the road. a large clock resting on a poll by some trees An Italian meal with marinara sauce served on a long tray. A town square with a statue in the middle. Large variety of fruits and vegetables on display at a market. The complete perspective of a washroom with numerous things to see. An object that looks like a dog sitting by a miniature cell phone. A MAN IS PACKING UP SKIES ON THE SNOW LAND A group of people holding candles on a sidewalk in the snow. a big yellow school bus shown through the rear view of another school bus A woman on the phone standing in the kitchen with her mouth open A bathroom with shower, sink and a mirror. Player and referee at tennis match on red court. a man holding a bat gets ready to swing it A BATHROOM THAT IS IN SERIOUS NEED OF A REMODEL Father, mother, and young son playing in the water. A man in a tennis match is swinging his tennis racket. A cow resting on the side of the road. A dark bathroom with a white bathtub and a white toilet. The street sign has numerous street names on it. A giraffe walking through a zoo type enclosure. A stuffed monkey sitting alone on a bench. A guy doing tricks on his kate board A red table topped with two plates with slices of pizza. A man and a woman with three dogs read the menu outside of the deli. A group of people walking down the street in what appears to be a marketplace. A red box on a pole with a solar panel on top. A white plate holding a sandwich and fried potatoes. A train stopped in a station with people walking towards it with luggage. Lady standing in front of two couches with a remote control in her hand. A bunch of airplanes are parked on the runway. Several small white boats on the open water. A couple of people on surfboards in the water. A view of a restroom urinal covered in filth. the start of a broccoli stalk in the garden A toilet in front of a window, and next to the shower are shown A person on a field swinging a baseball bat. A cat that is cleaning its paws while sitting on a suitcase. A bear lays on a pile of food A pair of giraffes standing in a pen at a zoo. Three men stand in front of a beige building and the man in the middle who wears a hat holds a white Frisbee. Young girl gets ready to blow out candles as family watches A young man doing a skateboard trick while others watch. A boy is eating a slice of pizza at a table. A man holds scissors to his protruding tongue, as if to cut it off. a sign for Bras Basah Road next to a pedestrian stopwalk A man standing in a field holding a small parachute. A microwave oven mounted into the side of a wall. The city street is quiet this time of night. A young boy standing on the top of a sky slope. Several kites of different colors laying on the sand on the beach. Workers in uniforms next to a truck and construction equipment A bowl of vegetables with a silver spoon. People sitting at a table and eating soup. An orange and white cat chasing a feather An Australian Shepherd herds cattle in a pen. Doorway view of a bathroom with a toilet and window. Two women standing under an umbrella having a conversation. A picture of some people playing with a frisbee. A cat playing with a shoelace of a tennis shoe. A man riding down a snow covered ski slope on skis. Two donkeys are standing together. One is facing out and the other one has his head bent. A cat is lying in a houseplant on the window sill A couple of glass items that are in a room. A plate filled with lots of different types of food. The cow is hoping for a way out of the fence. a small white and red plane parked at an airport Three men standing together while on of them handing another one a frisbee. A female surfer stands on her board in the water. The extra long passenger bus is entering the intersection. A dog is crouched down beside a toilet looking up at the paper. A bicycle is parked between a welcome sign and a street light. A close up of two teddy bears hanging from two strings on a hook. four sheep grazing in a open snow pack Two men are seen eating something standing on the street a large building with people outside looking around a bed room that has a couple of beds in it this is several zebras in the grass running A girl laying down on the couch holding something in her hand. A railroad train pulled into the station with people boarding A family riding on the back of an elephant across a field. A stop sign affixed to a cyprus tree in a body of water. A bed made up with flowered comforter in a room with two windows. A group of Asian people seated around a restaurant table. Several people in ski gear standing in the snow and in front of trees. one brown cow and one black cow standing in mud A large open living room with a decorative rug. A train moving along a track outside during the day. A man is holding an apple in an advertisement. A giraffe and a zebra grazing the grass. A desktop and a laptop on a desk. A number of signs hanging from buildings. A group of people sitting around a restaurant table. A person on a field with a baseball bat. A jar filled with different types of fruit on a table. A giraffe is standing in a grassy field. A bowl of food is sitting on a table beside a glass of wine. Three men holding snowboards on top of a mountain A lone cow walking in a large field near houses. a toilet a bathtub a rack bottles and a shower curtain THERE IS A MAN THAT IS PLAYING BASE BALL ON THE FIELD A skateboarder with a hat is skating down a ramp. A group of kids at a skateboard park doing tricks A small boy on a guys lap with a toy guitar. A parking meter sits in the foreground before a church and other large buildings. Laptops, keyboards, and other computer equipment on display. A short boy with a penguin backpack stares at a large bear in the zoo. A cat laying on top of a laptop computer. A cat is sitting on a wooden surface behind a vase of flowers. A large propeller airplane flying through a blue sky. A baseball player getting ready to hit with a catcher and umpire at a game. Barrack Obama eating a hot dog with his young blond boy toy. there is a plate that has meat and rice on it Tennis players stand together for a group photo. A smaller giraffe is standing in the green grass. A wooden bathroom with a wooden toilet next to a window. A man stands with a tennis racket on turf. Two gulls perch on a mossy concrete wall overlooking the sea. Two little boys sitting at a restaurant table with an adult. A man holding a cabinet in a kitchen. There is a mountain behind the light house. A young woman taking a picture with her phone. A train is parked near a platform at the station. a couple of buses parked behind the other in the street outside some buildings A boy, three dogs and a frisbee in a dried up creek bed Many people are walking around the dock near numerous ships. A living room filled with furniture and a large TV. A white stove top oven with two tea pots on top of it. A small gathering in the living room with drinks being served. A sign saying no drinks allowed is hanging A giraffe is standing with his front legs apart. A room that has stained glass windows separating another room. An instructor pointing at something on top of a screen. a building with a clock tower near other buildings A woman holding a baby near a long horn steer. A crowd gathered for a small-town parade looks on as the next float comes down the street. A brown dog with it's head hanging out of a window. A man sitting on a concrete structure on the beach. A young man riding a skateboard down a curvy road. A living room with a sofa and built in tables. A guy leading a bunch of people in a choir. A large long train on a steel track. The two hot dogs are prepared and ready on the plate. An airplane is flying high in a blue sky. A man standing on a field talking on a phone under two colorful kites. The man is carrying the bananas down the road. A sailboat is floating outside on a lake. A wide building with many glass partitions has a front pavement with standing and milling people, some of whom are headed to the open door of a bus also resting on the pavement. Two people are lying in a bed with a computer. A big bus and other traffic on a busy city street. No parking signs hanging on a pole. The man in black came up to the brightly colored food truck. A group of travelers wait to receive their luggage. The motocross driver races down the dirt hill. A very comfortable looking bed with big plush pillows. A blue boat skims the ocean with a crew of several people. A pair of racing motorcycles coming to a start line. A hand lifting a slice of pizza off a pan. Several people are sitting at a restaurant as staff work. A tennis player reacts to hitting a ball. Two giraffes, one is closer and larger then the other, appearing to be curious about the photographer. A banana sitting on top of a white plate. A man and woman look at a piece of paper A skateboarder rides his board at a skate-park. Stuffed animals displayed on table with assorted items. Many kites are lying on the field on a cloudy day. A woman holds a little girl's hand while cross-country skiing a desk with multiple monitors and a laptop A beautiful black and white dog catching a frisbee in midair a group of people walk through a rain storm A group of children are standing in line. An orange sign with black lettering near a city street. A person attempts to para-sail with a parachute. A man is riding a wave on a surfboard Several people on the beach with chairs and umbrellas. Two zebras with one of them laying his head on the back of the other A group of people in a park watch a man in a green sweatshirt and hat catch a white frisbee. Horses bumbled up next to each other in an enclosure a male sitting on a toilet with a laptop Several potted plants in front of a window. Man playing tennis in motion with crowd and tennis court A man standing on a tennis court holding a racquet. a bus that is filled with people crammed together A picture of a stainless steel stove that is in someone's kitchen. A surfer in a bodysuit rides a wave. A picture of a toilet taken from above it. a large monitor and a small laptop are on a desk A bench that looks like a round hut. Commuter bus on roadway at night in city setting. Lunch recipe calls for whole eggs baked inside bread, served with tomatoes on the vine. A handsome sink on a long pedestal in a bathroom a man on a horse rides through the streets while others watch A group of black and white cows are on the grass. a toilet a tub a brown wooden floor and a mirror Three cars traveling down a street in front of a large building. A white table that has black chairs in a kitchen. two zebras standing together in a field a by a small tree A close up of a fire hydrant with a skyscraper in the background. A person standing next to a pole working on a traffic signal. This is a cluttered room with alot of boxes of stuff. A toddler pulls himself up next to a toilet A white toilet sitting in a bathroom next to a wall. A bicycle parked near parking meters both covered in snow. A man is jumping near a ramp on a skateboard. A surfer looks back as another surfer catches a wave. A small train is going through a bushy field. A beautiful woman taking a picture with her smart phone. A group of people flying kites over a sandy beach. A man holding a bat on the beach looks down a woman is cutting a fourth of July cake while two other girls watch A table has a handbag, brush, mints, wallet, and cell phone on it. A man sitting on a stone wall talking on a cell phone. A smiling man holds a bunch of freshly picked bananas A person crossing a street next to a crosswalk. A small blue car parked outside a house The woman is posing for a picture on the side of the road. two hands are toasting some wine glasses and a person in a black jacket A pizza with spinach on top of the sauce and cheese A group of people walking through building with large umbrellas. a pastry with some powdered sugar on top of it A crosswalk signal with a lighted red figure. Two trains on the track at a railway. A montage of people shaving and cutting their hair. A plate of pizza sitting on a table ready to serve. A yellow fire hydrant is on a city curb. A close up view of a mirror reflecting cars parked on a street. Asian man and woman sitting and looking at cell phones Man removing a pizza from a home oven with a peel. An old restaurant in Lucerne that apparently has wonderful wiener schnitzel The little girl is eating lunch and having milk. A bedroom with a bed under two framed paintings. several young students working at a desk with multiple computers A man cutting a cake on top of a table. A large group of sheep stand near the water all looking down eating A white and brown cow eating grass in a field. Smiling friends posing over a bag of donuts A behind the scenes look at a photoshoot for a bunch of bananas A zebra in a fenced in area next to a man. A man in a grey apron with a sandwich full of barbecue. A large teddy bear with pink camouflage on the street. A very tasty looking pizza sitting on a table ready to be eaten. A black TV sitting on top of a desk next to a couch. Two horses eating grass by a body of water. A beautiful blond haired woman talking on a cell phone. A tourist looks at sheep grazing in a yard These motorcyclists are waving their American and Marine flags Buses and cars stopped at a traffic light. Close up of metal post with a walk signal and a Do Not Enter sign with profane graffiti with building behind. A crowd watches a softball player with a red helmet. A man about to hit a tennis ball with a racket. A man doing a trick on a skateboard while people watch. Three adults watch a child holding a toy doll. A cat that is eating some food on the ground. A table with a book camera and shells A dark colored cat standing on a wood floor. Two white ferries passing each other on a body of water. Orange cat walking across two red suitcases stacked on floor. A stop sign on the corner in front of a row of stores. A cat that is sitting near a sink. A toothbrush is sitting on a sink that has the words mystery toothbrush on it. A baseball player takes a swing at a pitch. a small boat in a large body of water a man sits on a bench while holding on to a dog City two way street with cars lined up on both sides. Four different food dishes including rice and chicken. A man is wearing a blue shirt with a black coat and a gold tie. A black cat sitting on top of a red couch. 2 farm cows stand on a baron field Two female skiers are standing in the snow wearing purple attire. A sheep grazing in a field above a pond. a pizza that is in a pan that is on a table A man on a skateboard riding over a hill. A bathroom with a large green plant growing on the wall. The group of people walking in the city have umbrellas up. A pretty yellow city bus on a wide street. The pre-school child is trying to kiss the toddler. A large group of people playing frisbee with onlookers. A black-and-white shot of a woman in a dress holding a tennis racket. We have a distorted view of a bus and a pillar. An upward photo of a man in suit staring in the distance with another man holding a finger up. A small teapot is on a plain wooden table. A kitten is laying on a laptop watching a video. A girl wearing glasses posing for the camera while holding a tennis racket. A woman is painting a green fire hydrant. A couple of people standing in a room. a man on a skate board does a trick in the air A woman riding on the back of a brown horse. a little kid is looking at some doughnuts under a display Man in a field walking behind two Clydesdale horses. The zebra and giraffe gaze into the open meadow. A person with a pink umbrella and a suitcase next to a taxi cab. Signs showing different street signs on the corner of the street. a display of a giant bear standing in the middle of a shop A stop sign in an area with grass, trees and small buildings. A man stands near a podium in a gray suit and blue tie. a woman standing in a kitchen while preparing food. a person holding a kitten and feeding it milk A young person is playing a soccer game. Two small dogs look around in the yard. A large truck is parked on a street. A man in the water on a surfboard. A train covered with snow sits in a train station. Some bananas are for sale at a store. A cat sitting on top of a television Slivers of cut, sun-dried tomatoes lay to the left of a pair of food shears there are uncut tomatoes on the right. A giraffe is walking near a fence at a zoo. A commuter train stops at a train station with it's doors open. Wild animals walking in large open field and path. A person is standing in front of a store mannequin in the dark. Several young boys are playing a baseball game. Somebody is having in the peaceful of the picture. Bus, cars and a motorcyle all stopped in the street 2 Motorcycles are sitting in an empty office A female tennis player swinging to hit a tennis ball on the court. A man looking down next to several hanging bunches of bananas. A woman in black shirt and skirt playing a game of tennis. A little girl smiles next to a foil wrapped cake. a zebra is eating grass in a stable an emaciated man wearing tie standing erect showing teeth. a motorcycle that has some sticks on his back A bride and a groom look ridiculous as they stuff cake into each other's mouth. A table topped with paint and construction tools. Lady wearing a hat and sunglasses riding on an animal. two people at a bar holding drinks A three story white building with cars parked on the street in front of it. An apple is carved with facial features and teeth two young children in a garden eating greens Woman and her dog tends to the herd of sheep a man riding on an elephant near a stream of water. some people are traveling down the street in a city A stop sign leans to the right at a small town intersection The man is just getting ready to serve the tennis ball. a mechanical robot holding a base ball bat A giraffe bends over to nibble grass in a rock and lawn area at the zoo. luggage is packed and lined up for traveling people dressed in costumes at a ski resort Three people are having a cook off in the kitchen. The skateboarder is learning how to complete his trick. A kitchen counter with a lot of empty bottles on it. People are sitting in chairs with laptops, papers, and cups. A multi-hued teddy bear wearing a royal robe and blue ribbon. A group of giraffes feeding next to a tree in a caged area. A beautiful woman in a bikini surfing with her dog. A black handled toothbrush with new bristles on it. Many different types of small boats on the water. a security officer sitting on a fence while talking on a cell phone and holding onto a segway Several kinds of doughnuts are in a cardboard box. A girl is going to the field with her soccer ball. A dog lying on a couch while wearing a collar. A multi colored train parked on a train track Blender on a messy counter in a kitchen filled with food. A slice of pizza with vegetables sitting on a plate near a drink. A man wearing a black helmet swings his baseball bat. Broccoli and waffles with a mushroom sauce on a plate with a spoon beside it. A pan has a slice of pizza left in it. Toy cars line the parking lot of a toy setup. there is a baby elephant standing in a field with tall grass A humming bird flying over a red bird feeder. A boy and a girl with a blue frisbee. A man on a bicycle passing by a taxi. A baseball player swings and makes contact with the ball. This is a large kite flying high in the sky. A man standing on a tennis court holding a tennis racquet. Tall green pine trees in back of large grassy field. A bench that is by some trees and grass. A man is standing in a semi-dark room making a call on a cell. A white vase of flowers sits on a wood table. A few empty boats at a river ride Two men holding surfboards while standing in the ocean. Dog on skateboard wearing t-shirt during parade event. A toilet and sink sit in an empty bathroom. a sink sitting in front of a bathroom mirror A compact bathroom with a shower and a mirror. A shaggy dog lying on a green and blue blanket. Elephant with young rider standing next to adult elephant near parking area. A small bathroom has green walls and beige floor tiles. A plate of pizza on top of the table Someone is frosting a cake that is on a glass plate. A computer monitor and speakers on top of a desk. A snow skier is being pulled by a rope overhead. A table with pies being made and a person standing near a wall with pots and pans hanging on the wall. A large fleet of boats in a large body of water A man wearing a white shirt and tie. a bunch of food is on a white plate A teenager has his feet off the ground holding an umbrella. Two children stand near a large teddy bear. People in a market shopping for fresh produce. A brightly colored quilt on a bed in a furnished bedroom. A train that is yellow is moving down the tracks. A living room that has wooden shelves with many movies on them. A man riding a motorcycle over two cars. A black and white checkered bathroom with toilet four motor cycle cops on a city street A white toilet sitting inside of a red bathroom stall. A surfer raises his arms for balance on a wave Three people on a bench are smiling and waving. a group of lambs walk across a grassy plain a person riding a surf board on a body of water A young elephant at a watering hole with other elephants in the background. A white polar bear standing on a concrete surface. A man jumping a motorcycle over a row of parked cars. A man that is doing a trick on a skateboard. A bathroom with a shower, toilet, and multiple sinks. A CAT IS SITTING NEAR A TOILET SEAT A group of four men riding horses holding flags. This tech wizard leaves all options open, equipping his computer area with both a laptop and desktop machine. a black and white cat a hand and a laptop Four people playing a game with a frisbee in a grassy area. A reflective mirror at the junction of two hallways. A very big dining table with some people at it. The surf boarder is coming out of the water. A man in an old-fashioned baseball uniform hits a ball with a bat. People are wearing hats with umbrellas attached to them. a man is in the air riding a skateboard outside A parking meter that is placed on a sidewalk. A stuffed teddy bear and memo sitting on a bunch. two zebras close to one another inside of a fence black and white stripped poles with stop lights attached A dog with his leash attached to a bench A bathroom sink, mirror, soap containers and a towel shelf below. Large statue holding a black and white umbrella. A little, brown bird on a tree branch many different bikes on a city street A wooden and metal park bench sets at the side of a path. There are many traffic lights on this busy street. Pedestrian traffic and advertising in an Italian airport A group of people on street in snow next to cars. A young man who is drinking a glass of wine. A building and cars parked in a lot. Kitchen with white cabinets and refrigerator and black countertop. The person who decorated this bathroom likes cats. A large room has many different planes displayed. A man is painting on the side of a wooden compartment. People waiting to cross at a busy intersection. A baseball player standing on the pitcher's mound The street view of an average city street. Electrical plugs are coming out of a box on top of a box. A herd of zebra standing on top of a dirt and rock field. a boy with glasses a cheese pizza with onions on a silver platter Several house barges lined up on a river. A policeman on a motorcycle waiting on the street An orange cat on carpet outside of a door. A picture of some kids playing a soccer game. A bi-plane in the sky on a sunny day. A couple of small white bears on some rocks. A donkey joins a group of zebras around a water trough. A man has a ponytail on top of his head The man is playing baseball on the baseball field. A submarine sandwich sitting on a white dinner plate. A girl in grey jacket and tie standing on a street. A couple of kids hovering over a pizza sitting on top of a wooden cutting board. A woman in a black dress swings a tennis racket a man takes a bite of a doughnut THERE ARE YELLOW TOWELS IN THE BATHROOM HANGING The woman is standing in the kitchen empty. A man on a couch talking on a cordless telephone a group of people at a park playing with a white frisbee Apples and oranges are being sold in a market. A cat is standing on top of TV near a huge bookcase. A man is standing next to a tall surfboard. A city filled with traffic next to a tall building. A red fire hydrant with the paint chipping off, next to a wire cable fence. The room has red wall, white carpet and matching furniture. Rain makes the brick streets shiny and dramatic a close up of a person using a cell phone A cat that is wearing a festive hat. a cozy living room with a couch and two chairs, a coffee table and lamp a baby standing in a suitcase and a mom a plane flying high in the air below a blue sky A giraffe in an enclosure standing by a tree. Train traveling on tracks near populated area near waterway. a man is snow boarding down a hill at night Young girls sit at a table making paper kites. a table with a calculator and phone siting on it A man is baking something in a portable miniature oven. A bathroom with blue tile in the midst of restoration. A brown and white cow standing next to a stream. The woman is posing on her bed with clothes. A couple cross country skiing with their dog. Open bottles of various wines on a glass table A very cute small child brushing its teeth. A spotted dog and a black cat hanging out in a bedroom. Two men are playing a video game with a motion controller. A close-up shot of a zebra eating grass. people walking in front of a building a woman filling a bear at a build a bear type place A bathroom with a bathtub, sink, mirror, and toilet paper roll. A bunch of very cute cows going down a road. A purple frisbee is shown flying high above the sand. Two pictures hung on a refrigerator by magnets. Kites flying on the sandy beach on a sunny day. Cluttered apartment with a large T.V. and a great view. a man holding a tennis racket and ball A green double decker bus called "Green Rovers" A man doing a skateboard trick on some stairs. A vase with kanji holds flowers and is displayed next to a purple mug, a mug with a dog, and a white mug. Two women walking on a train platform an old diesel locomotive coming upon a track switch an airplane next to a large body of water Group of table and desktop laptops sitting on a workbench. A small bathroom photo focused on the toilet A pizza with shrimp and basil on a table A black and white photograph of a traffic intersection. A man riding skis down the side of a snow slope. Somebody left the toilet seat and lid up. An adult stands by a young child on a fake cow. A young man eating a sanwich while working on a laptop. A small dog sitting next to a wall in a hallway. A long black train sitting on top of railroad tracks. A computer mouse sitting next to a laptop computer. A pair of zebras runs in tall grass. A picture filled with many things all inside. A woman looking into a mirror while blow drying her hair. A woman is giving her dog a bath. A bride and groom are slicing a wedding cake A pack of zebra standing in a field next to an ostrich. Veterans riding in the back of a military truck. A person with glasses on the skateboard as others watch A chef at a pizzeria behind the counter A brown dog in a grassy field with a purple frisbee. A stuffed animal is in a porcelain sink. Group of children and adults playing a video game. A toy train track is set up with two trains, houses, and a tractor. a male in a tan shirt is playing a video game A girl is smiling while riding a gondala. A broken fence seen through a broken window. three men and a woman pose for a picture on the tennis court A man and woman put ketchup on a hot dog bun. A man riding on a skateboard on a sunny day. Brown bear laying down on a log of wood in the forrest. two men holding wii controllers in a living room A woman is standing by a truck smiling at simetjing A bird is sitting idly near some flowers. A woman walks in front of a horse next to a red trailer. The bowl has broccoli, celery, and lemon slices in it. a white stove top oven siting next to refrigerator. A snowboarder in winter gear riding a snowboard and a steep slope that is snow covered. A couple sheep on a steep grassy hill. An outdoor image of a fence at a dog park with a fire hydrant A bear has just taken a dip in the water a person riding a skate board jumping in the air A group of skiers trekking up a hillside in a snow storm. A meter on the street reads a time of zero. A kid is holding a controller on a coach A clock and its reflection placed near a sidewalk. Outdoor table set with wine and breads in the center. White cat sitting on sandy area near walkway. A man in suit and tie wearing a white beanie. An aircraft that is inside of a building. An ostrich in a zoo a long with three zebras. A little boy holding up a packaged electric toothbrush and smiling. Many people play sports in a grass field. The banana in the car seat is aging and browning. A traffic light sitting on the side of a road. A close up image of a little girl getting her hair done. Towels stored under a bathroom sink with a glass countertop. A couple of men standing next to each other holding glasses. A person standing in front of a stove top in a kitchen. A person standing next to a building holding an open red umbrella. A cat with a peculiar look sitting on a bench. A man riding between two oxen as they travel through water. Two black crows sit atop two tree branches A man laying alongside of a white toilet near a sink. A bedroom with an almost empty bookshelf and desk Two people holding up cell phones with photos of a young man and woman. Young lambs with adult in fenced grassy area. An airplane flying in the sky during the day. A long-haired grey tabby cat resting on a sofa. There is a long line of cars behind the rearview mirror. A lady wearing a white shirt trying to tie a tie. Skateboarder riding through the middle of park benches. A man with no shirt rides a skateboard over a ledge of a skateboard park. A soccer player in the midst of kicking a soccer ball. The cool dog is riding on a motorcycle. Two surfers walk onto the beach from the water. A man is sitting on a black couch with a cat. A soda can sitting next to a laptop and remote control. A man on skis on a snowy trail. a giraffe looking over fence, at person walking away. there is a young girl and her mother boarding a plane An outside bathroom carved of wood with a toilet and sink. A bed covered in clutter and clothing with blankets. The young person sits on bench seeing the tranquil lake An open laptop computer sitting on top of a desk. An unattended office containing several computers and a chair. Several people sitting around together eating and drinking at a venue. A group of people that are on a soccer field. A woman swinging a tennis racket on a court. a person stands while holding on to a pole Skier with backpack down hill skiing in the sun A yellow and red train traveling down train tracks. A walk in shower sitting next to a white sink. This is a cake and a fork in laying in a plate. An apple is being cut with a sharp knife. A motorbike parked on a road with a man. A snowboarder goes airborne over a snowy hill. Person on skateboard in mid air with color lights above. children holding stuffed animals and a parent holding a baby A young girl climbing on a painted fire hydrant Two people that are skiing together in the snow. People standing behind a clock in a clock tower filled with massive golden bells. Four persons are skating on the skate board on snow. A room of chairs and sofa with red stairs next to it. A very large semi truck on a wide road. Someone taking a slefie with a large camera in a large mirror. A window looking out at a brick building A pile of luggage on top of a cart A cart filled with lots of luggage driving down a street. Man wearing riding gear sitting on parked motorcycle. A group of people who are skiing on a snowy hill. A man posing for the camera holding a skateboard. a group of peeled oranges with purple flowers on top of them a person is holding a baseball bat by a brick wall A grey cat sits on an office chair in a home office. Umbrellas litter a sandy beach next to a beautiful blue ocean. Several kites sit on the ground, with a few people in the background. A pink bicycle leaning against a fence near a river. A dock that is separating the harbor from the ocean. A yellow and silver train pulling away from a train station. A microwave or other small kitchen appliance is seen from behind. A piece of toast and grapefruit half is on a tray. A stack of four oranges on a table. a person that is standing in a kitchen next to a icebox Two elephants are walking through trees side by side. Pasta with a mixture of different vegetables sitting on a plate. zebras and antelope graze on the planes next to shrubs Game pitching plungers into a toilet in a field. A male is skateboarding in an outdoor skate park near the ocean with many people standing nearby. A woman with purple hair taking a picture of herself in a mirror. a man dressed as jesus holding a cell phone Two brown dogs lying on a burgundy comforter. Two people in a public bathroom painted red. A woman in a bra laying on a white surface. A meal of beef, broccoli, and mushrooms is eaten with chopsticks. Small sailboats are sitting on the water all over the lake. A fire hydrant outside a shop with graffiti. A gentleman is walking through the boardwalk with his surfboard. A cat looking intently out of a window. A sliced chocolate desert covered in powered sugar A grey stripped cat on a table in a room with many books. a tangering sitting on top of some bananas The clock is located near the body of water. A man bending over scooping food into a pan. A park bench surrounded by a green forest of trees. A person that is holding a kite in his hand. a airplane that is flying through the sky over some snow A cat looking out from a box designed like a bus. Some cars that are driving through an intersection. A dog catching a frisbee with a man in the background. a bunch of sports items sit in the grass A city street with business signs on buildings Two people watching a small jet on the tar mat of a airport. A poster behind a gate against a fire hydrant A couple of green street signs sitting above a stop sign. A baseball player holding a bat on a baseball field. Two men are playing ball with some elephants. A pizza in a pan sitting on top of a wooden table. A cowboy leads a cow through a paddock. A man twirling a yellow frisbee with his finger A wide eyed teddy bear with a scarf is sitting on checkered bedding. A man gets his picture taken at a ski resort Old fashioned kitchen featuring a two compartment sink. A group of elephants are walking away from water. A little kid with a uniform, glove and hat on during a baseball game. A bunch of oranges hanging from an orange tree. A woman sitting on a bench while talking on her phone. a small couch overed with blankes and pinapple designed pillows A dog laying on its back on a made bed. a black and white photo with two males on cellphones Various types of apples and other fruits at a market Someone getting food from plates with a bunch of different foods on them a male in a red tie and some other people A plate with steak, vegetables, and rice being served. A bed is shown next to a stand and TV. A pair of pizzas sit on trays with ingredients on top Jet airplane parked on a cement runway under a large white cloud. A small herd of cows with halters and bells tied to a cable fence. there is a woman playing with a dog with a toy donut A white bed with black pillows and a patterned throw. A refrigerator door is open and full of condiments, food and drink. a metallic suit case in front of a couch A black cow and a brown cow walk near a motorcycle on a village street. A baby sitting in a chair getting a haircut. The police officer is observing the airplane in flight. Large red bed in room with dresser and futon. a close up of a young baseball player touching his cap frontal view of airplane with cockpit facing on white airplane a close up of two stuffed animals siting on a table THERE ARE PEOLE SITTING IN A WAITNG ROOM a man is cooking some food on a grill A car with a wheel lock on its wheel next to a parking meter. A woman sitting next to a child on a couch. A tall giraffe eating leaves from a tree a man standing at the edge of a tennis court getting ready to serve A teddy bear sitting outside in a chair. Black and white photograph of a man sitting at a bench. A kitten laying on a man's lap while a woman plays with a Wii controller. A young child smiling for a picture, she has a plate of cake in front of her. A man holding a tennis racquet on a tennis court. a teenager attempting a jump on his skateboard A man in blue shirt walking on street with building in the background. A table topped with a pizza surrounded by people. these people are waiting for a train at a station A little baby that is sleeping on someone. A group of people gathered together, one holding up an umbrella. People are loading onto an old red, yellow, and green train. Individual plates of sausage sushi with ketchup packets A dog follows a cyclist along parked cars. A lot of red apples are put in a box. A couple of giraffe standing under a tall umbrella. The inside of a bathroom leading out to the hall way and a room across. A guy skateboarding on a big ramp somewhere. A middle aged lady is decorating a cupcake. A baseball player has just thrown a ball. A neat and clean kitchen with cooking range,microwave. Two city buses traveling down a rain covered road way. A fry pan with a mixture of vegetables in it. A lot of food that are growing on a tree. City scene with parked buses and people walking on the sidewalk. A street shows several street lights and an empty intersection. A couch with clothes and items scattered allover a two story bus on a busy urban street Several men are playing baseball on a baseball diamond. A train rides down the tracks near a hilly area. A black and white photo of a dog standing happily on a horse. A woman sits on top of a motorbike. The dining room has four chairs at the table, and a hard wood floor. A kid is sitting on a skateboard with another kid behind them. The people are having a group meal at the table. a desk with a laptop, some speakers and a mouse on it A zebra standing next to a group of three trees. Two pedestrians underneath their umbrellas walk across an open plaza in a rainstorm. Two brown horses pulling a black carriage and driver. A guy in a big grassy field flying a kite. A cat holding a toothbrush in its paw and chewing on it. a man that is skiing down a snowy hill A fluffy quiche or pizza is loaded with vegetables on top. A brown teddy bear holding a glass vase in front of a grave. A big commercial plane parked by some vehicles. Two urinals in a tiled bathroom with windows. a man and a woman standing in the living room with her holding a remote A stop light tells motorists to go across the intersection A person is showing their feet near a book and headphones. a couple of large planes are on a runway a couple of chairs sit under a umbrella Four dogs are sitting together on the bed. A bunch of green bananas hangs from the ceiling of an outdoor structure. A young man that is standing by a big pile of luggage. A skier in an orange jacket looks out over a snowy valley. A skateboarder is balancing on the rim of a bowl. A three dimensional rendering of a woman sitting on a giraffe. A yellow cat sleeping on the hood of a black car parked in the garage. a fire hydrant on a city side walk Several young soccer players playing soccer on a field. a woman in a white top some lights and a cake Two medium sized dogs sitting next to each other. A thin pizza is on a plate with a spatula under it. A piece of art hanging from a yellow wall in a living room. A woman and child sitting on the bed with an open book. A group of men playing frisbee on a field A dog lies down and waits on sand at a beach. Three vases of different sizes and shapes all holding pink flowers A dining room features both chairs and a bench. Several just baked cakes on top of a stove two plates some food and a fork knife and spoon A bed that is unmade next to some plants. A man standing next to a smile giraffe. Two teddy bears sit on a rocking chair. A microwave oven on a mini fridge in a room. A city bus coming up at the corner and someone is waiting for it. A couple of zebra standing next to each other on a field. Brown cabinets and dual mirrors and sinks in a bathroom. An ostrich watches as a giraffe leans over as it eats some bark from a tree. A pizza cook getting ready to cook some pizza in the oven. A tennis player makes a strong return during a match Teenage girls with skateboards at night in front of a restaurant. A jar of water with a flower inside. Four luggage bags are stacked close to each other. A bathroom with a large mirror above a white sink. An old train is on the track near a small shed. a man holding his cell phone to his ear THERE IS A DOG THAT IS IN THE POOL WITH PEOPLE A bear is swimming in a cold river. an image of a man with other men on skiis a desk with a laptop a monitor and a keyboard A bus parked outside with Asian characters on it. a man getting ready to hit a tennis ball A clock tower with a statue in front of it. Five surf boards arranged in an arc on a grassy area. A double-decker bus with few passengers aboard drives down the road. A little boy against a wall while holding a tennis ball and tennis racket. A skier skiing past a tree at Snowbird ski resort. People walk on the sidewalk near the buildings. A snow boarder laying in the snow after a run Goats and geese standing near each other in howling pen. A man sitting on the bed watching tv A large green train covered in graffiti. A dog is seated in the living room watching tv A man has his hand around a zebra as they stare at each other. Some very pretty zebras grazing in the grass. Motion blur photograph of a busy city esplanade at night A bathroom with a toilet, sink and a window in it. A green bowl of corn and broccoli in a white stew with a spoon and a biscuit next to it. A siamese cat laying on top of a white sink. a person riding a large skate board on a street A flock of sheep standing in a grassy field looking at the camera. Shot of a small bathroom with a bathtub and a toilet. A picture with no head but a suit and tie and flower A sink with dishes in it and lined by various bottles. The umbrella's on the street are decorated with messages. The woman is sitting alone on the bench reading a book. A man kneeling down on a baseball field pitching a baseball. An adorable little girl holding a brown teddy bear next to a wooden table. there is a pair of slightly rusted scissors in a rusted handle A guy sitting on a big bright purple bench with some headphones. A group of people standing on top of a snow covered field. A bedroom is bright with colorful accents in it. A large zebra and small zebra are standing by a tree. Pedestrians with umbrellas cross a rainy street corner. A U-Haul truck with a driver sits in a grassy field. A red teddy bear sitting in a chair with potted plants all around. A dining room table with some beautiful plants sitting on top of it. a man in a black jacket standing by a red and black motorcycle A woman talking on a cell phone and looking into the distance A man on a skateboard going over a black box at a skate park. A cake sitting on top of a plate with a knife in it. A wooden table with a remote control that reads "control a woman." Large public transportation bus stopping to let passengers on and off. Close up of white USAF fighter jets in a blue sky A bunch of vegetables sprinkled with pepper sitting beside each other People ridding elephants and one is holding a camera. The side of a truck that has spray paint on it. A large shower head in a bathroom shower. tree are two woman standing in the rain under a pink umbrella A slice of vegetable casserole on a plate. A person with a hat standing by a parking meter. Three motorcycles stop at an intersection at an oriental restaurant. Two people with boards riding a ski lift. Motocross rider going around a bend on the track. A woman hitting a tennis ball on a tennis court. A gray and white kitten walking through a square hole. Two people in a room with assorted luggage A dog is wearing a paper hat with a star. A large bear in a river with some rocks. A giraffe is posing close to the camera in its enclosure. A large elephant with a couple people on the top. A helicopter that is sitting with its back wheels on the ground. A male skier dressed in orange and black performing an airborne stunt The contents of a back pack are spread out on the floor. Black and white bags above people on a field. a young man rides a horse down a paved pedestrian area in a town the man is swinging the bat at the ball A sculpture of a man reading a newspaper sitting at a bench. A boy and girl riding bicycles with a small dog. Five people just got off that gray bus. A young male is riding his skateboard in his empty pool. A commuter train passing by a field of wild flowers. A skier cutting a turn on a slope. the hitter prepares the to hit the pitch A suitcase that is packed to the brim with things. A park with trees, bushes, walkways and benches in front of a skyline of buildings. A stop sign on a piece of paper. A woman sits at a table in a wooden cabin next to a lamp A toy model train station with a train on some tracks. An umbrella on a beach with a towel. The bus has the lights on as it travels down the road. A group of people standing on a field under a cloudy blue sky. A man that is standing on a board in the water. A big crowded beach with some guys playing with a disc. The room has a television and sports jerseys. Three buses in a row that are different colored. A mantle with several glass vases of flowers. A man holding a baby girl while seated in a cafe. A comics page from the paper lies on the floor of a bathroom stall. A plane preparing to take off on an overcast day. A cup full of toothbrushes and tooth paste. A bedroom with a bed, radiator and laptop. A man works on an old steam engine train. A large yellow school bus driving down a road through a park. Filtered photograph of a man jumping on a skateboard. Two people next to a bench at a dock above the water. A couple of cats relaxing with each other on the bed. A mom and her kids ride together on an elephant. An assortment of shaped kites flying in the sky. Two horse drawn carriages traveling towards a big house. A bunch of big colorful kites flying high in the sky. A parked pick up truck with a flame design on the hood. Jetliner with "Saturn" on the side flying over a body of water A woman wearing a net on her head holding a box in a kitchen. Some old guys in funny costumes on some fake horses. A Kingfisher plant parked at an airport with a food service truck in front. A Eastcote welcome sign in a suburban neighborhood A man standing next to a yellow and orange fire hydrant. A woman cuts a cake at the table with a red cloth. A smiling man with a goatee sits in the backseat of a vehicle surrounded by luggage. Players at center court with camera man during tennis match. Two large white sheep standing on a lush green field. a stop that has been defaced with graffiti A black and red train engine next to train station. A cup with a straw in front of a laptop. The baseball player is sliding into the base as another player is blocking it. A bus sitting parked next to a building with people in it. a plane flying high in the sky on a cloudy day A very shaggy ram and a smaller lamb in the grass Two men that are shaking hands behind a table. A very large commercial air plane on the tarmac. A man riding a motorcycle driving through a mountain side. Some kids are outdoors playing baseball during the day. a single giraffe stands tall in field of bright green grass A group of four giraffes standing next to each other. Bottles, cans, and foodstuffs within a wall's recess s close up of two dogs eating cake off of a table A man is talking on a phone while standing in the street. A white metal piece of artwork in the city. Someone is riding a white horse with a grey mane. A skateboarder heads down a decorated ramp against a panorama that includes an overcast sky, a line of trees and a field of snow dotted with people in winter clothing. A man in a large room with baskets and pottery There is a red car being towed on a truck A woman holds a string in her hand on a beach. A white table with umbrella and two chairs on a deck near a railing. Two people cycling on a road as others walk by A giraffe standing next to several tree branches. A woman in a seat is on her laptop. Two women in bathing suits next to a cat with planes flying across A man in sunglasses holding a sub sandwich A close up of a bowl of vegetables containing broccoli and carrots. An open door shows a small bathroom space with a toilet and a shower while a sink sits near the open door. A child hugging a stuffed animal while surrounded by stuffed animals. A plane with stairs next to it sitting in a large lot. a close up of a traffic light on a city street Some guys in a dark room playing a game on a big TV. A cat laying on top of tie dyed pillow. A variety of healthy foods arranged on a table top. A room with a bed, fan and a dining table and chairs. a close up of a cat sitting on a pillar There are flowered vases and framed pictures set against a wall with balloons hanging above it. There are several modern lavatories in the rest room. Traffic light on a long yellow pole in front of apartment balconies. A man in a ski suit sitting in the snow with a snowboard. Several employees are standing behind the bar of a restaurant. A car turns the corner of an intersection in the rain. A red toy train stopped on tracks near toy figurines. empty train cars sit in a snow-covered deserted train lot Three bears stand together near a fence. A woman that is standing up with a doughnut. Bikers and pedestrians populate a street featuring many shops and stands. Long billed bird standing in green weeded area of fodder. A pair of surfers carry their boards along the shore. The person rides in a yellow motorboat with a dog. A boy in a blue shirt catching a frisbee. A plate full of meat and broccoli on top of a table A single zebra walking by some water in the dirt. A lush green field topped with lots of vases. A pizza that is sitting on a plate. Men playing recreational basketball on a hot day Two young men and a dog standing on a snowy road. Two people are playing Wii games in the living room. Couple standing in snow on skis posing for the camera The clock has beautiful gold detials on the face. A man holding a kite string as a woman releases the kite. A birthday cake has an airplane on it. a labrador retriever bring a frisbee back for his owner A meat sandwich on a bun with a side of Brussels sprouts. A zebra and a giraffe foraging together by some trees. The red and white train is relatively short in length. A person laying down with a book in one hand and a cell phone in another. a person in a costume standing talking on a cell phone A tall building sitting next to a bunch of trees. Bathroom sinks and a mirror lit by sunlight coming through a small window. There is a giraffe that is looking at something A woman standing on top of a green field next to two men. A male tennis player on a court with a racket and ball. Adult elephant standing near a multi-wired electric fence. Different markings sitting on a bag on the floor. A pair of giraffe are walking in a field in Africa. A group of three zebra standing next to each other. Three vases that are red with flowers on them are on display. The home office features several important business tools. A grey tiger cat staring at himself in the mirror. an overview of a marketplace sale with child toys Several surf boarders at a city wave pool. a man lays down on a surf board as he paddles through the water two teddy bears sitting on a chair and wearing costumes A small single sink in a home bathroom cluttered with items. This painting shows a perplexed fellow staring at a laptop computer. a small bathroom with a sink and a toilet the toilet lid is raised. A modern living room in a cabin with food. A bunch of horses are walking two by two down a road in a city with a few riders. A little league batter await a pitch at home plate. An up close shot of a woman wearing a badge on a lanyard opening a banana. A couple of people riding skis down a snow covered slope. a plate of meat and bananas on a table Various different animals that are standing in the grass. A dog laying on the floor chewing a toy while a man laying on a couch watches. A father and a daughter flying a kite in a park. A train on the tracks blowing smoke out of the engine. A group of ninjas wearing all black hold up small white fans. Two people in orange jackets smile as they ski up a road. A Michael Jackson birthday set is shown in gems A woman is standing outside in the snow holding a snowboard. Two zebras cross a dirt road outside a village. A banana laying next to a plastic container with lid. A small living room area with black furniture and curtains. a kitchen with brown cabinets and a big door A guy with a cast does some flips with a skateboard A giraffe towers over thorny treetops in the day. A little boy that is holding an umbrella. a couple of people play a game of wii A living room scene complete with two couches. A family plays with a Frisbee on cobblestones near the water. a dog in a field with a frisbee in its mouth a polar bear standing next to a cliff A snowboarder gets some big air off a ramp. An airplane sits alone on an empty tarmac. A small family of Giraffes are together near a couple of trees. 2 professional tennis players competing in a game of tennis A herd of sheep standing in a muddy pen with a chicken. A slice of cake with a single birthday candle sits on a plate. A bird is jumping off of a branch. A teddy bear sitting on the ground next to a garbage container. a bench that is outside in the woods A lady is sitting in a restaurant eating while holding a jar of peanut butter containing a comb. A very fancy wooden mantle clock with ornate design. A large white boat floating on top of the ocean. A dog sitting at a picnic table peeking out from behind someone's legs. A DOG QUIETLY SLEEPING IN HIS BED ENJOYING THE SUN. two males are playing a video game and chairs Black and white photograph of a bowl of apples. Man in a black jacket snowboarding down a hill. Three horse grazing on grass near a street sign. A person in a ball cap and holding a Frisbee with a dog. A bunch of bananas on a banana tree. a big man running to hit a tennis ball a light colored bear in a grassy field A base ball game in progress behind a fenced in park. This is a picture of a kitchen that is also used as an office A man riding a skateboard through orange cones. A table covered with arts and craft supplies. A man riding a skateboard on the side of a rail. A very cute old looking fire hydrant on the curb. A stop sign is standing in front of a palm tree. A man plays a video game as a woman sleeps nearby. A group of people standing in the middle of a walkway. The zebra is walking through the short green grass. Four cows are grazing on the short green grass. a person jumping in the air with a skateboard The mounted officers ride near buildings with flags on them. A pineapple, orange, and bananas sit on a plate in a kitchen. A city street has diners eating on outside tables. A chair and a couple of pieces of furniture in a room that had been burned. THERE ARE CHRISTMAS DECROATION ALL OVER THE PLACE A mirror sits on the side of the tracks of a subway. A Japan Airlines plane waits at the gate while it is towed in. A person holding a wine glass with a dark beverage in it, in front of a television that has a cartoon on it. A steer is walking through the grass with large horns. A young man in a sweat shirt is standing on a wooden walkway. A vase with flowers on the table Two men hold a kite together outside surrounded by chairs. Several men looking at phone in one's hand. A piece of pizza sitting on a plate. A polar bear keeping cool in the water. a table that has all kinds of plates of food on it Two glasses vases are next to each other with flowers in them. A snowboarder is in midair preparing to land. A street sign, with two signs on it. A young child that is sitting in front of a birthday cake. A bowl filled with oranges on top of a wooden surface. A kitchen scene looking toward the living room in the background. A very pretty dog laying on a person on a couch. A white tub sitting next to a sink and a toilet. a woman is hitting a tennis ball across the tennis court A bathroom with white vanity, toilet and tub and open frosted windows. A baseball player takes a swing at a low ball. A Jeep towing a boat out of a body of water. A couple of men standing on top of a soccer field. A person holding an electric tooth brush next to a cat sleeping on a bed. A vegetable pizza on the edge of a table An older woman preparing cookies and bread at a table. a photo of a man wearing a tie with a tv monitor in front of him An umbrella is tied to a bike on a rainy day. A sign warning drivers to slow down because of the presence of children. Tired dog rests on top of a teddy bear. A bike parked in front of a red brick building. Three people walking toward a small airplane on a tarmac. Airplane with smoke coming out flying through blue skies. two women out in the snow with their skiis A tray of food in foil and a fork. Cross country skiers are engaged in a race. A basket filled with food and a cup of salsa. Group of cars parked in front of a large building. Several signs posted on a metal pole near a pharmacy. Small boy in yellow shirt holding onto a white frisbee. A person on a surfboard in the water. A large bird is flying over a beach. A black cat with crazy eyes wearing a bib. A man with a suitcase walking in the road. Three giraffes standing in a zoo enclosure with trees. A group of people on a field playing baseball. The side of a stainless steel vehicle with large wheels. An adult in a wetsuit surfs a small wave. A beach with people flying their kites in the sky. A zebra walks by an alligator near a watering hole. A kitchen area with a stove, sink and dishwasher. A man sitting down holding a brown dog wearing a blue tie. A suitcase sitting next to the subway rail. a man taking a nap at the end of a bench The bedroom with the bedspread is dimly lit. A woman in white shirt climbing onto an elephant. Two women in skis standing by a sign and trees. an image of a child that is playing tennis on the court A small air craft is heading in for landing. A large black bear standing next to a stone cave. A boy is sitting in front of a laptop. A woman kneeling down next to a fire hydrant with cans of paint. A father helping his child brush his teeth. A photograph of a thing in the picture. A man is standing under an umbrella next to a tent containing clothes for sale. Two small children in green shirts on a baseball field. People walk in a narrow alley way while clutching umbrellas. A horse has a harness on its face. A dog that just caught a frisbee. A cat is laying on a laptop on a coach A red fire hydrant next to the curb with parking meters in the back ground. A kid in a car hiding from a zebra that is poking it's head in the window A man swinging a baseball bat in front of a man with a glove on. The adult black bear is inside of a pool of water. The two green military vehicle are parked in the field A man sitting next to a large pile of luggage. three women stand by an elevator with their luggage The two teens are on the sand dune, racing to catch the frisbee. A road bike rests against a park bench. A living room filled with furniture and a wooden book shelf filled with books. A man jumps his skateboard over a fire hydrant Man serves tennis ball at high speed while other watches. A toilet that is on the ground near a trash bin. A poster that indicates the letter S stands for sandwich. An opened stick of butter sitting near some scissors a street pole with a sign on top of it A woman with a child in a carrier standing in front of a giraffe exhibit. Outside view of white horse in the window Two shots of a woman swinging at a tennis ball. A bed above a desk with a computer A half-eaten pizza sits in an open takeaway box. A colorful dish of several fruits and vegetables A sports motorcycle is parked on a gravel road by a river. A very large orange cat lying on the roof of a vehicle. A small very messy rest room with many books. A woman throws a frisbee into the goal in frisbee golf. A very cute bright red fire hydrant by some bushes. A Mack truck parked in a parking lot. Fruit, grain and vegetables have been putted in separate bows. A giraffe walking through a jungle next to a large tree. Man looks at another man that is holding a Wii controller in his hands. A bowl of vegetables containing carrots sitting on the stove. two long lines of boys paddle a canoe A lone elephant walking through the desert grasses. A woman sitting at a table cutting a princess cake. A man sitting on a high chair on a tennis court. This person is riding their horse near the water. A woman holding the head of a horse wearing a bridle. Street signs on lamp post in large city. Chefs working in a kitchen at a restaurant. A man and woman posing with tennis rackets The man talking on a cell phone has glasses on his head. The red bus is driving down the street. The pizza is on the dish and ready to be eaten. Small boy in dress clothing sitting down on a white bench. A teenage girl with black hair and black makeup wearing kandi bracelets on her hand and holding up a sandwich. People are on the beach with water fun equipment. A man with his arm around a woman in front of several skiers. A person laying on top of a bed next to a white dog. A rock wall extends out from a stone building and tower. A pair of scissors sitting on a plastic chair in an office. a white plate with eggs ketchup and a fork and a cup a chocolate doughnut on a saucer, coffee in cup. The young woman is selling many types of cupcakes. Three adults on the beach fly a very odd kite. A pastry is lying on a blanket on grass. A white airplane is on a asphalt lot as the sky is covered with clouds. A single giraffe looking into the camera on the plain. A view of a mountain range from an airplane. A family holding ski's posing for a picture on a mountain. A man is on the beach playing with a frisbee. there is a woman that is standing in the snow with her skies A person loading a bite of cake onto a fork. A stop sign by a cross roads on the roads. A family is in a living room playing the Wii. Two large elephants laying down in the dirt. All the items that are going to be packed for a trip. there are many lights that are on in all of the buildings Crowd of people with backpacks line up on the runway to enter the plane A herd of giraffe walking across a field. a vase with bright flowers sitting next to a man usiing a platform two hotdogs topped with a dill pickle tomatoes and tofu a man is holding up a box of doughnuts some people standing around by a table and chairs a desktop computer monitor with a keyboard and mouse The horse is in the water with a man. A close-up of the dirt in a garden with a small umbrella in the ground. A toilet seat with a picture of a dolphin on it. A horse looking over a fence on a snowy day a round window overlooking a parking lot filled with cars tree is a man holding a small red guitar A group of people playing a game of frisbee on a beach. a number of zebras near one another on a dirt ground A plate of food with mushrooms, beans, sausage and two kinds of meat on it. A woman about to enjoy a good lunch of a sub. A small park with benches and buildings in the back round. View of down town in a city and traffic driving on the opposite side of the road. A white bus driving down a street past a semi tall building. Hot dogs are being cooked next to bins of toppings. A herd of giraffe standing around a pile of rocks. lemons and limes in baskets in the produce section A man in a white outfit, holding a tennis racquet. Two zebras are walking in front of some trees. several multicolored scarves hanging on a display case. A beautiful woman holding a brown dog in her arms near a refrigerator. A bedroom with a bedspread and a window. a white and brown cat is laying on top of a keyboard A person helping another person fix their skis. there is a man with a pink shirt holding two surf boards A lot of cows are walking on a field. Three zebras that are standing in the grass. A young person riding a skateboard at a skate park. Collection of vintage motorcycles sitting on display at a museum. A group of cows standing on a road with a vehicle looking on. Two elephants walk along the bank of a river. A pine tree branch in a vase decorated with a dove and colorful star. A person riding their bicycle in the rain. A dog sitting in front of a open book. scones sitting on a plate at a cafe A train travel at high speed with buildings reflected in the windows. The man is ready to throw the frisbee. A man and a woman standing their surf boards next to each other at the beach. A small Christmas teddy bear is hanging on a tree. an image of a bedroom bed with a bookshelf in the background Five dessert samples, on clear glass plates, are displayed on a wood spoke wheel. A steamer filled with different types of vegetables. A fan sitting in the middle of a room next to a sink. Woman looking at cell phone while outside in the bright light. A man on a surf board riding a big wave. a person walking with a cow in a parking lot A cordless land line phone is all lit up. A tractor and a herd of cows in a farming field. A woman and two young girls are blowing out a candle. a person standing in a living room playing nintendo wii A burnt pizza covered in cheese and toppings. This woman is playing tennis on a court. Street signs at the intersection of Partridge Way and Pear Tree Lane. A room with chairs and a clock and a floor. Window display of a suit and sewing machine. there is a sign that has whoa on it and there as a truck behind it A brown and black cat underneath an umbrella. A man standing on a tennis court holding a racquet. Man on large open area covered with snow. A very long large train at a station. a couple of buildings surrounding a pond with boats A man holding a tennis racquet in his right hand. a sprinkled piece of cake on a pink polka dot plate there is a man on the beach flying a kite A van parked on a road side, covered in snow, ice and sleet. a woman holding a pole skying on the snow A suitcase has been re purposed into a charming bench seat. A large dog sleeps in front of a tv. A vintage image of a lady holding a baseball bat. A row of motorcycles parked next to each other. A man sitting at an office desk utilizing a computer. Three people in suits posing outside of a bus A view of bathroom with a sink, toilet, tub , and mirror. Baseball player wearing protective hat with a bat warming up before his turn. A skier in green snow pants recovers from a fall Passenger train at stop waiting for consumers to load A woman in a swimsuit with a racket in her hands on a tennis court. a dining room table that is in a room A clock tower on the side of a brick building There is a cross country skier wearing full gear An elephant,fanning his ears is standing on the ground. A plate of food with meat and other vegetables. A woman surfing a wave on her surfboard. A lady walking down the street with a red umbrella. A young boy is standing on a skateboard. Pizza, orange juice, and red wine sit on the table. A white kitten is sitting on a laptop computer. Three men are sitting on the couch, one is on the laptop. Two giraffes eating together from a feeding station. A batter standing at home plate has just swung at the ball. a cabinet with a coffee pot, toaster radio and microwve A lamp sitting next to a red vase filled with flowers. a skateboarder with white tennis shoes is doing a trick A chocolate style cake with candles on it by a cutting knife. A woman brushes her teeth and looks at the camera. A building with a stop sign next to it with a man on a horse. A woman and child are about to cut a cake A red stop sign sitting under two street signs. A man sitting in field next to a herd of cows. A bed with an orange headboard, a green pillow, 3 regular pillows and the bedspread turned down. There are two people watching another one play tennis. A pan with carrots, apples, meat, and potatoes. A group of cars that are parked on a beach. A sandwich sitting on top of a white plate. A person standing on a sandy beach next to the ocean. Men are standing together outside of an old train. A man is flying kite in the park. Young couple cutting white cake at indoor celebration. Giraffes walking around outside in a wildlife park. a toilet sits inside of a cramped bathroom Two people on hard ground throwing a frisbee. A passenger sign on the tracks at a station. A young man riding a skateboard through a puddle of water. A group of people enjoying a cake and pizza. A herd of zebra standing on top of a lush grass covered field. Two skiers are going cross country in opposite directions, one taking the high road and other the low road. A toy chicken standing beside a flower vase. Cat sitting on top of a chair near door. a mixture of vegetables including broccoli and squash a little kid that is standing next to a suitcase Two pizzas being placed on top of a column of plates with an employee checking the pizza on a stone stove. A man petting a cat that's sitting on a kitchen counter. Man posing in front of a pair of giraffes in background. Plated lunch with condiments and utensils on dark table. An old cellphone stand next to a mug and a statue of Jesus. a tie on a pole outdoors in a field of grass The meal is prepared and ready to be eaten. There is a clock on the side of a building A family posing on skis with a young child in the snow. This is a portrait of a bench next to the ocean. Two sheep standing next to each other in the snow. Three people in uniform cutting a cake with others watching. Three bikers in a busy street riding in front of a bus. A woman bundled up in the snow skiing. A person in a purple shirt standing on a couch playing wii A silver train traveling down train tracks next to two men. A chick is siting on the edge of a bathtub. The man has just thrown the frisbee in the air. a group of zebras grazing on dry grass in a large field. A man returning a tennis ball in a tennis game. jockeys riding horses in a fast horse race A seaboard soars majestically over the green-blue ocean. Two men holding hands while holding a snowboard a man standing on top of two horses a woman wearing a wig holding a tennis racket Three donuts are on paper next to a coffee cup. A man skiing is doing a rail grind. Male surfer in wet suit, just thrown off surfboard at the peak of a wave. Red Oral B toothbrush in a blue cup. A pitcher, batter, umpire, and other baseball players on the field Two menus sit atop some colorful decorations next to a green box with lights on it in front of a restaurant. three people sitting on a motorcycle in a street Little girl holding up a sheet of uncooked rolls by oven. two elephants in a encloseur at a zoo A colorful chain with a note attached is wrapped around a parking meter's post. The child is jumping on the beach above a body board. Street signs on the corner of Fillmore and Filbert Baseball players take various poses as a ball floats above the pitcher's mound A sink and toilet in a bathroom being remodeled. two ripe fruits on the floor ready to be eaten A woman is playing tennis on a hard green surface. Young boys playing soccer trying to kick the ball. Two giraffes stare at a crane from behind a fence. a bunch of cupcakes stacked up on trays A person reaches out to pet a pony. a close up of a cat sitting at a table A cat's head sticking out of a leather bag. A glove laying on a stuffed animal in the grass A woman is walking through the park texting on her cell phone. A table has potatoes, carrots, onions and broccoli. A group of lambs standing in a grassy field. A boxed lunch with a sandwich, veggies, fruit, pickles, and a dessert. A man sitting at a table at a diner with a basket of food in front of him. A microwave oven with a plate of nachos inside of it. Cat sitting on top of a person's computer. A women in mid swing hitting a tennis ball. Two plates of food in front of two dogs. A baseball player hitting a baseball with a bat. A cluttered desk filled with monitors and various items. A clock on an outside information board with snow all around it Four pieces of a television remote disassembled or taken apart. A couple of police officers in the middle of a street. A white paper topped with square slices of pizza. A spoon is resting in a bowl of cooked noodles and vegetables. Three guys are in the kitchen together preparing some type of meal. A device fashioned to look like a yellow car sits atop the desk blotter. a bowl sitting on a table with flowers inside of it A person riding a wave on top of a surfboard. A black cat with a conspicuous look on its face in a bag. Two hot dogs in cardboard plate one with pickle and the other with cheese. A reproduction steam train waiting at the station The man watches the little boy on the surf board. a black and white photo of children siting posing for a photo A little boy reading his book on top of a toilet. A clock sitting next to a brick sign under palm trees. A person on snow skis is pulling a rope that is attached to something heavy. A man and woman toasting with martinis with olives. a public transit bus in a field with a sky background At least nine giraffes live in the enclosure. Four boys with skateboard relax by an iron fence. there are many people gathered here in the snow Several people interacting in a spacious living room. A blue and aqua colored train and people on the platform. A group of brown horses standing on a snow covered ground. The clock tower stands tall and reads almost five-o-clock. A toilet, shower, and sink in a bathroom. A young man on a skateboard near a half pipe A picture of a modern looking kitchen area A row of parked jetliners sitting on top of a dirt field. A vehicle pulls up next to a building. A train on the tracks under a walkway from one building to the next A baseball player holding a bat during a game. an orange caution sign stating fresh oil in the street A lady is playing tennis game in a tennis court. Two surfers carrying their surfboards in the sand at the ocean A Macbook sitting near a clock and a lamp on a desk. A couple of men in skies on a snowy slope A little girl holding a baseball bat on a field. A dog and a cat laying on some platforms. A person holding a cellphone that is opened upright on a table. A man in a suit standing in front of bookshelves. The woman is playing tennis on the court. A fire place sitting below a brick and plaster mantel. A group of men in hats next to planes on a runway. a lady on a horse and people taking a photo A person is watching animals in the wild with a camera. A stop light is shown over a road. A train is going down the track under a bridge. Here is a compact kitchen that uses it's limited space well. an image of a group of people outside for an event A subway train is parked at the station A jet plane flying through clear blue skies. A table topped with food and a remote control. A clock tower with elaborate details decorating it. People riding motorcycles along a street with a lady riding on the back of one giving the peace signal. a plate of food with a banana and a sanwich A large long train on a steel track. A man hitting a tennis ball with a tennis racket at the tennis courts. Several people walk up a slope as others are coming down at an intersect. a man and woman are sitting on the back of an elephant A refrigerator sits in a temporary spot in front of a doorway. A herd of giraffes and two zebras are grazing in a field near a fence. a bus stop with a white bus picking up lots of people Something outside the window has captured the dogs attention. The woman sitting in a red chair is smiling while holding a cell phone. a person riding skis on a body of water tethered to a boat a person riding a surf board on a wave A man holding a device and a coke bottle in a clearing in a wood. A large jet sits at the gate at the airport. a close up of food on a plate on a table A person hitting a tennis ball with a racquet. Four people standing on balcony and a parking meter A white toielt with a standing rail in front of it for support Three backpacks loaded with a variety of stuff sitting on a tile floor. a bright yellow 'watch for rocks' sign in front of the blue sky. People are riding bicycles and walking across an intersection. Two men jumping in the air across sand to catch a frisbee. Scissors and material being made into small purse People in a stadium watching some men play baseball. A man has his hand up to his ear as he walks past a bridge. Three people are cutting into a yellow dinosaur cake. some fireworks in the air above a clock tower A red stop sign sitting on top of a yellow gate. A man with black suits next to a surfboard A person with dark hair throws a frisbee. a bike that is parked next to a brick wall Yellow fire hydrant in between two blue posts. A row of boats on a beach with a dog near the boats. A blue and white train pulling up to the train station. a woman eating out of a small bowl next to a computer A woman with her hand on a blender on a bicycle Young women playing a game of softball in the hot sun. An old photo of a group drinking in a restaurant. Some sushi rolls, apples and vegetables are in lunch containers. A boy holding a Frisbee on the beach. Two small children hiding their faces behind umbrellas. a boy wearing shorts and tennis shoes riding a skate board A man throws Frisbees in to the dark colorful umbrellas and chairs in the sand on a beach Birds sitting on wires are silhouetted against the yellow sky. A desk with a computer, office items, and CDs on it. a custom motor bike is parked on some gravel A street sign in grass with building in the background. A man in a shop working on some motorcycles. Animals walk around a grassy area together. People walking on a snowy road in a village a person sitting on a motorcycle on a city street A man in a hat and sunglasses eating a banana. A pastry of sliced banana on a white plate. A plate with a sandwich and fries on a table. A very tan man driving a wooden boat on the open water. A close up of a woman smiling while looking at her cell phone. A sign that says stop under a red light. A bay view with a city in the far distance. A herd of horses in a rocky field. A woman wearing red with a red purse while holding her cell phone. A black and white picture of a man wearing a turban walking down a street. Several vehicles providing ground transportation are shown in the photo streetcar, tourbus, classic car and family cars A white fence in front of a house next to a yellow fire hydrant. A clock tower is on the side of a building. The sign on the pole says Wall Street. Two skiers sitting on top of a snowy mountain. The bird is an owl flying low above the grass. A dog and a little girl riding a tricycle. A semitrailer truck as seen in its outer rear view mirror A skier taking a leap off a pile of snow. single guy on a skate board skating on a roof top An underneath view on a beach umbrella with a table to the side, and some people in rows of chairs on the beach. a laptop placed on a wooden table in a room A large crowd is watching a baseball game. A man riding a wave on top of a surfboard. Children in a room with many beds A fire hydrant sitting on the side of a road. A red and white airplane is on the runway. Two people sitting at a table across from each other. A group of people at the beach flying kites the man is holding on to a small boat craft in the water A man standing in a sport coat and looking down at his hands as a woman passes in front of him. A covered horse grazing on grass while being fenced in. People are sitting on the ground petting a cat. A train on some train tracks near trees A train passing by fields and greenery on a track. Two zebras on top of a dirt terrain. A group of people standing in line to get on a red bus in the city. a plate that has some food on it A surfer crouching in to a choppy wave A empty, set table in a modern style kitchen. a table with some food and beverages on it Two gentleman in suits smiling and posing for a picture. A young boy eating out of a can. A baseball player standing on a field holding a baseball bat. a cargo train being led by an orange and black engine Two shaggy white sheep together in a fence. A young boy dunking a basketball into a yellow hoop. A person with their pants down next to a smart phone. A couple of people sitting on top of a bench. A stop sign and several other road signs attached to metal posts. A produce section of a grocery store with a wide variety of fruits. There is a gray cat sitting on top of a gray luggage Taco salad bowls full of taco salad and a salsa container. Cat sitting on a bookcase intently watching out a window. A child's hands holding a fresh orange with a leaf and twig attached A could people stand around a food truck to get their dinner A cat is standing on a desk in front of a computer. Two men standing on either side of a pink inflatable object. some zebras are standing on a green hill and rocks A man holding food and smiling with a full plate of food on a table. A man standing along side of a truck trailer. A man with a superman custom under neath his clothes posing A man riding skis across a snow covered countryside. a cat laying on the keyboard of a computer A large wall clock on a white wall. Pair of kites flown on grassy area with several onlookers. A person on a racing motorcycle making a sharp right turn. A man holding a pizza above a table filled with bowls food. A man is sitting on the couch and watching TV while holding the channel selector in his hand and a black guitar is sitting in a corner. a man and woman cut into a wedding cake A man playing tennis prepares to hit the ball. Two women are dancing with video game remotes. baseball player swinging metal bat at home plate. A microwave that is wrapped in plastic and is inside of a larger piece of furniture. A couple of people in the water with surfboards. a strawberry pie with whip cream and strawberries on a green plate A couple of men on hot rod motorcycles parked in a lot. A small bathroom features a small sink, toilet and mirror. Close up of the over-used bristles of a tooth brush An open white box of assorted decorated doughnuts A little boy swinging at a pitch during a baseball game. A group of boats that are sitting in the water. Two giraffes are in the enclosure surrounded by a group of people. a dual screen computer on a desk in a room there are three giraffes embracing in the wild A man with a hat in the air with a skateboard. A table topped with breakfast food and a cup of orange juice. A hand holding a mouse next to a laptop on a table. Young girl with brown hair and a flowery blue hat in kitchen looking downward a man on a surfboard in the water Many pieces of luggage sitting neatly beside one another. A ripe banana, a pear, an orange and a strawberry. An art exhibit with two chairs and a blue vase. A flower pot that is sitting on top of a chair. Three older individuals with luggage, standing near a sidewalk. An old red VW van sitting on the street Two different slices of pizza on a plate. A microwave in a puddle with leaves scattered around it. Two people run for the Frisbee in a local park A man kissing a woman's forehead while laying in bed together. A dog sitting with a woman looking soulful A man riding a wave on a surfboard. Two children in blue shirts squatting under an umbrella. A monster size truck moving down a quiet city street. a young man playing tennis on a sunny day A white cat sitting on top of a woman sitting on a couch. A multi colored train riding on the tracks an image of a cat that is playing with a pair of tennis shoes Professional baseball player hold a bat and scratching his armpit. a bunch of cars drive in different directions on two sides of a street in a city Giraffes huddled next to a tree in their natural environment. A passenger jet taxiing on the tarmac of an airport. A cow inside a brick building with people looking at it through the door way. Some baseball players sitting in a dugout watching a game Skateboarder jumping off his board on a concrete course. A chapel filled with benches, a book stand, and other accessories. A cute little dog sitting on top of luggage. A young boy playing with a toy oven with a fake plastic sink. A dog inside a pin wearing a hat. Laptop computer sitting on top of a table in a personal office. A boy in a grey sweater is holding a blue kite with a whale picture on it. A small dog sitting inside a red duffle bag next to a frisbee. A woman crossing the street in the rain. Small white toilet sitting in a small corner next to a wall. A small family seated at a table in a pizza parlor about to enjoy a meal. A large grey elephant walking through the middle of an auditorium. A cow in a barn cage looking towards a camera. A boat with a long cabin sits in the water close to shore. A baseball player swinging at a ball with a catcher and referee behind him. A white bathroom sink with a crack and a mirror. A baby elephant walking into a pool of water. A bathroom with a sink and several towels on the counter A batter poses with a bat over his head. A grass umbrella and two chairs on a tropical beach. This bathroom has a toilet, tissue roll, bathtub, and two towel racks. a couple of men are standing on a snowy mountain Wearing a red shirt, a surfer rides a wave on a white surfboard. A baseball game in action with a man at the plate with a bat. A toilet that is next to a bathtub. Men in army shorts on skate boards near ramp. A couple play tennis on the tennis court. A brown plush teddy bear holding a heart A small boy holding up a tennis racket A picture of a vegetable that is starting to grow. an open book laid on top of a bed A person that is on his computer on a table. Orange placed in bowl next wet marsh land A man swinging a tennis racquet at a clock. An outdoor garden area with verdant plants and a tree. A purse has a cellphone located in a side pocket. A woman hitting a tennis ball on a court. Several skiers are standing on a snow covered area. a couple of men ride on some horses as they race busy city showing a big blue moving truck with graffiti on it next to a white van. A group of young boys standing on a lush green field. Black and white photograph of a busy city beach A laundry room in a dimly lit place. A woman in a yellow apron ties the top of a bag of popcorn in her concession stand. A red train engine sitting next to a tree. Four zebras stand in a meadow in the black and white photo. a close up of a person holding a hot dog An individual on a kayak riding through waves of water. A plant sits on top of a refrigerator in an empty room. A woman sitting next to a child on a large grey teddy bear. A elephant that is standing in the dirt. A sandwich on a white plate on a table. This is a picture of two bowls in a restaurant. an ostrich walking sneakily towards a couple of zebra a woman rides on a bike down a street A big pile of building material is placed on the floor in the wooden structure. A man in a business suit in an office building. A man that is on a bike next to a woman. A group of birds sitting on a horizontal pole. This is someones couch in their living room in their home. A baseball player is bunting the ball at a game A piece of cake with a dollop of cream filling next to it. An outdoor swimming pool has people in it. A picture of some people holding a sign. The bathroom is white with the shower curtain open A television playing on a desk in a room with colorful art on the walls A man steering cattle in a water puddle. an image of a couple that are on the couch An escalator with a guy standing a kayak next to him. A group of people sit next to each other on a bus. A child sitting at a table smiling with its eyes closed . A desk that is cluttered and has two laptop screens. A skateboarder reaches the top of a ramp. Several boats filled with goods sitting in the water. A table full go delicious meals, the closest being seasoned shrimp over broccoli. A living room with a covered couch and coffee table. Man on a snowboard going down a hill. a young boy standing on a surfboard at an amusement park Sunset scene with surfers coming out of the water A dog is staring out over a body of water. a boy is looking at his cellphone in a bathroom An adult teaching a small girl how to play tennis. A man holding tie devices in his hands while he looks at his laptop. A body of water filled with lots of boats. Many people are waiting with bags and possessions. I am unable to see the image above. A red bus is at a bus stop. A sheep standing on the side of a lush green grass covered hill. A bear climbing across limbs and fallen trees. Man in black uniform holding a soccer ball in front of a net. A bathroom with mirror, lights, sink and bath tub. A man standing in a kitchen holding a bottle of ketchup and a hot dog. A woman is reaching for the ball on the court. A green street sign near a palm tree in a city. A giraffe is looked at by many people on a balcony. A naked baby lays on a towel in a bathroom and chews on a toothbrush. A dog lying on a couch next to a computer. Several glazed doughnuts in a white box container. A fire hydrant that is sitting on the sidewalk. Man in black and white uniform swinging at a baseball. A living area with a television and various places to sit. there is a police man riding a motorcycle on the street A red fire hydrant sitting beside a lake. Two trains traveling along a snowy railroad track. A woman sitting at a table in front of a pizza. A bird is taking flight during the day. A fire hydrant is partially under a tree. A child on a snow board stands in the snow. People are riding on bikes on a road after it has rained. a man riding a wave with a colorful surfboard The zebras are eating grass in the field. a child practicing his bating in a batting cage A dirty bathroom stall with white toilet and papers various pieces of pottery lining the shelves in a workshop A blender on display next to some small glasses. He rides his motorcycle through a narrow alley. A black and white photo of a dormitory with several beds in rows. Person in yellow shirt playing tennis on a court A man in red jersey standing on a pitchers mound. A man riding on top of a brown horse while wearing a hat. Two giraffes are standing amongst a bunch of trees. There is a woman that is riding a bike A large metal clock hanging with chains from a roof. A man pushing a surfboard with a small boy standing on it A very plain and dull bathroom that's in someone's house. a man walks next to a giant bike piled high with garbage bags A man in white shirt riding a skateboard down a hill. A blue street sign sitting on the side of a road. The young catcher in black is throwing a baseball. a food dish containing red peppers, broccoli potato and chicken. thERE IS A CLOCK IN THE MIDDLE OF A LARGE TRAIN STATION A cat sleeping on top of a blue towel. a squat down toilet with a door there's a white building with gold trim and a clock A woman is raising her hands at her desk. A small child sits in front of a decorated cake. a collage of photos with a child near a cake Male and female at a party celebrating in front of balloons. A woman and a man flying a kite against a city background. A white sailboat floating across the ocean over waves. A bus is parked on the corner beside a large stone building. A train loaded with cargo crossing a bridge A photo of a bed that has been made, A flock of birds floating in the ocean next to a cement wall. A man reaches under his leg to catch a frisbee. A bus driving on a brick street a lady in a canoe with fruits and her personal items A display of apples and tomatoes in their own crate. adult and baby sheep walk across a field A train on the train tracks surrounded by greenery. A city street has a fire hydrant, trash bin, and parked vehicles. a stove with a pot cooking tomatoes and another holding a strainer A green and white bus parked in front of a small building. a person riding skis in the middle of a snowy street a group of birds sitting on back of a bench A stop sign on the side of a street Multiple white cars passing next to train at a train station. a city street with a car and traffic lights A group of people that are standing in front of a surfboard. A stern man is speaking in the center of a political rally. Some zebras are standing in the middle of a grassland. A woman with soccer ball playing with two boys next to a fence. An airplane in a very bright blue sky. A man getting ready to hit a ball in baseball. an image of two zebras side by side A man laying stretched out on the back of a boat. A country pasture with cows, grass and trees. The woman is laughing as she gets ready to eat the sandwich. A baseball player is in the outfield of a baseball field. Black and white photograph of a man with an umbrella. Two boys sitting next to each other holding stuffed animals. A plate topped with three donuts next to a cup of coffee. Two men are talking to each other during a presentation A woman performing in an arena with her horse. A man is playing Frisbee with a group of other people. A woman standing with a cell phone in her hand. A plate of fruit with bananas oranges and other fruits. Old black and white image of a man starting an airplane propeller. white and green street signs at an intersection next to buildings Two cows in a field are staring at a motorcycle A white faced clock with roman numerals surrounded by a painting. Two men on a dirt path in a grassy field. Several different types of apples sit in white bins. A person is leading a horse with a saddle down a beach. a small child holding a tennis racket with two hands A man surfing inside a half pipe wave. A giraffe walking in a grassy area with a tall bird. A boy and a girl play on the Wii gaming system. a person taking a photo in a bath room mirror A kid in black glasses pretends riding a red motorcycle. Skateboarders are attempting tricks in a concrete skate park. A skateboarder is in the air as he performs a stunt. TWO OF THE SAME PICTURE OF A BLACK DOG BY A WOOD CHAIR Two teddy bears sitting next to a plush hello kitty. a male is wearing a white shirt and black jacket Sculpture fashioned to look like a cat holding a pole. A hand made felt sloth with a button nose. A group of motorcycle races flying down a race track. A fridge in the kitchen of a house with blue walls a lady on her bed with a laptop smiling An old fire hydrant in the middle of the woods. a man on a surfboard riding the top of a wave a white counter top in a home kitchen A flowered plate of meat and vegetables on a flat surface. A brick patio with a bench and flower pots. Multi-colored miniature stuffed bears that appear to float at the ceiling. a little girl is dressed in a uniform outside A man at the beach leaping in the air to catch a frisbee. A man is paddleboarding in the ocean on a cloudy day An unkempt bed, with a pillow, a blanket, and a book on it. a plate with a sandwich on it with a side of salad and ketchup a person sitting on a couch with a cat A young boy getting ready to fly a kite with his father on the beach A slice of macaroni and cheese pizza on a plate. A table filled with several different camera's and people sitting around them. A kitchen and dining room area with a fireplace. a couple of stuffed animals sits on a street corner A variety of food dishes are shown on display. A train station with an incoming or departing train. A girl sitting on a stone wall and eating. Two women are on an advertisement on the side of a pink bus. a young woman walking on a sidewalk next to a firehydrant A dog laying in a room near a television and dresser. Baby elephant standing in the grass beside a truck. A small, green bathroom with a sink and a toilet. A person is flying off of they're skateboard a family in the living room playing with a wii video game A man sitting in a chair drinking something out of a cup. Man man setting up a network inside a business. Two men are sitting at a world economic panel. Flowers are in a vase on top of a table under some pictures. Woman in white jacket holding a snowboard in the snow. a couple of people that are playing a wii A bunch of ceramic containers that are on a shelf. A man posing for the camera on his skis A plate of food that includes broccoli and white dough balls. A model train countryside scene with a bridge and plants A makeshift bathroom is equipped with a foot landing and a tiny hole for eliminating. Bathroom with destroyed walls, a sink and a mirrored cabinet. Smiling woman standing with luggage in front of her car A television sitting on top of a television stand. A group of sheep are being herded by a dog as people watch. Commode with unusual bowl displayed in bathroom stall. a teddy bear wearing a red dress and shoes sitting in a chair A couple of men playing a game with remote controllers. A laptop with a phone sits on a desk. Corner kitchen with refrigerator and counter space next to table A street scene where a vendor is standing and some ladies are doing window shopping. Three double decker buses are parked outside of a building. Two fire trucks in front of the station. A group of different mopeds sitting in the street. Six men standing on stairs in front of building with large columns. A white swan standing on a lake next to small waves. Clocks on the face of a building below a steeple. A person with a toothbrush in their mouth with a baby. One man leaning on a parking meter talking to another man. A man with takeout sitting on the floor watching television. Woman sitting on the bus with her dog next to her in other seat A female professional tennis player preparing to serve the ball A couple sitting together on a bench in a park near water. A city intersection with several street signs and instructional signs. A stop light and a home built chair on a brick floor several people i the water para sailing near the beach A train is pulling into the station beside waiting passengers. a group of tennis players chatting with one another A red train traveling down a track driven by an engineer. Two signal lights displaying the 'red' stop light. A bus that is sitting on the side of the street. A person in a room with a television and a fireplace. A close up view of an open laptop in a room. a desk with a monitor and some remote controls Family poses in front of their house with horses next to them. A man in a suit helps a smiling boy straighten his tie. Green apples, lemons and oranges are in a sink. A view of a kitten sniffing a pair of high hill shoes. Two people sitting on the back of a horse carriage. A woman sitting at a table painting brown vases. A person biking in a roller skating lane during sunset. a little girl that has a big doughnut in hand A serving of meat covered with gravy and a side salad on plate with utensils. Two cats that are looking at a camera. A grey and white cat watches a cup of tea brew. An elephant peers through a wired fence as far as his tusks will let him. Two men working in the back of a pickup truck. A woman holding a red umbrella in the rain. A commercial airplane with the door open and people walking in. Black and red bird standing in front of a caged in area. A man with a piercing in his left ear smiling. Some cars are stopping at a stop light. A couple of women standing next to a couple of soldiers. A women sitting in front of several laptops looking at her cell phone. A tennis player poses, racket in his right hand, left arm behind him. A group of people riding horses in a line along a trail. Three horses are in a pen and they are blind folded. A red fire hydrant sitting in the middle of a green field. A hand holding a pair of scissors next to a chair. Some people that are hanging outside my car. A plate full of food with potatoes and cheese. an airplane is flying past a large city A cow standing in the grass with a tag in its ear. The legs of a person resting on a train with a backpack nearby. Three people look at paper work in a hospital room. A hot dog sitting on top of a bun in a wrapper. a close up of a plate of food with broccoli A blender with a mixture in it sitting on a counter. Several different kinds of donuts on a tray. A bunch of airplanes parked at the airport A person snowboards down a large snowy mountain. Two men and two brown horses pulling a cart in barn A man plays tennis on a tennis court. A lonely zebra galloping through a wildlife enclosure. A little girl puts something into her mouth while looking at the camera A small very neat kitchen near a bedroom and another room. A man paddling a surfboard on a lake. A wooden bench on the side of a trail has a backpack left on it. A man drinks wine while another man chops vegetables. A bunch of street signs sitting on the side of streets covered in snow. a train traveling on an elevated train track. A bridge and clock tower are lit at night. Someone takes a photo as they stand in a bathroom, near the mirror A train that is parked in front of a large cruise ship, with a blue crane next to it. A motorcycle stands in an exhibits beneath some roofing. Many flat bottomed boats on a swampy river. A computer station with monitor, keyboard and personal items. A bathroom with a double-sink and some mirrors. A baseball game in progress with the batter starting to run. An industrial type bathroom with an open shower. A yellow dump truck that is near a building. A boy with a kite in his hands in a grassy field. a woman standing outdoors with a cat on her shoulders A LOAF OF BREAD IS ON THE TOP COUNTER A black and white picture of dunes, two benches and a trash can a group of men play soccer in a dirt area A skier skiing down a slope wearing a dark snow suit. a male skateboarder in a white shirt doing a trick The woman is talking on her cellphone while walking down the street. A young man riding a bike past a car while talking on a cell phone. A motorcycle sitting on top of a wooden book shelf. A man and a women who are running toward a Frisbee. A black dog running across a green field with a frisbee in it's mouth. a dog that is sitting in front of a frizbee The reflection of two men in the mirrors of a public restroom. A cake donut sitting on a plate at a bistro. A boy pouring some drink into a cup at a counter. a group of giraffes sit inside of a caged area a man is sitting in front of some food at a table A bird flying into the side mirror of a red vehicle. A skier is posing in front of the sunset. A large sheep grazes at a countryside farm. a photo of a man over a table of food smiling at the camera A man kneeling down next to two large dogs. A landscape of some mountains with a plane flying above them. A pan filled with food sitting on a stove top. some baseball players are playing a batter and catcher Men with suitcases at an airport ticket counter. Man in the motion of running and throwing a frisbee from his hand. Man with a yellow jacket riding a scooter. Man standing holding a remote control towards a component. A plate with a brownie and vanilla ice cream. A family gathers around a table with cake and beverages on a deck at night. a number of people holding surf boards close to one another a man is standing in front of a table An old classic red truck is parked in front of bank as a man stands near the window and a woman stands in the background. Two elephants walking near a pool of water and a forest. A woman carrying a cake with lit candles towards a young boy. A man making a phone call has no shirt on. some white sheep are eating grass on a hill Two girls posing for a picture with painted on neckties. A clock and two vases sitting on a small table. A woman with a stuffed animal on a train platform a glass of wine a table with dishes of food A man grabs the back end of his snowboard as he soars off a jump. A crowd of people are standing in line. A cake cover is made to look like a wire birdcage. Back view of three men on a baseball field. A boy is hugging his stuffed animal toy A construction working holding a stop sign while standing in the street. Two ladies are riding horses on the beach. A baseball field filled with players and an umpire. A white bowl filled with lots of ripe bananas. A man taking a swing at a tennis ball There is a country styled kitchen with wood flooring and white walls. a plane at the airport landing and people besides it An office with a desk and chair with the door open. Three urinals in a restroom each urinal is at a different height to accommodate adults and children. a man riding a snowboard into the air. a person holds a horse that stands on some beach A man looking into a refrigerator door for ingredients. Someone holding on to a dog collar while the dog has a frisbee in his mouth a big window showing the reflection of a building across the street The men are going to ride their bikes in the dirt. A white polar bear is laying his head on his paw. Sandwich sitting on a plate next to a glass of juice. A living room with a Christmas tree beside two couches. A man in a courtyard reaches out to catch a Frisbee. A red and white stuffed animal with a tv remote in a bed. The bathroom has a toilet, sink, and mirror in it. a baseball player that is at home plate with a bat An overhead view of a man sweeping the street by a sidewalk. A black and white dog looking out a window. A black cat with white paw laying in a hanging cat bed. A woman with long blonde hair wearing a men's neck tie. At the birthday party there are plenty of snacks. A man jumps to catch a frisbee with two hands A dog is asleep on a white blanket. A group of giraffe standing next to each other in front of a building. a black and white clock on a pole a building and a flag an old and nasty bathroom with a toilet and shelf A person lying on the ground posing with a snowboard. a man and a woman along with a baby sit an watch a lap top A duck and elephant stuffed animal sitting next to each other. A cat is sitting on top of a toilet seat. A person holding a hot dog with yellow mustard and onions on it, at a sports stadium. The back of an Apple iPhone with the front on the table. A woman holding a container while milking a cow. A bird sits on the thin branches above colorful leaves. A herd of elephants walking along side of a river. A man wearing red is skiing down a hill. A red train with cars traveling with a mountain in the background. Some people in a very big area flying some kites. two guys play firsbe on a grass field a red double-decker bus next to a bus stop. a group of people that are posing for a picture A historic clock tower turret still keeps the time. A lounge with chairs, shelves, and a fireplace A living room couch with a display of large mirror and flowers. there is a large truck that is carrying many things on it A window in a room with different shelves nearby. A woman buys a bunch of bananas from another woman. A salad and a partially eaten sandwich on a plate. A man looks at what he is currently holding in his hand. A white cat that has yellow eyes looking straight ahead. Two zebras are battling each other on hind legs. three surfers wearing we suits are riding the same wave A city sign that is underneath a stop light. A woman in a striped shirt in the kitchen next to the fridge. A man wearing skis standing in a victory pose. There is a truck parked on the side of the road. a man reading the label on a food package A person who is standing up holding a frisbee. A couple of kids standing next to each other. a man that is standing in front of a stop sign A person reaching for a wii controller A chocolate cake sitting on a plate with ice cream A square white plate is holding a vegetable heavy entree. There is elephants both young and old on this African bush land. a baby and a bear play on a sofa A giraffe in the middle of the street blocking traffic. people with a carmel at the beach playing a plate with some pizza, salad, and some sauce on it A baby sleeping next to a brown teddy bear. Modernistic couches and chairs surrounding a big-screen television. There is a box that has a lot if wired inside of it Two guys are sitting at table. One is looking at a cell phone and a computer. A living room with blue seating and wooden tables and cabinets. A refrigerator that has its door closed and then opened. There is a person sitting at the tablr A woman laying on top of a bed in red shoes. three zebras standing next to each other looking into the camera A man laying on top of a sandy beach laying next to a surfboard. A snow boarder jumping off a ramp at night A small boy laying on the ground with a large stuffed animal. A BOY WITH A BLUE SHIRT AND JEAN PANTS DOING A TRICK WITH HIS SKATEBOARD A few cars are parked in a parking lot at night. An apple is being cut into slices on a cutting board. A series of little weird cars in fron of an european arch. A picture of a very green plant and red flower. A large organ van is parked next to a smaller van. A bathroom has a diaper changing table in it. A woman standing on a blue mat with two broken tv's and a bat in her hands. A white bridled horse carrying blankets in the desert. a bunch of bears that are in cases A person riding a wave on top of a surfboard. A male surfer on a white board in the water. A person lifts a slice of gooey pizza A large white polar bear walking through the snow. a close up picture of President Obama A bear that is going towards some water. A beach with flags in the ground and kites overhead in the sky. Man sitting on the floor with a case full of pamphlets. A man is poised to hit a tennis ball. The building has a large clock on the front. A group of people sitting around a table with glasses of wine. A sandwich on a plate and full wine glass are under blurry lights. A street lined with buildings and red double deck buses. A large white parked airplane and some trucks A white sink and toilet in a room. Three black bears on rocks on the side of the river. An online game player playing while two other men look on. A bedroom with a desk, bed and entertainment center. A group of boys playing with kites in a field. A zebra eats grass with another zebra beside them and a third zebra nearby. The man is riding his dirt bike on the street. A sandwhich in a deli tray, with a soda and a book sitting next to it. a lady driving a wagon with red spoke wheels being pulled by a horse a pedestrian traffic light with street name and pedestrian crossing signs A painting of a woman holding a Frisbee. The table has meat and donuts sitting on it. A man walking on the sidewalk with a cart that is piled with a stack of luggage. A woman standing on top of a tennis court with a racquet. A couch and a table in a room. Two giraffes lick a branch on a grassy field. A clock tower in an open space with decorative plaques under the clock. A dog is hiding half under a bed with its nose and rump sticking out. People posing with a white two door refrigerator A long train traveling across a road on train tracks. Cars are driving past two tour buses on the road A calico cat lounges in a blue chair in a home. A kitchen sink with kitchen utensils in containers. A double-decker bus is parked in a large field. Dessert on a white plate next to a silver fork. A woman reaches to hit an approaching tennis ball. Guy holding his mug why sitting in front of the computer an iron bed with a hand made quilt on it A stuffed pink teddy bear laying next to a doll in a dress. some baseball players are playing baseball on a field Two cows with big horns are on a dirt road. a woman is taking her surf board out to the sea a person on a skateboard in front of a car on the road A ferry docked at a ramp with people exiting. The two snowboarders are relaxing at the bottom of the slope. Two men sitting on the backs of horses in a field. a city bus drives down a city street Two towels arranged in a heart shape on a bed. a boy is holding a tennis racket outside Several cows laying down in a hilly area near a body of water A traffic light flashes green against the backdrop of a city. A man with an umbrella and other pedestrians walk down a street. Four people brushing their teeth in a bathroom. Player preparing to return volley during major tennis match. A bathroom sink with a towel rack, soap bottle and an air freshener sitting next to it. A man is riding a skateboard over a ramp while wearing a helmet. A group of guys playing basketball on a city street Many different types of toppings on multiple pizzas. a living room decorated in beautiful red, white and black oriental imagery with vases and scrolls A giraffe eating something out of a persons hand. a bunch of giraffes are in a large pin A man holding a metal cup on top of a wooden table next to a window. A cat standing in a laundry hamper looking down. A motorcycle parked near a curb with a man on a bicycle riding by. There are two men preparing their boards for a sport A little girl standing on top of a wooden chair. A man standing next to a pair of sheet while biting his clothes and holding a meat cleaver. A zebra walking across a dry grass field. a close up of two people talking on cell phones A brown bear walking through an enclosure. A cheese pizza made with mac and cheese and flat bread. An airplane monument placed beside of a road. A pile of ripe bananas sitting on top of a table under an umbrella. An elephant and a bunch of cattle at a watering hole. A man on his phone in front of his laptop at a cafe A banana and some sliced cheese are on a cutting board. A tan muscle car sits outside a home on a gravel drive. Steamed white rice and a variety of dishes for lunch Two people are riding on top of some elephants. a man standing on a surfboard inside the water a couple of men are playing tennis on a court THERE IS A HORSE THAT IS EATING GRASS Adult baseball player preparing to throw ball from infield area. two men in the park playing with a frisbee A very tall building that has a clock. Large elephant walking forward down a dirt road. a person riding on a horse behind a fence a close up of a young child eating something A kitchen with cabinets, wine glasses and a refrigerator in it. A small bird sits among a bunch of branches. Two bags are full of fruits on the table. A person reaches to catch an incoming Frisbee. A cat curls up on a soft and comfortable bed. a train is preparing to leave a train station A boat with equipment on it riding through a waterway. Warning signs outside a fence at a transit station The young person is jumping over the back of a blue bench. A bunch of very cute fluffy sheep in some hay. A clock that is on the side of a building. A couple of men moving a large book shelf A man at a podium with another holding an umbrella over him. A smiling woman sitting on a motorcycle in front of a building. an overhead view of many people on motorcycles A cat on flora fabric with Obama on tv behind it an old car sitting on the side of the road There is a close up picture of bread and eggs A red scooter is parked on the side of the road. a bird on a beach with a ship in the back ground A toll booth next to a highway at night. An owl sits in the grass with his eyes shut. a person holding a skateboard riding an escalator A herd of sheep with two sherds moving down a road in the mountains. A white and black motorcycle sits in a parking spot. A flock of ducks swimming across a lake. a woman clips her babies finger nails off A kite that is sitting up against a house a big brown bear with two young cubs A peacock with very large feathers walking down a street. A watermelon pound cake with icing with a slice taken out. A cat is snuggled up in a black backpack sleeping. Two motorcycles parked outside a building on a busy street. A girl throwing her frisbee so her dog can go catch it a bunch of cars sit parked down a side walk A plate of food next to glasses and bottles of wine. A woman standing on a beach, holding a kite. A huge bathroom with a large window overlooks the ocean below. A man in a suit drives his car. A plate is piled high with a meat and broccoli entre. Wine glasses and several items used in photography sit in a studio. A couple of people standing on a beach holding surfboards. A man para glides on the water near land. A bear sticks out its tongue while climbing. An array of vegetables including tomatoes, turnips and others. a room with drawers full of books and a screen People riding a sky lift watching others ski down the slopes. Kids plastic tools and toys on a table. A large living room filled with art pieces a bathroom with a black counter and a big mirror a living room with a big black couch in the middle of it a white horse with a white cover and some grass A woman wearing fishnet stockings sitting on a bed. A bunch of people are sitting together eating pizza and talking. A yellow finch perched on a white fence. A crossroad displaying the signs for Creek Road and Amethyst Street. A crowd of people are watching two teams of athletes perform. A woman in a living room playing a game system. Three people in the water, one of a surfboard Two green freight trucks parked on the side of the road. A brown stuffed animal dog with a black collar sitting in front of the mirror. Bowl of oranges on a wood surface with more oranges on the side. A shirtless man riding on a large motorcycle on the beach A tennis player is being watched by a crowd. The man has his hand on a rack of small yellow objects. Man flying a kite from a roof top in an urban area. A man in a black dress jacket is talking on a cell phone. A very attractive young lady using her cell phone. A dog is sitting under a bench outside A little boy that is standing in front of a counter. A cameraman taking a photo of a skateboarder in action. This black and white photo was taken by water. A man standing on top of a sandy beach near the ocean. A black bear sitting on a rock surface. A group of people surfing in some water. A red fire hydrant is leaking onto a side walk. A man dodging a frisbee flying at his face. Three giraffe standing next to each other at a zoo. A little girl standing in front of tall wooden doors next to a dog. A man with a tie and glasses is by a house. There are people camping and flying kites in a field. A group of people standing on top of a dirt field. A boy smiles at his friend while his kite soars high. A baseball player that is standing in the dirt. A cat is lying on the hood of a black car. A couple of birds are flying over the beach Two black cats are casually laying on a computer desk. An assortment of food and four wine glasses. a person is pulling apart a eggplant Several streamers float above people on a beach a person standing on skis on a snow covered slope. A silver colored video monitor sitting on a gray table. A stop sign and a no u-turn sign. Skiers of all ages skiing down a slope and gathering at the bottom. A boy and two girls taste testing different vegetables A metallic refrigerator freezer sitting in a kitchen. A cat that is sitting on top of a speaker. One of the giraffes is peering into the building. Large dog laying down on a blanket next to a table. A crowd watches a batter in a baseball game. A man standing in front of a pile of food under an umbrella. A breakfast of bacon, waffles, and fried banana slices A crowd of people sitting in a room on to of a wooden floor. A white toilet sitting next to a white sink in a bathroom. An umbrella and camera equipment sitting in the corner. Giraffe standing tall in open grassy field with fencing. A man on a surfboard on the waves surfing The view of a busy urban area at night. A family stands at the top of a mountain while skiing. Man in a tiger suit in front of another man on the phone A table with a chicken sandwich and a cellphone. A view of a modern building with skylight and a fire hydrant. A baseball team in the dugout preparing to bat. Man in boxers on couch with two laptops Three limes are next to a small bushel of bananas. Four seagulls are standing in a line on a large logs in the middle of the sea. A pitcher partly covers another baseball player during a game sponsored by Comcast. The man is skiing down the snow slop. A young boy eating a custard covered donut. A sandwich on a white plate on a table. The foreheads of two zebras standing side by side. A girl on a bench outside a salon checks her phone. A long commuter train passing by a train station. A dog running in the snow with a Frisbee. Four horses and a man with a hat sitting on one. A plate filled with meat and different kinds of vegetables. a living room area with a two-person couch and various living room furniture there are many different donuts on a yellow plate A white plane getting ready to take off on a runway. Two people in a small boat floating by some greenery. Three adult and one baby giraffe standing outside. A table topped with coffee cups and plates of food. A bathroom with a toilet, sink and bathtub. A man has laid out all of the items he plans to pack. A ski resort area with various skiers in the snow and several in line on an automatic transport belt. A bird on a table drinks from a tea cup. A man standing in a dry field, holding a Frisbee. A person on a cell phone on a street. A bird floating on top of water in the rain. A man in black jacket skiing down a hill with a kite. A green suitcase sitting on a wood floor. A bathroom area with a toilet, trashcan and tiled floor. A skateboarder coming up out of a dry pool. there is a small lap top surrounded by other things A dog and man rest on the bottom of an overturned boat sitting on the bank of a body of water. A picture of a man wearing a suite and tie in a picture frame. a bride and groom a purple table and a purple and white cake A group of people walk down the pa towards the beach Two sailors are shown walking in a parking lot. There is a dog wrapped up in a blanket. People ski at a ski lodge during a snowstore. Cross country skiers in a competition with number 33 in front A white bathroom with a sink and mirror next to a shower. a plate that has a table full of food A person in a purple jacket is on a snowboard on a snowy hillside. A surfer carries his board as he runs through the water. a bath room with a toilet and a sink A young baseball player is getting ready to hit. Two dogs plays together on the ground in the dirt. The passenger train drives around the curve of the tracks. A man and a young woman walking down an alley way. A man swinging a baseball bat at a ball during a game. A man riding on the back of a parked motorcycle. A birthday cake with a number one and three candle. a child sitting on a car eating a hot dog A large colored bird perched on a power line A man with a surfboard walking along a beach. A red double decker bus on street next to buildings. A plate of Mexican food with beans and tortillas. A gold and blue clock that is on a building. a person laying on a bed while reading a book The crowd of people are looking to fly their kites. A toilet stall that is white all around. a large green and yellow train on a track Cars are parked on the street next to an old fire hydrant. A close-up of a hawk with a group of people in the background. A newborn foal nursing his mother in a corral. Dump truck alone on road with buildings and bare trees and shrubs behind it. Two men in the Navy cut a cake shaped like an aircraft carrier. A woman riding a wave on top of a surfboard. there is a plane flying very high in the sky A crowd of people walk along a sidewalk near a busy road. A person is standing at the edge of the water on a beach. Adults and children gather near a dock on the beach. The ride attendant watches over the wave park. A residential street with large houses during sunset. A street scene with two men napping on a bench, a woman walking, and two other men looking at their own reflections in a shop window. A red stop sign targeted specifically at bicyclists. A large plate is adorned with broccoli and a rather small piece of meat. The man on a bicycle is using a cell phone. An Asian lady in a red dress petting a small elephant at a zoo. A small bird on a sandy beach near the water. a close up of a plate of food with broccoli Two people standing at a food truck placing an order. A red stop sign next to a street corner. A woman holding up a fairly large pizza. A plate of vegetable stir fry with sauce. A batter and catcher assume their stances as an umpire looks on. A dirt bike rider is racing through the dirt track. Two giraffes standing next to one another and interlocking their necks. a white bathtub in the center floor of a bathroom with a sitting chair and a window with drapes. A bus parked at a stop beside a small home there is a man that is throwing a frisbee between his legs A man eating a piece of pizza at a table. a person walking holding an open umbrella People and dogs sitting in a boat floating on water. Two flowers are allowed to grow in a beer bottle. Girls reaching for the basketball in a gym Ocean fairing ship near land seen passing markers. a couple of bowls with some food inside of it A man playing swinging at the ball during a tennis match in front of spectators. A couple of single beds with a phone and remote control by them. a small child stands in a tennis court, about to serve a tennis ball a girl in glasses is sitting at a laptop A wooden surface with three frosted doughnuts on the top. A tiny suit case full of girl's doll clothes a city bus parked on the side of the road a person jumping with their skateboard by some stairs An image of a city skyline taken at night. a blender with mixed fruit sitting in a container A shower stall set up with handrails and a seat. A slice of chocolate cake is on a small plate. Two photos of a living room- one without a ceiling fan, one with the fan installed. Young girl walking up steps to dog at pier area. a cat that is standing on a red chair A man standing next to a truck parked on the side of a road. a building with a large clock above an archway Birthday cake with a three candle and six other candles. A young precocious girl clutching her teddy bear. Five baseball bats on a silent auction table. Two brown cows looking at the camera. Two people laying on a green bunk beds A silver bin holding different kinds of vegetables. An old, rusting, yellow fire hydrant n weeds. A stuffed animal is standing on a table A computer that is turned on with piles of paper to the side A person riding a board through the air. A dozen doughnuts sitting in a box and ready to eat. Close-up view of skateboarders lower body performing a trick on a high wall. A table with food and a drink on it Skiers make their way down the trail through some trees. a bird sitting on a shore next to a lake. A motorcycle with a side car parked with other motorcycles. Glass enclosed shower with white tile walls,brown floor A man riding a board while hooked up to a parachute. People are standing in front of a castle type building with an eerie gray background. A small boy cutting out things from paper at a kitchen table. A dog is sleeping on the bed and having fun. A set of lights on a light blue motor vehicle. A man is standing in the water next to a boat. A small bathroom with a commode and sink, and empty corner. The refrigerator, stove and microwave are on the same side of the kitchen. a man on a skateboard performing a trick at a skate park A cat sitting on a motorcycle that is parked in a driveway. A dog leaps in the air to catch a Frisbee. A man in a grassy field throwing a Frisbee. Statute of a horse and rider on top of a block wall. a black cat laying on a bed with a colorful blanket Three trucks with lawn mowers in the bed and people near by are parked side-by-side. A man brushing his teeth with a tooth brush. there are two sandwiches that are on two white plates A stadium full of people are watching a baseball game. The large bathroom has two beds in it. A group of people riding skis down a snow covered slope. A man doing a trick on a skateboard in a park. A man holds an umbrella and looks over a flowery hill to the sea beyond. A man talking on a phone while standing on a corner. A girl holding a tennis racket up with both hands A woman looks over her shoulder as she pauses while cross-country skiing. The stands are full as a man in a blue and white uniform holds a bat in front of a catcher and umpire. A pug dog with a pirates hat licking a bottle. A living room area with eclectic furniture and accessories Two birds sitting on top of a rear view mirror on a car A man surfing on a green surfboard in front of mountains. The man on the grass is playing with his soccer ball. a dog on a table on a porch A bathroom with yellow walls and a picture of man over the toilet A vase filled with flowers sitting on top of a table. a blender with a bunch of food inside of it A pizza cutter slicing up a food item on a cutting board. A wooden table topped with lots of veggies and greens. A horse-drawn carriage ride stopped at the gates of a European castle with three towers. A hummingbird hovers near a bird feeder. A man in his ski gear is in the air. A small bird perched on a metal bar next to a tree They are selling a bunch of bananas at the fruit stand. A horse walks through the grass near sand. Dinner plate with prepared steak, broccoli and sauteed mushrooms A man in a blue jersey swinging a bat on a baseball field. Three horse drawn carriages in front of a huge house with a clock on it. A view of a small plate of food with a orange. Bananas and coconuts are sitting on an old fruit stand. Guy in a hat flies a kite on the beach while other people are in the ocean Group of kids eating some food on a table A woman sitting down next to some bananas. Men hitting ball with round discs near brick building. A man that is standing up in a grass field and holding a kite that is over his head. Flat pizza like object sitting on table with a person taking a slice A photo taken within a sleeper car on the train looking at the window. A man poses in a double-breasted coat with a fur hat. Donuts in an open box on top of a table. A bathroom with a sink and toilet next to tile wall. A large commercial jet in the air with the landing gear down. There is a woman sitting under her blankets A plastic hand reaching towards a plastic toy blender. a woman is sitting with a red guitar and bananas A man is sitting on a chair holding a sign up this is a man on a bike in the woods A pizza with basil, cheese and tomatoes displayed on a table. Young girl posing at table with cake lit with candles. A cat is sitting on a cushion on a sofa. There is a woman swinging at a tennis ball A man on a skateboard is going down a ramp. A jet airliner flying over a building with sky in background. Four zebras at the edge of a lake with a multitude of flamingos in front of them. This bathroom has a toilet and a duvet. there are two men standing and playing a video game A city block intersection with cars stopped on a corner. A large herd of sheep standing near each other A stuffed dog with a wizards hat on it's head. Herd of black cows grazing on a hillside. there is a skateboarder doing a trick in the air a short yellow school bus parked between two cars An man taking a picture of a sink through a mirror. a man is riding a skateboard in a bowl A giraffe with his head out of sight over a covering. a toilet attached to a wall in a bath room Hands putting motorcycle models onto a birthday cake. That is using physical motions to play the video game. a zebra standing alone in a pool of water A group of people sitting at tables with paper and laptops. A bunch of cows that are standing in the grass. Looking out from under a frayed sunshade at a beach and water view. A child brushing teeth in a blue sink. Three sheep eating grass near a water source. A boy is jumping off his skateboard a the top of a skateboard ramp. The giraffes are bending their necks down to eat from the bush. A woman with a scarf and sunglasses standing next to an human size stuffed dog that has an outfit on. A silver mirror hangs above a sink in a bathroom. A shelf containing books, stationery, and a clock. Tourists riding in a British double-decker bus that is making a stop. Man in yellow and black body suit on skateboard. Child wearing a red jacket skiing down a slope near the trees. A man in yellow shirt doing a trick on skateboard. A skateboarder doing tricks in a half pipe at a skate park. A lone woman stands posing in a large kitchen. A young girl in a chef's outfit cuts raw broccoli in a kitchen A man in a coat and tie and biker shorts carrying a backpack. Several graduates call friends and family on cell phones. A meal of french fries, salad, and meat is sitting on a table. Black and white photograph of a skateboard with its rider leaping above it An opened door to a bathroom with a counter and a tiled wall. A peeled banana sitting on a wooden fence. people standing around in the snow with some snowboards A person gets ready to release a kite. A tennis player in an orange shirt and black shorts holds black tennis racket on a tennis court surrounded by onlookers. Two people in ski gear standing at the top of a mountain. A tree filled with unripe apples in an apple orchard. A busy New York city street at night. there is a pair if scissors leaning on a rock and paper a couple of men are playing video games in a room An old man standing next to a forest of trees. A group of men on skateboards on a ramp. A crowd is watching horses go down the street. A cluttered kitchen with white cabinets and tiled floor. A man is who is kiteboarding on the ocean is airborne. A red traffic light sitting on the corner of a street. A photograph of papers and a computer at a desk. A red and white sign reading "Whoa" and a red a white sign reading "Caution children at play". A baseball player holding a ball and a glove. a black and white photo of a person in a suit and a person in a dress A yellow and green train traveling under signals. A reflection of a dog sticking its head out a car window Black container sitting on top of a white toilet and a bathroom. A woman standing in front of a door with a broken surf board next to her. Tennis player about to hit a ball in front of an ad. A man reaching his arm to catch a frisbee. there is a green bike parked by a red bus People lined up on a sidewalk near a bus. A bathroom featuring toilet paper hung from a chain. a street sign next to a tree lined street. A soldier wearing an Army uniform rides a regulation motorcycle. Their is a little kid using a phone A man walking a brown horse wearing a red blanket. Two giraffes inside a building near a beam. A cat sits on a wooden park bench. A orange tabby next to some black birds A horse drawn carriage going down a city street. People are flying kites on a beach near the boardwalk. A scooter with a helmet hanging off it's handlebars. a person on a skate board does a trick A man with a catcher's mitt reaches out to catch a baseball. A child in a living room is swinging a bat. Two small children are laying in a bed under blankets. a tennis player hitting a serve on a court A plate with meat, onions, gravy, broccoli and cheese. Refrigerator and freezer are filled with soft drinks and beer. A giraffe standing next to some tall building A little boy holds a small dog while he sits on a bench A child in a giraffe costume and a child in shorts cooking in a kitchen on chairs. A shelf filled with organic mango peach juice, bananas, oranges and eggs. A dog celebrating its birthday with a cake. a couple people on the beach flying a kite. a bowl of fruit in black and white. Some traffic signs in front of a church. A small brown monkey sitting down while holding a banana. A woman cutting a cake with a knife. A young boy holding a toothbrush and toothpaste getting ready to brush his teeth. A kitchen with a black automatic dishwasher next to a doorway. there is a farmer market with lots of fruits The white cat is sitting underneath an umbrella a young boy standing in a living room holding a wii controller a train on a track near many trees with a sky background A van is pulled up to a boat docking area while a cow stands alongside the signs. A person sits with their feet up with a boxed pizza. A boy sitting down with a shoe in his hand. A silver train traveling down train tacks near other trains. A demonic looking life like doll sitting on a bed next to pile of human skulls. Two teddy bears, one a police officer bear sitting in the lap of the other, a white bear, both of them sitting on a wooden chair. A wide view of the patrons of a large library. A picture of a trolley that is on some train tracks. Two male chefs cooking in a kitchen while another staff member uses a mobile phone. a man riding a motorcycle down a city street with luggage and a sleeping bag attached A cat that has curled up in a bowl. FOUR SHEEP IN AN ENCLOSURE WITH SNOW AROUND THEM Basil, cheese, tomatoes and bread on a plate. The elephant is an extremely large animal.It has a bug tusk. A couple of men that are standing near luggage. A man laying on a blue couch in a living room under mirror. A glass vase of yellow daffodils sits on a checkered table cloth. A kitten is eating cat food from its dish. A man is holding a tennis racquet and hitting the ball. An open refrigerator door with very little contents. Two small brown sheep in a fenced in pen A man rides a skate ramp on his skateboard. A sink in the bathroom next to an open toilet. A post with several street signs on it, including the name. A teddy bear and another stuffed animal next to bookshelves. The man who uses this bathroom shaved this morning A man wearing sunglasses talking on a cell phone. a man is standing and holding a controller A man looks somewhat blurry on bike as others look on. a person riding a race bike doing a trick Four zebras drinking water in a sandy field. A long desk area with a desktop computer at one end and a laptop computer and Wii video game system on the other end. An old Gothic style church with a clock in the tower. A rhododendron bush is in full bloom beside a park bench. A lot of colorful umbrellas lay out on the grass. A young man sitting on a couch using a laptop computer. a desk with many laptops a monitor and a mouse A woman standing in front of a cabin in the snow. a bed with two tables a purse and books stacked in front of There are two red and white street signs that show directions A white and green bus on road next to a car. The woman runs to hit the tennis ball coming towards her. Five delivery bicycles are parked aligned along the wall. A herd of horses in a grassy field near a hill top. An elephant statue sitting in front of a clock. A food combo has noodles, cabbage, eggs and meat. A train sits on tracks near power lines and a street sign. A group of people at a long table eating dinner together. A couple of people playing a video game with remote controllers. A boy is putting peanut butter on a sandwich a person in a living room with a emote control A blue and white plate with ham and vegetables on it. The kitchen counter has a cutting board with chopped vegetables on it. A bus driving through traffic in a city with skyscrapers. a bathroom with shower, toilet, and sink with shelves A bunch of men standing in a building and one of them is on a cell phone. A train in the middle of tracks with people. A group of snowboarders riding in the white snow A skier makes a jump on a very steep hill. Baseball memorabilia is displayed in glass stacked casings. A teddy bear is sitting on the rail of a wire fence. A cable car in front of a tall building. A little girl sitting at a table with lots of food. A dog sits and stares at the TV. A person that is playing in a tennis game. A young women in wet suits carrying surfboards. A man riding a snowboard down a snow covered slope. The skateboarder is about to perform a trick at the cones. Three giraffes in an outdoor setting with one giraffe drooling. A cat sitting on a couch , with a shirt covering it. Iguana eating fruit in fruit stand not intended for him. These young grey hours are playing Frisbee with their owner A man holding a container of two hotdogs. a messy kitchen counter and sink covered with dirty bowls and other cooking ingredients A compact kitchen with white appliances and shelving units for storage. A pile of luggage at a transportation hub. Herd of happy zebras in a field of grass A lot of flowers that are by a walk way. A man and a woman eating lunch at a restaurant. A glass vase with flowers resting on a grave. A quiet highway with a street sign up ahead. The room in the house needs to be picked up. Two people with their arms wrapped around each other sitting on a bench. a purple mug is next to a bowl There are some men playing a game of baseball. Two giraffes stand in their enclosure at the zoo. A large clock on a pole on a street. A red fishing boat floating on the water. A group of three men riding in the snow. A woman having fun with a baby elephant A woman showing her hot dog to the camera. There is a cat walking along the edge of a sink A large herd of cattle is in a field. Two men skiing across a snow covered slope. a woman leaning on a counter poses for a picture two zebra standing next to each other while one kisses the other in forest field. there is a bench under a very large tree A lady puts a frisbee in a frisbee goal. A small blue car that has been hit by a city bus A grouping of bananas and other fruits against a wall. a group of men play a game of frisbee in a park A group of people in a park flying kites. A semi truck parked at a rest stop. a close up of uncooked pizza on a surface A variety of kitchen utensils hanging from a peg board. a cat is sitting in front of a television a sink with soap a towel rack and a towel An open marina with boats on both sides A hanging traffic light at an intersection with another traffic light visible in the distance. a bedroom with a big bed, and a lamp. A toilet and sink side by side in a bathroom and a mirror. A train parked inside of a train station next to a loading platform. A linden tree overlooks a park bench on the banks of a lake. A blue counter top with lots of pairs of scissors on them. The cat is playing with the shoes on the floor. The bulldog has a mean look and is protecting his home. A couple of street signs hanging from the side of a pole. A very nice motorcycle in a drive way. a baseball player is swinging his bat at a ball Two skiers race while a crowd looks on. some people a stool a counter some lights and bottles An intersection with a crosswalk and street lights. A red frisbee stuck in a tree at a park. A black bear is surrounded by black birds on grass. A person with a lighter lighting candles on a cake. A man looks a donut hanging from a string. A women sits in bed with her white dog and she is looking at her cat. a cow stands in front of tall stacks of hay on a grassy field A bunch of shirtless dudes walk down a road a bathtub with bed behind it and big window. Someone looking out their window at vehicles on the street. A living area with a coffee table with food on it. A kid and an adult are flying a kite. a small brown and white bird sitting on a branch couple sitting with a dog wearing a cowboy hat Four young men sitting on a bench with four skateboards. A woman and two men on the beach with surfboards. Young lady with her legs in the air laying on a bed in a room. A person sitting in a chair watching a computer screen while playing a guitar. Kitchen with wooden cabinets and a center island. Several boats are docked along the side of a river. A brown and white dog laying on a floor. A Safeway truck that carries merchandise for the stores. Two zebras are standing in the shade of a building The people are posing for a photo out of an airplane. A colorful bird sitting on a branch full of leaves. A bathroom with a toilet next to a sink. A boat that is sitting in the water with a sail. A man riding a blue motorcycle on the road. A bedroom with a large, unmade be, a ceiling fan and other bedroom items a giraffe in its pen and two people are feeding it A woman is standing on a tennis court and holding a racket. The glass bowl holds a broccoli noodle dish. A red fire hydrant surrounded by yellow flowers and grass. A person with a ring smiling holding a object. A man and woman playing tennis on an asphalt court. A red fancy bus is parked by a standing man. Four umbrellas lying down a beach during the day. a white bathroom with a urinal and two framed pictures of clowns A lady is playing doubles tennis with a man. Group of zebras standing on a dirt field together. A salad that contains broccoli and oranges in a blue bowl. A small dog sitting on the back of a cow. An animal is covering up the keyboard with it's long tail. A man skiing with a dog close to him A colorful plate of vegetables, fruit and beans a grey cat sitting on top of a couple of plants A man making a vase on a pottery wheel. blue and white working truck sitting on the street A black and white image of a baseball game. A cat laying on a pink couch with a large brown hat on A man standing on skis next to a sign. A cat in a room with an assortment of luggage. A pretty young woman sitting at a desk working on a desktop computer. Person on the beach flying a black and red kite. A woman sitting at a restaurant getting ready to eat her food. The man and the dog walk near tall stacks of plastic chairs. Carrots and dressing on a plate with some yogurt. A giraffe in a pen looks down towards the ground. A plane sitting on a runway in the middle of the day. Three mountain goats on a rock with grass around it. A man wearing jeans sitting on a parked motorcycle. This is an image of three children with play phones. A person riding on the back of a white horse. a cat sitting between a window and security bar A bunch of different types of doughnuts together. A couple of air planes flying through a blue sky. A train's bathroom with a sink and a toilet. A large group of skiers waiting in a formation. a coffee maker is sitting on a marble counter top A young man tossing a Frisbee in a park. a fridge sits in a kitchen next to a door The bathroom has a sink, toilet, and a shower. a bench in a field looking at snowcapped mountains. Two roosters walking next to a fence, near a fire hydrant. Little children on a field playing soccer in a park. a woman on a train holds up her camera to take a picture of something outside the window Three boats in the green and blue water. Close up of a street sign in front of a water tower. Woman in a jersey standing next to a large elephant. A man hosing a dog off while talking on the phone. A red stop sign sitting next to a street sign. A yellow vespa parked in a lot with other cars. A meal from Japan or China on a tray. A man standing near the ocean with his surf board A chair and a clock attached to the side of a building. A person on a blue snowboard going sledding between trees Snow boarder riding during the night over a fence. A train moving through the station with a man on the bench. A little boy is standing on a refrigerator shelf. A woman is sitting at her jewelry display and talking on the phone. Fresh flowers and produce sitting on a counter top. A group of people sitting at different tables. A brown teddy bear holding three pizza boxes. The Big Ben clock tower towering over the city of London A woman on a court with a tennis racket. A group of sheep gathered together standing next to a donkey . A young person on skis flying high through the air. A large bed sitting inside of a bedroom next to a lamp. A keyboard, computer screen and mouse are on a table. A teddy bear in a chair dressed in clothes a cat that is standing in front of a person A living room with a Christmas tree couchs and a black dog. A couple of cops riding on the back of motorcycles. A woman in a skirt is side saddling on a horse. lots of snow on the ground and the ocean is ahead. Small boy smiling with his head tilted to the side. A heard of animals in a field approaching the water. A large ship is on the water near docked small boats. A decorated Chinese vase on a side board. A HERD OF GIRAFFES STANDING AND LYING UNDER THE TREES Many birds gather in the middle of buildings A dog is approaching a statue of a white bull. A baby wears sunglasses and plays with a pink suitcase. A group of people with drinks watching a game be played. A frisbee barely hits the surface at a lake A man and little girl are sitting on a bench in front of an airplane. A wine glass set on a counter of a kitchen area with a reflection of the kitchen in the wine glass. An airplane outside of buildings near people sitting in chairs. A dimly lit remote control and image on screen An adult bear and three babies cross a road A small bathroom stall has a maroon toilet rug. A man in a burgundy shirt playing Wii bowling. A dog is looking out the window of a car. Trainer shows man his elephant in tropical setting. A person riding a moto bike in the mud. A gas stove in a small simple kitchen. A person holding some some of electronic device. A cat that is sitting on a dogs back. Two cats cuddle on the chair in the living room The desert cake is frosted in two shades of pin, and topped with fancy frosting flowers. a laptop besides an alarm clock maroon in color A bedroom with two beds sitting under four framed pictures. A dog lowers its head to the ground A person holding a snow board in the mountains. A couple of cows standing next to a building. A group of zebras on a grassy plain. two dogs laying beside each other on a couch A man is getting a haircut while another man sits. Group of three players in a baseball game. A group of jet perform in the sky. A man flying through the air on top of a skateboard. This shack has a small table to the left, a stove in the back, and a counter top on the right. a man sits against a wall with punk accessories A man in a safety vest standing next to water hoses A display case in a bakery filled with lots of dessert. A tall building with a clock embedded at its top. this paper plat has the word cat and a cat drawn on it A woman has an apron and head scarf while touching carrots at a produce market. A mother and baby zebra standing in their enclosure. A person on the water flying a kite. A person with their feet on a desk with a plate of pizza and a can of soda. A fork holding a pink food item on an upside down plate. A bed is in the middle of a well lit room A woman in glasses is sitting on a butterfly bench. A guy and a boy on a motorcycle with a side car. A man is looking down at a small cake. A man on a pink and blue bicycle on a crosswalk in a city. A bird sits on a car's rear view mirror. A close up image of a giraffes face while eating. there is a small baby that is holding a small racket A pile of debris in front of a purple and red building. a few ladies are playing tennis at school Two cats are crouched in the refrigerator, among food. A vintage photo taken of a street sign on a dusty road. This piece of paper has three hot dogs on it. A group of people at a wine tasting with a variety of wines. two bears touching noses standing on rocks Three people sit under umbrellas at the beach. a number of sadnwiches and wine on a cloth near a body of water Table of sampled chocolate cake and ice cream on a table. A person walking over to a black and yellow kite in the park. A pan with a crust filled with raw broccoli, carrots and cheese. A table with a bunch of kids tools sitting on it and other items. Hotdogs cooking on a commercial grill with condiments nearby Bottles of infused oil and a glass vase full of glass flowers A large white cruise ship sitting in a harbor. a stop sign and a pole in a dark knight A left hand holding a partially eaten, pink, iced donut A red fire hydrant in front of a building. A woman in a bikini standing next to a man on the beach. The action during baseball as the pitcher throws Several baby and parent giraffes sitting around a cut down tree. A black and white photo of two female skiers in a mountainous landscape. A woman standing in front of a table of baked goods. A man is cutting an onion on the cutting board A man filming a women holding a microphone on a street corner. A man with a red hat, tie and white shirt A baby laying on a colorful quilt with a bib around his neck and a string in his mouth. A living room opens up into a kitchen. A large herd of sheep are grazing in a field. A young boy posing in a baseball uniform. There is a woman standing n a field around kites a batter holding a bat waiting for a ball to come Snowboarder performing trick on snow with trees in background A man winding up with a frisbee on a court. A women in military uniform who is giving a cow a shot. A lady is touching her lip, holding her purse, on the bench. A young boy eating breakfast in bed. The space shuttle ridding "piggy-back" on a NASA 747 airplane. Woman in a white shirt laying in bed looking at a laptop. a stop sign that has some signs on top A herd of sheep make their way down a rural path. A man plowing with oxen on a dirt road. Duck leaning forward towards a body of water from a concrete footing. A fire hydrant is covered with graffiti and spray paint as it stands in front of colorful building in the background. A large white building on the corner of a street . A view of a woman sitting on a chair with a guitar blocking her face. A stop sign has been amended with "driving" bumper sticker. a church with a tower with clocks on the top of it A dessert is sitting on a small dessert plate. Table set with black and white dishes with a scissors and dotted line motif. Picture of a plate of food and a drink. There is a lot of traffic outside because of the fire truck A picture of an airplane that is sitting at a terminal A street sign on top of a stop sign outdoors. A living room with a glass coffee table, couch and television. The clock has many different measurements on it. A kitchen with three tall bar stools next to an island. a bathroom with two toilets and a bunch of toilet paper a few people that are walking down the street with some umbrellas Three birds flying high in the overcast sky The back of a woman's head in church. A fancy bathroom with a stand up shower. A horse pokes his head over the metal railing. An empty bathroom with white tile and a large mirror A guy lying in bed with a bag of munchies and holding a game controller in his hand. Two girl holding tennis rackets on the court. A man in a suit a and tie with a umbrella. A white fire hydrant sitting on a street corner with a face painted on it. A man is sitting on a one wheeled bicycle next to a smoothie. A herd of cattle standing on top of a grass field. A group of people that are sitting in the grass. A group of people sitting around a table with food. A newly remodeled kitchen with stainless steel appliances. a close up of an elephant walking on a dirt ground A couple is walking by a store front windo An older man takes a pizza out of the oven. A bunch of pans that are hanging on the wall. a man is riding a board at the beach A man wearing a wetsuit in the water on a surfboard. Two elephants are locking trunks with each other. A reflection of a kitchen microwave and cabinets. A bathroom is adorned with a quilt pattern-inspired floor and walls. A person in a hooded jacket is near a transit bus. A person is holding an umbrella in a snowstorm. a person wearing gloves kneeled down in front of a toilet a wash room with toilet and wash basin are seen. A long white paddle boat with people riding on top of it. A somewhat dark image of a laptop sitting in the background of a bedroom. Two men at the beach one of which is holding a surfboard and a para sail. a ferry boat and a jet flying over head A computer and a laptop sitting next to one another. An aircraft is releasing a red substance below them. A box of pizza that is opened has tomatoes, cheese, and spinach on top. A luscious desert tray to satisfy all tastes. A train on the tracks is parked while people board. woman in long, light red dress with orange umbrella. A row of motorcycles posed on a floor next to a flag. A white bird with it's wing extended floating in the air. A group of people sitting around a living room together. A picture of a man and woman on the screen of a lap top computer. A worker in front of a kiln holding a vase. An older man drinking white wine from a glass. Clock tower and official buildings on the other side of the river. A white cup holding a tooth brush on top of an orange table. A lone polar bear walking across a frozen landscape. A coin meter next to a trash can on the sidewalk. many people riding horse drawn carriages with umbrellas A group of surfboards and people at a beach festival. A bunch of birds in the air flying with kites. a basket is behind a brown bicycle seat A very small plant is inside a cup. A busy street is blocked by a crane truck while a construction worker walks by. A family sits on the gravel of a beach flying a kite. Two people enjoying a picnic by a river. Three zebra standing next to each other on a lush green field. A dark room lit only by one lamp and a computer screen Guy riding his gold motorcycle giving a signal. a young girl standing above a teddy bear taped to a chair A Pacific National train is stopped at the station A black and white dog laying on top of a pink and black frisbee. a close up of a plate of food on a table Young boy knocking over his t-ball stand in the backyard A sparse room with a bed sitting in the corner. a yellow fire hydrant standing behind the tall grass Someone is trying to eat a slice of vegetable pizza with a knife and fork. A professional motorcycle rider leaning into a curve. A group of men on a field playing baseball. two shots of a man climbing stairs, then jumping down them with a skateboard a big plane flies through the blue sky Three people sit around a table eating a meal. A herd of four zebras in an open field. A desk with a computer, printer and other various items. A small blue and white gazebo sitting underneath a lush green tree. A pink double decker bus driving down a street. A dog is shown in a car rear view mirror. A skateboarder is using a ramp to jump into the air. A jockey with his horse and dog standing in a field. A metal bowl filled with oranges and tomatoes. Traditional narrow boats on a river with fruit and people. A bathroom with two sinks and a large mirror. there are two men playing Frisbee one is jumping in the air to get it A batter prepares to hit a ball in a professional baseball game. A large black train on a track with steam coming out. Three people holding wine glasses in a bar. A man is jumping and doing a skateboard trick. They have a variety of pizzas to choose from. A street with people in cars and bikes is shown. a bus driving down a street with people seated on the roof of the bus. a vintage photo of some people getting ready to cross a street a dog laying on a bed with a stuffed animal People are shopping at a farmers market on the street. Children standing in the grass on a field. A cat that is laying on the back of a chair and sleeping. A white sink and towels in a room. A city bus is slowly making its way down a very crowded street. A sport team is posing in a park. A man and woman are playing doubles in a tennis match. People laying in the sun on the beach on a sunny day Two hot dogs covered in toppings on a blue tray. A person on a skateboard does an air trick. A picture of a bunkbed that is very clean. A lady in a bath robe touching something near the ceiling. A couple of boats parked on top of a beach. A group of sheep eating grass on a very sunny day. A man holds a large hot dog and hamburger A couple of people standing in a room with remotes. A large group of people sitting on the ground. A bed that has been made in a small room. A woman taking a swing at a tennis ball A silver microwave oven sits near a wooden cabinet that has a silver handle. A seagull is standing on a ledge and one is flying across a river that is flowing. a woman stands in a bathroom blow drying her hair A group of boats on a body of water with clock tower in the background. A group of young women standing around in a half circle holding tennis racquet. A bird is chirping out of its nest. A steer and a baby brown cow staring into the camera. a bell tower with a clock face on it One woman leaps to hit a tennis ball while her teammate guards the net A bus driving down a street next to buildings. A television screen that has a video on it. A group of people standing and sitting on the sidewalk, watching a parade with horses. A really nice hotel room with a gorgeous view. A herd of zebra standing below a tall hillside. Two ladies are sitting on their laptops at the table and one of them is on their phone. A photograph of a train traveling down some tracks. Items are laying on a long table in a narrow kitchen. A desert that has some Oreo cookies crumbled on top of it. A boat with a man fishing on it on a lake. A oneworld passenger plane taking off from an airport. A group of people running and being sprayed by a fire hydrant. A couple of buses parked in front of a two story home. A woman is taking a picture of herself in a mirror. A young man with acne holds up his necktie. Two one way signs are on the same pole as a stop light. A woman sitting next to an older man holding a Nintendo Wii game controller. a vase and flowers are sitting on a table A vase full of some yellow flowers sets on top of a counter. A large Banana tree on an island near the beach. A train traveling through a jungle next to a bridge. A row of seats have closed off a stairwell. The view of a large kitchen with a breakfast bar and stools. A man flying through the air while riding a skateboard. A little blonde girl standing in front of a fridge. A cow and calf sitting on the ground. three people standing wearing umbrella hats near one another A large airplane flying through a gray cloudy sky. A picture of a building and some grass. A pizza cut into 8 pieces on a pizza pan. Two elephants walking in the dirt near water. Adult giraffe with offspring in structured zoo enclosure. A card showing the right position to ride a horse. A donut with white and brown swirled frosting. A man is doing a trick on a skateboard. a tray holding three plates of food including vegetables and fruit Two cameras on a pole near a stoplight. A train is on the tracks in a country area. A city street with a fire truck, school bus and taxis. a kitchen with a double sink a refrigerator and a counter top A boy cutting a piece of paper at a table. a man doing a trick with his skate board A brightly colored train and a santa clause. AN ADULT BEAR IS STANDING IN THE FIELD The view from the inside of a large clock tower with several people and bells inside. a modern looking bathroom with solid wood paneling A bus driving down the road with several other cars. A messy bed with many books on top of it Skier on top of a mountain admiring view as sun rises. A group of people in the woods holding up clocks. Two women and a pink umbrella riding a bicycle down the street. A boy is waiting by a train and train tracks A front view of a street stop sign. A group of skiers trekking a mountain in snow A man sleeping under a book bag on a floor. there are many bike riders racing in a street race An infant sitting on a table with a pink cake and pink decorations a green plate of food with a fork. a man is using a banana as a smiling mouth A dog looking around while standing in a window. A hotdog with mustard put on it by a mustard bottle hanging upside down. There are many different vegetables grouped together here. Umpire makes a signal during a baseball game. A kitchen with all white cupboards and appliances. A person in a yellow shirt is standing on a long holding a water ski. A parking meter on the side of the road is covered in snow. A couple of men wearing uniforms playing a game of baseball. A white horse and a black horse standing in a field eating grass Young boy gets ready to kick a ball. An old suitcase with several worn stickers on it. a man with a beard is holding some food and some people walking A brick oven with pizza baking inside next to fire. Teenagers siting on crates are gathered around a small campfire. A piece of newspaper holding bananas with drawings on them. a large bacon, spinach and cheese pizza with a large crust Male and female rams climb search for food on the side of a snowy hill. a tall clock tower with a sky background A clock that is hanging on a wall above a window. Skiers skiing down a snow covered ski slope. a white plate with some broccoli and some noodles a man playing with his kids with a kite A bright orange and yellow engine pulls this train. A large blue and white airplane on the ground. A cupcake with frosting and a star on top Two young man playing soccer together on the field. a large herd of horses standing in a field eating the grass a man on a skate board grinds on a ramp A smiling blond haired little girl is hugging a teddy bear. A diamond shaped sign is sitting in the middle of the street as cars are riding on the side. Three giraffes eating in a heavily shrubbed area. A skier in a red jacket walking along a snowy forest. The surfboard is painted in grey and pink splatters. A skier in the air over a jump. A couch on a trailer hooked to a bicycle. A small bed next do a daybed and coffee table. A meal sits on a table next to the ocean. Three young women hanging out on a bed. People standing near luggage placed on the floor. An Equestrian jumping their horse over a white jump. A tusked elephant is walking among the greenery. A giraffes head peaking over bushes and trees. A team of ultimate frisbee players jump for the frisbee. A man with beard and tie on a subway car. Male skateboarder displaying leaping ability over steps with handrails. female surfer walking carrying surfboard on her side One slice of pizza let with toppings on a pan. A man and woman posing for a picture. Many people are gathered to shop and eat. Young girl having a meal in outdoor setting. A man with blue jersey holding a baseball bat. A mouse that is sitting next to a keyboard. A man an woman are sitting under an umbrella on a park bench. A mauve colored toilet bowl on the sidewalk A girl sits on top of a bouncy house texting on a phone. A man taking a turkey out of the oven a locker with some books and school supplies in it A room with a couch, bookcase, and flat-screen television a toothbrush is laying on a white sink A small child sits on the floor and watches tv. Two young children are playing a video game. Two dogs looking at some fenced in white cows. A cat sits looking out of a window. An airplane with people under the wings at a field. A baseball player signing a baseball bat for a fan. Five old fashioned looking airplanes in formation in the sky. A pizza is shown on a plate with a serving knife. An elephant in a fenced off area under a shaded tent. A brown purse is sitting on a green bench. a desk with a laptop and a monitor sitting next to it A salad that has a white dressing on it. A bayside cafe with piers and boats in the water Two kids that are standing in a living room. A man walks around with two sheep on leashes. a man sitting alone on a black bench a couple of small figures of a man and a horse Ah, look at these sumptuous desserts under glass. A woman that is sitting in front of a cake. A living room with expensive furniture and a large window. A woman in a bikini showing a type of food A man holding a baseball bat in front of a catcher and umpire. a small cat and small dog looking in the opposite direction. A slice of pizza sits on top of a plate. The back of a semi truck on the freeway. A group of people that are standing in the snow. Some people are sitting and playing Wii in a family room. A small dog sitting on the ground at some ones feet. A red city bus parked on the street A woman standing near a large green pillar with a clock on it Two red street lights that are on a wire. Two plates filled with lots of hot dogs on buns. A guy's hat falls off as he plays tennis A women on ski's going through the air . A group of people gathered around a laptop computer. An airplane that has just taken off into the sky. a bathtub with a small shelf above it Girl standing with a Wii controller in her hand A space shuttle is parked in a museum while visitors look around. A mother carries a dish to the sink, and a young man carries a beer bottle toward a counter, as a young girl looks on. One kite flying and two stuck in a tree. A series of photographs about dinner at a skyscraper restaurant A woman holding a baby and sitting next to a dog. A tennis player's feet and shadow on a court made of clay. A red velvet cake next to an alcoholic drink. A couple of giraffe standing next to each other. a building with art work and a sidewlak with afire hydrant on it a cake that is less then half on a plate This living room is large and has a glass sliding door Two zebras and a giraffe in a dirt and rock covered area in front of a muddy pond. Men on a horseback at a polo competition. The vegetables are sitting in the white bowl. Bear behind fence of enclosure as official inspects him. A horse grazes on grass in the shadow of a mountain. Guy patiently waits on his surfboard for the best wave Adults shopping in produce section of grocery market. A young man skating boarding on a half-pipe. A small metal bowl holding an orange flower on purple sheet. A dingy with some pigeons on it in the water A sign on the side of a building on a street. A happy little girl lies in bed with a stuffed bunny. This man is holding a breadstick and a bun. A Juicerator sits on a counter and dispenses a yellow juice. A crush soda on a white back ground with orange halves. a close up of a motorcycle license plate A close-up of a table with three boxes of pizza. A cow and a bull walking down a skinny alley. Two orange and silver trains passing on a street. Five chocolate donuts and three unfrosted ones and a Canadian penny sits on a blue pokadot cloth. A pole with multiple traffic signs near trees and bushes. Group of men in white shirts and white hats holding tennis balls and tennis rackets. A bride and groom on their cell phones. A pizza sitting on top of a wooden table. A couple of girls sitting in a bed in a bedroom. a brown and white owl and some green bushes An upscale bathroom sunken tub with chandelier above. A man smiles as he holds a baseball bat in an historic photo. Two zebra walking past a grassy forest in the daytime. Two chairs and a glass table sitting in the middle of a well put together room. A room that is divided by pillars has two overstuffed chairs, coffee table, piano, a table with flowers in a vase. A toilet is in a small room with windows. Three elephants are on a dirt road. A white bowl filled with soup sitting on top of a counter. A woman holding a skateboard on the sidewalk. A mouse and a computer sit on top of a wooden desk. Two people are walking in the shore of the ocean. A clock is shown on the side of a sidewalk. A man stands in a tree with an umbrella, observing birds, Man ironically holding up holes of scissors to eyes a short woman helping a tall man fix his collar Five birthday cakes all in different and unique shapes for kids. Small boy holding a bat above his head on a cobblestone street. Pedestrians walking underneath a traffic light by a city road. A brown and white cow standing next to a tub. There is a plate of broccoli and vegetables A woman rides a horse quickly around barrels. a person in a living room watching a television A black and white small dog sitting on a foot stool. A man in a kitchen prepping a tray of food. A vase filled with flowers next to bottles of wine. The stripes on the zebra almost disappear on its legs. A wooden table holding a white laptop and glass of wine. A teddy bear on a table and some red jello desserts Two zookeepers feeding two giraffes in a zoo. a living room with two chairs and a tv A large boat floating on top of a large body of water. A make shift office space in a bedroom. A large cock sitting in the middle of a street. Three boys look on at a little league baseball game. Four people are on a bench next to a store. a large herd of sheep walking down a dirt road. A fire hydrant next to a sign of a fire hydrant. Kitchen corner with refrigeratorfreezer and microwave next to an open closet door A person in a police uniform sitting on a motorcycle. A white terrier dog on a leash with a brown spot on his eye. A skateboard is skating down the sidewalk on his skate board. A computer desk with a turned on computer in front of a book rack. A man taking a bite of a large piece of chocolate cake. A group of people watching a black cow eat from a blue pot. A small silver and red airplane sitting on the ground. A pigeon on a brick street under a park bench. A few people are getting ready to ski. A large tiled bathroom with glass sliding doors A crowd is watching a man on snow skis. Some food is about to be served for a meal. several boats docked at a marina with clear water a lady and her dog on a paddle boat he dog as a life jacket on and hey are happy THERE IS A YELLOW FIRE HYDRANT THAT IS ON THE GROUND WITH A BLUE CAP A man with a surfboard about to go surfing. A cowboy sitting on a horse at a festival. a bunch of hot dogs that are in a bowl Two guys that are sitting on horses in the dirt. Two birds are perched upon a snowy bank. A bathroom with white fixtures and tiled floor there is a piece of chocolate cake on a paper plate a little dog trying to pick up a Frisbee A skateboarder is doing a trick in the air. A horse figure is on a snowy track A person who is holding a hotdog in a napkin. A horse and a dog positions for a picture outside. a mom and her son eating at a restaurant A white outhouse toilet sitting inside of a stall. A man and a child on a ski containing a seat. A black bear crossing a road as a bus draws near. A dude in shorts playing baseball with a bat. A container that has a bratwurst in it. A do not enter sign sitting on the side of a road. A group of jets that are flying in the air. A Continental airplane is waiting for takeoff at the airport. A man prepares to fly a kite in a grassy area. A woman in black jacket holding skis next to trees. Three middle age men looking at a piece of machinery. I bet he will finish this entire meal in no time at all. A man power sliding on a long board Players and a referee playing on a football field. A motorcycle rally is attended by numerous riders. a collection of lemons, limes and oranges in front of books and a mug a computer desk area with electronic devices on it a white purple and red double decker bus and some buildings A group of people at a park flying kites. A close up image of a bike gear and chain. Two indians with pony tails are with some horses. Chocolate and caramel sauces are on a tray with sliced bananas and strawberries. A train driving past mural of working men while billowing smoke. A plate with a large pancake cut in half. A white pickup trucking is lacking doors, bumpers, grill and one headlight. A photo taken outside a restaurant with tables and chairs. a male in a blue shirt is playing ping pong A little red-haired boy standing in front of the refrigerator. A computer is shown with a keyboard and a mouse. A bathroom sink at a hotel with the usual amenities on the counter A bunch of people stand in front of a car and next to the nose of an airplane on the tarmac. A green and white bus on street next to dirt area. A person riding a wave on top of a surfboard. A person flying a kite in the snow An elephant standing on top of a wooden stool. Many people and a dog under an umbrella on a beach. The vase is holding the budding flowers on the table. A kitchen scene with wood floors and wood style cabinets. Blue umbrella on picnic table in front of food truck The skier is carefully descending a snowy slope. A little girl riding on top of a skateboard in the street. A very sexy woman laying on top of a bd wearing fish next stockings. The animals look to be walking a one direction. This is a train on the tracks that is filled with doors for houses. A cat is lying on a cushion on a couch. a man is on the court playing tennis outside A tennis player striking the tennis ball for his next shot. A man walks down the road with some cows. Crates used as tables, full of fresh produce at an outdoor market. a person in a sweater holding a cake over a paper plate a large group of children holding their kites A bus stopped at an intersection in front of a church. a red fire hydrant is between a couple of poles A woman in a pink hat looks at her phone in a crowd. Man posing for a shot wearing a suit and tie and carrying a briefcase. there is a dresser that has a mirror and many things on it a double decked bus drives down a city street Three birds on some rocks near the ocean. A photo of several bunches of yellow bananas. A grizzly bear sitting outside in the grass. a man dressed in riot gear wearing a face mask and holding a red and white umbrella A refrigerator in a corner of a room. A group of birds flying over the water looking for food An alsation dog paddling through some water in front of a building A child playing with a baseball bat and a ball. Some leafy trees are hiding a black bear. A giraffe looking over a fence on a summers day. An attractive young woman speaking on a cell phone. A silver and black computer mouse stands next to an open laptop. The man is standing by a large herd of cows. A person on skis coming up a snowy path. Two adults and a child walk on the beach in front of a cruise ship. A refrigerator covered in pictures and stickers in a kitchen. Four young men in a sitting area stand looking towards the opposite side of the room. Plates of food and two glasses of red wine are on a table. a man and a small boy standing on a tennis court holding tennis racketts A bird displaying its decorative plumage among some leaves. A man and woman sitting on a park bench A GIRAFFE STANDING NEAR TO TWO DEER IN A SEMI-ARID GRASSLAND. A row of chairs and some umbrella's on a beach near the water. A man carrying his surf board out of the ocean. people sitting, walking around and some are in groups Two people cooking bowls of ramen in a kitchen. A boy in a plaid shirt holding an umbrella. A frisbee will be thrown to a girl's dad in time Kites being flown by a crowd of young children on a cloudy day. A row of vespas parked next to a bunch of motorcycles A group of young ladies sitting around a table sharing a meal. Two elephants in a concrete enclosure at a zoo. people standing at counters of booths being served Looking down the length of a city street while cars pass by. A baseball player hitting a ball on the field. A person laying on a couch with a cat laying in their arms, covering part of the face. A counter filled with vases, candles, and fruit. The entrance for a subway on a city street. a large crowd of people at an airport terminal A small kitten is walking on a computer keyboard. A bowl containing meat, lo-mein noodles and broccoli. Yellow umbrella stands sitting on a beach with one chair. Two purple flowers sitting in a green vase. The bedroom is decorated for a female and includes a breakfast tray. A young man kicking around a blue soccer ball. A train is traveling down the tracks in the open field. A woman holds her hand out to feed a giraffe. A person who is all bundled up standing in the snow on skis. A fire silver and red fire hydrant is in the grass near a curb. Woman in shopping aisle with bear on her head A woman holding a video game controller is playing games. The tall tower in the middle is framed by two large buildings. A man in a yellow jacket washing an elephant. A man riding up the side of a pink ramp on a snowboard. A tennis player hitting the tennis ball with the racket. A small grey and white kitten stands next to a foot. An airplane is flying high in the sky after taking off. The car has two different shades on green on it Two cats are siting right next to each other. A man sits on a couch in a sitting room with coffee table with an open laptop on it. A TALL VASE OF FLOWERS IS SITTING IN A WINDOW SEAL a person holding a cell phone to a gerbil a huge brown bear standing at the edge of a small hill Open toilet, basin, and shower stall in compact bathroom. A bunch of flowers in a clear vase of some sort. A man looking at a computer game on the counter A bot stands in front of a bus, while other men look on. a group of people in a field playing frisbee An animal leaning against a bare tree relaxing. A side table with a lamp and books net to a home library. A surfer in a wet suit carrying a surfboard as he walks into the water. Man doing a skateboard trick while others casually watch. A boy is cutting slips of paper with scissors. a photo of someones living room complete with, bookshelf full of dvds, two leather chairs, a flat screen tv, fireplace, and a overly large decorative clock. A woman sitting a table holding two hotdogs. a snowboarder with a blue jacket walking up a hill Woman and child watching people row in water. A dour young man sits on a horse. A room with several types of luggage against a wall next to a mirror. This bathroom is decorated with wood and has several mirrors Asian men at a a white board talking with a Samsung sign behind them. A boy is in a courtyard on a skateboard in the air. A large yellow dump truck parked and empty. a close up of a giraffe and people holding a bucket Animals eating grass in a hill by the ocean. A banana tree filled with lots of unripe bananas. The airplane is flying high above many clouds. Three tall urinals and one short one in a restroom. This picture shows the details of a red colored skateboard. a close up of a vase near other vases and a plate A bus underneath a large crane at a factory A small airplane is parked on the runway. People walking in the middle of a snowy street on a campus. Two adults and one baby elephant walking in the wilderness A dog sitting on a chair next to a soccer ball. Two truck cabs facing each other on a road. A man standing on a surf board with a paddle. A man flying through the air while riding a snowboard. A woman is standing over a stove holding a cup. A flock of birds flying over a light house near the ocean. A water fountain that has a pigeon perches on it. people in a boat moving in the deepest place A person walks a dog with little shoes on. A group of sheep grazing in the field A boat on the ocean with a grouping of birds flying around. A man riding the waves on his surf board. Plastic containers and a bowl filled with lots of food. Young man in white playing tennis at a tennis club. Miniature pizzas and skewered heart shaped pretzel bites. two boats sitting on the shore close to the water a cat that is laying down on a couch A red double decker bus driving down a busy street. A crowd watches two people at a tennis match. Church cathedral with decorative arches, marble floors and high vaulted ceilings. A man with his shirt off, is flying a kite. A close up shot of a red apple beside an orange. A man on a dirt bike riding on a dirt road. A man riding a board on top of a skate park. A dog is sitting in the back of a pickup truck. Man painted in gold paint standing next to a horse. A street scene looking at a clock on a pole. A train traveling down tracks near a station. Wooden benches in the middle of a forest. A building is shown with tables in front of it. A yellow commuter train traveling through a train station. A long red train o the side of a field. Two pieces of cake are arranged on a table Two kids with joysticks and remotes seated on a couch playing a game A man in a purple shirt trying to catch a frisbee. some water boats bushes trees and buildings and a train A young girl walking on a road carrying an umbrella. The baseball players are about ready to take the field. A person showing a selfie of themself to the camera A couple of judges judging some sheep at a county fair. a cow in a field on a very foggy day a living room with two laptops and a tv A clock between two archways on a castle A group of men sitting around a laptop at a table. Several people on motorcycles sitting parked on the road. Four cats inside a caged in area, two yellow, two not. A group of people sitting at a table eating. It's hard to tell if these are tennis players from the thirties, forties, or fifties. baby in pajama's sitting on the bed playing with an object Horses are eating grass on a large pasture. A bowl of vegetables is set next to a blender. A long freight train crossing on a bridge over the ocean. An airplane flying with dark and light clouds in the background. A hot dog with cheese, mayo and a vegetable on it. A woman stands in front of a neatly made hotel bed. The woman sits at the table overlooking the pink and white cake with lit candle. Assortment of shells and soaps displayed on commode with dental care products. A person flies their kites above people by water. two sheep next to a wooden structure behind a fence A train bed with a blue sheet and various items on it. A pitcher winds up for a throw at a neighborhood baseball field. A family and a dog playing frisbee very near to the edge of one of the cliff of the Grand Canyon. cows resting in the shade and relaxing for a moment. Skateboarders are doing tricks as a crowd watches. two brown bears on some rocks in their pen A woman walking down a street in a dress with a bag. A couple of zebra standing on top of a grass covered field. Three men stand on a beach watching a kite fly. many different sinks near one another with mirrors A woman covering her face sitting next to a man on a log. A snowboarder decked on in great poses for a picture. a train moving on a snowy area and besides an ocean A lady with a dog in the snow waiting to cross the street. A police officer standing next to his motorcycle after pulling someone over. A smiley face sitting on top of a wood table made out of fruit. The bathroom contains a bathtub and shower, toilet and sink. there is a male skate boarder doing a trick inside of a parking lot A girl taking a picture of herself in the mirror. The train is on a railroad track, under a signal light. A train maintenance vehicle sits on train tracks. There is a view of a bench and houses down the hill A man a woman pose on a tennis court. A couple of pictures of a cat sleeping on a hair brush. A white toilet sitting next to a white bath tub. A man and a woman surrounded by people. There are red benches near the grassy area. A police officer rides his motorcycle next to the protesters. The lights and sights of a busy, populated city in Asia. A group of men and women rowing a boat in the middle of the sea. A young girl in pink snow gear on a snowboard. A woman being pulled on her water skis. An infant girl sitting in a shopping cart. A women who is holding an odd shaped carrot. A blue and yellow plate of food that includes rice and beans. A baby zebra hiding among the tall grass. A person with a camera taking a picture through a mirror. A clock mounted on the face of a building next to an eagle statue. A lady and a baby at a pizza parlor during the day. A man riding a skateboard in a covered skate park. A large brown and grey cat sits on top of a desk. A train rounding a corner on the tracks. Several crafting items laid out on a white linen. The red city bus is driving next to a construction truck. A man holds the string of a kite, as many kites fly in the sky. The girl is wearing a jacket with fur and has a yellow frisbee. a woman standing on a tennis court and holding a racket Blurry shot of man at the intersection of busy street. people holding a surfboard and walking down the beach a cat with a big fluffy tail sitting on top of a car tire Two birds are sitting in their respected area. A cat standing close to and looking at two geese. A small house stands in a small constraining carriage. A fire hydrant spraying acroos an empty street A person attempts to remove something from a large oven. a close up of a stop sign with a sky background Two elephants walking in a dirt field next to trees. An old train makes its way down the track in the country. Bunches of bananas hanging from wooden rafters by string. Giraffes walking in their enclosure at the zoo Three birds are sitting on the branches of a tree. A dog that is playing in the snow. A white SUV parked in front of a train. A green and white bus traveling down the street A street sign where there is currently construction. a child is blowing out the candles on the cake. A styrofoam plate with cats with noodles on it. A hawk perches on a tree branch in a forest. A gray cat is sitting in an empty red suitcase. A man and woman walking past a fire hydrant. A stuffed animal laying across the steering wheel. a red plate a table drinks and a sandwich A woman is swinging a tennis racket at a ball. an elephant is eating grass and a bike is nearby A bunch of surfboards that are on the ground. A cat is sitting on the seat of a blue motor-bike. Several men are standing or walking on a soccer field. A cake that is well decorated with green stuff on it. two surfers one in a white shirt and water London Olympic games statistics statue with many tourists and visitors nearby. a number of motorcycles parked near one another A produce market displaying racks of fresh fruits and vegetables. two kayakers enjoy the clear open water a dog is siting behind a large window A parking meter in a parking garage that has a lot of cars. Horse held by two leads in passageway of large stable. A plate topped with pancakes next to a cup of coffee. an empty truck parked next to a building A man wearing an american eagle tie in a suit. A platter of donuts sits on a wooden surface. Four men with smiles on their face, in a kitchen. this is a close up picture of a giraffes head a red orange double decker bus smoking on the road a large building with words scrolled across it The man holds a pig foot next to his mouth. Paved highway with several cars moving past an exit A small residential bathroom featuring oddly shaped furnishings. A glass vase holding a flower on a wood table. There are several animals in the grassy field. People and sheep traveling down a long country road. A man with a surfboard stands in the water. A red bus is next to a curb and trash bag. An old, small residential bathroom with blue curtains A group of friends waering skirts and dressing are walking down the street. A young girl appears to be enjoying a biscuit of some sort. An Indian man and woman in the water on the edge of a river. A dog is chasing along behind a cow in a field. Several double decker buses driving down the road. A picture of a bathroom with a large shower. Two people on a long rowboat in a river or lake. a shed with giraffes near it behind a fence A bench sitting by some very pretty assorted plants. A bath tub sitting next to a white toilet. a man in a suit glares while standing outside A metallic refrigerator freezer sitting next to a stove. A large long train on a steel track. A giraffe in the brush standing facing away from the camera. A man standing behind another man helping him with his tie. People sitting on a curb watch a parade and horses walking in the street. Two gentlemen doing a show with umbrellas and colorful suits. Two guys walking and talking in the room. A woman and a man pass food between their mouths. A table with a coffee and a salad on it. Three large kites flying in the sky near the water. A cat pawing at a television picture of some penguins. Two women are sitting on a bench outdoors having a conversation. There are two woman in bathing suits and a cat A herd of elephants in their natural habitat. A street is lined with people and buildings. Man checks wheel on mule drawn cart driven by girl. Boys standing in front of microphones outside in front of cameras. Two people walking in the ocean away from a boat. A pizza with olives is on a plate. A street sign that is in front of a cemetery. Two laptops next to each other are open on the desk. A sign in front of the airplane warns that tobacco is not allowed in the area. A person holding a blue umbrella in the rain. A young girl adjusts her pink sunglasses in a park. A grouping of luggage with tags and luggage trolleys. A baseball game in progress with the player running the bases. a dog sitting under a desk with a monitor Many people ride on surfboards as one man catches a wave. A big bird stands between a trail and some trees. A bamboo bench with a backpack sitting on top of it. Cellular phone displayed on display case with other phones. The horses have made this patch of ground quite bare. A HERD OF SHEEP GRAZING ALONG SIDE A HILL. odd, four street signs on a hill away from the traffic A pizza sitting on an outdoor table in the sun two legs a toilet a stall door and white tile A man holding a child on top of a skateboard. A motorcycle parked in front of a wall. A red stop sign mounted to a black pole. A teddy bear sitting on top of a red plastic basket. a large crowd of people at the park with some playing with a large kite A woman tennis player in a black army shirt and tennis skirt, swinging a tennis racket. People have set up tents near picnic tables on a beach. many people and line of parked scooters and motorcycles at night A horse stands near a colonial era stone furnace Two young women are eating hot dogs while walking down the sidewalk. A large neon sign at a market square A little girl is playing with a hair dryer Toothbrushes sit in holders arranged around a sink. A young man in a suit, tie and glasses is smiling Cows are trying to kiss the girls on the arm. a bathroom with a sink, toilet , and tiled floor A man wearing a tie, jacket and white shirt. A girl alone on a beach flying a kite. A large stack of trunks and luggage on a sidewalk with people behind it. A painting of a vase with a polka dot gray background. The plate of food has meat and cooked vegetables. Tennis player with the teeth of a predator. The elderly woman uses a video game remote near her companion. The sleeping child is holding onto a teddy bear. a little car sitting by a wall with a picture on it A young blond beautiful woman standing on a tennis court. A wall with lots of weird things mounted to it's side. A MAN IS HITTING A TENNIS WITH A RACKET A man that is on a snowboard that is in the snow. The cow sticks it's large tongue out of his mouth. There are street signs and a traffic light at a downtown street. there is a woman sitting on the ground making food A woman in a bikini laying under a red umbrella. A giraffe standing next to a tree that it is chewing on. Giraffes mill about in their pen at the zoo. An elephant pokes his nose in the brush. A large red bus on a city street. There are two zebras walking side by side. vintage black and white photo of old motorcycle A large white bed sitting under two framed pictures. A small orange train traveling down tracks near a station. A sandwich is on a long bun in paper wrapping. A HERD OF SHEEP GATHERED AROUND AN OLD BARN There is a birthday cake with chocolate icing on the table. The three teens are talking to each other on the sidewalk Black and white image of a woman and a man petting a horse. Three animals are standing near a body of water. A couple of plates of food on a table. All ages can have a good time using the Nintendo Wii. A man in a suit poses for a picture with each of his arms around a boy in a suit. A little girl on the beach playing with a frisbee. A keyboard and mouse are sitting on a desk in front of a laptop and monitor. All of the planes are flying in the same direction. A white plate topped with three donuts covered in frosting. a number of baseball players on a field A personal single engine jet, on the runway A building seen through a rain and fog covered window. A man riding a wind sail over a large body of water. Lots of colorful flower vases hang on a wall. a woman on a phone is waiting for a bus A cake with tow layer smothered in white frosting. Carefully sculpted pieces of wood in a display case. A black man opens his fridge and looks inside. An airplane during takeoff ascends into the clouds. A red light rail trains passes through a station A little girl is sitting with an umbrella a man in a suit grabs his head while screaming Four persons skiing on snow clad mountains and slopes. a small kid holds on to some balloons a guy that is skateboarding on some kind of concrete A person in skis going tightly around a flag. The desk has a desktop computer and a laptop on it. a man stands in a kitchen by a table A person skiing in an open area of snow. A number of pizzaz sitting on a wood table The group of skateboarders is headed towards the park. There are many seagulls standing on the ledge over water A picture of a man swinging a tennis racket. a parked air plane sit at a airfield An old woman getting vegetables from a heavy loaded cart. A girl looks in the mirror as she brushes her teeth A wide variety of produce is for sale including apples, pears, and onions. A commercial district street with a sign pointing where to stop for a crosswalk. The teddy bear is posed as if he was working out. A white cake with red designs and two cups next to it. A horse on its back with a man watching. a kid on a snow board stands in the snow There are two males on a vintage red train. A pizza pie with vegetable toppings and cheese A bathroom with the light on and a painting hanging over the toilet. Two suitcases that are sitting on a chair. A laptop, Furby toy and books on top of a desk. This is an image of a woman getting her hair styled. A refrigerator has a note pinned to it with a magnet. A small weiner dog that is cooling off in the pool. A young man wearing black does a trick on his skateboard where he is almost parallel to the street. A man standing in front of a flag holding a plaque. there are many young men on the field playing soccer A market area displaying various fruits that include plumbs and pears. Some flowers in a vase on a table A lady petting her dog and a man standing on a log a polar bear on a field near many trees A man getting ready to hit a tennis ball with a racket on a tennis court. A man is sitting in front of a desk with a coffee mug. A man helps his friend fix his tie before a photo shoot a baseball player swings his bat at a ball a dog that is on the lap of a women The two cats are looking out the high window. A small group of people on the sidewalk with a few holding umbrellas. A man standing next to a woman in font of a tray of food. A young boy is holding a baseball mitt in a grassy field. a man is riding a board in the water Two doughnuts sit on a plate with drinks surrounding. The is train cross a bridge over water. A child sitting at a table with a plate in front of him. A woman holds a birthday cake as a man lights the candles while another man looks on. A man in black shirt and apron in a kitchen. A man standing in the grass flying a kite. A person riding a skateboard up the side of a wall. A man in a brown shirt is playing a video game. A white and blue jet airliner docked at an airport. A man tossing a frisbee on a lush green field. The bed has mosquito netting hanging around it. A man running after frisbees in a wooded area. A black boat with a dog on it going down the river. A person flipping a skateboard with his feet in the air A multi colored dog jumping up to catch a frisbee. A very big grassy field with a bunch of bats together. clear vase filled with white and yellow flowers with water A black sign with directions stands in front of the blue sky. A dog in a mirror with a person in a room. Yellow passenger buses ride side by side down a crowded street. A black and white photo of old cars and a boat, all sitting in front of a lake. A desktop computer with a note attached to the screen. Two men have thrown their ties over their shoulders during a meal. The woman wearing a coat stands near sheep behind a fence. some pasta in a bowl sits on the table A man holds up an x-ray and looks at the camera. A man wearing a backpack pauses to talk on his cell phone. Two tennis players consult with the referee during a tennis match. A room with two pictures on the wall and a table with a computer monitor on it. A wooden floor and a table with a yellow bowl and a grey and white rug. a person cutting a small cake on a table A traffic speed limit sign sitting in the middle of a road. a bunch of cows are in a field Several zebras eating together in a fenced in area. A bunch of lumberjacks moving logs in the woods. Many pots and pans have been hung over a kitchen bar. A cell phone being held by someone is showing two women on the screen. A sign indicating the historical site that is the Nathan Hale Homestead. A completely shattered television lying on the sidewalk. A very delicious sandwich with black eyed peas on the side. Two men are sitting side by side as they are eating and smiling, they both are cutting their food with a knife. Public transportation train with blue front approaching the station Three people preparing to launch a small boat in a river. FedEx trucks parked on the side of the street while cars wait in traffic. a man skat boarding down a concrete pathway an elephant is standing behind a wooden fence area We are looking at a crowded city street. a man standing at a table with wine in a supermarket A sandwich with a drink and a bag of chips. the woman is holding a cat with a hat on A row of fire hydrants sitting on the edge of a road. A clock stands alongside a busy street at night. The two skiers are eager for the finish line to come. The bench is chained to the outside door handle. there is a woman sitting at a table eating Two elephants in an animal sanctuary with trees The back of a car that is pulling up to a stoplight. A produce stand with a variety of fruits and nuts on display. This public restroom has no toilet but instead a simple porcelain hole in the floor. . A white and black street sing covered in snow next to trees. A woman holding a pizza box and a paper bag. Many skiers are traveling along through the snow. A red, white and blue airplane is high in the clear blue sky. A group of people in a restaurant eating a meal. A group of softball bats leaned against each other on a field. A table with crafting supplies next to a cell phone. A bird sits on a wire over a street sign. A bird sits in a tree branch with leaves. An old Boston baseball player sits while holding his bat. A man that is inside of an elevator shaft Inside of a bathroom with a sink and mirror. A red kitchen with metallic appliances and paintings on the wall. A man holds a bat at the base. A man by a book case has a guitar. Many bananas and apples are on the kitchen counter. A cat is sitting alone in the middle of a large patio area with a historical building in the background. A man wearing a yellow and white striped vest and hat A person holds a bag while walking on train tracks. A large sandwich with meat, cheese, and vegetables. A man flying through the air on a skateboard. Fancy standing clock sitting in a nice setting. A bed with pillows where the blanket is slightly pulled back. A woman in a bikini with a surfboard in her hand Two cats are staring at a light spot on a floor. A snow boarder boarding down a snow covered mountain. A young man standing over a pan filled with food. A women who is eating some food and looking out a window. A man in skis holding a stuffed animal near a group of other skiers. People with their faces blurred out play Wii on a mounted TV. A collage of photos of cats and goats. A soldier riding on the back of a black horse. The man is a wet suit is catching a wave. Street sign advising to turn left for Shanks Avenue A woman holding a tennis racket swinging at a ball. Three businessmen who are crossing a city street together a brown cow standing next some other cows There is a surfboard sitting next to a car. There is a dog that is walking on the beach at sun set And elephant behind a low log fence and someone leaning on the fence, taking a picture in another direction. A little girl using a laptop on a table An employee slices a large piece of pizza, pretzels hang bear by The man on the skateboard and the dog are getting their picture taken. a large crowd of people is outside a building A tennis player stands before a net and waves while a camera man films him. A man playing a game with a remote controller. A suitcase, sitting on the floor, opened is full of clothes and a curtain is behind it. Two cats standing under a windowsill with each other. A person with an umbrella near a building. A bottle of beer sits next to a gourmet pizza pie. A guy wearing a blue shirt is skiing. A man in striped shirt looking into an open refrigerator. A red stop sign that is on top of a pole. A bathroom sink with a facet and soap dish and three mirrors that reflect three sides of the sink. A man that is standing in a kitchen near a bowl. The front of a city bus rolls down the street. The bicyclists have formed a train, and are being towed by the city bus. A small giraffe with its head down, standing next to a tree. A man in a black wet suit is about to stand on his surf board. A man standing in front of a clock. The person has fallen asleep while holding their skateboard. A street with people walking on it and items on the sides of the street. A bathtub sits against a wall with a sink and toilet in the foreground. An expressway with street signs in Chinese. This is a decorated red velvet cake on a red tablet cloth. A kitchen counter that has various objects on it. Two people playing a video game on a projector THERE IS A MAN THAT IS ON A SKATE BOARD IN THE STREET A little boy sitting in front of a computer keyboard. Three birds are lined in a row in a grassy area. A man wearing a helmet rides a skateboard A WOMAN CARRYING FOOD ON TOP OF HER HEAD a close up of a bowl of food with broccoli A young woman in a gray, long sleeved t-shirt sits on top of a yellow structure looking at her cell phone. A man holding a dog sitting outside looking down. A fleet of airplanes rest at their gates at the airport. Snowboarder displaying aerial tricks in populated urban setting. A dog is in a living room sitting on the back of a couch. A clock that is above a pedestrian walk way. A pair of scissors next to some pieces of paper. A ripe banana sitting on top of a wooden table. A woman holding a cat in her arms in a car. A small white bird standing on top of a dirt field. a small plate that has some food on it Sheeps and goats eat food in their pen A man standing on top of a skateboard. A man up to hit in the middle of a baseball game A glider is flying over the beach on a foggy day. Women walking down the street holding an umbrella Woman in midst of a Wii activity, holding the remote and smiling. A clock sits on an iron part with lights above it. Two gray and white cats laying around a toilet. A red and white train sitting on the train tracks. A small inverted airplane flying in the sky. Double decker bus in front of store on empty street. Two men in a small living room are playing with the Wii. Many people prepping large kites on a beach. A guy in a hat skateboards across a ramp. A big black bear lays down in a lush green open field The uncooked pizza has raw tomatoes and lettuce on it. A man walking on a tennis court with a racket in his hand. A horse drawn carriage riding past a city trash truck. A man leads a horse cart carrying four people including two ladies with headscarves. A car parked in the street next to a parking meter. Four men in military uniforms are smiling while holding an item next to a table as other people look on. A black and gray goose standing in the sand A boy doing a trick in the air on his skateboard The back of the garbage truck has rotten bananas on the bottom of it. A man and a woman walking past a bus with an umbrella. The meal being eaten at the table is on a blue and white plate with spoon, fork and knife. A cutting board topped with two sandwiches next to drinks.. The sink of a large modern bathroom is full of water. A pizza with two slices missing from it. a cat that is laying down on a bed A hot dog on a bun with an abundance of yellow mustard. A man beside a valley stands beneath an umbrella in the rain A woman wearing goggles skies down a large hill. A person on a snowboard in the snow. a girl flies a kite near some other people a female in a white top is playing tennis The Asian market has a large quantity of pears available as well as other produce. Looking up at a traffic sign and street light. A woman looks at her reflection in a handheld mirror. A man removes food from an oven with hot pads. Street sign light on a traffic light pole Adult men standing in living room playing video game. A white bowl of tangerine slices on a wooden surface. A pile of veggies next to meat covered in gravy. A green backpack with a computer mouse poking out. A person with a pair of scissors about to cut hair. Dog laying down on the sofa next to a cat. A cat that is standing on a bench. Herd of cattle laying on a beach that has people on it a bowl with liquid flavors in containers lemon orange banana and pineapple A group of skateboarders riding down a city street A white plate topped with a sandwich and chips. A man and his son eating donuts at a restaurant. A person on a skate board in mid air by a rail. Older style single engine airplane being displayed at air show. There are several people walking in a street parade. A bird perched on brick ledge with a hole in it. A beautiful blonde holding a Nintendo Wii controller with another beautiful woman holding another Nintendo Wii controller. Grey fighter jet, with pilot, on a runway. A woman is pulling on a man's tie. A city street with traffic caught in motion at night time. A man in an inter tube by a boat in a lake. a man riding the side of a wall with a skateboard A skateboarder rides on the side of a large pipe. A dog is looking out of the window of a car. Cattle are crossing the road to a beach front. A pair of scissors stabbed onto a wooden counter top. Two photos of a tennis player rushing to hit a ball. A flatbed truck carrying the remains of a crashed light airplane. A black and white dog carrying a frisbee in a field A person is laying tennis with racket in hand Three guys at a table eating a giant pizza. A bathroom being remodeled with toilet set aside A man is taking a picture of his bathroom sink. A man and young woman fighting over a frisbee. Two bowls of food next to a pack of lemonade. We are looking down on a market square. A large boat is motoring toward the shore. Two men drink wine with their eyes closed. A small gray elephant standing in an exhibit at a zoo. a bridge with a train driving over some water A woman taking a picture of the back of her top. A desktop computer on top of a wooden desk. The zebras are grazing in the open field. The young man is talking on his cel phone. A man holding up a phone and pointing to it. a man in red is sitting on a barrel A tennis player getting ready to hit the ball. Several people who are skiing pose for a picture. The dog is all dressed up and ready to ride. A van and car driving down a street. A clock is on a pole under a set of windows. A person jumping up into the air for a Frisbee. A young woman sitting on a rock under an umbrella A skier performs a somersault on a ski slope. Cars are parked on the street near a traffic signal. A man drinking from a wine glass in a polo shirt Tourists among taxi and double decker bus traffic A computer is on a desk in a blue room. The road sign is visible for all to see. a man typing on a desk top computer at a desk A baseball player is swinging his bat at a pitch A man making a goofy face while sitting near a cake. A man laying on top of a couch. A huge bundle of bananas is hanging from a tree. A man at the beach flies a red, white and blue kite. A great shot of a very lit up city. A close up of the edge of a table looking at a keyboard and a mouse. A black and red train traveling down tracks. A group of people sit on a dirty boat. A man is brushing his teeth while a piece of tissue sticks out of his ear. Glass and stained wood entertainment center, with decor and a flat screen television. a couple of giraffes stand next to each other A bird perched on top of a wooden power pole. A sign is shown pointing two ways with a dog. a man walking on the beach with a red surfboard A zebra standing on a dirt road next to a bunch of deer. A plate of food showing broccoli, fish, lemon and rice. the man is leaning over taking a picture of another man A sheep is standing in the grass near water. A man holding a blue, red and green frisbee in his hands. A man carries a bulky, stuffed piece of luggage. A man holding a pair of headphones in his left hand. A para sailor goes airborne over waves in the water A high mountain of snow with a cross country skier. A fighter jet flying through a blue sky with smoke behind it. An airplane is mounted on a stand in a park. this is a woman using her cell phone A couch is looking quite dark with the blinds down. A pig head on a plate surrounded by a bunch of apples A skateboarder spreads her arms to balance herself as she circles the rim of a bowl shaped course. A man in a kitchen concentrating on cutting an onion on a board with a knife. Two attached train cars on a track. Cross-roads sign for Jekyll and Hyde roads attached to top of stop sign A man sitting at a table eating a sandwich next to a marker board. a man hitting a baseball during a baseball game a bathroom with some knobs built into the wall A truck sitting in the middle of heavy traffic. A large bus with several people standing out side waiting to get on. Two women playing paddle ball on a sandy surface. A computer desk sits in the corner next to a dresser. a shadowy looking man jumping over a ramp A tropical bird in flight on a sunny day. A small ham and pineapple pizza on a plate next to a spicy pepper shaker. A pizza with tomatoes, corn and a pizza cutter is laying next to it. Herd of Wilde beast and zebra walk through grass by shore line A bird standing next to a partially eaten apple. A row of surfboards sitting on the beach near the ocean. Group of people watching two skiers come down a slope. Black and white photograph of people with bicycles and skateboards next to a ramp. A salad bar filled with lots of different foods. Little kid in a cap stands next to a fire hydrant many red and white stuffed bears holding hearts grouped together A boat sailing in the water near a beach and grass. A baseball bat hanging to the side of a wall near a sign. Two women sitting on a couch with remotes in their hands. A very cute green city bus on a busy street. a young boy about to take off his helmit after playing baseball. A white sink and a shower in a room. A man flying a kite stands next to a young boy. A close up of multiple vegetables including broccoli. A brown horse standing on top of a grass covered hillside. A row of wooden shelves with lots of glass pottery on it. The bears look like they are hugging each other. A bed has no sheets or pillow cases some white jets are lined up on a runway An outfielder watching what is going on at home plate. A bathroom that has a yellow floor mat in it. Boy in midair while skateboarding on indoor course more than one yellow public transit bus in the road A person is skate boarding on a sidewalk. A collection of apples and oranges in wooden crates. A picture of someones bed and dresser in a bedroom. Tulips about to bloom in vase in vacant room a man rides on top of a race horse A cat peeking into a room from a curtained window A small puppy chews on a dog toy shaped like a pizza slice. A table sitting inside of a room next to a window. a man rides on a horse near a blue car Asian vegetable stir fry dish with wreath of broccoli and assorted mushroom varieties. A group of people are sitting around a wooden table. Two motorcycles side-by-side parked in a grassy area. A bunch of people on holiday at the beach. A giraffe walking in grass on a sunny day A baseball game where a player is running to 3rd base. A tennis player on the tennis court in the middle of the swing. A fritata on a plate with chicken and broccoli and tomatoes A bald man plays an informal tennis game. The tall vase on the table is holding small flowers. A man and woman brushing their teeth and taking a selfie photo with a camera in a bathroom mirror. A woman is sitting on a bench looking sad A cat next to a windows behind cans and bottles. A person typing and working on a hp laptop A green bus is in a parking lot. Man feeding a costumed woman's head chocolate cake. there is a black cat laying on a desk next to a computer A boat of some sort near a harbour. A person who is working on a laptop computer. A school bus sits in a parking lot with other cars. A cat sitting next to a banana on a shelf. Two men who are wearing suits and hats standing next to each other. A person falling off a skateboard onto the ground. A bathroom with a large square mirror over the sink and a brown shower curtain with circle designs. A double decker bus passes a fellow motorist on the street. A woman riding on a motorcycle inside of a show room. A bagel, cream cheese and lox is served with fresh cucumber and tomato slices. A person in a wet suit is parasailing. Flowers arranged in vases on a shelf against a wall. A young person stands on a beach with a kite board. A tractor trailer is parked in a grassy field while people lean against it. A family of giraffe walking around a stone filled hillside. A train makes its way down the tracks in a wooded area. A TV has a cartoon-like screen with a keyboard sitting idly. A baby with bib on sitting on the floor putting an unidentified object in mouth. A man standing in front of an open refrigerator filled with food. Several senior citizens are at the table, posing for the camera. A dog is sleeping on a couch in a living room. The woman in the red shirt jumps up to catch a Frisbee. A woman eating a doughnut and pointing at other doughnuts in a bowl. A man with glasses is wearing a white shirt and tie. two horses at the sunset in the field feeding Four people carrying surf boards on the beach in wet suits. A group of people on land looking at a flying boat A man talks on a cell phone while holding a camera. A man with a tennis racquet stands on a court. Guys in the gym playing soccer with teams A bowl of food and a spoon on a table. A woman riding on a bike past a busy intersection. A man outside in snow gear on a snowboard. The edge of a bed and a closed window. Michael Jackson hat and glove to celebrate a birthday. A surfer is atop a wave with arms steadying from an upward position. A lady in the dark holding a remote up. A young man is waiting on a table of people at an asian restaurant. A woman talks on the phone while touches a yellow cup that sits on he table. Young baseball player up to bat poised to hit the ball A bunch of plates of food such as fish, pork, watermelon, pasta salad and cocktail sauce. two pieces of toast, bacon and potatoes on a table with a cup of coffee Small dog sitting on covered table with orange toy. Two plates of food and two glasses of wine placed on a table. A man in blue shirt holding two bowls full of ice cream. a street sign attached to a pole on a street. A black hair dryer sits in a tan chair. A plate with a sandwich, fries, and a pickle are sitting on the table. A person walks along the beach with some dogs A large red double decker bus driving down a city street. Two photos of a man sitting on a private jet . A girl standing and holding a sweatshirt next to a stop sign. A brown dog on wooden floor next to a window. A closeup view of a clock on a Christmas tree. two vases on a table with flowers in it A baby holding a i phone sleeping in it's mother's arms. A traffic light is shown next to a tunnel entrance. A large white clock on the side of a wall inside of a building. A photograph hangs above the tank of a toilet with a spare roll of toilet paper. A large selection of fruit of different types in baskets. A partially eaten taco pizza is in the foreground, while another type of pizza is in the background. Three giraffes pressing their mouths to each others heads. A man is laying in bed with headphones on. A dish of pie on a wooden table. A young man sitting in a car talking on a cell phone. A gray elephant walking around inside of an enclosure at a zoo. A small boy chewing on a blue and white toy. Two microwaves sitting side by side on a countertop are marked with signs printed with the symbols for man and woman. Someone using a cell phone while brushing their teeth A pepperoni pizza and a bottle of beer A dimly lit bathroom just has a toilet and dirty sink. A stop sign on a street corner with building, crane, and blue sky with clouds in background. A child holding up a baseball in a mitt. The white bathroom is very sleek and modern. A man who is eating a pizza and looking out a window. A surfer sits on his surfboard while waiting for a wave. People with red suitcases walk towards a large building. A bathroom is shown with a shower and a toilet. there is a stuffed animal that has a small stuffed animal inside it A man laying on top of a couch in a living room. There are two people holding glasses of orange juice. A cup of coffee sits next to a keyboard and mouse. Two women on a bus, one talking on a cell phone. Three boys are playing soccer underneath a bridge. an elephant picks up riders from a platform Woman sitting with bananas in camp with people in background Several people are on a lake with kayaks, boards, and boats. A young man sits on a bed that is made-up with lots of pillows. A person standing on top of a tennis court while wearing a white hat. Two old suitcases, a blue one and a brown one, are stacked one on top of the other. A man riding an elephant plays basketball while others watch. A woman makes a crazy face over a plate of food. A man holding a toilet seat on a square toilet in a bathroom. Old photo of man with a beer sitting in the ground with others. A bathroom sink under a mirror on top of a counter. A girl plays with a cat on the ground Two black birds sitting on the branches of a tree. A subway train that is crossing over a river by a bridge. A picture of a snowboarder jumping right into the air. A computer on a desk with a bottle of beer next to it A woman is posing next to a stop sign. A man with a tennis racket jumping on the grass a man wearing a back pack walking toward another man A white airplane with two large propellers sitting on a runway. A clock on a pole in front of a tree. A woman standing on top of a tennis court holding a racquet. A man crosses a street at a corner with a market on it. a couple of people riding on some big elephants a kid poses on a side walk as a baseball player A bunch of zebras that are standing in the dirt. Bathroom with glass shower door and art work hang above the toliet A boat is in the dimly lit water by the city. There is a bathroom with a toilet and a bidet. A women who is riding a skateboard in the street. A nighttime picture of Big Ben in London, England. Here is an image of an outdoor place. A couple of women are playing tennis on astro turf. A kitchen is being installed with stainless steel refrigerator and glue is on the island. A man giving a thumbs up while on a cell phone. A group of young people throw a frisbee back and forth. A brown bear in the woods under a tree. A man is frowning while standing in an empty room. Two giraffes graze on treetops in the distance. A woman cutting a portion of pizza from a tray next to a bowl of fruit A plane flying over a river in a rural area. a man and a woman sitting at a table eating food A man riding a bike on a dirt path through a forest. An elephant walking down the side of a dirt road. The hood a street motorcycle, that has the Italian color, the number 7 and ALITala on it a brown bear is laying on a rock and some trees A vase of flowers is sitting on a white table. A sausage link is strung out on a board ready to be cut. A colorful vase of flowers sitting on a glass table She makes riding the waves look fun and easy. A small restroom with a single toilet and wooden toilet seat. The kitten is enjoying the treats on the plate. A lovely cat have a cup to his face. A group of animals walking in the grass next to a road. A few men carrying some surfboards on a beach. a small plate of cake on a table A person that is doing a skateboard trick in the air. a close up of a pizza on a pan on a table A red umbrella with the ruins of a building in the background. A large gray cat laying on the floor next to a couch. A man standing under a blue cloudy sky. People sit around a table full of hot dogs and fries. A bus traveling on a road with other vehicles beside a large building. A sandwich with eggs and cheese on paper Skiers riding a ski lift and looking back behind them. A woman in skies is standing in the snow The young boy is walking with his glove on. A man standing on a surfboard catching a wave. Rows of Pullman bags for sale at a store. An open door on a train at the end of a platform. A cat climbing down beside a t.v. screen. A computer sitting on top of a wooden desk near a window. A large bridge spanning the width of a bridge near a tall building. A car perched on a table looking closely at the television screen. Three friends look past a bottle of wine to the end of the table. An old building with rote iron railings and landings. An extreme close up of an expensive gaming keyboard. a young boy sliding down a snowy hill on a snowboard A woman walks down an empty street next to a large street clock. Assorted food items with paper wrapper ready for consumption. A oven made of iron filled with pots and pans. That building looks like the building downtown in Atlanta. A picture of a man's face next to another picture of a person's arm holding a glass of wine and a remote control. A woman holding two pairs of scissors next to a display. Man in black shirt and jeans doing a skateboard trick. A woman and black cat together in a bed. an image of a man that is riding his bike up high Shot of bathroom with bath on far side near toilet. A dog is laying and resting on a walkway. A young persons clean and orderly bedroom and desk. a man in a blazer uses a cell phone someone having a chili cheese hot dog for lunch some glass ware is on a wood shelf A bathroom with a white toilet next to a shower. A horse is standing inside a pen next to a smaller horse. A girl has her pony by the harness. A kitchen with an oven, stove, microwave, and refrigerator. Image of a bathroom showing the vanity and sink area. A large bathroom with a toilet and sink. The person is deciding whether to try the skateboard trick. A photo of a horse on the back drop of an ocean A work desk cluttered with stamps and work supplies A woman walks down the street with an umbrella. A man is riding his bike down a subway area under a Clearance sign. Flowers in a vase sitting on a window seal. Black and white cats laying down in the green grass. Pastel umbrellas hang above a garden in the lobby of a fancy building. A zebra carefully walking around in a zoo pen. Stud farm with horses and trainers in a vast ground. a cat and a sheep are standing in a field THERE IS A WHITE WII CONNSOLE AND GAME ON THE TABLE A fish eye view of part of a bathroom An owl is eating the flesh of another bird. A white parrot standing next to a jungle covered hillside. A group of people standing in the middle of the street. Two women are cutting a heart shaped cake together A farm stand selling plants and apples by the pound or quart. Sidewalk under construction with safety cones by the fire hydrant. A man sitting on top of a pole next to a fire hydrant. A small tree is covered in snow from the storm. A man is dressed like a clown magician while pointing at a picture on his cell phone. A man is standing outside of the water observing the huge flock of birds. Black and white photograph of people observing sheep in a field there are many computers and lap tops on this desk Smiling indoor tennis players and their racquets with a football Some young children preparing for a baseball pitch. A kitchen area features a silver refrigerator, stove and counters on dark, wood flooring. Young woman dressed in black and white playing soccer. A flower in a pot standing on a table. a baseball player swinging a bat at a ball A road sign next to a building and tree A man dives in to catch a frisbee A giraffe in an enclosed area eats from branches up high. a very pretty kite is flying high in the sky. An orange cat sitting in the passenger seat of a car. An assortment of remote controls lined up on the table A skateboarder who is jumping down a flight of stairs A skateboarder performs a difficult skill in a skate park. A bedroom has pink walls and a blue bedspread. A stack of donuts sitting on a piece of paper. A white boat on water with seagulls and umbrella in the foreground. A train coming into the train station A zebra is outside enjoying the grass before him The bear is on the table in front of a glass of beer. An intersection with a pole that has signs on it. A crowd of people on the sidewalk and an airplane overhead. A bathroom with a tub, toilet, sink and a mirror with red edging. A photo taken from the ground of a person standing with their skateboard. There is a stop sign in a field. A group of people sitting around dinner tables. A senior tennis player prepares to backhand the ball. A woman standing next to a tree holding a pink frisbee. A baseball player taking a swing at a ball A woman standing next to a standing toilet. a person riding skis jumping in the air this is a woman on skis posing for a picture an elephant with its mouth open and some bushes and trees A dog sitting in a wooden rocking chair outside. Close up view of a large glass of wine. A woman talking on a phone while wearing glasses. There is a hanging clock over a set of stairs. An eagle flying past a group of green trees. A bento box with chopsticks containing strawberries, carrots, sandwiches, broccoli, lettuce, and some other foods. Interiors of a kitchen containing several household items. A man standing in a kitchen with a large pan of batter. A train traveling across a snow covered hillside. Two traffic sings sit above a Parking sign. A man riding a skateboard up the side of a ramp. Baseball players at the pitch playing and a crowd watch this is a man picking over bunches of bananas A Muslim man is being interviewed on TV an image of two urinals inside of a public restroom Trains sitting side by side on a train track. An empty intersection in a mountainous area. A group of people eating food at a table together. A man walking on a brick sidewalk with an umbrella. A surfer riding a large wave in the ocean. Neckties are tied together around the circumference of the pole. A man holding a tennis racket up in the air A man taking a selfie while brushing his teeth and looking in the mirror A person that is trying to get a frisbee. Commercial airliner flying near mast on cloudy day. Some pizza with toppings and some pasta on a plate. an image of a rotten fruit and burnt hot dog A young lady sitting at a table covered in food. A horse standing in a snow covered field in front of some buildings. A group of people sitting around a couple of benches. A baseball player with one leg kicked up preparing to throw a ball An empty kitchen is shown with empty counters. a white bus driving in a parking lot with a truck beside it A toilet in a bathroom that is being built. A vase and lids are sitting on a table. a small passenger plane sitting in a filed of airplanes A motorbike with blue and silver bones painted on it A baseball game with players in uniform and one player swinging the bat at home plate. An empty bed with gray sheets and a small lamp A woman carrying a pink umbrella wearing a blue scarf. group of people on bicycles waiting at a stop light a train going down a track by a platform with stairs A kitchen table that has a vase on it. A cake shaped like a bear has a sparkler and candle on it. The large cat fell asleep in the chair when no one was home. The animals look very skinny and unhealthy as they walk around. A couple of zebras graze in their zoo habitat. A woman is taking a picture of herself in the mirror with a camera. A grey cat is being held by a woman at a cat show. A bird flying over a small city with small buildings. Two young children playing in a living room A man with graying hair looks down at a stand full of yellow bananas. A train on the tracks under the electrical lines. This person ordered this dish at a restaurant The dog is on the couch in the room with the large TV. A large kitchen with wood floors and cabinets Soldiers on a train saying, "goodbye," to nurses. A bathroom with a toilet, sink, towel rack and paper roll. A blue car that is parked on the side of the street. A slightly knocked over stop sign next to a small empty road. A line of buses parked in a bush lot with a fence. A guy doing skateboard tricks in front of a crowd of people. Giraffe and small dog stare at each other at the zoo A group of guys playing Frisbee in a park The retro looking living room has blue couches and pictures on the wall. A pile of luggage is secured to the top of a small car. A plate holds a large salad with broccoli. Many horses are on the beach near the ocean. some people are walking around a city with umbrellas Two ladies wearing black texting on their cell phones. Two men in a park playing a game of basketball Metal street signs with street names and a stop sign. A bird is standing upright in the water and leaves. Two guys playing a game on the WII. Young girl perched on rock about to rcieve thrown frisbee. a person in a living room playing nintendo wii near a window A tennis player is playing tennis on the court. A sign on the street that lets you know where you are. A cat is using the toilet to go to the bathroom. a person on a beach holding a surf board A white and black boat traveling near the Golden Gate bridge. A boat sailing close to shore near a lighthouse. Looking up at a dirt bike rider leaping over a jump Elephants at the zoo holding each other's trunks. A man and a woman that are sitting on a couch. A very young girl brushing her blonde hair. an image of a man drawing pictures on the sidewalk That concrete is going to be hard on his body if he misses this skateboard trick. Boys sitting on a bench at a baseball game. A man riding on the back of a motorcycle down a road. A row of red fire hydrants sitting in the middle of green bushes. A table with a television and a picture of electrical gadgets. A person wiping out on a surfboard on a wave. A bus traveling down the street next to a bunch of cars. A plane makes a landing at an airport. A man standing on a tennis court holding a tennis raquet. A woman standing talking to her cellphone next to a man in glasses. A sink and tub with towels in a room. A soccer player runs up to kick the ball while the crowd watches. A pregnant belly with a teddy bear on top of it A small table space that is in a tiny motel room. a glass vase with some flowers inside of it Blue and green passenger train passing down the side of the small valley. A lone zebra stands under a tree branch. a large pile of teddy bears in many different designs A couple of men standing on top of a field. Tourists photographing a steam locomotive pulling into a station. An old style truck that is parked on the grass. a table that has a banana and some ice cream on it A horse is being led away by its bridle A tall building with a massive clock on it's face. Two people standing in a room playing video games A covered dish beside a sandwich and other dishes of food up to the right. One person cutting a cake while the other pulls out slices on a spatula. A tie with the picture of a deer on it sitting on a shirt. A woman is wearing sunglasses and holding a parasol. Guys in the park playing Frisbee golf on a cold day. a bathroom with cream colored walls and a broken counter on the floor A horse or zebra in the middle of some shade trees. a black and white kitty laying next to a chair leg A light that is on a table next to a laptop. A man rared back with his racquet on a tennis court. some stuff blended up in a blender for some serious gainz a person at a desk with a laptop and a note book A man flying through the air while riding a skateboard. a platter and assortment of different desserts and cakes A group of people sitting at a table around a pizza. Two skateboarders doing tricks at a skate park A pair of skis are placed in the snow. Cat carefully examining a skateboard on a hardwood floor. Two girls in red chasing a white soccer ball. Water spews into the air from a fire hydrant. A street corner with trees that are covered in snow. A horse is running down the dirt path. Two young ladies are sleeping side-by-side in a subway station. A picture of a man holding a remote. a person lays on the snow with their feet up A large crane sitting next to a building under construction. A red train leaving a train station with man watching. A narrow lane runs between rows of parked buses on a rainy day. A glass of wine and a smart phone sits next to a laptop computer. A little girl that is sitting on a kitchen counter. A hot dog, french fries, and a spread. THERE IS A CAKE ON THE TABLE A close up photo of a train set with a little train going by. A zebra standing in a grassy field by a woods. Near a wooden bench, a baby in blue places her rubber boot upon a skateboard. An old fashioned train is parked as workers gather around it. a refrigerator with stickers on it sits in a corner in front of a window The young boy is standing and playing the game. A family sitting at a outdoor table at a restaurant. There is a truck pulling a camper trailer The display of the Magic Bullet blender, with a price tag of 53.99. Pale shelves with bananas and other items and a black marble topped L shaped counter against a brick wall with cooktop, sink, and various kitchen items, meet, leaving a small section of inlaid wood floor. Small white bathroom with a black-and-white shower curtain. A green couch sitting in between two lamps. The skateboarder is performing a trick, mid jump. A light colored dog chewing up a child's toy on the carpet. Boy with legs out stretched taking a jump with a skate board. A vintage photo of hurricane damage to boats. A picture of a living room in a house. a lady with a real colorful umbrella that is standing outside Two women in white tennis outfits hold out their rackets as a crowd watches. Girl on a skateboard texting by the beach Breakfast for four with omelets, fried eggs, bacon, ham, french toast and pancakes. A man dressed all in blue playing tennis. A clear vase holding white flowers on a table. A kitchen area with many copper pots and bowls on display. Two people huddle on a bench under their belongings. A cat is sitting on a couch while leaning against the couch's arm. An upwards-looking view of a Stop Sign, an All Way sign, and a One Way sign. A person riding on top of an elephant near a tree. A man walking with a dog that has a frisbee in his mouth. Two men pushing a full cart down the road The street is lined up and down with motorcycles. a close up of a person holding food A black horse in the middle of a field with a mountain in the background. A bench on sidewalk below tree next to lamppost. A computer desk with a monitor, phone, and laptop on top of it. A couple of wine glasses next to some bottles. There is a small window in a stone building A woman in yellow raises her tennis racket. Flying a kite on a wide beach with few people. A man standing next to a hipster girl. A man is taking a picture in a rear view mirror. there is a man flattening dough on a tray A group of people standing and sitting around a table. A blue shelf filled with Chiquita bananas in a store. A man with sun glasses and wearing a hat laying on a bed. A man in bright green prepares to serve a tennis ball. Here is A tender moment among zebras this afternoon A Muslim lady holding a child that is being fed a birthday cake. A plate of food is arranged with fruit and vegetables. Horse and rider walking on sandy beach at ocean. Skateboarder and board in mid air at a contoured park. Cat sleeping near the sun on bed covers. a group of guys playing with the wii A plate of food containing broccoli,cauliflower, celery and other foods Puppy and full grown dog outside near some refuse A sleeping black and white dog wearing a pirate hat. A hot dog with toppings and potato salad There is a large group of skiers standing on a wide field Two girls walk along a path near a waterfront. Small brown dog laying in between a person's shoes. a beach covered with umbrellas and tourists relaxing A group of people flying kites under cloudy skies. Two old people in motion while playing a Wii. A private airplane is flying in the sky. A woman sits in a u-shaped bench with her legs elevated A street with people walking about it and a kite above. A red, double-decker bus drives through the town as dusk approaches. A metal sink filled with many lemons and apples. A man in a baseball uniform standing on a baseball field. A man smiling for a photograph and holding papers in his hand. a yellow long tailed kite being put into the air by a couple Three lit candles on a chocolate birthday cake. A baseball player pitching a ball to a batter. A train coming on the track in a train station A person is doing a trick on his skateboard. A player chases a tennis ball while the umpire watches. A person giving a thumbs up to a computer screen. Closed toilet and shower in small, bright bathroom. A zoo keeper on a scale holding a giraffe with a "me gusta face" A woman and child on a silver motorcycle. Brown and white cat sleeping on desk next to a computer. a person on a snow board does a trick over a hill A couple of guying chasing after a Frisbee. A person in a suit and tie looking unhappy. A dark colored river with several horses on the other side near the trees and brush. A bicycle with a springs mounted under the seat. Two pieces of pizza on a plate pepperoni. A decorated room with no one in it has a table in the middle with various items on top. Two trains driving inside of a train station. A woman in a costume inspired by the White Rabbit from "Alice in Wonderland." A baseball player in mid swing and a catcher ready with his glove. A man biting into a slice of pizza. Advertising and traffic clutter a busy city street a crowd of people by a school bus and a girl holding a big blue bowl Two people are in front of a deck, and about to go skiing. a kitchen with a sink trashcan refrigerator and a heater a building with a clock sitting near the top of it a man in a red and gray snow outfit stands on skis holding his ski poles as he stands near other skiers and snowboarders. There are different citrus fruits in the bowl. A bed and desk in a small room. A horse that is grazing around in the grass. A tie rack filled with lots of different colored ties. A couple drinking wine on a horse-drawn carriage ride through the countryside. half empty bowl of cereal with a loaf of bread, a banana, and beverage A bathroom with raised shower, sink, widow and mirror. a very large building that has a clock on top A stop sign out front of a construction site A flat screen TV mounted to a wall over a lamp. A car and a large truck on a city street. A young man swinging a baseball bat on top of a field. a big basket of bananas next to some people A chocolate cake sits half eaten on a table. a public restroom with a white toilet and toilet paper A red couch behind a brown ottoman with a cat sitting on top of it. A stove with a willet cooking banana and a moka pot. Hot dogs and buns cooking on a grill. A beer advertisement on the side of a passenger bus. a man wearing a suit and tie standing in a room. A group of surfboards on a rack on the beach A red fire hydrant sitting next to a green plant. a surfer wearing a wet suit is surfing on a sunny day A plate with with different kinds of food on it. A man and a boy playing Wii in a living room A man dressed in white is on a horse. SOME GOOD WAVES FOR TWO SURFERS IN THE OCEAN A smiling man that has long dread locks in his hair. The tennis player is swinging the tennis racket. A zebra drinks from a pool of water in a grassy field. A kid holds a sandwich and a big candy cookie. A boy leans on a counter next an almost empty soda bottle. a skier with a red jacket is next to some water and snow A man is smiling while talking on his cell phone. A zebra is grazing in an enclosure while an ostrich sits in the background. The single train car is painted black, yellow, and orange. A man standing in front of a fridge with a lot of magnets on it. The interior of a public bathroom with multiple sinks. A man and woman standing next to each other with the woman holding an umbrella. A parking lot filled with yellow school buses parked side by side. A kitchen with a sink, coffee pot, refrigerator and shelves. A person handling bread over an open oven. A woman with a tennis racket is running a tray covered with cheese fries, a corn dog and a hot dog A train is coming down the track near old warehouses. Two teams playing soccer with one team kicking the ball down field. Shadow from a street sign with a message written on it. Several birds overlook the skyline of a distant city. A woman poses with avocado sandwich lunch at an outdoor restaurant A living room with chairs, a table, and painted walls. Two women shaking hands at a tennis match. A police officer mounted on a horse while two children pet the horse. Lone giraffe lying in dirt area of enclosure. there are two hot dogs on a fake paper plate a black red and white double decker bus people and buildings A boy grips his skateboard as he jumps the edge of a half pipe. A couple of elephants roaming through the tall grass A close-up photo of a white and brown cow. A young girl standing under a window next to a toilet. A tennis player stands by her equipment bag holding two rackets. A mountain view with two birds flying overhead A boy and woman in an open area in shopping center with three park benches. A street identifier installed as part of a curb in the sidewalk. A large elephant standing in a grass field. A young man is riding a skateboard with other young men watching him. A view of the city is very colorful. The picture shows the underside of a jumping snowboarder. A black and white train on tracks next to a station. There is a cat giving itself a bath while laying on a luggage. A woman with a dog talking to two people sitting on a bench. Several children and some adults celebrating a birthday party. A black and white zebra is standing in the green grass. a orange cat sitting on a half rotted wooden bench A poppy seed muffin with orange slices on a plate. A man on snow skis traveling on some snow. The three giraffes tower over the smaller animals. Two beds in a tiled room, both with lime green bedspreads. A pitching about to throw a baseball at a game. A vase that is placed outside of a window. Pumpkins sit under a spooky lit up Halloween display. Two people roasting hot dogs outside on a stick. two people on a beach with a kite Small group of kites being flown nice day. Several people in the heavy snow on skis. A green and silver train passing by a building. A group of people walking down a street on a rainy day. Slices of pepperoni pizza on a baking tray. People standing with sheared sheep inside a fenced enclosure. a coupe of people sit on a couch while laughing A young man kissing the top of a young woman's head. Vases and figurines line a long piece of furniture next to chairs, a lamp, and a picture. A bird walking in the grass with it's beak open. A man holding a large bag of lime green luggage. A baby sitting on a bed next to a large brown and little white teddy bear. A white horse looking up for a photo at a fence side. A stop sign that has been tagged with graffiti. Two people in a group with one holding up a phone. a couple of people that are walking on a beach A man standing on dirt holding a pink frisbee A baby is sleeping in a swing in a room. a close up of a cat in an open luggage bag Many people on the city street with umbrellas. A man rides two brown cows across water. A meal of a sandwich and soup sits on a wooden table. a close up of a bunch of green apples a kitchen area with a stove-top oven and sink and cabinet with a dishrack A woman is blurry as she rides her bike next to shops in a city. The horde of pigeons take advantage of the crumbs left by pedestrians. A group of elephants is standing on grass. a person pointing to what they are putting on their snadwich. A man holding a snowboard standing at the bottom of steps. A man walking along the platform next to a subway car. A train covered in blue paint and graffiti. A bus displays an In Service sign, traveling down a road A woman holding a cat up tight against her. A mother and her child sitting on a couch using laptop computers. Smiling woman standing in front of refrigerator with wine bottles on top. The giraffe is posing for the picture near the wooded area. This train car features a variety of colors and carries passengers. A zebra that is outside eating some grass. A young man in black clothes holding a yellow frisbee Group of children sitting at table eating pizza off plate Museum with ancient artifacts and people looking at them. Several chocolate donuts with decorations sitting on a pink mat. Several street signs shown on a city street. A kitchen that has various types of appliances. A group of people are standing around a caged giraffe. People play in the water and fly kites at the beach. A person wearing glasses is walking away from a stop sign. Person bundled up out for a ski in the soft snow a green street sign surrounded by some trees A city street filled with tall buildings and motorcycles. A traffic light is displaying a green smiley face. A pile of vegetables sitting on top of a wooden table. A marching band stands in a street in front of spectators. The blue bathroom is small, sleek and efficient. a cup of coffee a laptop and a table Three women on a couch talking to each other. A locomotive train traveling across a train trestle. A street sign of NE 5th st and the back of a stop sign A water bottle with ear buds on it in front of a laptop. A cat is looking at himself in the mirror. A girl standing in a room holding a green Frisbee. THERE IS AN AIR PLANE THAT IS FLYING IN THE SKY Two planes are flying by one another and one is putting off pink smoke while another puts off blue. A room with three old tubs and peeling walls. The boy is curious about what is beyond the umbrella. The plane is flying over the parked cars. A woman and a black and white dog on the beach. Several cars at an intersection on a city street. Female tennis player in blue outfit returning volley. A bowl of a kind of vegetable stew on a table. An individual snowboarding down a snow covered hill. two elephants in tall grass with trees in the background A tennis player prepares to hit a tennis ball, while others watch. Oranges hanging from an orange tree in an orange grove. The little boy eats a slice of pizza. People standing around the stove and counter fixing plates of food Two laptops and monitor on a desk in front of another monitor. A pulley is seen in a room with lots of stuff on shelves. A fake bear that is standing in the snow. A pool surrounded with chairs and trees. Man riding a snow board down a long slick area. Several kites are flying on the beach in the blue sky. a living room with a tv a desk and another tv a black cat walking into a kitchen A group of motorcycles parked in front of a tall building. A giant neon Coca Cola sign glows in the stadium during a baseball game. A bathroom counter has purple orchids on it. An orange fruit beginning to grow on a tree. The city streets are busy this time of night. A skier flying high in the air over a snowy hill. People standing in a long line at a train station. A round intersection on a surburban street with one floor homes. a couple of people that are standing next to each other A large group of people standing around in red, white and blue colors A dog is sitting on a chair near a stuffed animal. a group of people that are walking down a sidewalk A man and a baby lying on the couch in a living room a giraffe eating leaves from a tree with its butt to the camera A woman holding a racquet and tennis ball on a court. A man wearing a suit directs two men riding horses through a city A young boy touching a cow through a metal fence A black and white photo of a train system going down tracks. An aerial view of a street corner with a STOP sign and a ONE WAY sign above it. Two men playing professional soccer on a field. A man riding a wave on top of a surfboard. A woman in a dark cave holding two sheep The room has two couches in front of a tv. The ingredients are on the kitchen counter next to the blender. some people at a table with a umbrella silverware and some drinks A man that is wearing a suit and a pink tie. A gray and white tiger striped cat sitting in front of a brickwall The fried rice has vegetables and meat in it. a bunch of people on a snow slope in the moutains A toilet attached to a red and white brick wall. A train traveling along side of a road. A white table topped with plates and bowls of food. a white keyboard sitting next to a white computer mouse on a mouse pad. two stage coaches traveling down a snow covered trail there is a clock on the side of the old building. A group of people are lined up skiing. A man and his son playing Frisbee in a park Female soccer player maneuvering ball on grassy field. Skiers skiing in the snow with their skis on the ski slope. A man in a purple shirt doing a trick on a skateboard. A British Airways airplane flying in the air. A man is doing an upside-down flip on his motorbike way up high in the clouds. A man does a jump on a skateboard. A white Ecohopper bus driving down a street. A PICTURE OF A WEATHERED YELLOW AND BLACK TRAIN A bathroom with toilet, mirror, picture and tub. A hand holds an old-style flip phone in the open position. A boy on a skateboard at the top of a rise on a skateboard ramp. A large wooden clock hangs from the ceiling in a store. A person holding two ski poles while standing in the snow. a dog that is rolling down a skateboard there are many people on the road riding motorcycles a group of baseball players standing in a field Large bird preparing to fly from beach area. A group of brown cows grazing in a field A LOT OF MEN ARE ON HORSES A child is playing in a recreational park. A street sweeper machine parked against a tree by a street. a living room with some book cases beside the fireplace a lady wearing a red sweater with an empty plate A teddy bear with multiple colors with a new tag still on it. A man laying down in the snow with skies on Passengers near a yellow and blue ski airplane. A large display of apples at a market. A red couch that has a laptop computer on it. A bedroom that is cluttered and needs organization. A MAN DRESSED AS A PIRATE AT A PARTY a young man is performing a skateboard trick A baseball field full of baseball players standing on a field. A bathroom with a sink on the left under a mirror and a toilet. To bananas sitting on two blue plastic bowls. The yellow commuter train is pulling into the station. Photograph of a public toilet as taken from above A LADY IN YELLOW ON THE COURT PLAYING TENNIS A woman is shown holding a pizza with zucchini An elegant bathroom has a light up mirror, marble counter tops and dual sinks. A dog sitting on top of a made bed. A man trying to block another man with a frisbee during a game. A yellow cat is sitting on a green blanket. The plate of food has a salad and toast on it. A snow skier skiing down the ski slope. a guy that is jumping on a skateboard A white kitchen with a counter in the middle. A blurry picture of a bird sitting on a wire. a motor bike parked on the side of a road across from cars A man is looking at his laptop while chatting on the phone. A man that is sitting down near a bird. a bus in a city at night time stopped A person jetskiing in the water and creating a huge wave. A TV sitting on top of a counter inside of a store. A parking meter on the side of a street A man leading a flock of sheep down a street. A bunch of stuffed bears altogether during Christmas. one sheep is standing in some tall grass a family is sitting down at a table to have cake a cappuccino and a overripe banana sit on a table A person rides horseback down a beach along the ocean. Some men and women in white shirts and bow ties standing in a row. Skiers lined up at the starting point for a race A baby sleeps sitting up while clutching a teddy bear. Multiple computers and soldering equipment on two desks. The kitchen has a stove, and a microwave in it. A baseball player takes a swing at a ball. A man making pizza in an oven on a wooden board. A woman sitting back on a couch holding a little white dog. A person standing next to a tall giraffe. The interior of a modern kitchen including an eating area A cat sitting on top of a blanket on a bed. A young man in a bathroom taking a picture of himself using the bathroom mirror. The graffiti on this Stop sign denotes a positive impact. A small pizza sitting on a decorative plate. a bird that is sitting on a pole outsid A cat climbing on top of a suitcase. Man herding sheep down a street with a child in front of the herd A bathroom with a small window and a odd toilet. A person helping a child stand on a skateboard. A baby girl on table next to cake and balloons. A woman sitting up in bead looking out the window. A blue and white fire hydrant on a lawn. A person skiing down a snowy mountain side. The purple city bus is noticeable against the brick buildings. A bathroom with a standup shower, toilet and sink. This person is laying in bed while reading a book. A group of people standing on top of a beach. A woman standing in front of a counter full of baked goods. A pack of elephants stand in a grassy plain. Several people are doing something with remote controls. Sheep are grazing in a field in the distance. There is a surfer holding on to a sail in the ocean A kitchen with drawers, a stove and a sink. A tall giraffe standing next to a tree on a grassy field. SKIER COMING DOWN THE SLOPES JUST OUTSIDE THE CABIN A guy and a girl are sitting in rocking chairs using laptops. A large pizza sliced in half in a box. a herd of zebras drinking water at a lake A cat laying on top of a suitcase laying on the floor. A pile of chicken, carrots, and brussel sprouts. Two buses driving over a bridge with boats in the background. The slightly overcooked pizza is inside of a pizza box. A box of cookies sits by a wedding cake decorated with berries. A laptop and an old computer display text while sitting near a window. An airport with an airplane that has a red tale a number of different doughnuts on a table The couch is directly in front of a huge television set. A home desk has a computer, lamp, and knick-knacks. The black and grey cat is facing the other way A bowl of vegetables on a wooden table. a close up of a jet flying in the air The man in the red cart held the reigns controlling a pair of obedient horses. The plate has broccoli and an egg roll on it. a bunch of computers that are on a desk there are many surf boards laying on top of each other Man in yellow shirt grinding down a railing with his skateboard. Two men surfing in water next to a dock. Three horse wearing coats walk around a large field. A man looking downward holding a teddy bear. The seat of the wooden bench is covered in snow. A group of friends playing a motion controlled video game Toothbrushes and toothpaste lay on the counter by the sink. THIS IS A PHOTO OF A MAN WORKING ON SOME SORT OF CRAFT PRJECT A man driving a carriage pulled by three horses. A baseball hitter swings at the pitched ball A sailboat is floating on a lake under a cloudy sky. One person flies a kite near a crowded sidewalk. Freshly shorn sheep eat grass in a mountain pasture A girl looks into the distance, while holding a clicker. Two men on a motorcycle pass through a crosswalk. A little girl eating a piece of birthday cake at a kitchen table. Horse and carriage going down the street in the city. A ship docked at an empty harbor at sunset. The cat is on the desk by the two computers. Man lying on a bed in a furniture store display. Female tennis player in a purple uniform ready to play. A couple of sheep are in the grass by a barn. A bench sits in the sun near a path and some water. TWO POLAR BEARS IN THE POOL EACH ONE HOLDING SOMETHING ORANGE Male and female intent while attending a function. A surfboard standing in the sand near trees and the water. a few small boats in a large body of water A plate that has several sandwiches on it. sheep standing next to building near a city street A bathroom with a sink, mirror, and toilet and other items The man is driving the bus full of people A cat dosing off while lying on a chair. Screen of an iPhone with German language text held in a person's hand. a baseball player swinging a bat on the field A kitchen with a pull out ironing board and refrigerator. A street sign on the side a of cement wall. some parked bicycles and two women on a bench and a book A hideous bathroom that is pink in theme. A man is playing with a frisbee on the beach. a tall clock tower near a building with a dark background A line of girls holding frisbees or plates outside There are two streets signs attached to the stop sign. Two red and white stop signs on a street. a decorated vase is sitting on the table top there are many people that are sitting on the benches two trains on opposite sides of a railway platform median. A girl on a surf board riding a wave in on the ocean. A man wearing a blue shirt while eating a hot dog. A parade float with people on top of it A chair and contraption between a grandfather clock and a plaque on a floor. The surfer is on the surfboard riding a wave. A large clock tower on the side of the water. A geese and several goslings in a pond A zebra walking in the grass while other animals are standing around behind him. A picture of a snowy street with a red fire hydrant. Two cars parked in the grass as a train goes by. A bird sitting on a bird feeder next to green trees. A bathroom is shown in dim orange lighting. this lady is using controllers and those men are watching a toy train on train track next to a toy railway platform. There's a computer monitor on a desk with speakers around it A photograph of Key Bank with a clock under the sign. A bicycle is parked in the narrow alleyway. A double decker bus drives down the street. A horse grazing in a pasture in a field, with mountains in the background. A girl that is standing away from the camera and has a Wii remote in her hand. A skateboarder performing tricks under the lights at night. A large red bed with a black cat laying on top of it. a black dog sitting in a white bathroom A man standing in front of a train car door. a person sitting on steps with a cell phone A man who is holding up a parachute. A man using a One Laptop Per Child computer, while another man uses a standard desktop computer. An open cell phone next to a sprouting sunflower seed. A young man jumps up to catch a Frisbee underneath his legs. a big zebra that has his mouth on top of it A man is sitting on the couch eating. Various people eating in a restaurant at a table. A man with a tennis racket at a tennis court. Bright yellow furniture sitting in a living room next to a lamp. A bathroom is shown with a door cracked. A man without a shirt is brushing his teeth. A man playing Frisbee on a beach on a cool morning. A woman is next to a scooter and cat. A small boat with people on the top. Three boats sit on dry land, the nearest one is called Lauren Jade. The motorcycle officer wearing a helmet drives near a crowd of people. People making a for sale sign on a car. A man riding a motorcycle down a street and surrounded by houses. A toilet bowl with a bucket and trash can by it. A brown vase full of colorful flowers in front of a mirror. A group of giraffes stand in a large open field. neatly made bed with blue sheets in a pink room two bikes parked near a clock pole on a side walk A couple of boys wearing ties giving each other a hug. a living room with a table chairs and a tv A tennis player at the match is returning a volley. The last car of a train sits on train tracks. A bathroom with vanity with sink, toilet and tub. Two men toilets, one regular toilet, and a sink in a bathroom A chair sitting next to a flat screen TV. A group of zebras are next to a patio table. A green street sign mounted to a white street light pole. A child posing on top of a mountain while they ski. A herd of zebra walking across a dry grass field. A boy in a tie poses for a picture. A small yellow room with a couch, table and lamp and wood flooring. Elephants gathered in the corner of an enclosure A long train coming down the railroad tracks. Lion statue with a large structural clock in the distance a lady that is holding a laptop sitting by a street A woman is holding a racket on a tennis court. Three dogs are following three women toward the entrance of a building. Cattle grazing in partially snow covered ground in winter. An anniversary cake on a table with a picture and glass of wine. A young oerson is raiding a small fridge in their room. A pesto and chicken pizza cut into eight slices. A green double decker bus sitting on top of a parking lot. An old bench on a porch of someone's house in the valleys. Some cars at a red light at an intersection stopped. A model kitchen is shown with white appliances. A large blue bus on the side of a road. A group of motorcyclists fly the Puerto Rican flag. A woman and a man look to the left while the woman points A photo dark room with the red light on. A giraffe standing near a tree by a body of water. A woman and small boy feeding some sheep Plates of food cover a table and includes vegetables and potatoes. The bird with the purple feathers is perched on the branch of the tree. You are proudly witnessing a 360 Ollie in progression A large kitchen with a table in the middle A cat is standing on top of a TV trying to look out the curtains. A pizza sitting on a table, with a spatula in the back. A woman pushing a stroller and looking at a cellphone. A cat laying in front of a computer next to a mouse. A small blue and white plate sitting on a small runway. A fire station on a street in a downtown area A large white airplane parked on a runway. A person is holding a tomato above a tray. a smiling woman standing next to a baby in a high chair there is a female tennis player serving the ball Two baseball players and an umpire standing at home base. She is going to nail that tennis ball. A view of a train station from the parking lot. Two white cattle standing in water next to some ducks. A bathroom mirror over a marble sink with the lights turned on. a lady with a knife laying down in a bed. a bedroom with some posters a blue and white bed and some pillows The restaurant platter piles french fries high with a juicy burger. Man prepares to throw a frisbee in an open park. A plate of food with a pizza on it. A view ofa bar from behind the actual bar. A group of children playing baseball out side. The commuters are busy while they wait for their plane. A few apples and a banana sit in a dark bowl. A person on an ocean beach flying a kite A train is traveling past a grassy area with a foot path. A couple of kids petting sheep inside of a corral The sheep and the dog are on a race. A group of people sit at a table with cake. A man riding a board on top of waves in the ocean. A surfer catches a wave on a white and green surfboard with another surfer in the water behind. Very long Coney dog on a long buffet table in a ball room There is a statue of a man's head next to a cat. A two sided pizza is being cut by someone. A jar filled with liquid sits on a wood surface. Group of people riding their bicycles on a city street. A worn and tattered pink and black bag. A young giraffe leaning over a tall bush in a dry field. A large table with a laptop and home computer. A man dressed like Darth Vader is standing in a white bathroom looking at himself in the mirror. Small dog in street next to a skateboard. A small sofa and coffee table in an apartment living room. Two females are walking down the street wearing boots. Female tennis player in the motion of hitting a ball. Two street signs show an attraction and street name A bird perched on a log with a house in the background. A food cart with trays of food on the shelves Perspective-corrected photo of a large masonry building under a clear sky. Man cross country skiing with a yellow lab. People crossing the street and walking on the sidewalk in a city. The black and white dog is lying beside a stuffed bear. A guy standing in a living room holding a controller playing a video game. A cow laying down in the sand on a beach, with the water in the background. a bowl of food next to a keyborard three guys sitting down eating sandwiches and smiling A group of people on surfboards in the ocean. A gang of bikers driving down a city street. Man standing in front of a parking meter holding a folder. People wind surfing on the water near a suspension bridge. An old picture of a twin bed and radiator. A television, couches, table and a remote controller. A girl is flying a kite on a clear day. A room with a tile floor containing furniture. a staircase and people. a parked van with graffiti painted all over it a young person standing on a chair in a kitchen cooking doughnuts A small white cow and a big black cow walking in an empty field. A woman in a dress and Mary Janes bends down towards a Frisbee in a fenced in yard. Three people skiing together on a path carved into a hill A person wearing a red tie pointing to it with both hands. A person with six snapshots making a call and taking a beer A bathroom is decorated with white tiles and white towels. Two woman standing in front of a mirror near a sink. A man in yellow shirt and black shorts playing frisbee. a couple standing in front of a wishing well. a close up of two people walking close together A very tall chicken standing next to the ocean. a little boy playing a game on the television Passengers are standing in a line in front of the door of bus. A herd of cows graze in a field behind a wire fence. A couple of girls with tennis rackets in a room. A person in pajamas laying on a bed reading book. Batter at baseball game waiting to hit the ball. A picture of a cat that is looking out a window. A man and woman cutting a white sheet cake. A bench right next to some tall grass at the edge of a body of water. A man is in the ktichen and the living room is painted blue. the people are watching the animal drink water Four cell phone on a wooden table with their screens on. A kitchen with a red stove top under a framed picture. chopped onions sit on a cutting board next to a glass of wine A bicycle parked next to a lake on a cement floor. There is a sheet of stickers that go on a keyboard. Children learning to make their own kites. A black bear perched on the top of a fence. A woman in a robe is using a mobile device while holding a cigarette in front of a garage door Two twin sized bunk beds in a room A young woman in the water wears a life vest holds a water ski. The intersection at Durham Court with forest in the background Pots being displayed at some sort of exhibit. A couple of red traffic lights next to a forty sign. Five men are around a table with food on it. A red train passing by bushes and a road. Some people with cowboy hats riding horses on a trail. A batter swinging at a pitch at a baseball game with a runner on first base. Two cows stand in a pasture eating grass. A stuffed animal is smiling while sitting on a bed. A small white cat sitting on a ledge. A group of people in boats on a river. A surfer on a surfboard riding a wave. Cat covers it's face while sleeping by the window Only one slice left of a fruit pie. The clock is on a brown stand with a wall behind it A man on skis sitting near the mountains A man riding a snowboard down a snow covered slope. An old couple is sitting down on a bench together. A skateboarder is mid-air doing a trick on their board. Clothes hanging on a rope over an unfinished patio. A plate with chicken, broccoli and mushrooms with a bit of gravy. A group of people pose for a photo at an event. A baseball player taking a swing at a ball Two kids in bunk beds reading while laying down. A sign warning of snakes in the area stands on a pole. A bathroom decor is in shades of browns. A biker has his young daughter on the bike A couple sit together for lunch on a street bench The four engine airliner sits on the tarmac on a cloudy day. A sailboat in the water with the docks in the background. A photo of bananas, mangoes, and oranges in a pile. An airplane is shown taking off into the sky. A tow truck carrying a bulldozer on a trailer. A half eaten doughnut sitting on the side of a road next to a truck. A lone zebra standing next to a tree in front of a fence Five wine glasses sitting on paper on a table. An amazing lunch spread with a beautiful salad, peaches, tomatoes, and sandwhiches a group of people are traveling down a paved road A group of people with umbrellas stand in the road. Child watching kite as kite is flying in the air A man riding a paddle board down a river next to a lush green forest. A school bus waiting at a traffic light. Bushel baskets full of vegetables at a market as shoppers walk by. A polar bear plays in its habitat next to a yellow traffic cone. A set of traffic lights over a busy road with cars. Two guys passing each other on a tennis court holding rackets. A person walks on a platform next to a passenger train. A pizza with lots of mushrooms is seen here. A person is laying in bed reading a book Two birds standing side by side on a branch Two men in baseball uniforms stand on the dirt. A wireless keyboard and mouse are on the table. There is a close up view of a giraffe. there is a very high mcdonalds sign on this street A group of people are standing around holding video game controllers. An apple, watermelon and bananas are setting on the table. a ship sitting out on the ocean not moving A large metal tray of rice and some vegetables. two men and a woman stand by a fence and pet a elephant There are three birds by the grass by the water. A woman sitting at a desk pretending to converse with a teddy bear. Two men skiing downhill next to each other. Several snowboards with people on them located in the snow. A Stop sign and other street sign on a road Bedroom with a bed, dresser, and small picture hanging on the wall. A person in action on a field with some people watching. A loft bed with various stuff being stored underneath it. A grey vintage truck on street next to a house. A large orange truck parked next to a woman. Two boxes that have a dragon on the lid are filled with food. A crowded street and sidewalk on a city street. Woman in a living room with large screen TV and cloth-draped furniture. A dog resting his head on the side of the boat looking out at the water. A large cruise ship is traveling on the ocean. A girl lying on a bed looking at the camera A man is holding a piece of food with chocolate in it A skateboarder dressed in pink and black at night. A man is snowboarding off of a hill in front of a crowd. Four sheep watching a dog peek through their fence. A show shining station with a pair of boots on it. A man standing at a train station near a pile of luggage A group of people sitting down at a table to have a meal. A city street with lots of blurry traffic on top of it. A group of people standing next to each other. A group of stuffed teddy bears sitting on top of a counter. A group of birds that are standing in the sand. A large Cathedral like church with a clock tower and people at the gate. a close up of an electric blender on a counter Digital painting of a tabby cat and large dog touching noses. Two cows behind a fence on a farm Two brown and white horses in an enclosure. a cookie being held up by a woman A group of people standing around a man with a cop in front of him . A bathroom stall with a small trash can and a chair. A Kinnaird street sign and Stop sign with the word Art in yellow painted on it and houses in the background. A bed in a room that has a window open. A group of people waling across a cement covered round. A bed in side of a room with a small white mattress. a black and white photo of a boy and girl walking a horse Passengers waiting patiently for their flight at the airport terminal A man and a cat sit on a sofa. a sheep is walking around near a tree A bunch of giraffe hanging out together as a pack in the outdoors. A bathroom showing toilet, sink, and shower THERE ARE A LOT OF PEOPLE WALKING AROUND WITH KITES A woman brushing the teeth of a toddler. A salad with lots of different greens covered in sauce. A skate board rider flying off a ramp in a skate park A young man playing on a skateboard at a play ground a living room with red walls a chair and a television A picture of a fire hydrant next to a plant. A white refrigerator with the door open with a small amount of food in it. A beagle is sitting in a chair with arm propped up the way a human would sit. a person riding a skate board on a city street A bathroom with four urinals and a drain on the floor. An old motorcycle rests near a rundown building. A young man is doing a trick on a skate ramp. Sign with the number "eighty" set against bright blue sky. A young boy playing whiffle ball in the grass The contents of an open suitcase scattered on a table. A large display sign outside of a ski resort. Crowd of people in a field flying kites. Green street signs sitting on the side of the road. A woman lies in bed reading a book, and petting a cat. Two large elephants walking across a shallow body of water. A vintage photo of a city bank branch. The cat is laying down while someone rubs it's head A smoking jet going straight up in the sky. A baseball player getting ready to hit the field. The table is littered with a number of typical office items. A Dominos Pizza with pineapples on the pizza on the table A young man holding a piece of food in his hands. a man with a tennis racket in his hand A white microwave sitting on the ground outside A herd of animals grazes in a field while a zebra nurses its foal. A group of two people waiting to cross the street under an umbrella. A man holding a pizza on top of a pizza pan. The man and woman are talking in the kitchen. a man in a tie and a suit is indifferent A doughnut that has several bites taken out of it. A bus parking lot area with several buses parked and one multi level bus driving. A man is swinging a tennis racket at a tennis ball. A beautiful dinner of authentic pizza with fresh bread, a plate of mozzarella and tomatoes and a lovely red wine. A cute little girl sitting on a bench alone. Two adult elephants interacting near a stand of trees. Little kid leans against the gate in front of train Blurry silhouettes of people and a horse against an evening sky. a polar bear pokes his head and one paw out of the water a young person wearing a shirt and tie an image of a man eating a slice of pizza Two rows of teddy bears of various colors and sizes. A man and woman wearing tiara while sitting at a table. This toilet sits in a stall in a public bathroom a skate boarder performing a trick while others look on A cat laying on top of a wooden computer desk. Two street signs atop a stop sign under a clear sky. A bathroom that has a couple of toilets, but no stall door for them. A man wearing glasses skiing during the day in the snow. A simple bathroom features standard toilet and tan sink with dark wood cabinet. Four zebras, two warthogs and a giraffe in an open field A tv sits enclosed in brick outside on the street kids watching a smiling woman milk a cow Boys play soccer in sand in front of a crowd. A kitchen includes a refrigerator, counter, and sink. ON ONE SIDE OF THE PARK BENCH IS TREE DOGS SNOOZING A bird statue sitting on a bench in a library near bookshelves. A POLICE OFFICER IS SITTIGN DOWN TALKING A suit case filled with a magazine and a pair of shoes. a man running on a tennis court with a rackett in his hand A MAN IS SWIMMING IN THE OCEAN WATER The three men are walking down the road together. a small red train is parked at the station Two people walking and holding umbrellas over their heads. A man riding a skateboard prepares to roll down a ramp. THERE IS A WOMAN SITTIGN AT THE TABLE WITH HER LAP TOP A kitchen that has white cabinets and drawers. A square in the city occupied by people. Sunlight bounces off the green wall in the den. a passenger train sitting by a platform and a fence A bathroom sink with all the usual toiletries on it and a hand towel hanging by it. A bathroom with a colorful rug, white towels, and a picture on the wall. A group of chefs standing in a kitchen preparing food. Some people sitting in the grass leaning against some wooden rest. A man hols a surfboard as he walks a beach alone. A person that is wearing headphones and glasses. A lot of ties are being hanged on the rack. An older large green and yellow trash truck driving down a busy street. A pack of elephants are walking through the terrain. A herd of cattle in a field covered with snow A propeller plane that is flying in the sky. A man standing next to a motorcycle on a street. blue car wrecked against bus trying to before them A white cat holding a wooden baseball bat. Colorful toys in front of a cell phone rested on its side. A hot dog on a bun with mustard Two giraffes standing by a tree with a forest in the background. A red double decker bus is parked on the street. A red, blue and silver motorcycle parked on the street. A man in a tie getting up from a meeting desk. A view of a toilet from the adjoining room. On a beach, there is a clock in the middle of the sand. A surfer is riding a medium sized wave. A pizza has red and green peppers embedded in the cheese. A zebra at a zoo stands alone looking at the ground. A little girl in a red dress with a red flower in her hair standing at a sink. A wood deck table has a glass of ice tea and a plate with BLT on a sesame sub roll and green salad on it. A plane is flying through a cloudy sky a herd of zebras walk in a caged area A man fixing a street sign on a raised up ladder. A person water skiing falls in a lake. A large group of doughnuts sitting on the table. A skier stands outside in the snow on their skis. A tabby cat sitting under the back of an old blue car Several toilets some without lids are sitting on the ground outside. An adult and young zebra standing in a field of green grass. The little girl whose name is Violet, is fast asleep in her bed A white and black cat sniffing a banana on couch. A man is holding a banana in front of his face. a green field that has a man with a kite Two dogs running and playing in the sun. A large metallic refrigerator freezer combination in a kitchen. Brown dog sleeping on a bed in a bedroom. A sitting room with three chairs a settee a sofa and a fire place. A person on a motor cycle in the street with blurry buildings behind them. Closeup of a corner of a metal tray containing three hotdogs. The large grey sofas have throw pillows on them. The picture shows a snow skier skiing down the hill. The large cow is wearing a blue tag around it's neck. Group of people standing outside a farm holding vegetables. A man riding a bike past another man without a shirt. A man in black jacket riding on a motorcycle. A zebra wagging its tail as it eats some grass on the ground. Man wearing glasses brushing his teeth in bathroom. Two brown horses tied up at a post. A child on a surfboard floating in the ocean. The glow from the lights are super blurry. A tall man eating and drinking next to a lady A surfer is riding a wave in the ocean. A plate with chicken,carrots and mashed potatoes with silverware. Two men cooking food outside with jars of food behind them. A child tries to catch a frisbee in a park on green grass. A sandwich is cut into triangles and served with a salad on the side. A man who is going down a hill on snow skis. A time-lapse photo of a guy doing a skateboarding trick, jumping over a curb. A baby elephant standing under an adult elephant. Lady posing with two horses standing on a street. The fire engine is ready for any emergency. A group of people riding on the backs of elephants in a river. A man on a snowboard in the snow. A red stop sign with two green street signs posted above it. He needs to rethink his choice of shoes for riding a motorcycle. A horse is standing by a wire fence. A bottle of water sits on a table next to fruit. A man playing tennis as people sit and watch from the stands. A selection of wooden kitchen tools on a counter. A stop sign on an empty, foggy street. A black cat relaxing in a cat bed on the floor Some animals are walking on the street and next to the car. This is an image of the inside of a home with lots of pictures on the walls. Two stop lights mounted on the same pole A man and woman stand with bikes in front of a field. A kitchen has a refrigerator and ice chest. The view of the headlights, handlebars and mirror of a motorcycle A man is riding waves with his surfboard. A group of young students eat lunch in the classroom. a very large teddy bear that is sitting on a chair An old man sitting next to a graffiti covered wall while holding a music keyboard. a pizza covered with assorted peppers on it A bus at a bus stop sports a bicycle rack. Two small black bears walking through a grassy area. A man rides his motorcycle through the water on the beach. A fire extended hose for fire hydrant in rural area Two children playing with a toy in a park. Two photos side by side of fruit in a basket, vegetables and basil. A woman rushes with a handbag through an empty train station with a large clock. A man walking with a skateboard towards a concrete ramp. A large tub is in a beige tiled room that has two windows and one window is white while the other is brown. A kitchen with a magnet-covered refrigerator and a pile of junk nearby. a clock attached to a tree in front of some buildings An empty room with a light is currently on. A girl wearing a pink cap riding her bicycle. A large white polar bear walking near a building A person doing a trick on a snowboard off a hill. A man flying a rainbow kite in a clear blue sky A lone shorebird standing on the beach as a wave rolls in. A man on a cell phone taking a picture of himself. Four jets in the sky at an air show. A man in a suit stands at the podium and speaks. A man walks on a snowy trail in skis. A herd of deer and a single zebra in a field. A dark brown giraffe leaning over the short fence of an enclosure a old black and white photo of a construction truck A black streamlined train pulling into the station. Four motorcycles are parked by the side of the road. The man holds the umbrella for the woman as they walk through the wilderness. A train drives passed a station as another pulls up to the platform. a couple of different pizzas on a counter top A man on a bike in the reflection of a car mirror A man on a tennis court is playing tennis in front of a crowd. Surfers bring their boards to the water on a crowded beach. A bus driving down the road near a church and traffic light. A PICTURE OF WAFFLES BACON EGGS, AND JUICE a batter, catcher, and umpire on a field during a game a close up of a person holding a call phone A fridge that is halfway open during the night. A clock sitting in the middle of the city, in front of a building. a couple of zebras stand next to some horses A group of people looking at an elephant. a couple of indian men riding down a road on elephants there are two airplanes that look old hanging and one looks spaceship like A group of people cross country skiing in forest. A picture of someones meal being served on a plate. A herd of elephants walking along a lush green field. Man holding a surfboard by the beach in his hands. A young man catches a wave on a surfboard. A wooden table with an empty pizza box and napkin. A man in a green hoodie preparing to snowboard. A woman wearing medieval clothing with a cell phone attached to her belt. two women and a man holding a big white surfboard A man that is sitting down holding a sandwich. A group of friends posing for a picture together next to a pizza. An orange truck parked next to a pink truck in a forest. A guy on a white and orange surfboard catching a wave. A red plate topped with a cut in half pizza with an egg on it. A giraffe inside an enclosure with families watching in the background. A woman is dressed as a man and a man is dressed as a woman. there is a man that is taking a picture of another man Three zebras running along a path in a field. Pair of zebra standing in open area of grass and trees. a small child sitting on a women's lap at a dinner table. A living room features a white couch and black loveseat. a monorail going down the track as a bus parks by the side of a road A white fire place sitting below a giant clock. three cooked dishes positioned on a wooden platter a plane flying by a red sky during the sunset Smart phone sitting in a red case being hand held by someone. A lady in a red shirt shows a man how to use a video game controller. A fancy clock face is flanked by two angel statues. A cookie is sitting on a plate next to a cup of coffee. A woman sitting down holding onto a fork. A man skiing alone in a snow-capped bush A woman wearing skiis while riding a conveyor belt outside in the snow. A GIRAFFE STANDING SURROUNDED BY TREES LOOKING TOWARDS CAMERA. a lone zebra stands just before a small body of water and looks down A couple of sheep standing on top of a lush green field. A sign on the side of a building for the business of Tomasino's Cellar Ristorante. A garbage truck travels under a stop light. A shower with a curtain stands next to a toilet with the lid open. A cat is rolled on its side while napping. This is an image of a patrol boat in the ocean. a man on a surf board rides on a big wave An anime action figure doll on a computer A person in a baseball uniform holding a baseball bat. A group of men standing next to each other. The sign in front of a French bar which indicates the location of the bar. A person is riding waves on a canal. A man in a green shirt is wearing a Christmas tie. Two plates with sandwiches on them next to a bowl of vegetables. A white horse leaned over eating something in a corral. many difference stuffed animals on a shelf on a wall Baseball player standing near home plate in stadium. A stop sign over a pedestrian crossing sign. Two horses that are standing in the water. Two eldery people are wnjoying the view of a lake in this park Two people in the living area of an RV. A very large bathroom has a two toilets and two sinks and a very large glass bath tub sitting next to a glass shower. A photo of a man standing with a ram. The young men are playing a baseball game. A giraffe with its head cocked walking about a sandy area. Two giraffes are standing near each other in a field. some people are sitting in front of desks A man skateboards in a parking lot while his buddies watch. A table set with plates and a cat. The fragment of the burned plane rests on the ground. A large bird in the air over a heavily forested area. A man is surfing on his board in the ocean. A family watches television in a small living room. a man holds a glowing item while in the dark This person is preparing a meal in the kitchen. Broccoli, carrots and a small amount of potatoes on a plate. A mix of beef and broccoli stew on a white plate. A flat bread pizza topped with green peppers, onions, and tomatoes. A large clock is posted above a turquoise rail. Room with a bed and a chandelier and double doors. A red fire hydrant sitting in the grass near water. A stuffed animal sitting in a Christmas tree. An unmade bed and a turned on lamp. An old man wearing a hat with a snake around it and a cellphone clipped to it. Two children with tennis rackets hold their hands up. A black dog standing on its legs and holding Frisbee in its mouth A man standing in front of a microphone. Woman sitting on floor next to commode with glass bottle on floor. A heavyset adult is outdoors and is wearing sunglasses. A baseball player preparing to hit the ball thrown by the pitcher. Items of fruit and flowers on a wooden surface. A bed topped with two red pillows and a head board. The cat is standing on top of the microwave that is on top of the refrigerator. A woman puts her head in an oven. Sheep gather in a grassy field in front of a lighthouse. A dog sitting on a couch under a blanket. Several species of animals grazing in grassy area. A young person wearing a jacket travels swiftly on a skateboard. A dog in a bathroom tears up a roll of toilet paper. The women was playing tennis on the court. A guy sitting at a desk with a nice monitor by a window. Two dogs are sitting a neatly made colorful bed. a plane on the air flying very high An old-fashioned safe and roll top desk in a green room A surfer is riding on a wave in the sunshine. Toothpaste,toothbrush,mouth rinse,tongue cleaner and other mouth cleaning things are kept. People talking in a kitchen with a mixer on top of refrigerator. Animals eating at the side of the road near mountains. A living room with a couch, television, and a colorful rug. A herd of elephants walking down a dirt road. A stop sign in the desert near an empty road A girls' soccer team poses with their coach for a team photo. A blue plate topped with bread and a salad. A hipster couple is giddy at a wine tasting. A cabin in snow with people around it. Many people are outside celebrating on a sunny day. Two cats that are sitting in the bathtub. A book with a train on the cover near a keyboard. a person is sitting on a park bench outside Shelves in a dorm room, with knickknacks such as a photograph, a lamp, and a lucky cat figurine. A sculpture made up of several traffic lights. The small kitchen has a black counter and wooden cabinets. Some type of wooden shower in a bathroom. A man stands by as a girl feeds an elephant A laptop sitting on a small black desk. the fully furnished basement looks clean and orderly A doll with large eyes and blonde hair holds a teddy bear. A kitchen that has a tea pot on the stove. A small house with a large tower and a walkway leading up to it's door. TWO GIRAFFES GRAZING IN THE TREES DURING THE DAY nine blueberry muffins in a muffing tin landscape of water with mountains on the horizon and a cloud filled sky A baby laying on its belly in front of a laptop. a goose is standing by a body of water there are three people sitting at a table holding up pizzas A family gathered around an outdoor table with drinks and menus. A dog catching a frisbee midair as his trainer prepares to toss another. A young person on skis lies in the snow A pair of scissors on top of a piece of paper on top of a rock. Jet plane flying high in sky on partly cloudy day. A person flying a kite near a basketball hoop A pink and white laptop and three computer monitors on a desk. a white bowl and a blue strainer and some bottles A small hotel bathroom has been well stocked A city bus parked by the side of the street. The parking meter is empty by the building. A man flying through the air on a skateboard. A child in a colorful airplane tie standing against a wall. A pitcher on the pitching mound in a "after pitching" position. A young girl blowing out candles on a cake. A person standing on a surfboard in the water. A man swinging a tennis racquet on a court. A young man wearing a dress shirt and a tie. A woman carrying a surfboard on top of a snow covered ground. A train pulls up to a platform with a line. Assorted flavored donuts being grabbed by multiple hands. A man in a safety suit walking along the edge of a dog where a cruise ship is docked. two benches placed on a snow covered land Some people at a table with some nice desserts. Young boy with stuffed toys lying on bed. An airplane is in the shallow blue water. Guy and his small dog out in a motor boat amongst bigger boats The moon overlooking the boats in the harbor. a dinner plate with steak, vegetables, and a baked potato There is an old fashioned blue refrigerator and ice chest in a kitchen. a desk some books a speaker and a video game system A hotel room with a bed, desk and chair. Some people with rackets on a tennis court. A small animal, maybe a baby sheep, is outside. A bunch of fresh produce sitting on a paper towel. Decorated living area with desk and cabinets with television. Skateboarder grinds along planter in an outdoor plaza. A group of snowboarders glide on the snow as a large snowy mountain stands in the background. A woman walking her bike on a busy sidewalk. A red traffic light sits on the street. A man is riding an elephant that appears to be playing basketball. A large grassy field filled with grazing cows A bus with three people getting out of it. A bird that is perched on some vines. A young male baseball player is about to swing for the ball. a man in a uniform standing on a pitchers mound The plate is full of pizza with chicken and vegetables on it. A group of teddy bears with princess crowns on. A young man and women in a very short skirt and heels. Four men standing next to a small airplane. A plate topped with a donut next to a cup of coffee. A refrigerator that still has its sale tags on it. A couple of men riding on the back of an elephant. An elephant standing alone in a wooded area A red stop sign near two large buildings. Multi-colored patterned pillows on top of a white in an empty bedroom. A baseball player is getting ready to hit a ball. A stop sign below a lamp post at night. A male officer and another man looks at laptops There is a full view of an outdoor area and it is nice. A man on a tennis court with a racket in his hand. A tall multi story building painted with colorful designs. A plate with a very big and tasty looking sandwich. A moving truck filled with furniture parked on the side of a road. A plate with a sandwich on it and several pieces of silverware on the table. A kid standing in the batters box, preparing to bat. a bedroom with a circle purple bed with a view of a tv A very odd shaped but pretty style clock. Three elephants standing on a stool with woman sitting on their necks. Man and woman standing under a red umbrella. A stop sign obscured by the brightness of the sun. A young zebra sucking its mother in the wild The stop light has various blue directional signs, a tennis player wearing a red shirt is playing tennis A man standing on the side of a court holding a microphone. an image of a man going on the ocean waves A statute built into the side of a building. Pink flowers sitting in a flower pot full of water. A furnished doll house with stairs to a second floor. a small bird stares out of a window looking at the outside A man in a reflective vest walks toward a parked airplane. A cage filled with candles sitting on a table next to a vase and another candle. A yellow banana sitting on top of a table. A young zebra is between two larger zebras. A beagle pads away from the camera across a reflective surface. Man walking on a sidewalk that is sloping downhill approaching the corner. Desktop computer setup with ergonomic keyboard and headphones. A down hill skier racing down the slopes in a blue ski suit. A little boy playing, eating and shopping while in a shopping cart. a man holding a white umbrella in a wooded area. A young man stands on a skateboard on a sidewalk. A couple of giraffe sitting on top of a lush green field. A street sign and some cars next to a building. a lady taking a picture of a red bus Several remote controls lines up next to each other. a woman reading a book with another woman standing right behind her with an umbrella A hot dog wrapped in tin foil covered in ketchup relish. Close up back and back of head of a cat in dark with two rectangles of light on ground in front. A pile of vintage suit cases in the middle of a building. A sandwich and a pickle with a bowl of food on a plate. Young man with crew cut and dark denim shirt taking selfie in bathroom mirror. A person with some skis posing in the snow. A room with furniture, wood accents, and a fireplace a blue tank of compressed gas near a house A man sitting on a train next to a woman. a girl with a game controller with a boy standing next o her A motorcycle is parked in front of two people. A person on a snowboard rides down the snow. A woman crossing in front of a double decker bus. A pot full of vegetables is sitting on a table. A living room filled with blue and white checkered couches. A woman sitting on the floor with a teddy bear Two men cooking and packaging food in a kitchen. A man walks next to a couple of horses loaded with supplies. A cup of coffee is sitting next to a laptop A few kids playing in the yard with a frisbee A large passenger jet with it's landing gear down. A dog wearing a collar standing next to the water. THERE IS A MAN THAT IS JUMPING A RAMP WTH HIS SKATE BOARD The view of an elephant's head through a display window. A dog leads the way for two crosscountry skiers Two hot dogs in wrappers on a table. Here is an Asian standing by a yellow fire hydrant. Two zebra standing next to each other in front of a cart full of dry hay. The woman serves the tennis ball as a child watches. A skateboard enthusiast doing a jump on a skateboard on concrete near a small tan brick building with tinted windows. The four skiers chose to wear bright colors, standing out from the snow covered white mountain. A city bus thats turning a corner with another at the intersection. A couple of elephants walking down a dirt road. A variety of fruits and vegetable on a plate. The man is having to work outside in the rain. An elephant walking draped with a colorful blanket. Small silver cellphone sitting on top of a wooden table. an ocean a white fence and a black thing on some rocks Young boys and their coach playing baseball in the sun A shop window with people outside on the street reflected on the suface. a toothbrush holder is sitting on top of a bathroom sink A blue and white KLM Asia plane being serviced at an airport. An old ad is showing a retro kitchen. Spacious kitchen with a center island and stainless steel appliances. a bath room wit ha sink and a bath tub A man with glasses and in a suit talking in front of a microphone. A zebra brazing on green grass next to a pile of rocks. Sheep and a woman in a field in front of a cityscape. A raw cut of meat still on the bone being seasoned. A man wearing glasses standing next to an airplane. FOLDED ROBE TIED UP LIKE A PRESENT IN A HOTEL ROOM a vintage photo of a man washing a lamb A horse running by itself through a flat area of land. A man wearing a stripe shirt and a yellow neck tie. A woman taking a picture in a garden by a polka dot umbrella. A giraffe that is standing near rocks while an ostrich stands behind it. An open refrigerator with various fruits and condiments in it. Assortment of baked pastry items displayed in case. A woman standing at a table filled with red lobsters. Long old train barreling through the mountainous countryside. A closeup action shot of a person surfing. A hand reaching out towards a standing giraffe a table with some dishes with food on it a tv near a closet and a book shelf A group of teddy bears in glass cases. Several people that are drinking beer together and talking. A large group of cows on a field. Green wooden shelves holding blackened bunches of bananas. a number of small boats near a body of water A bathroom sink next to a white toilet under a mirror a black and white photo of a person with a cell phone Carrots are being cut into pieces with a large knife. A woman laying in bed with a powder puff girl pillow. a small child dressed in adult clothing by a stair case A baseball player taking a swing at a ball Two men in suits shake hands outside of an airplane while others look on. three young cows in a fenced pasture with a short black dog following them A windmill placed near several cows in a grassy field. a shirtless man is skateboarding in a pool a group of kids playing frisbee chasing it A man riding a sled down a snow covered hillside. Gray cat laying with head on laptop on top of couch. An active computer monitor that is sitting on a desk. A bathroom with a toilet, sink, tub and shower curtain. A person standing outside on the beach looking at a Frisbee. A man riding a blue two seat motorcycle wearing a helmet. Two sheep are standing in a field next to a wall. A kite surfer rides the waves of the ocean. A black and white cat laying on top of a keyboard. The man in a business suit has a bag on his shoulder. A group of men with volleyball's in pink uniforms. A female jockey riding a horse spectators in the background. A baseball field showing the catcher, umpire and a person up batting. some food is laying out on some dishes Two cows that are standing in the grass. a large air plane flying in a sky A red plastic basket with two hot dogs on it. A bicycle leaning against a pole outside of a coffee shop. The unmade bed has three pillows on it. A man with a hand bag standing in a room. A computer, keyboard and framed photo on a wooden desk. A beer mug that contains water and flowers. A cow standing next to a brick building. A variety of food is displayed on a table. A baseball player extends his swing to hit a pitch. A closeup shot of the insides of a squash. A person holds an apple slice with peanut butter on it. The people has there umberellas up for the rain two ladies in a kitchen preparing some food Two plates of food with vegetables and bread. A man preparing to ski off a steep slope. A variety of food items are displayed in dishes. A yellow street sign warns of a hump in the road. A kitchen and dining room table and chairs sitting next to a living room with a chair and couch in it. A group of boats in a body water on a clear sky day. A bunch of craft supplies and a pair of glasses. A group of people that are standing with umbrellas. A pizza with toppings and a missing slice. tow pieces of a desert on a plate on a table a couple of people stand on some dry leaves a couple of people that are laying on a couch A silver commuter train at a train station next to luggage carts. A guy smiling while standing under a run for rights banner. Three white flowers in a vase with flower images on it. a field that has a bunch of cars in it A baseball player getting ready to catch a ball with his glove A cat sits between a window and a large birdcage. People cross the street in a busy downtown city area The Time clock is in the center of town. Two paper plates sitting on top of a table covered in pizza. a living room with several chairs and a small table Two women are sitting on a bench reading a magazine next to a bike rack. A group of men sitting around a living room in front of a tv An Air France passenger jet is parked on a tarmac. a person sitting at a table with a laptop A lone giraffe standing next to a river. A happy little boy with a banana in front of his face. a man sitting in a chair with a cup in his hand A street scene with a horse and carriage and buildings in the background. A baseball player swinging his bat in front of a crowd. Some boats in the water outside of some industrial buildings. A woman walks down the street alone late at night. A man standing near a van advertising a movie. He does have control of the motorcycle while pulling a wheelie. A clock in front of a window on a winter day. A man jumping off of a red skateboard. A pair of youths pause for a photo on a ski slope. A bunch of plates that are laying on a plate. The Big Ben clock tower towering over the city of London. Blue-and-white jet airplane sitting at an airport runway. A large white sink sitting under a bathroom mirror. A cooked pizza that has been placed on a table. An old photo of a man on a motorcycle and cars in the background. A teddy bear sits next to a mossy tree behind some green leaves. A young woman in a bikini surfs a small wave. Two men smiling in a grainy photo while holding a banana. a black orange and yellow train on its track and some trees a couple of bears are standing in a field A white and green fire hydrant sitting next to a light. A baseball player prepares to swing as a pitcher throws the ball. Trains parked on rail road tracks next to a tractor. An image with multiple photos combined in it. A banana with a frownie face drawn on it is by a computer. A toddler sliding down a snowy slope on skis. Two people are by a railing feeding a giraffe. A train inside a building going down the train track a woman is standing outside talking on a phone a group of people standing around in the park A crowd of people standing below the Eiffel tower. A trio of little kids in front of a birthday cake a cat laying in some blankets on top of a bed a man sits on the ground with a guitar Woman holding a small baby in front of her computer. Some art work with a man with a hat on and some fruit in a bowl. A triple decker sandwich is cut into quarters. A piece of pizza sits on a white plate that has gold accents. Two children interact with a television video game, while a third person looks away. two beds are shown as the light creeps in. A group of baseball players standing on top of a green field. A couple of boats floating on top of a river. A group of people riding skis across snow covered ground. A white toilet sitting up against a brick wall. A ceremony for military men from US and China Two women sit together as one of them dries her hair. A wooden bench leaning against a blue wooden wall. a plate holding a big pizza in the middle of the table three buses are parked at the buss station An elephant statue standing on top of a lush green park. a giraffe rinsk soem wate rin a nice pond A man in a harness holding a waterboard. A couple of men lying on some couches with covers on. A man jumping for joy in a field of kites a fork thrust into what looks like a pan filled with potato chips Some cows and horses are outside grazing together. A fully stocked bathroom with a vanity mirror. A man balances on one end of a skateboard. Group of people with wine glasses standing near table. a couple of kids stand with a toy A girl places a white teddy bear in a container An elephant tied up in a city park. A circus elephant using it's trunk to hold another elephant's tail. a man riding a wave with a surfboard A father holding his little child upside down. A woman and two men posing for a picture. A baby elephant walking with two adult elephants. A cat sticking its head out of a piece of luggage on the floor. A table topped with a bird and plates of food. A woman in yellow shirt and skirt with cats in grass. many giraffes standing together as a group eat from a basket A woman walking her cattle down the road. An older man is flying a kite with a small child. a man with a hat standing on a snow board A guy is returning a tennis ball that was hit to him. A person dressed in black doing skateboard stunts on a skateboard ramp. A white and red helicopter above a grassy field. Spectators enjoying a tennis game at the US Open. People shifting the concrete being poured in the forms. There is a display of trophies on the table. A cute cat sitting on top of a couch cushion. A man is kneeling in front of a large elephant. Two giraffes stand in the grass by trees. A bus that is on the side of the road. A man stands in the living room and plays Wii. A laptop with a green apple taped to its back. Two laptop computers sitting on top of a desk. This three people pose for a goofy photo a room that has some furniture and a table in it The two cows are fenced in the field. A tiny banana with a woman peeling another in the background. Two bulls who are walking on a street. The great wilderness with a white lonely horse grazing. A professional baseball player takes a swing in front of fans in a crowded stadium. two people in a body of water with a wake board THERE IS A VAN THAT IS DRIVING DOWN THE STREET A cay laying on top of a blue couch arm next to a wall. A clock is displaying the time on a tower. Two bees on an apple hanging from a tree. A young lady sitting on a couch in front of a laptop computer. A glass shower door in a small bathroom. One giraffe standing and another giraffe sitting in the grass. The bathroom of this house is spotless. Children dressed in snow suits standing in a crowded resort. A red and blue small train is on the tracks. a lady happy she got her tooth brush out of the holder a man on a surf board riding a small wave Trio of elephants walking past a large log A man sanding next to an orange frisbee. A man riding a skateboard on top of a road. A plate of food in a dim restaurant, ready to eat. A black and white cat sits on a red cloth that is over a television set. a man is jumping in the air with a disk A toddler wearing a ski outfit and a pair of skis in the snow. The tennis player in the pink sport dress is holding a tennis racket and ball. Two monitors with art from Akon albums on them. Tea, a tea cup, a teddy bear, and a tea brewer sit on a countertop. A couple of people in the snow on skis. A picture of different types of herbs and vegetables available from the CSA. Four people on a sailboat one is on the phone and three are sunbathing. A little boy rolls in a wheelchair pulling a suitcase. A bike attached to a car bumper with people with luggage in the background. A group of people stand outside, exchanging items. a bird standing on a plate of partially eaten food Baseball game with batter and referee on field with crowd A zebra stands with its head down in its enclosure. A group of people on bicycles in middle of street next to trees. an airplane with people standing under the wing A table topped with lots of fruit and vegetables. A row of table and chairs along side a street. A plane taking off in the air, on a clear day. An umbrella on its top laying on the ground in the sun A group of boats are enjoying riding on the sea. A woman is wearing a jacket and a tie. A bed in a corner of a room next to two window's. There is a horse standing by some grass. A baseball player getting ready to swing at the next pitch. A herd of giraffe walk through the tall grass on the plains. A fighter jet flying over two parked vehicles. A group of elephants moving in the middle of a river. The man is drinking a glass of wine in his kitchen. A pair of elephants standing in their natural habitat. A dog that is laying down on a table. Snowboarders walking through the snow carrying their boards Many people in business attire are sitting around tables. A boy stands among a row of red mopeds. A woman with a nose piercing is holding and looking at her cell phone. A group of motorcycles on a street next to grassy area. A photo in an airport showing a backpack and a cell phone. A man in a tie and backpack is drinking a beer. Three zebras are standing near a gate in a wall. Black and white photo of an old car on its side. A airplane parked out on a runway by itself. A man riding around on a scooter with luggage on his lap. A woman brushing a girls hair on a couch. A old photo of how things were a long time ago. A close shot of a cat staring at the camera. A woman is sitting in a garden tub while brushing her teeth for a window view. a person leaning on a bank holding a remote in his hand A boy is standing out by the water A person that is in the snow doing a trick. Two boys sitting,younger one is trying to read something. A one propeller airplane is in an airplane hanger. A man in a car wearing glasses and a shirt and tie. two people playing with a frisbee on a foot ball field A living area with a christmas tree in it A zebra standing around in the middle of a field. Cat lying on top of a shelf with its front leg hanging down. A pile of paper towels is on the floor next to a toilet. A man that is on a pair of ski's in the snow. A man cutting up scallions at an outdoor table A bird as it flies lonely through the sky A dark skinned child getting ready to be pushed on a swing. A hotdog is placed on a table next to some french fries. A blue double decker bus that says Garage on it. An red fire hydrant beside a grey fence. The little girl is sitting in front of the computer. Four chairs sit around a dining table with papers and shoes on it. A couple of cows with wreath decorations on their heads. A blue train stopped outside of a train station. A brown horse grazing on grass in a field. Two men and two women, all wearing flowers, are posing for a picture in formal wear. A view of a bathroom sink and porcelain tub. A Starbucks teddy bear sitting in a Starbucks. A cat laying its head against a teddy bear. A big dog is resting halfway out of the window. A baseball player throwing a baseball bat from home plate. A kitchen with white cabinets, black counter tops and a white breakfast bar. a little table covered with paperwork, books and a laptop A woman holding a yellow umbrella standing near window. A cheese pizza sitting on a white tray on a table. A group of people with wine glasses stand together. A salad with broccoli, cheese and radishes is in a bowl. a number of people sitting at a table with a cake Man in a black plaid shirt eating food while standing up. A pink flower sticks out of a narrow white vase. A couple of men working on a boat that's docked at a pier. A group of children are wearing school uniforms. A red and white air plane is parked on the run way. Grey dog laying down in black and white sheets. A kid in a baseball uniform holding a baseball bat. A commercial airplane is flying low to the ground. a woman holding a wil controller with a steering wheel A busy street with many people standing around and lights on. A baseball game is in action as the catcher leans for the ball. A herd of sheep grazing in an open pasture. a bowl with an apple and some bananas and some books Two men in suits and ties shaking hands. A group of people sitting at a table with stacks of books a bunch of vegetables and fruits sit on a chopping board A disembodied hand holds up a cellphone to take a picture of something on stage. Young boy on blue skateboard in parking lot. Three cows that are standing in the grass. A flip phone open to a test message Working man sharpening scissors with electric circular sharpener. A couple equipped with umbrella hats taking a break from walking their dog on a bridge on a rainy day. THERE IS A STUFF ANIMAL WITH ONE PURPLE CLOSE WINKED EYE A man standing on a tennis court holding a tennis racquet. a hand is holding a single banana to eat Pretty blue flowers sit in a vase in the sunshine. A man's handicap restroom located in an establishment. Various buisness signs and an ornate lamp post in the city. A cow that is laying down on the street. A guy on a snow board does tricks in the snow A plate that has various types of donuts on it. Surfer riding a large white top wave on the ocean. Two people standing next to a statue that is an invisible man. A person walking across a snow covered ski slope. A baby sitting at a high chair in front of a table filled with food. Three hungry boys pose with a loaf of bread. A group of people walking down a street next to buildings. Three white castle hamburgers sitting in a white castle food bag. a couple of people standing on a beach next to surfboards A large inflatable soccer ball with spikes floats up from a field. A male getting ready to throw a pitch at a baseball game. The yard is full of stuff such as a truck and a tug boat. A wall that has a large number of clocks on it. A man turns to smile for a photo while talking on the phone A close up image of a type of salad. two kittens sitting on a woman in a chair A woman is flying a kite in a city park. several sheep watching two sheep standing by a drinking tub. A young girl holds up a pink umbrella. A man in a baseball uniform standing with a bat. A bench next to a tree in a park. a bathroom with a corner toilet and a sink A boy with a cast is kneeling by a skateboard. A cat sits on a desk, on top of papers and in front of a computer. Woman playing in a tennis match in a tennis court. the fire hydrant has on the side of the road An "on-deck" batter watching the baseball game from the on-deck circle A man standing next to a brown piece of luggage on a floor. a railroad bridge with an old train crossing it a group of animals graze on some grass a number of baseball players on a field A selfie of a woman taken looking into a car mirror. A heron is standing on the edge of a body of water. A parking meter sits by a brick wall. a man getting ready to grab a frisbee as others watch A cat sitting on top of a car outside during the day. A little girl and goat standing in the rain while the girl holds an umbrella The boy wearing green is playing tennis on a green court. a fire hydrant in the middle of a large paved area Some people walking on the top of a snow covered hill. Two children struggle over a bat in their playroom. There is a man with glasses that is letting a spider crawl on his arm A dirty wok on top of a stove beside a dirty tea kettle. Two people that are laughing and holding a kite. A horse is trotting past a man on that walks behind him in the pasture. A man wearing a brown hat and a uniform shirt is holding a cockatoo upside down. A desk that has a drink in the middle of it. A man is skiing down hill using both ski poles and the snow looks powdery. A broken tv next to a brick on the street a girl is standing on her bathroom sink A wooden cutting board with a knife, plate and several different vegetables. The huge truck is carrying a construction tractor on it's bed. A couple of people sitting on a wooden bench. Toiled in a dirty bathroom with a concrete sink and tiled walls. Houses of parliament on the edge of the River Thames. A motorcycle is parked on the side of the road. An empty city bus travels down a city street. A desk that has a laptop computer on it. some people a bus and cars a street lights and buildings A man is skiing down the hill next to a sign a person is riding a motorcycle by a grassy hill People at an outdoor table eating pizza while surrounded by a crowd. A pelican strolls in the shallow water at the shore. A dog is sleeping on the step by a blue door. A man sitting on a bench next to a dog. An elephant stands in a grassy area with words written on his body. A bicycle parked next to a wooded area, with a large brown bird perched on the bike seat. an old photo of three people holding skis on a snow background An old dirty toilet and a sink in a bathroom A zebra standing on top of a lush green field. A ski instructor teaching a class of children. A herd of sheep standing below very tall buildings. A baseball player who is sliding into a base. A little girl standing on the grassy area of a beach. a person sitting on a bench near other benches A purple, red, and orange commercial airplane on a runway. A chicken or tuna club sandwich made with homemade bread. A sign on the side of a snowy road stating avalanche zone. A black and white coin meter on the side of a road. A motorcycle with a suitcase tied to the back of it A cat in a bed hiding under the cover. Three men wearing red standing on top of a ski slope. A sink and a dining table in a kitchen. a plate with a bunch of meat and vegetables on it A triptych depicting skateboarders who are mid air. A couple of horses standing next to each other. A VERY TALL GIRAFFE AND A COUPLE OF PEOPLE NEAR IT Closed toilet, sink, and mirror in a modern bathroom. An assortment of pens and pencils is spread before a keyboard. A white sign that reads no turns hanging from a traffic light. The small bathroom has brown tile on the shower walls and floor. a person riding a skate board at a skate park Bus backing up and being loaded onto a truck outside a conveyor belt holding some donuts after being deep fried A man is waiting for the wave he wants to ride to the shore A biker standing next to a motorcycle. near a garage. A group of elephants walking down a street with people on them. A man in a wetsuit surfing on a clear day Female tennis player touching the US Open logo banner. A young man swinging a racquet at a tennis ball. We see a girl playing a game on her Wii console. One boat on the beach with the water in the back round. The kitchen is full of various gourmet ingredients ingredients. Two young children sit in bed and play on computers. One man looks at the camera while another looks away two men in a kitchen making stuffed potatoes Three sheep next to each other at a farm There are several pumpkins being used as decorations. A long nosed train on the tracks near a station. a tennis player with a racket on a court A road side with graffiti sprayed on it to alter its message. A slow children street sign cutout is propped up next to a fire hydrant on the side of a road. A park bench next to fence and trees by grassy field. Sheep grazing in a wise open green field with clouds above A man is standing on a carriage pulled by four ponies. The tot is making a face to indicate a distatse for certain vegetables. A desk with two computers on it. Person in gray hooded jacket attempting t cross busy street. A person holding a glass of champagne in their hand. A man on a laptop on a coach in his living room. A black back pack on the side of a dirt road. There is a bacon, lettuce and tomato sandwich. A woman holding a cell phone to her ear. A few sheep eating and grazing in someone's yard. That cake as fresh strawberries on the top of it. pizza a knife and fork a bottle of wine and a glass A group of young skiers pose in a line on a snowy slope. A man in a wet suit crouches down as he rides a wave on his surfboard. A plane flying by a runway on a slightly cloudy day. A large long train going down a track. Humans holds dog back in a swimming pool A man riding a bike next to a bus on a street. a man holding onto a rail in the middle of an empty parking lot An empty wooden bench sits near a neatly trimmed lawn. A young boy on skateboard riding on a ramp. A man doing a jump on a skateboard Bottles of Pellegrino are stacked on refrigerated shelves. A person riding a white board surrounded by a group of people in the ocean. a close up of street signs with buildings in the background A person standing on the beach flying a kite. Pair of electronic parking meters in front of a red truck. A bathroom done all in tile that is clean. A jeep that is sitting in a field with a large fire and smoke in the background. A bus that has bags of luggage on the side of it. A Volvo bus parked on a road near a hotel. Pedestrians, a rider on a scooter and several bicyclists cross an intersection at a crosswalk. Adult women standing at open refrigerator filled with beverages. a balding man in glasses holding an umbrella and wearing a jacket with a very high collar around his face A giraffe standing next to a fence near people. A bunch of people walking down a street with open umbrellas. A girl on a boogie boards catches a wave in the ocean An old unlighted sign hangs overhead advertising "Open Kitchen Restaurant" A man is flying a cat while a cat watches A little kid skiing down a hill holding ski poles. A person's feet standing and balancing on a skateboard. A large tree situated next to a large body of water. Large group of stop signs in the same area. this is a group of parasails in the sky a cow stares as it stands in a muddy area A Japan Airlines passenger jet climbs skyward with its wheels still down after takeoff. A woman standing in a kitchen cutting up vegetables. A bunch of people waiting on the train platform for the train A pile of luggage, helmet, clothes and mirror. The people are walking to water to surf the waves. A horse drawn cart is driving down the road. Skateboarders at a park skating in an empty pool. Many cattle are trying to find food on the desert ground. A street is void of cars at night. A tall building surrounded by a crowd of people A train wreck near a river draws a crowd. A very large room filled with a bunch of diners. A kid that is swinging a baseball bat at a batting cage. a woman standing around a bunch of clocks a table with some plates of food and some glasses and cups A black and white cat with curious look sitting on a desk. This is three cows eating hay from their stables. Soccer paying kicking the ball while others look on. Some luggage against the wall of a hallway a person in skies is standing in the snow An asian woman with black hair and a green headband posing with a tennis racket in front of a man with white hair and a cigarette. Jet airplane in flight landing gear extended down A boy doing a skate-board trick on a ramp. There is a food truck set up under a bridge. A young child looks at a group of zebras. A traffic signal at an intersection on a city street. A man uses his cell phone to take a picture of himself. A baseball player hits the ball as the crowd watches An empty park bench in the middle of a tree covered park. A kitchen shelf holds an assortment of pots, pans, and utensils. Young man spinning green frisbee on finger along shoreline A white toilet bowl with an electronic brown seat. A group of Asian chefs stand by bowls of food. Some animals are outside in the dirt in the daytime. A bathroom with bidet, toilet, tub, and a checkerboard floor. Two people who are standing on a beach. a bathroom with a strange looking toilet in it The train car is stopped and it is empty. The big ben clock tower standing tall in the foreground Scissors, a marker, and two other items on a table. A guy playing the drums with a very intense look on his face. A dog attache by his leash to the side car of a motorcycle parked in a parking lot. Young girl on surfboard riding small wave in ocean. A row of parking meters in front of a stop sign. Two zebras stand in a field with tall grass. A view of the outside world through a train's window. A horse and a dog on a grass field. A piece of cake is on a white plate. A lone sheep surveys a fern and wild flower covered hillside. The clock is sitting atop the antique building. Two old fashioned black and white buses are parked next to each other. a beach with a lawn chair and umbrella positioned on A man leans down playing a game of tennis Young guy playing tennis on a clay court. A woman is sitting outside on her phone Man jumping about to serve a tennis ball. A band wearing costumes standing around talking. Kids on a bike while a man is drive a horse drawn buggy. A police motorcycle is parked next to a police car. A large two sided clock by a building a couple of sheep are standing in a clearing cook prepares dish by putting it into the oven A skier carves his way down the snowy hill. A herd of adult and baby black sheep in a fenced field. A television that is showing a news program on it. two children playing with a frisbee in a drive way A view of a giant bridge during the day. A man riding on the back of a brown horse through a lush green field. A parking meter with an hour and thirteen minutes left to go. A clock tower stands over a city landscape. A small frame building with a large sign. a group of pizza standing around a table eating pizza A man going down a slope near a ski lift on his snowboard. A man at a baseball game is holding his bat on the ground with is head on top it. A gang of bikers riding down a street. That baseball player looks like he may have done something good. A couple cutting their wedding cake at their reception. A picture of a boat marina full of sail boats. A MAN IS ON A MOTOR BIKE SMILING THUMBS UP a person riding skis on a snowy surface People surfing on a white water river. a couple of trays that have some food in it A large tv seems too small for an enormous surrounding cabinet. A pan of food is in the middle of a table. A photo of a surfboard with a man in the background A young boy throws a pitch at a baseball game. A pizza in an iron pan on top of a table. Black and white photograph of women walking towards an umbrella. A family sitting down for a meal and conversation. People on snow skis are by a wooden building. A small boy playing tennis while holding a racquet in his hand. The sun is setting near the clock tower that reads 945. Police person riding a blue and yellow check motor cycle. A giraffe is sleeping on bare dirt next to a dead log. The airplane is about ready to land at the airport. A large kitchen with a large center island. Many sheep grazing on grass in a field. A crowd of people lined up in front of a food truck. Piles of unripe bananas sitting next to each other sitting on a floor. The child is sleeping in the bed with his stuffed toy. A couple of men and a woman sitting next to each other at a table. Four different plates that have food on a table. An old phone shows a horse and wagon on a wide street and children are in the forefront. A black and white cow is standing on the grass. A white toilet sitting next to a white bathroom sink. A skateboarder riding down a ramp in black clothing A woman wearing a mask holding a racquet A person on a court with a tennis racket. A boy riding on a skateboard in the street. a room with a bunch of teddy bears in it A surfer rides a wave on his board in the ocean. A train driving past a lush tree filled forest. a fork with a plate with carrot cake a couple of kids are sitting on horses An umbrella is attached to a bicycle frame with leather straps. A group of people riding bicycles down a street. A white tusked elephant at his compound at the zoo. The clock tower appears very tall at this angle. A desert dish has powdered sugar on it. Two men that are playing a game of baseball together. Inside view of terminal building with large sunlit window and a clock. A sandwich and a cup of drink on a table. A large jetliner sitting on top of an airport runway. m mm m m m mmm mm m m mm mmm m m m A stainless steel toilet with the sat up A lot of motorcycles that are in a window. A pink house with a bunch of bananas outside A large black cat sits by the front door. a station wagon covered in foot prints and stuffed animals a red and white sign and some parking spaces A wine server holds up a wine bottle display for a man to look at. A woman with ear protection on swings a bat. A wooded table filled with apples, oranges, pomegranate, and cherry tomatoes. A man wearing glasses and a green tie this is a man riding a board down a rail A woman and man hold a kite with two children nearby. A woman is holding up bananas at a market. A red stop sign sitting above an orange not dumping sign. Back view of a female tennis player wearing orange shorts. The backside of a travel bus on the side of the road. A young boy is surfing in the ocean. An empty living room has a cluttered coffee table. a train passing on the railroad in a grassy hill A close shot of a mini fridge. A woman in a black and purple dress poses in front of some tall grass. A pile of fruit sitting on top of a wooden table. A black and whit photograph of a boy tying a tie. pans filled with assorted veggies, fruit and rice The old style airplane is flying on a cloudy day. A man reading a book in the park a white bus is on a city street A large brown dog laying under an open umbrella. a lady riding a horse holding the other black horse A stop sign in front of a large home A group of elephants walking in a green and rocky area with many trees surrounding them. A large white dog panting while laying down. A man in white shorts stands near a large television screen with a remote. Small multicolored airplane sitting on a landing strip. An orange train rides through the rural countryside. The electronic contents of a bag are placed on a bed. A brown horse wearing a bit standing next to a wooden fence. A waterway with many people on some small boats. A man helping a boy on a paddle board in the water. A man cutting a cake celebrating his 50th birthday. The desert is on the table ready to be eaten. a close up of a child and a dog An elderly man sitting on one of three park benches which are positioned side by side. a bunch of signs together on a line. A blue and silver train is pulled up to a platform. Two zebras are pictured but there is an elephant and other animals in the background. a person holding an uncooked doughnut near other ones A person holding a cell phone next to many others. there is a man taking a picture with his cigarette in his mouth A town square with several tall clock towers. food on a plate that matches the countertop A man riding on top of a surfboard on top of a wave. A man playing tennis in the middle of a serve. two catcher and a pitcher stand on the pitchers mound A baseball player in a blue and white uniform holding a baseball bat. people standing around a table filled with some plates of food A high clock tower is brown and has roman numerals. A woman is standing on a tennis court holding a tennis racket. A sidewalk with various signage and many cars in the street. A Polar airliner is parked on the tarmac. many types of vegetables in the vegetable section of a market The angle view of tower with a clock. A group of people riding skis on snow covered ground. They look like they are beginning a ski race in the snow. A yellow firehydrant on the sidewalk near a building A giant desert covered in chocolate sauce next to cups of coffee. A bear is snuggling with a bear cub. A beautiful woman laying in bed reading a book. Traffic has stopped to allow four zebras to cross a highway. A man holding a tennis racket in his hands while on the tennis court three skate boarders and one is doing a jump a tea pot is steaming on the stove top A white plate of food on a table. A herd of wild horses grazing on a green grass covered field. A person manipulating the skateboard with his feet. A person sitting in an old refrigerator on the sidewalk, drinking beer. The two signs give directions to upcoming cars. A bathroom with a toilet and bathtub and handheld shower. A man who is performing a trick on a skateboard. A nightlight is on over a kitchen sink. Will the elderly women finish the wii game? A plate of vegetables, chicken, and white rice. Watercraft in a row, floating on the calm ocean. Several stuffed animals and teddy bears laying on a bed. A white plate filled with slice oranges next to a pile of bananas. A zebra standing, its face down, grazing on dried grass. An elephant strides through brown grass and trees. A kitchen with hard wood flooring and a stove top oven. A man surfing down a flowing river rapid. A banana filled with melting chocolate on a grill A little girl sitting at a wooden table in front of two bowls of food. A woman at a desk with a computer monitor and CD case. a small girl with sunglasses is hitting a tennis ball A black cat sitting in a bathroom sink. this is a yellow train riding the rails The sheep is wearing a bell with a blue cord around its neck. A group of five zebras stand in a field. Elephant playing inside mud, with fences surrounding her a lady on the beach flying a kite A dog wearing sunglasses sitting in the front seat A person on snow skis in the snow. a number of people standing in a kitchen near one another two apples and one banana lying in the shape of smile on a wooden table. A group of people ride a double decker bus and hold black umbrellas. A bathroom with a vanity sink, mirror, toilet, and bathtub. Two pizzas and three cups of drinks sit atop a table. there are several woman wearing bikinis and waiting for cake A desk with several computers and laptops on top. A full wine glass means the bottle has less in it for later. A man laying on a couch holding a gaming controller. some writting on a wall by a window A woman with a bear in a photo with a sign. A red buses on a wet paved road by vendors undercover walkway, and one vender on curb with an umbrella over table. Two giraffes are standing next to each other. The farmers are working the land with their animals. A person is chasing sheep through a field. A dessert with few carrots on a plate near two candles. A group of bread slices with cheeses on them in a pan. Many people walking in the streets holding umbrellas. A bright patio umbrella stands out against the plain white building. an image of street signs being crossed in air A man is putting a pizza in a oven A small dog being carried in a backpack. A beautiful marina with many boats docked in it. A plate of food and drink on a table. A ram standing still in an empty pasture. A person is hanging in the air near a building. a public males restroom with two urinals that are based on the floor A cat sitting in an orange chair in a bedroom Guys riding motorcycles through the path in the park A red stop sign with a no left turn sign. A man flying into the air as he catches a frisbee. A photograph of the inside of a public men's restroom This boy is practising in a play ground A baby zebra getting a drink from mama during the day A man enjoys a quick ride down a ski slope. All way stop sign at the intersection of Prairie Street. two men are riding in a train in hats A man holding an orange frisbee on top of a green field. A man in tie standing in front of a table. Two white bears on the rocky shore of some water. People are walking along the beach and people are skiing on the water with parachutes. A bearded man riding a skateboard on pavement. Two women on snow skis on a hill A large patio area with many table and chair sets covered by large umbrellas. People watch as a couple of people are skateboarding on ramps. A group of people riding a boat on top of water. A view of some snowy mountains from an airplane. Two women stand at a store in front of a cooler containing various alcoholic beverages A little boy that has a spoon with food on it. A black stuffed cat with fangs is hanging on a rack with others. Two red trains at a train station, with forest in the background. A person is sitting in a chair on a sidewalk while a bus drives by. closeup of a white horse with someone riding it some food is sitting on a green and white wrapper A bull and a dog charge across the field. a ball player running toward home base by a bat A slice of pizza that is sitting on a table. A tiled floor bathroom with a red and black shower curtain. A display case of different types of doughnuts in it. A close up image of a bag of Broccoli florets. Two brown sheep huddle near the back of a large plastic cage. kids playing Frisbee in a park on a bright day A person riding a bicycle on a street near a building. many bananas hanging above some people in a shop A man rides a horse while driving longhorns down the street. A person flies a large and colorful kite at the beach. A black and white picture of a large house is shown. a guy painted yellow with blue overalls holding a banana a row of boats are lined up in the water The couple scoots around town on the motorbike. A white sink sitting next to a bath tub. A chair is made out of stuffed pandas attached to each other in a clump. A bunch of people riding in an odd looking vehicle. Tower clock designed with two western shooters for entertainment display A couple of women holding game controllers in their hands. A laptop on a plaid black and white blanket. A woman sitting on a rock holding an umbrella. A man riding a skateboard on top of pavement. A person with a tennis racket on a court. A bear peaking over a log in front of a rock wall. Variety of fruits being placed into a blender. a clock on a walk with a bike parked near by Cows on display on top of codling with people far down below A cat is sitting on a bathroom counter. A person leaving a trail of snow as he glides on his skies. A horse stands near a fence during winter time. A half eaten dessert and half empty cup. A young man standing on a beach holding a bat. A memorial bench with a can of liquid sitting on top. A clock tower with a clock against an overcast sky. A group of people getting onto a bus. A group of people standing in front of a Inn. A group of zebras stand in a field. A parking meter on an empty street at night. Young boys with alien and spider facepaint tattoos Two men in a showroom for snow skis. a bed with a red and white bedspread and pillows a group of children and adults gathered together on a snow covered bank Six well-dressed men drinking beer and eating pizza. This is a sink and mirror of a hotel bathroom. A room filled with lots of clocks on it's walls. Two dead birds covered in wires sitting inside a outdoor plant. A man on a horse near a dog and two cows. A young girl wearing a baseball cap eating a hot dog. Bike left outside next to the bench in front of the river there are many people in this living room playing a video game A woman standing next to a fire hydrant wearing a backpack. A woman passing a bear mask on a market tent. A task force of drug dogs monitors an airport corridor a couple of women sit on the ground next to each other three geese on some grass by a pond A boy batting during a little league baseball game. a woman is working with something over a book A man and woman that are standing next to rocks. The woman wears a hat and has flowers in the basket. a girl on a board riding along a boat in the water A guy is posing for the camera with a medal around his neck. A view from above two men working in a kitchen cutting fish. Oven light on in a kitchen with wooden countertops. A stop sign on a snowy day in the daytime. A television sits on a dresser by a window. A young man is taking practice swings on the field. Close up of a plate of broccoli and stems. A stop sign centers an upside down street image. A baseball game is in action as a batter swings. A woman is holding a sawed off bat while wearing lingerie. a small bird on a tree brand near fruits and leaves historical fighter plane on display in an air hanger A person eating lunch and using a computer in a cafe. A picture of a subway shuttle bus traveling down a city street. A very cute small child holding a big umbrella. an image of two planes that have just landed there are two stuffed bears sitting on a toy horse a bathroom toilet with a carpteted seat cover and floor rug. Three double-decker buses are parked in a lot. A street sign is near a lamppost and trees. A laptop that is sitting on a desk. A truck hauling a large load to a job site on a winding mountain road. A series of steep stairs lay next to a lake An elephant at a water hole spraying water into his mouth. A woman with something in her hand in a decorated picture. Two tool boxes sitting next to each other on a table. A white and orange colored cat laying on a bed with its eyes open. A person jumping in the air on a snowboard. Two people ride horse beside dogs near a meadow. A hot dog sitting on top of a bun covered in toppings. A young man has his foot placed on a pole while another looks on. The two men are riding their horses on the road. A cat is laying on cozy white sheets. THIS IS A CLOSE UP PICTURE OF A STUFFED BEAR AND MONKEY A picture of a person throwing a frisbee. A man in tuxedo posing for a photo. Closeup of a baseball glove and a black ball hat. a truck sits next to a big plane A man standing in front of a parking meter about to put money in it. Two young children laying in bed next to each other drinking from bottles. A herd of zebras where one of them is biting another. A dog sits with a frisbee at its feet. A wooden computer desk with a computer sitting on top of it. A red stop sign sitting next to a wooden electrical pole. A man is putting a pizza into an outdoor pizza oven. A cat that is sticking its head in a green bowl. A clock on top of a post shows the time The face of a cat that is sitting in a sink. Two kitties playing with toilet paper next to the toilet. A man who fell asleep with phone on face A bathroom with a sink and a bathtub Locomotive parked under a brick bridge in a secluded spot. A bunch of bananas sitting on top of a wooden table. A man in a swinging position holding a tennis raquet while on a court. A person that is laying on a bed. large semi truck with steel front end parked in grass A young boy smiling on his skis in the snow A woman in white dress playing a game of tennis. Woman in blue outfit taking a swing during tennis match. Two girls kicking a soccerball on a soccer field. A young woman holding a baby with a teddy bear on her lap. A man lying in a field flying a kite. Man in baseball uniform playing shortstop waiting between pitches. A man with a knife in his belt and a beer in his arm enjoys a sandwich. An older woman with white hair and glasses, seated at a dining room table and another person in the kitchen area. A table is set with a full dinner. A small kitten fits inside of a gray sneaker. Group of horses in race near canvas fence. A man pushing a girl on a swing. A green plate of food that includes rice, broccoli and meat. Three horses grazing in a pasture in front of a house. A toilet sitting outside a building in an alley. a bento box filled with different types of food Kitchen photo with window over counter and a bowl in the middle. a large vase with a big colorful boquet sitting on a table A bathroom with full vanity and wall mirror. A group of motorcycles parked in front of a white church. Groups of skiers near a ski trail in the snow. A white sink that has a necklace, a rubber ducky, toothpaste and some beauty items laying around them. A person skiing on a snow covered mountian A man is enjoying surfing in the water. A clock on an ornate metal pole in front of a shop. A train traveling over a river on a bridge. The dog has a frisbee in his mouth in the snow. A cat and a dog sit in a colorful bathroom. A bathroom area with a tub, shelves and a sink. A dining table and a lamp are beside a fireplace. A plate full of couscous with mixed vegetables A young woman with tattoos using her cell phone.. Two beer trucks are parked beside one another and unloading. A person is looking at a pair of scissors. A colorful illustration of an old train and stormy skies. a big airplane taxiing on a wet runway A man riding a bike next to another person on a bike. A couch and rocking chair are in the small living room. A couple of people sitting on a beach watching an assortment of para sail chutes. The open faced sandwich contains a meat in casing. A man wearing winter gear snowboards while several people snowboard behind him A plate contains a meal of meat, potatoes, eggs and fruit. A white couch in a living room filled with Christmas decoration. A silver suitcase on a wood floor with a pair of black and white shoes next to it. an oddly tied tie on a pink shirt A young child learning how to ski down a slope. many elephants are walking on a trail and some trees a cat is sitting on a couch in a room a big white bed with a dresser and lights new to it. A sign is displayed on a traffic light. Fans sit in camping chairs along a fence to watch a children's baseball game. A group of people are eating near a wooden bench surrounded by trees. Man installing an OS while giving the "devil horn". Some people playing a wii video game in front of crowd A man cuts a bowl of greens with scissors. A group poses in ski gear in front of Olympic rings. A baby in an adult's arms is gnawing on a toothbrush. two people sit parked next to each other on motorcycles A car sits on the side of a road with letters written on it. Woman sits on beach with laptop pondering what to write. A person is giving a piece of crust to a dog. The group of cows stand in a river drinking the water. A skaterboarder is doing tricks at a skate park. A person surfing on a surf board on some waves An assembly line with doughnuts moving through an automated fryer on it. A baseball player hitting a ball with a bat. A dark street with signs and buildings on the side. a older male with his mouth open wearing dusct tape. A blurry photo of people watching a bunch of horses. A white plate with a hot dog in a biscuit next to fried potatoes. A fruit cocktail with banana, oranges, and various other fruits A blue toilet is sitting in a blue bathroom. a bunch of colorful items on a black plate An old man with eyeglasses stands next to a giant screen a toilet a shower a tub a sink cabinets and a mirror A man on a snowboard rides on the snow. two people in a green field playing with a frisbee. The person is skiing at the bottom of a steep slope. A mountain view with a plume of smoke in the background A bathroom has a sink, toilet and an orange bucket in it. A dog jumping to catch a thrown Frisbee. A group of giraffes stands around near a watering hole A man in a suit and sunglasses drinking from a paper cup. A smiling woman eats outdoors with a group of people. A very cute little bird on a green leaf. A guy is cutting something out of a piece of paper. Guy plows the field behind two strong horses Many containers of food are on the table. A woman with an umbrella walking her dog who also has a smaller umbrella. A lighted mirror in what appears to be a bathroom. fifteen different varieties of doughnuts in a display case A mouse swimming and another climbing out of a river in a wooded area. A man standing next to a truck near a forest hillside. A person sits on a bench with the skyline in the distance. a giraffe is crossing the road in front of a car A group of people standing near a bus The man stands on the beach prepared to enter the water with the green sail. A basket full of white biscuits on a table. A bed with five pillows under a hanging print. a man on a giant bicycle rides by a tall pole in front of an empty, large field in front of some mountains A train sitting next to a train station near other tracks. Keyboard with iPod shuffle in front on desk A giant chair with a horse statue on it A cement elephant on the other side of a fence. Box with picture of a hand holding a Nintendo wii remote. The cat is sitting on top of the black suitcase. A group of people is standing in a driveway. Four people sitting at a table with a large pizza and cans of soda. A picture of a families living room with nice furniture. A tennis player who just hit the ball to their opponent. Two newspaper stands with a fence behind them a brown white and black animal and two people on a motorcycle An older lay sharing a birthday cake with some little girls. A woman standing on the top of a snow covered slope wearing skis. A simple hotel bathroom with two sinks, free mini bottles of shampoo, and a hair dryer. there is a man riding a bike and waving A man and a woman sitting on a couch. A bear sits on the rocks by a pool of water in a wildlife exhibit. A woman holding a colorful umbrella with writing on it. A man on a surf board rides a rough wave. Two giraffes and a zebra with several trees. a silver and black motorcycle is lying in the dirt A tiled shower, molded plastic bathtub, shelf, mirror, wooden vanity, lamp, and sink make up a beige colored bathroom. A hazy sun over chairs and an umbrella on the beach. A lawn chair sitting on top of a beach covered by an umbrella. Several monkey figures hang on a bedroom wall. A pair of scissors, a tape measure, and a spool of thread sitting on a piece of folded fabric. A flat screen TV sitting in a living room next too a shelf. White parrot sitting on a ledge eating a seed pod. A street corner with a stop sign and it's wet from rain. A woman stands holding a white controller near some chairs. A black cat is nestled among indoor plants. A tennis player hitting a tennis ball in a professional game. A fat cat laying in a bathroom sink. A woman stands in a dimly lit kitchen at a gas range. A man in grey shirt jumping on a skateboard. There are parking meters alongside of the railroad. Old worn red truck parked in a driveway near a cactus. a lot of people standing in the middle of the road with red stop lights A professional tennis player holding a tennis racket at the US Open. People on a shoreline are flying kites on a clear day. A commuter train leaving the clean subway platform A giraffe is standing tall next to a tree. an old photo of a miniture pony pulling a cart A man serving a birthday cake to a woman A kitchen has a washing machine in it. A cat sits next to a laptop on a desk a white and green street sign and a traffic light A man reading a magazine and sitting on a toilet that is outside on the street. A large building with stained glass windows and a clock. A cat laying on its back with paws up in the air. A man rides his motorcycle through an alley way. A flock of doves and a man sitting in a park. This is an image of a cake with a bear surfing. A man with a gun standing in formal dress. Three people riding horses together down a trail. She is serving the tennis ball pretty high. A bridge over water near several buildings in a city. a man that is skateboarding on a ramp A herd of zebras standing in some algae covered water in front of a sandy plain there is a green vase with a plant inside of it A man holding a bowl with an open oven A man who is snowboarding down a hill. A black and white cat sitting in a chair. some veggies are in a small cardboard box In the dark two street signs are glowing. A surfer sits on a beach next to some surfboards. A large bus on a open city street. A bathroom that has some open windows in it. A man standing on a beach holding a surf board. Three geese that are standing by a pond. a guy on a bicycle and a guy flying a kite A photo of a person being taken in this picture. People in a large body of a water using surfboards. Many pots of marijuana plants growing in a greenhouse. a toilet sitting in a tile covered floor in a single room A large white bed in a red room Sheep perched atop knoll on green countryside with rocks. a large group of people walking on a city street A plate and vase on display in a room. A man laying on top of a white bed between two lamps. A woman reaching for a frisbee as another defends her. A man holding a Nintendo Will controller in a living room. A tow truck at a traffic stop with vehicles behind it. A horse drawn carriage riding across a snow covered field. An older later sits and drinks from a cup. A sink sitting in the middle of a bathroom. A table that has a silver tea pot in the middle and several plates around it with desserts on the plates. A truck that is driving down the street. A boy in a red jersey throwing a baseball. A man on a horse is herding animals down a trail. An elephant standing next to a lake on a beach. A griaffe walking on a road with two cars approaching. A clean passenger bus driving in a city. A small black cat laying on top of a couch. A cat is trying to squeeze through a door. The transit train stretches down the track under the power lines. A man is standing near some graves, water, and a bus. A man playing a game in an RV with a remote controller. A black and white dog with a frisbee lying on the grass. The tennis player is returning a strong serve. Two young children play in the grass with a kite. Two trains, side by side, waiting at the train station One cow attempting to mate with another cow in a pasture. a living room with a shelf a coffee table and couches Trophies and cell phones are on a table. a person on a beach with a frisbee Two soldiers taking pictures of a group of soldiers arranged for a photo. There are three men with fishing poles at the beach. A BUS STANDING AT A TRAFFIC SIGNAL IN A STREET. A zebra and an ostrich up close with other animals in the background. A lone horse in the middle of a grassy field. An elephant spraying water onto his body with his trunk swinging backward. A pretty cat with both front feet in someone's shoe Close up on meal food with three items side by side chicken with barbecue sauce, broccoli with shredded white cheese bits on top, and a bean and pasta or grain mix. Three images of a brown and white dog sitting beside a doughnut. A black and white kitten stands atop a laptop computer People are flying a kite in an open area. a person skiing by a start sign above them A woman giving a man a haircut in a barber shop. A wooden table topped with the contents of a woman's purse. A brown fluffy smiling teddy bear with big paws. A man and woman wearing skis on a ski slope. The slice of pizza has tater tots, green beans, and cheese on it. a hamper with compartments having a cup clothes and two bears A small table has many foods and drinks on it A Photo of a man on skis gliding on flat snow . A MAN IS ON THE BIKE WITH A USA FLAG a couple of sandwiches are on a white plate Two views of bright objects floating through the blue sky. Two men on a tennis court playing a game of tennis. a giraffe standing in the foreground with an ostrich behind a white plane is being prepared to board passengers some giraffes are in a green field and some trees a large in ground swimming pool near tents A large number of teddy bears are sitting at tables with fake food. Fighter jets flying together in close formation leaving vapor trails. Group of giraffes in high brown grass looking to feed. a person with a horse and a car in the background A man that is standing on a surfboard in the water. A flock of birds are flying in formation. A huge heard of sheep are all scattered together. A man and a dog on a motorcycle. A giraffe leans over while another walks away from it in an outdoor area. A goth man sitting on top of the floor near a store. A child throwing a ball towards a batter during a ballgame on a field. The person is walking on the sidewalk alone at night Two cats lay together on a messy bed. A woman in equestrian clothing on a horse Some people by a long row of motorcycles parked together. Large long tailed kite on string above rural town. a giraffe in the distance in front of a tree. A group of chefs prepare food in a restaurant kitchen. A skier rides their skis down a snow covered hill. A blue clock spire next to buildings and cars. Several wooden cages with white cloth tops and sides. An old man is sitting on a bench. A couple of sheep laying on top of a pile of dry grass. A white pick up truck driving down a road behind a line of elephants. A dog behind the steering wheel of a car. An L-shaped couch in a living room with a coffee table. A urinal seperated from a toilet in a bathroom A small elephant stands next to a tree. The white and blue boat is floating on top of the water A plate of two slices of pizza and a cup of juice. A herd of zebras graze on an open grassland. a room that has a bunch of chairs in it A female tennis player is about to make a serve. an image of a plate of meat and vegetables A jumbo jet plane running along a runway. A room with a couch, chairs, television and a table. A group of people outside at a park playing softball. A photo of two plans with water and birds surrounding it , one plane in the air one one the ground. A herd of giraffes grazing on a tall tree stalk A man holds a glass of wine on a patio by a vineyard. There are two zebras standing in the desert. Shimmering lights inside a living room with a dog on soap A baseball player at the plate just after swinging at the ball this is a little girl playing in the beach a bride and groom are cutting their wedding cake 2 giraffes one of them is doing the splits A group of girls on seats in a tour boat. A man wearing a helmet on a bicycle in a street that has a guard railing on the side of the walkway. There is a small toy elephant sitting on a wall A skateboard on the walkway in an old bus. A red headed skateboarder sips on his drink. A table that is filled with hot dogs, and a hamburger. A duck can be seen in the water with high rise buildings in the background. A young woman on a surfboard getting ready to ride the wives. a horse in a field of tall grass A man appears to be giving snowboarding instructions to a woman. man brushing his teeth in blue and white tiled room Three people in green and black snow suits with ski equipment on a ski slope. The modern church has a clock on it's steeple. A cook is holding a wooden cooking utensil in his hand. A living room with hard wood floors and a tv over a fireplace. A train stopped at the station to pick up passengers. a woman is riding on the back of a horse two zebra standing in pen and grazing side by side A picture of some food in a bowl together. A man sitting at a table with a cake in front of him. people getting on a public bus at night Tractor passing a statue of a dairy cow wearing a lei Woman with blue streaked hair sitting cross legged on bed. A stop sign with graffiti written on it. A professional horse back rider is getting ready to take a shot while the crowd looks on. A hotel room with a made up queen sized bed. A boulevard has been pictured by someone driving by The empty bench is sitting in the nighttime street. Airline employees by an aircraft parked at the gate Some animals are standing together in a pin Small piece of cake sitting on a plate with cherries on it. A woman sitting outside her house under a fruit tree. baby in a highchair with bib and cake A red trolley train riding along the tracks near trees. An array of lights on some sort of machine A man sitting inside of a car on the street. A man is being pulled on his skateboard by two dogs. A horse sticking his head out of a doorway. there are several clocks that look like they are hanging from the ceiling A kid is doing a trick on his skateboard. a man with a suit stands in front of a brightly lit drapes An orange cat and a black and white cat both laying on a bed. A man in skies is going up hill. A platter of food that includes eggs, hot dogs, and cheese. A living room with a couch, chairs, television and a child's high chair. A white bed sitting in a bedroom in front of a TV. Roses and other flowers are siting in the vase A person sitting on top of a rock over a river near a city. two public transit buses parked near one another A man is standing in a cluttered room. A young women sitting at a picnic table eating a meal. A crowded city street filled with traffic and bicycles. A bird with a large crest standing on a branch A giraffe lying on the ground in a zoo pin. A shower and a toilet in a bathroom. Three grey birds in a tree with blue backdrop. two cars parked on the sidewalk on the street A woman standing next to a modern style parking meter. A chili dog, onion rings and chili fries. This rider takes a brown horse across dirt A large dark colored spoon sitting on a rack The player who is up to bat next is getting ready for his turn. A man on a beach getting ready to throw a frisbee. Two old people sitting on a bench before a wooded lake A small dog wearing a sweater and holding a Frisbee in its mouth. there are many airplanes stop at the airport. A woman checks her phone while holding her hat sitting on a bench, with a bicycle in front of her and a hedge behind her. THERE IS A PIECE OF CAKE ON A PLATE The person fell off of the horse and into the water. A large truck can be seen in this picture near a bridge. A small old street sign hanging on a building. A man watching another man on a skateboard A man sitting on a large bench talking on a cell phone. Parking meters stand in front of parking spaces in an empty lot. A man on horse back and a truck watching a herd of sheep cross a road. A man in wetsuit on surfing on white surfboard. A dog and a cat underneath a desk. Tower clock made of rock set against a cloudy sky. Two children who are standing next to a white fire hydrant. A bathroom with a white counter top and white towels Two snowboarders are standing with one foot strapped into their boards and one foot out, at the top of a mountain. View of a partially shaded city street with autumn leaves. A laptop computer sits on top of a messy desk. A pair of sinks in the middle of a kitchen counter with a wooden countertop A group of bicycles that are sitting on the road. Several people are swimming and surfing in the ocean. a herd of giraffes standing around a bare field Two young men at sunset juggling a soccer ball on a beach A dish with meat and vegetables set on a bed of rice. A brown teddy bear sitting on top of a wooden bench. The sheep are grazing in the grassy field. A blue train on the tracks at a train station. A cat sitting inside a toilet bowl looking alert. A woman who is standing near a clock. a man with a tennis racket plays a game of tennis Two small black bears stand near a tree. A group of people holding Nintendo wii game controllers. The people ride the bike near the water. A piece of luggage with a rainbow strap and a ticket on it. A pizza is shown being cooked in an oven. A man holding a frisbee in the field with grass A street sign is posted to watch for senior citizens. A large intersection that doesnt have much traffic People in a park trying to fly a large purple kite that looks like a fish. many different cup cakes on a grill on a table A man taking a swing at a tennis ball Snow skiing at night presents unknown dangers without the lights. A bathroom with the curtains drawn down and the lights on. Two old-fashioned bicycles parked together on a beach. bananas and apples sitting next to each other on a counter A white and blue truck driving down a mountainous dirt road. Backpacks line a boardwalk to a beach surrounded by trees. A bird that is standing on a concrete ledge. A group of sheep sitting next to a stone wall. A large pizza with tomatoes, basil and cheese. An Asian man is sitting in a cubicle with a near a computer near other people working in cubicles. A white seagull standing on a white column by a pier. A yellow and metal train traveling down train tracks. A pair of giraffes grazing through a wire fence. A board full of chopped vegetables near a computer a giraffe standing in between some brush as a bird flies by it He is intent on another bite of the sandwich. A dim runway has an airplane on it. sheep cross the road next to a white barn in the rolling hills A man sitting in front of a computer with a bottle of beer. White and blue plate of two glazed donuts by two glasses of orange juice. Flowers are in a vase on a shelf. A group of young people brushing their teeth. A grey cat with yellow eyes looking innocently at the camera. There are giraffes that are standing g yogeter The Asian woman is trying to sell her food on a local beach. A zebra standing next to a van door. A man riding a skateboard across a crosswalk. a group of young people playing frisbee in a field A table topped with cut in half Twinkies on top of cupcakes. The woman is on the tennis court playing a game. A picturesque view of a small town during winter. A woman riding a wave on top of the ocean. A zebra walking next to another animal across a dirt road. Two horses and a man are on the beach. A yellow plant with green leaves in a glass vase. Headphones help her to hear her cell phone. A picture of someone typing on a laptop. A cat curled up asleep in front of a laptop computer. A very nice looking room with a big bed. A man holding a baseball and a catchers mitt. People looking in the shop windows with a bicycle parked against the window. A person kicking up on their skateboard at the top of a ramp. A large bunch of green bananas hands from a tree. Man wearing a bandanna trying to catch a frisbee A woman preparing to serve a tennis ball. a couple of people are standing in the shade a telephone pole with a sign stuck on it A man talking on a cell phone while walking down a street. A boy in red shirt swing a baseball bat. A man getting ready to hi a ball in a baseball game. A pie filled with white creme next to a yellow banana. A man outdoors jumping to catch a frisbee. A person on a snowboard catching some air over a hill. A zebra laying in the dirt looking away from the camera. A double decker bus parked next to a brick building. A cheesy pizza with red peppers is in a box. Elephant holding onto the tail of another elephant with its trunk. A dog and it's owner sitting in front of a desk. A picture of a surfer as he catches a wave. A young man sitting under a tree with red leaves. A woman sitting on the grass with a computer outside the Brown Library. A pizza on a pizza pan with two pieces removed by a serving ladle. A blue subway train pulls into the subway station. a woman whispers into a mans ear with a suit on A shirtless man is on top of a man on a couch All the contents of a video game console have been unpacked from a box. white sailboat docked with other white sailboats A man swinging a racket on a tennis court. Stuffed toy bears on display on shelf in large room. A red stop sign sitting above a no parking sign. A living room filled with furniture and a purple couch under a window. Airplanes sit parked on the runways of an airport. a person jumping a skate board in the air A passenger train speeding down a track in the countryside. A boy playing baseball is winding up for the pitch. A dog and cat sleeping together on a dog bed. A living room with different living room furniture. A player prepares to run to first after hitting the baseball. Three adult giraffe stand at a grove of trees. A person wearing red gloves grilling a pizza. the man is riding a skateboard down a ramp Several sailboats sit in the water in front of some trees. Looking up at a building with a large face clock near the top. A large jetliner flying over a row of runway lights. A clock on a tower as seen from a roof. A stuffed animal hanging from a post in a field. Two nicely dressed men standing together next to a flag. A refrigerator filled with food and drinks next to condiments. People are riding the waves on surfboards on their stomachs. A few deer laying down in the grass near a bunch of trees A dog is sitting on a covered couch with some light. A couple of dolls are standing on a table. Two children eat fresh vegetables from a skillet. a person holding a coffee cup with a watch on his wrist. a large air plane flying in he sky a person on a skateboard is doing a jump An elephant with tusks eating food behind a fence. A man riding the waves on a jet ski. A person on a horse that has a decorated hat on its head and covering it's ears, with another horse next to it that has a mask covering it's eyes. Large blue metallic public transportation bus with Aubaines written across the back. Large pizza with cheese, olives and tomato sauce a couple of people that are staring into a icebox A man standing in front of a restaurant with a skateboard in front of him on the ground. A silver Sport Tourer BMW motorcycle on a sidewalk. Seen through a wire fence, is a stadium area with watchers and many vacant chairs, a dugout with a railing and many men leaning on it, and a playing area with a lunging, uniformed batter with a catcher and an umpire behind him. a woman with a cell phone and another with a large bag A white table topped with two desktop monitors. A sneaker and a paw are seen on the grass. A train traveling through a grass covered park. A red fire hydrant sitting on a brick sidewalk. A very colorful mix of grilled vegetables looks delicious. a large cat laying across a table next to a monitor. An antique motorcycle restored to like new condition. Baseball player barely delivering hit to ball during game. a zebra bending over eating grass at the zoo A person is cooking something on the stove. a black vanity top sink toilet and mirror An old fashioned bench is sitting on the sidewalk. A herd of cows grazing on the grass. A man prepares to swing at the tennis ball A white toilet in a bathroom next to a trash can. A boy that is jumping in the air with a skateboard. A couple of people by a boat in the water. A stop sign on the side of the road. a pink and yellow sign is hanging above the street lights Tennis player holding his racket looking ahead of him. a woman with eye glasses sitting at a table covered with food A skateboarder is performing a round about handstand. A child is playing with a frisbee in the park. Blue plates are stacked on a wood countertop. An old time car is parked at the curb near a stop sign. A trailer truck hauling with a crane hauling logs. A furry dog playing with a green apple on the carpet. a boy sits sits on top of a horse in front of a jungle forest A couple of brown horses standing next to each other. A motorcycle parked next to a stairwell behind a plaque. A cat sitting in front of blinds in a window. A young man is skating around white cones. There is a woman playing a game of tennis. An old sign with trees in the background filled with fall colored leaves. A young boy in motion while holding a remote. A fighter jet is flying through the air. The young children are playing a game of baseball. The fat grey cat is wearing a red satin tie. The large boat has nets extended on both sides of it. A man swings a racket during a game. A baby is sitting on a wooden bench. A man, woman, and two children laying together on a bed. People watch a baseball game in a large stadium. A very tall building with a massive clock tower. A man flying through the air while riding a pair of skis. A boy putting his leg back to kick a soccer ball. A stuffed bear head and paw on a laptop computer. Several dark colored cats laying together on a piece of luggage and a duffle bag. A woman laughs as a man brushes his teeth in a public location. a man shaving in a bathroom while looking in the mirror A fruit salad with cantaloupe, kiwi, and bananas. A colorful train is waiting on the tracks at the station. A woman holding a tennis racquet on a field. A bowl with a plant, a large vase, and two cups on a table. a person that has a lighter in their hand Man leaned back with his mouth open, sleeping on a bench a bunch of knobs on a large metallic stove A zebra is eating while standing next to some hay. A group of people on a field playing with a frisbee. A girl laying on a bench reading a book. A hand holding a mug of green liquid next to a pile of fruit. A pair of photographs of a dessert with a vase of flowers. Two zebras are standing outside near a tree. some brown and black horses a table umbrellas and a person A child in striped shirt sitting on the top of a bench. A young baby that is brushing their teeth while sitting down. Group of signs on top of each other on a pole. A man and a woman outside next to an old truck. Passenger train crossing a bridge next to a grassy field. A clock tower made of stucco with an arched window. A women holding a fork while looking at a cake. A restaurant with no one in it has several square and round tables. a crowd of people standing around and sitting watching surfboard A man in grey shirt riding on a skateboard. Street traffic light that is on blinking yellow. A man laying in a bed with tubes attached to his check and mouth. Two bird flying low across a body of water. A baseball player posed to hit a ball. The man is putting his feet up on the desk. A cat sits in a glass window by a stuffed toy. A fireplace with a mounted flat screen tv above it A person's hand on the back of a black cat that is diving into a bathtub filled with debris. Two girls playing a game of tennis on a court. The front edge of a well used skateboard. Man juggling three balls at the same time. a woman is standing in a green field playing tennis a giraffe bending down to eat grass off the ground Two tennis players on the court and waiting to play. Two children smiling and eating small personal pizzas. Old refrigerator open in an abandoned wooden building A painted postcard of the clock tower and bandshell at the Daytona Beach, Florida. A couple on a bike are riding on the sidewalk alongside a bus. Bed in room of some home with windows. a really big elephant that a man is on A person in a ski jacket next to a train a red sign is hanging on a pole outside A orange cat sitting in a piece of green luggage. This kid who is Pinoy is skateboarding over an ollie jump The man poses for a picture while holding a snowboard. A beautiful young woman talking on a phone. Two ladies on a road with an umbrella A gathering of people playing a video game. A small office with a desk and book shelves Several planes are flying high in the air together. A large elephant walking next to a man A pair of young men stand in a field playing with a frisbee. A kitchen area features white appliances, counters and a white floor. Cars parked on a dirt road near airplanes. A cat is looking at the side of a laptop computer. A green wall in bathroom with white and chrome fixtures. there I a motor bike that is pakrd on the street and one with something on it A couple sitting on top of a bench under an umbrella. A flat screen tv on a wooden tv stand. Two young men playing a motion controlled video game A cat sitting on the edge of a table. A woman sitting at a table eating a donut. A table contains a large square cake decorated with a flower. A set of blue bleachers sitting in the middle of a dirt field. A pair of rusted scissors stuck in a stone sculpture. Man standing in front of a television holding up a Wii controller. A man playing a game of tennis and people in the crowd watching. Three blenders with colorful tops and bases, two of them matching, stand in a row. a room showing a microwave and a cooker also an oven a man wearing a hat while riding a surfboard A man brushing his teeth in front of a mirror. Two bears in an in closed area with trees and stumps. A man holding on to a parasail over the ocean. A white toilet sitting next to a white sink in a bathroom. A blue sea anemone living on a coral reef. a long haired white dog is eating some cake on a plate A batter practicing his swing in the batters cage. A person riding a skateboard down a metal railing. A snow skier standing at the top of a snowy slope. A chocolate cake with decorations and a knife A large white jet airliner flying over trees. Flock of sheep eating grass on a mountain. A surfboard resting on the sand of a tropical island beach. An overhead view of a group of people sitting at several tables. a sandwich with some fruit and a drink A parking meter on the side of the road. A pile of luggage sitting on the floor. Fruit stands with bananas, pineapples, oranges, and other fruit. The three giraffes are walking together on the grasslands. The hand is reaching out in hopes of catching the flying disc. a paper plate with some pizza on top of it female skier, skiing slowly thru cold white snow There is a computer monitor with a graphics program open sitting on a wooden desk. A giraffe laying on the ground in the grass. An dinner of pinto beans, broccoli, a roll, skim milk, an apple, and something unidentifiable. Spectators watch a professional tennis player serve the ball A meal on an airplane of cereal, milk, and fruit. This is a child holding a remote to a game console. A young skier in a red jacket goes down hill Three big horn sheep are in an enclosed pasture. A group of people standing around outside with their bicycles. A toilet with jelly fish and star fish on it there are two pieces of bread on a yellow plate A large bear ornament hangs on the Christmas tree. A small bottle of liquor next to a whole orange and an orange half. A woman holding a few bread sticks and a glass of wine. A view of an airplane wing flying over a mountain range. A tennis player holds her racket during a match. The cat is angry while sitting on top of a pillow. a blue and yellow train engine and some people a group of people that is surfing on some water A man riding skis down a snow covered slope. A plate of food that includes a sandwich and shoe string fries. The dinner plate has three smaller bowls next to it. The yellow bird is waiting for its mate. Two empty stone park benches placed up against a stone wall. An airplane sits on a stand for display. a close up of a person holding an electronic device A airport tarmac filled with a jetliner and trucks. A piece of cake and for that are on a plate. A man skateboarding through an obstacle course with cones. a white toilet and many rolls of toilet paper a four-legged animal grazes on the side of a hill in a forest Two young ladies petting a young calf on a farm The woman in the colorful dress is holding a video game remote. A wing outside of an airplane window high above clouds. A person on a surf board surfing a wave. A small kitchenette with personal items displayed attractively. A man with glasses sitting at a wooden table with a lamp. A desk area with a computer monitor, keyboard and mouse. A girl that is sitting down with a cell phone. A great full shot of the bathroom with wooden floor. A cat lies in an empty fruit box amongst other fruit boxes. A man is outside grilling some hot dogs. The truck is traveling down the road in really bad weather conditions. A field filled with lots of white sheep next to a river. There are a lot of animal heads laying on the bed. A women standing on a bike backwards . a woman sitting on top of a horse standing on a beach A man in a blue shirt holding a piece of pizza. A bed with two pillows under a window. A white plate with slices of meat and veggies. A young man is eating a hamburger while a young girl watches and laughs. A parking meter reads .90 cents as a silver car is parked behind it. A man looks down at a dog sitting on a chair outside. A picture frame with 2 pairs of scissors dangling from top and a painting sitting in front of the frame A woman that is leaning over a pizza. There is a suitcase which appears to filled with foreign snacks. Two women in a public place playing on a Wii system. two people on a tennis court playing a game A large crowd in a grassy area with the capital building in the background. There is a plate of pasta with a fork in it. An old mattress lying amidst overgrown brush and leaning against a fence. A person on a surfboard in the water. A refrigerator sits next to a counter in a kitchen. An elephant is stretching its trunk on the ground. A kitchen has a vintage gas range and yellow walls. A man with a glass of wine in his hand. Red fire hydrant with blue top on downtown street. A person standing on top of a snow covered slope. A group of smiling police offers on brown horses. A media center with gaming consoles and a television A traffic light that is currently a green light. A white plate with meat and broccoli on it The yellow earth mover sits in the field in front of the pole. A man holding a surfboard in a hotel room Looking up at a traffic light next to some street signs. there are many train engines and cars Dessert pastry with apples served with an autumn theme. A red stop sign next to a road in the middle of nowhere. A kitchen with a sink, counter, cabinets and a dish rack. there is a woman riding a brown horse on gravel A noodle and vegetable dish is displayed on a plate. A snowboarder holds a snowboard for a photo. Motorcycle decorated with an American flag and reindeer A person on a skateboard on a ramp. a man with a tennis racquet serving a tennis ball A young man is in the middle of performing a skateboard trick. A young man playing a game of tennis against an opponent. A man with a pan in his hand walks by pizzas in a oven and on counter tops waiting to be baked. some kids are watching as giraffes walk around their zoo exhibit A young lady holding a bat behind her shoulders A couple of cats sitting on top of a couch. Kid sits on the edge while another jumps over riding a skateboard A group of people walking across a crosswalk. A group of three men standing with their backs against a fence. A couple stands smiling next to a sitting older couple. A large white table with chairs surrounding it. A snowboarder and her child in the snow. A male tennis player holding his racket in the air. A red and blue train on a bridge during a cloudy day. Brown horse on the sand at the ocean. poles full of signs in front of a skyscraper People reaching into a broccoli garden and picking broccoli. An old fashion looking bus is sitting idly. Homemade cheese and red sauce pizza on a plate with flour and dough on the wood table. A desk with a bunch of paper on It. A cat being offered water in a glass. Three loaves of bread are in an oven. A teenage couple dressed up and smiling in a aprk A kitten is held and fed with a bottle. A woman stands on a patch of dirt holding a tennis racket. A woman taking a swing at a baseball a big bear stand next to a river stream A table topped with construction contents on top of a wooden table. A young man wearing a tie and sunglasses is looking away. A baby sitting in the grass looking at the kites in the sky. A fork and knife on a plate with pizza A trio of people stand near two elephants in a covered area. A girl playing frisbee in the backyard of a house. A doorway leading to a dining room area. A child is holding on to a rod while he rides a boat. A stainless steel pan with a pizza cooked on it. Graduate talking on cellphone with people behind him. a painting of the president sitting with his hands folded in front of his face Automobiles stopped at a traffic light at night on a busy street. A street sign sitting next to parked cars and motorcycles. A train in a subway that has a few passengers. A lot of people walking in the streets and on the sidewalks. A baseball player swings a bat at a thrown ball A motorcycle parked next to a green grass covered field. Many swans in a lake are overlooked by a cow. A herd of cattle that are sitting on the grass. two giraffes and a man in a brown shirt is feeding one A smaller kitchen with a very decorated fridge. white cabinets a sink stove refrigerator and a window A woman walking down a rain soaked street with a red umbrella. a military vehicle and a smoking tow truck on a rural road A black and white picture of a child on a skateboard the street A number of grizzly bears sitting on tan rocks. A group of people sitting in a field eating together A man is cross country skiing through a forrest in winter. A buffalo is looking at a bird from a distant. A cardinal standing on an empty wine glass. A chocolate cake being sliced and served on plates sliced tomatoes on a plate and a bottle of wine Two people stand next to each other holding cell phones. Large pink boat on wheels parked on the side of the road. A bathroom with a unique double sink and round mirror. A polar bear walking along a snowy, rocky ridge. a man on the phone looks angrily at the man Different sized and styled teddy bears on display with pictures and information. This is a train traveling down the train tracks. Two women riding an old motorcycle with a side car Woman jumping on bed caught in mid air. A person on a surfboard high up over the water. There are two monitors and one laptop on the corner desk. Photo of kitchen being remodeled with a new stainless steel stove. A surfboard is laying flat is the sand beside a palm tree. A sign that is on top of a pole. A group of three people sitting next to each other on a bench. A table with an assortment of items such as a keyboard, phones, pens, snacks, keys, sunglasses, a water bottle, and more. A woman playing fetch outside with a dog. A man reflected in the mirror in a washroom. A man in black shirt holding a large striped flag. Two children riding a horse in front of their home. A pair of buses sit next to each other on the road. A group of skiers entering a tunnel through the snow Afternoon tea in a living room of a home in a hot climate A male ostrich runs through the grass in front of the trees. A man posing for a picture, in a kitchen. a wooden desk with two monitors and a keyboard on it. Someone is holding their tablet connected to a surge protector. A woman walking next to a train, pulling a suitcase. A couple leaving their wedding ceremony in a shower of rice. A stuffed teddy bear sitting on top of a bench. A huge dump truck is fenced in in front of a neighborhood. Two little girls sitting on a bench at a softball game Someone is enjoying a small slice of pie. A group of people riding horses down a sandy beach. an image of a man that is drinking wine People in uniforms playing baseball on a baseball diamond. A bathroom with a double vanity and round mirror. Light colored cat lying on woven rug next to checkered shoes. A modern sink and shower stall are visible in this photo. A passenger bus that is driving down the street. there is a broken tree log on the ground A mini keyboard attached by USB to a laptop. The raw material of meal preperation including Broccoli is kept on the table. Two apple computers are on a white desk Man sitting on a step in a run-down part of town. Two people standing in the grass under a cloudy blue sky. Two toilets sit outside on the pavement next to a yard with many decorations. A man high in the air mid trick while snowboarding. a bathroom that has a white toilet in it A fire hydrant is across the street from an Asian restaurant. People waiting at a bus stop with a bus parked. A group of men playing a game of basketball on court. a fridge filled with assorted foods and condiments a guy in a half pipe gets ready for his trick A small boy is holding two pizza muffins. A display case in a bakery with decorated cakes and cream rolls. Zebra crossing a dirt road by itself in daytime. A long yellow train traveling past a train station. A lady staring lovingly into her pizza. A brown teddy bear sitting on top of a pregnant woman's belly. An old black and white photo of a man with glasses in a suit and tie. A man on a skateboard with a woman filming orange, pear, and apple are all in a row. This is a bus with a Titans themed advertisement for Coors Light on the side. Little bird looking out from the tree it's standing in Everyone is waiting in line to purchase tickets. A small model train traveling around a small track. Two forks on a plate of cake and cream. A doll house living room filed with furniture and a persian rug. A man standing next to a hipster woman while holding a beer in his hand. Two tall birds stand together on a grassy spot next to a large rock wall. A group of people standing on a beach next to the ocean. Someone sauteing broccoli and onions with wooden spoon. A woman posed for a picture while eating. A woman looks through things on a desk. A train car with purple and grey graffiti covering windows A small bathroom has a sink and a storage rack over the toilet. a group of men trying to get an air blaoon working A large dim kitchen with light coming in from a window. A skateboarder is gliding along a paved walkway. A group of elephants in sandy area next to trees. A bird sitting on a fence and looking around. A person windsurfing with the sky in the background. A stop sign and three street signs attached to a pole. a black television is on a white table A picture of a red prop plane parked in a field. A woman sitting in front of the Eiffel tower near pigeons. an image of a closed mcdonalds taken in a parking lot A man is shown, with headphones around his ears. A man is waiting for a bus on the side of a city street. A kid up to bat in a baseball game. a church with a clock built into the side of it A clear vase full of purple flowers sitting on a table. Two women play singles tennis outdoor surrounded by trees. A stop sign is leaning a little bit. A cat sitting inside a piece of luggage on a vehicle. A white bathroom with pedestal sink and small cabinet and daylight window A dog picks up a Frisbee out of the grass. An open laptop computer on a wooden night stand. An old man holding a bag walks down a street. A laptop computer with pictures of giraffes on the homescreen. A person riding a skateboard on the sidewalk while holding a pole. A person laying face down and balancing himself on four yellow poles and a fire hydrant. A laptop computer and a desktop computer sit on a wooden desk. A woman is holding her daughter in front of a birthday cake with candles while another lady stands nearby A sheep with long horns wearing a purple bit. Pink glasses are inside a clear plastic bag with bananas. a bed that has some material items on it Sheep resting under a blue boat foundered at low tide. A bed and mattress store front with open doors Three men sitting at a table in a restaurant eating. A very clean bathroom that is made out of wood. A small airplane flying in the air near land. A man and a giraffe are greeting one another. two horses standing in the snow inside a fence A bathroom with a separate tub and shower A large machine digs up a side walk at a construction site. The yellow train is running along the tracks. A chili cheese dog in a travel box on a table. a women that has a carrot in her hand People are sitting on a cart pulled by a horse. A little girl standing in front of a pile of surfboards. A close up of a modern motorcycle on display. A person rides a snowboard in a forest setting. This is a cow on a grassy plane with a mountain in the background. A large display filled with bananas for sale. A woman is sitting outdoors at a table with a sandwich. An individual is taken in this very picture. A photo taken over a water way with a clock tower in the background. some men riding horses down a mud track A catcher throwing a ball at a baseball game. A mixer in the process of mixing foods. A busy inner city street with cars, a bus and a biker on it. A bathroom stall with two toilets and a plunger. A group of three men and one women are holding Wii controllers in a living room. The boy is on his boogie board in the ocean. Vases with flowers are setup against a pink backdrop. A women walking down the street while holding an umbrella. A couple of cows laying on top of green grass covered field. A dog lays on the bed with a remote. A woman hitting a tennis ball with a bat. A dog lying on the floor on some clothes and a remote A tennis ball sitting on a tennis racket. A long row of scooters stretch down the length along a sidewalk. A white and yellow plate holding three bananas. a living room with a couch and two low wooden tables with floor cushions in a log cabin. A cat watching birds flying on a Sony TV screen. A person standing by a field with a large chair. The man is standing by the table using his phone. a person at a table with many plates of food A wooden ladder stands over a toilet in a tile bathroom. A toilet with a sink and a towel dispenser in a bathroom. A photo of Thomas the Train coming down the tracks. two elephants in a field behind a fence near many trees A very tasty looking dish of food with some broccoli. Three giraffes on grassy field next to trees. A town with buildings, vehicles, and street lights. A side view of a building on a street corner are shown. The decrepit bathroom features a brand new toilet. Individuals are there commending and having a ton of fun of their life. the person is putting something into an oven. A man in a kitchen handling food, with another man in the background. A clock that is sitting on a wall. A giraffes head in front of a metal grid A young man in formal dress is standing. Custom made pizza sitting on a plate ready to be cooked. A pedestal sink and a toilet in a bathroom. A man dicing carrots with a large knife on a cutting board. Graffiti on a French street showing a man holding a red umbrella. A zebra is standing next to a fence. The plains with zebras and gazelle around a watering hole A lonely bird sitting on a white bench. A man and a small girl standing next to a glider. A coffee cup and a plate of food. a display table filled with assorted carrots and cauliflower A person on a skateboard riding on a street. A man that is on a skateboard in a concrete bowl. A crowd of people holding umbrellas walking down a sidewalk. A person's feet in the bed with socks and shoes on. Two toilet paper rolls sitting next to a toilet. A young woman carefully touches a giraffe's long tongue. Several people ride down a dirt road in a horse drawn carriage. A foggy street with lots of traffic driving under traffic lights. There is some food sitting in the pan. A view of many different scissors on display. A woman gives the peace sign at lunch with her friend A fighter jet flying through a blue cloudy sky. A vase filled with peacock feathers sits in front of the window. A man in a long sleeved hoodie holds a cup Red jello with fruit in container in microwave. Motor vehicle traffic on a paved city road. A school bus parked in a parking lot next to a building. Two men watching three horses running down a path. an image of two zebras in the wild a person jumping over a curb at a corner in front of a liquor store A man with piece of cake and a spoon sticking out of the top of the cake. an image of a zebra in a field A woman with a cell phone sitting on a couch surrounded by a red white and blue border. A boy in a safety vest lays in the snow as another boy on skies stands near by while a man in a red jacket kneels in the snow by the boys. A vase filled with white flowers sitting on top of a wooden table. A person sitting down a bench in front of the ocean. THERE IS A MAN THAT IS SITTING ON A BENCH READING A red truck driving past tall buildings on a paved road. A young Asian boy holding a tennis racket. A woman standing near some steps at a river's edge. Coordinated bedding pulls together a full size bed and a set of bunkbeds. A person on a court with a tennis racket. a fire hydrogen that is sticking out of the ground Old and young men sit around a table with laptops. A batter, catcher and umpire in a baseball game. A skier with a huge black pompom on his hat. A bathroom is shown with a mirror and a sink. A motorcycle is parked near a quiet river. A man is holding a partially peeled banana in his hand. Railway train on tracks traveling on beach next to ocean. two people sitting at a table with many wine glasses An empty refrigerator has its doors open as it stands next to a kitchen sink. A person with purple hair and and tie. Plane sits on a bridge above the water This is a man and woman on a ski slope. this is a fire hydrant sitting on the sidewalk A black and orange cat sitting on a wooden counter top. The service man is putting meat on the tray. There are giraffes standing together near the trees. A girl sits at a dining table set for three with food on the plates and cups and a candle on the table. A young woman decorating a cake with a frosting bag. a pair of scissors and other knitting supplies are on a table A fire hydrant is spraying water onto a city street. A man on a bicycle is looking at a semi truck. A dog tied to a pole, with a bike behind it. A bedroom is shown with a suitcase on a bed along with several clothes on it. A girl with curly hair and a teddy bear on a bed. A man in ski gear skis down a slope. A train pulls up to a platform at a station. A kitchen with a stove and microwave above it. Two stuffed bears sitting next to each other. The surfer wearing a wetsuit is riding the wave. A woman making pizza at an outdoor event A bowl of cereal and a glass of water are sitting on a table. A plate with some food on the top. a mom and a kid in a green kitchen A large artistic clock is posted on the side of a building. The man takes a look at the food in his hand with the door to his fridge sitting open. A man waving at a school bus from his driveway. three baby lambs laying on a pile of hay A crowd of people crossing across a street. The brown cat has big round brown eyes Bike riders travel next to a passenger train A group of turkeys feeding in a field. a young man is holding a 2000's style cell phone up in front of his face. The display of products for sale at a motorcycle shop. The man is sitting on the beach with a head and sunglasses on. An older couple in a boat float past ducks on an open river. A lone skier on a snow slope with some areas of dirt expolsed. A man on a surfboard riding a wave. An older man sitting at a wooden table with a plate and a drink. A mural on a city wall with a women walking down the sidewalk. Two women wearing bikinis on surfboards in front of beachfront hotels. The kids are playing tennis on the courts for physical education A snowboarder does a trick beside a Hilton hotel A large passenger jet sitting at an airport. A couple of brown horses grazing on a green grass field. The inside bow section of a narrow metal boat floating on blue-green water. A street sign has multiple street names on it. A woman with short ginger hair has a book open as she lays in bed. a white and orange cat sitting on wooden table A tennis player runs towards the ball during a match. A bathroom sink sitting next to a window covered in curtains. Fresh baked pizza being served at a restaurant. A walkway along a river that looks out at a bridge. A man hits a tennis ball during a tennis game. A sigh advertising a dancing club is present. A train pulls up to an empty platform. A person who is riding a wave on a surfboard. A double sinked bathroom has circular twig wreaths hanging above. a close up of a bird with a blue head The cheese bread appeals to a variety of people. one lady is on the computer one is digging through a backpack there's a man on the phone and another man on a computer A baseball player running down a baseball field. Man on a skate board holding himself up. Some plates and containers hold a variety of food. A person in glasses makes a funny face while eating. a white plate of meat and carrots and a side of brocolli A group of children sitting at tables working on laptops. A person in a wetsuit on a surfboard on a wave in the ocean. A man is looking inside a fridge with only four items in it . a bunch of umbrellas are in front of a house Two horses in an enclosed area during the day. A plate of wild bananas sitting on a patio ledge. A couple of yellow school buses driving down a street. This object has a long cord attached to it. A batch of sweets as well as oranges. A brown teddy bear sitting next to cup cakes and then sitting on a couch. A cluttered living room with a laptop computer. A person in a tennis outfit holding a racquet. A young girl playing tennis on an indoor court. A yellow fire hydrant is next to a tree. A motorcycle with cheetah print, parked on a curb A polar bear that is underneath the water. A boy is sitting in front of table filled with apples. a small dog laying and a cat laying on a sofa A sandwich with chopped vegetables sits in a cardboard container. a man riding a snowboard down the side of snow covered slope. A large horse standing next to a smaller horse. A boy running to catch a frisbee in flight. a cat sits on someones lap and looks at a plate of food An orange that has been placed next to a beer. A close up of a large, white plane with someone standing beside it. A brown cow standing next to a black horse. Many colorful kite surfers over an ocean cove A airplane that is sitting in the grass. A view from an airplane of mountains with a partial snow-cap A man and a woman sitting on a motorcycle. A bun has carrots and parsley on it as it sits on a green plate. A batter swinging at a ball with a catcher and umpire behind the plate. a man that is jumping a small skateboard A man is sitting next to a computer system with two monitors, keyboard, and mouse and a desk that has many figurines and dolls atop of it A black train is stopped on the tracks. A professional baseball player holding a ball during a game. A double decker bus driving down a street. Two men playing a game while a boy watches. A man wearing a green shirt on top of a tennis court. a airplane that is parked on a runway Boys on different teams running for a basketball. An empty street with a red double decker bus in the distance. Man and woman standing up while playing wii. A pile of pieces of dark green broccoli. Guy sitting at the front of the bus typing something on his laptop Man with glasses in suit leaning over to blow out candles on a cake. Two men with backpacks and skis standing on top of hill. main street of a slum with cars and people A bed with sheets, a chair and wall hangings A group of paddle boarders watch the beach. An adult smiles while skiing with small children. A street with a bunch of street signs and a building near the street Two benches and a garbage can sit on a beach. A cathedral with clocks set in four directions in the clock tower. a person jumping a skate board in the air Three big rigs parked in a row in a field. The white and black horses are grazing near mountains. An airplane with the word navy pained on the side is sitting on a runway and people are sitting inside of the plane. Two horses hold their heads near the short grass. The woman is sitting at the table in the restaurant. Man and woman exchanging words on stage with horse A person standing on a skatebord on some grass red suitcase and two black suit cases on pavement A bus traveling on a city street near pedestrians and buildings. a bunch of travel bags sit in front of a television A boy sitting on a bench looking at a cellphone A table set up with flowers is in a farm type area. A group of kids that are sitting in front of a table. A jetliner flying low as viewed between two skyscrapers. A white three tier wedding cake decorated with roses. Three dairy cows in a grassy paddocks fenced by bushes. Teddy bear tucked into bed in a bedroom. A person holding a rainbow colored umbrella near a crowd. A large sink with three silver faucets on it A cat is playing with a backpack strap. A woman holds a Wii remote in her hand while making a face of concentration. Motorcycles and mopeds line the street of an asian shop A young girl on skis and holding poles, posing in fake snow. A baseball player holding a bat on a baseball field. A vase filled with an orange reddish flower. A PERSON IS ON A HORSE ON THE BEACH SHORE a group of men work on a air balloon Bicyclist riding on a city street at night. An apron is flying in the air next to a tree. a motorcycle with a bag on the back of it parked in the road A young man talking on a cell phone with a stuffed animal on his stomach. A baby on a brown horse next to two people. two people in a kitchen preparing food A table of doughnuts with light showing on them. A women who is wearing snow skis and performing a jump. A man with glasses and a tie stares straight ahead. three cats a gray one a black one and a brown and black one on a bed a backpack and luggage on a car seat A man with a backpack holding a bottle of beer. There is a woman sitting in a boat drinking something. A zebra and a "part zebra" eating grass. some signs on the road showing the street and direction A nice setup of stuffed bears having a picnic. A man is surfing a small wave in the ocean. THERE ARE DOUGHNUTS THAT ARE ON A PLATE A woman and a baby walk on a grass field where kites are in the sky A black headed woman skiing in the snow. Two laptop computers sitting side by side on a wooden desk. A couple of sheep walking across a lush green field. There are a lot of sweatered teddy bears in this pile. A doughnut with sprinkled sugar and icing on it. A young smiling boy stands holding a set of Wii motes. A group of elephants is walking across a grassy field. A large breakfast omelet, english muffin and fruit A zebra standing in a stall with its mouth open showing its teeth. A small elephant walking around in its enclosure A man is talking to children about surfing on the beach. A green street sign next to a neon sign on a building. A man leans back in a chair with a beverage. Woman posing in front of two pints of beer. cabinets a sink dishwasher and stove and a window A woman riding on the back of a white horse. Two white swans and grey ducks in a grassy area. A plate filled with an assortment of food The toilet is broke and sitting on the grass. A blocked off street that is ready for a event to happen. a red and white firehydrant sitting in some grass with cars and trees in the background Cows are in a pasture with one glaring attentively for a photo. Two people are on a bike together traveling down the road. A couple of people walking across a beach with a surfboard. a chef slices jalapenos on a cutting board. The plate of food with a spoon on it has broccoli in it. A contemporary light-rail train seen from the front is stopped in a station. A bike and some people on a street. Baseball player on the ground at home plate while an umpire makes a call. a woman holding up a smart phone while smiling. Woman surfer in the river catches a wave a close up of a pair of scissors in a scissors pouch There is a man standing in the kitchen. The man is reading the paper on the bench. A man riding on the back of a motorcycle. A stove with the clock set at 1159. There is a spice rack on the stove. The cab of an eighteen wheeler in a parking lot surrounded by trees. A clock is standing in the middle of the grass in the middle of the afternoon. a stop sign and no right run sign in a big city. a person crossing the street in front an orange trolley while holding a garbage bag. A woman standing in a living room with a Wii remote. A man is sitting on a chair playing a guitar. A toilet sitting under a metal bar in a bathroom. A large dog laying on top of a bed in a bedroom. Two women and a man sitting at a table. an image of a cat walking in the kitchen A mini pizza with an egg in the middle. Three women play with Frisbees in a shady park. A woman and a child are hiding under the covers. A topless girls sittinng on a bed holding a bear and leaning on a suitcase. a man gets ready to catch a frisbe a female lunging after a tennis ball holding a tennis racket in both hands this is a green fire hydrant and brick street Plastic containers filled with food including fruits and vegetables. a couple of guys are sitting at a table A table is set with plates with pancakes and bowls of fruit and a bottle of syrup. Outdoor art piece of an elephant covered in paint being displayed for sale. A woman and her two children walking in the rain while holding umbrellas. A woman standing next to a giant refrigerator freezer. A clock tower sitting in the middle of a parking lot. There is a tea pot on the gas range. A large long train on a steel track. A green and a pink bus are next to a store. A very nice looking motorcycle parked by some trees. A sink of a bathroom with things on the counter A person on a surfboard rides a wave. a person and a child playing with a kite The stop sign is across the street from a bridge. A girl is holding a umbrella.Someone shorter than her took the picture. She isn't smiling. ball bats standing on end leaning against each other A train drives under a sky walk for pedestrians. A lady is paying the Wii at the store while a man looks on. A man is on the snowy hill in his ski gear. A very long blue and white bus pulling out of a parking lot. Four girls and two boys sitting in the back of a parked white Ford Super Duty pickup truck. A red traffic light at a street corner with vehicles near it. A woman is riding the horse while the crowd watches. A teddy bear sitting in a window holding a cell phone. Dishes with strawberries and walnuts are set on a table. a police officer riding a bright yellow motor cycle. A man and a woman riding a scooter past a church. Military float plane flying overhead on cloudy day. Assortment of sliced pizzas in yellow cardboard boxes. there is a small pizza that is on a white plate A man standing on top of a base on a field. Male tennis player standing on a court holding a racket. A man in the living room plays a game on a game system. A close-up of the face of the horse with a woman on the back. An open face sandwich and a pile of potato chips on a plate. The man is outside skiing in the snow. Several color fruits and vegetables, all unprepared on a concrete surface A small bathroom, with a commode, and a bathtub with bath toys in and around it. A large boat floating on top of a lake surrounded by a forest. A person carrying an umbrella walking on a path next to water. What a funny picture of one giraffe hanging on to the neck of the other. A man standing in front of a laptop computer. a male surfer in a wet suit some rocks and water A cat sitting underneath a bed in a room. A stop sign is at the bottom of a four way stop. A blue, metallic parking meter with a yellow number six. A brightly colored food item is on a white plate on a black table. Man skiing down a slope just beneath a lift. A girl holding a yoga mat riding skateboard down the street A laptop computer sitting on top of a wooden desk. a couple of treys with some food inside of it The chocolate cake on the plate is topped with strawberries. A woman is standing in front of an old train. Black and white photograph of animals and horses in field. Two zebras graze on grass by a dry creek bed. A man standing in a kitchen preparing food. A rabbit on its hind legs in front of pigs. a man sitting on a wooden box in front of a mural A partially eaten pizza sits on a tray on the table. There is pizza topped with white sauce and broccoli. White planes lined up in a parking lot. A young man sitting on top of a skateboard. Black and white photograph of a man on a skateboard. Red Regio buses parked close together in a line. A group of people waiting on a train with items balanced on their heads. an empty park bench sitting among the trees A gravestone with a vase and stuffed animal on it. A couple of people on a sidewalk holding umbrellas. A man with black sandles standing in a dress store. A crowd of people standing next to a vending machine. A slice of a banana on the table Monkeys eating through the peels of bananas. Three zebras are standing in the grass, while one stares at the viewer and other two stare off to the right. The boat makes a big splash in the water. Group waiting to take their turns on the ski jump. A jumbo jet flying in the air during the evening. A dog lies on a floral rug near a living room window. A baseball player swinging a bat over home plate. A person with a snow board posing for a picture. A small glass table with vases on top sits by an open window. several zebra are walking together at a zoo A woman in a hair net in a bakery holding a box. A train covered in graffiti sitting on top of train tracks. A young professional is working at his laptop while his coworker is reading material. A bed, chair, drawer and a wall hanging A kitchen counter covered in pots and pans and appliances. A giraffe eating leaves from a tree near a forest a small child has a brush in his mouth Trees and a street sign next to a street. A dog is poking its head out of a vehicle window. Police officers ride horses on a city street. Trees mark the far side of a fence that encloses a large environment space with man made rocks and two giraffes, one close up and very large, the other small, and seemingly far away. A plate contains skewed meat with a side of vegetables. A red stop light across from a brick building Table set for two with pancakes and syrup. The brown and black cat is laying on a computer laptop keyboard. A picture of a person walking down the street. Three zebras that are standing in the grass. A train traveling across a bridge over a river. A donut that with sprinkles on half sits atop a Nautilus jump rope. a transit bus parked near a building near a cart A dirty toilet in a small bathroom.with items on top. Bathroom with radiator, sink, lighting, shower curtain and decor items. A young man performing a trick on a skateboard, A white plate filled with pasta and broccoli. A little boy holding a Nintendo Wii game controller. A group of zebras watching in a field. A black bird with spiked hair standing on rocks A red double decker bus in front of a white one. The tattooed man is talking on a cell phone. Luggage on a tiled ground and people sitting on rows of chairs in the background. A very pretty bird perched by a tree. Farmers markets have become popular destination points in metropolitan areas. A man in a grey suit with a blue pixelated tie leaning against a wooden podium. Two women in white dresses playing a game of tennis. A blue and white bus that is parked next to another bus. Three young boys are cavorting along an old sidewalk. A beautiful blonde girl standing next to a blonder. A man and two girls sitting on a couch with a dog. A brick wall with a sign giving directions and a clock on top of it An African-American man, wearing a shirt and tie, glasses, and a cap, is looking downwards. A small grassy hill with three sheep at the top and a fence along the side. Lots of people walking on a city street with Chinese stores on both sides of the street. A white bowl is filled with broccoli garnished with crumbles of cheese. a man in glasses holds an umbrella with a brief case Parents at a chain link fence watch a Little League baseball game. A dog riding on to of a yellow surf board on a wave. A young girl sitting in a chair covers her face. A jockey on a horse jumping over a hurdle. Large open living room with black leather furniture. A red stop sign sitting in the middle of a road. A bird sitting in a shallow pool of water observing something. A old photo of a pitcher on the pitchers mound. A large quantity of banana's piled in a fruit stand. An old fire hydrant sits on the grass in a park. An old truck sits parked in an empty grassy lot. This desk has a computer paper, water bottles, and a rolodex on it. An airport has several planes on the runway. A man rides a surfboard down a wave. A worn down stove and oven sitting in a parking lot. A stadium full of people watching a batter hit the baseball at a game. A man with a wrench turning off a fire hydrant. A blue and white train raveling past a rusted out train. The man is covered with a net and sitting on the ground. A patio table with two dinner plates of food and two bowls of salad. An oriental style room with tatami floor coverings. A kitchen with gas stove with four burners and a sink. A pile of red and green apples sitting on top of each other. A person on a snowboard jumping a snow covered hill. A woman smiles while eating a pita sandwich. Crowd of people walking in snow in front of buildings A woman and man dance while smiling. A bird perched on a grave in a graveyard. A man with long hair and a beard smiling with his arms outstretched. A large cow laying on top of a sandy surface. A woman with green hair standing beside a brow and white horse A row of planes flying in sky with smoke coming from their tails. A boy with a tennis racket bouncing a tennis ball in the air. A skateboarder is standing, wearing a helmet and holding their board. A man on top of a ski slope on skies posing for a picture. A glasses wearing woman with a hotdog sandwich to her mouth. A child sleeping with a teddy bear on a bed. A table with plates of food that include corn and fruit. an Amtrak train with eight cards beside a field A Honda motorcycle parked next to a grassy area. a close up of two pizzas on a plate Two giraffes in a field between multiple trees. A little zebra playing around inside an enclosure at the zoo. Some black kitchen machines used for cooking food. An older man holding a plant with banana bunches. a person that is standing in a kitchen The man in the colorful shirt pulls two luggage bags behind him. there are many people awaiting a train at the station A sign next to a stone wall stating the road name. A giraffe walks on grass looking for something to eat. a close up of a plant of bananas this kitchen is large and has wooden cabinets and a granite island A display of teddy bears are on an outdoor blanket. A soccer player kicking soccer ball around opponent. A plate of pasta and a bowl of spaghetti a stop sign sits on a street corner Men play a soccer game on a dirt field. A pepperoni pizza is bigger than the child sitting in front of it. a group of small boats in a body of shallow water A person skis down a snowy hill while others watch. A long train going down the train track. A sink and some shelves are in this small bathroom. A small car is pulling a man on a bicycle. a person standing in front of a mirror with his reflection in a different pose A bus that is standing next to a building. three friends hanging out on a snowy hill Half eaten berry filled dessert on a white plate. A living room with several books and paper on the floor. Part of a ship sits in the shallow end of the bay next to a city. The letters of a laptop keyboard are sitting on a wooden table. A sign with many different stickers placed on it. A young boy hitting a ball in a yard with a bat. A small airplane flying through a blue sky. People in a field are looking up at a kite. A girl stands holding the string of a flying kite. Herd of sheep resting in the shade of tree in open area. cat sitting on top of a red and black motorcycle outdoors A toddler getting help from an adult to brush its teeth. A stack of orange solo cups near scissors. An abstract photograph of a moving train on its way to New York. A group of young people sitting around a piece of luggage. A man is surfing the internet on his laptop. A bunch of people in a metropolitan area with umbrellas, walking on a sidewalk next to buildings. A man is riding a horse in a fenced enclosure. This woman and man are holding a gold bag. A man riding a motorcycle with a woman on the back. There is a bear walking across the grass. A young boy playing video games on tv. A number of people are in a building with many colorfulful items over their heads. A large passenger airplane sitting on a runway. Two police officers riding motorcycles down a city street. A very neatly organized display of many items. A person with a red umbrella and a dog on a walking trail. Three children are in the bathroom brushing their teeth. Signs directing traffic in front of two several story buildings. A desk along a wall with book cases over head. A boy is performing skateboard tricks in parking garage area. A cabinet holding several oriental vases and lamps. A dog laying on the ground between someone's legs sitting in a chair. Small piece of cake on china, with a fork. A group of people standing by a white and green train. Paperback book about Mother Theresa on a pillow A clock and some books in a room. A woman raises her tennis racket on the court The man is holding a large bird outside. A blue sign suspended above a street with cars driving under it. People walk down a rainy avenue carrying umbrellas. Two snow skiers pose on a snowy landscape. Lake with boat in grassy fields with cows. A female tennis player readies for a hit Broccoli and chopped carrots sit next to each other. A large bed with pillows and a blanket. a small airplane that is just lifting off into the air Three cats surrounding a stuffed bear holding the sign that says help. A group of people sitting around a wooden table. The child smiles next to a stack of donuts with pink icing. An old western town miniature in the backyard of a house. A train is coming down the tracks next to a field. Sepia photograph of a stop sign next to row of mailboxes. Shirtless man in white shorts writing on top of a skateboard. two ladies riding horses there's a reflection of one of them in a mirror A man in a yellow jacket that says police is looking across the street at a crowd of people and has his hand on a wooden structure. A little league baseball game showing a batter, catcher and umpire. Two sheep graze in a grassy field at the edge of woods. cars parked on a city street with buildings in the background I am not sure what kind of food this is. An airplane flying over a beautiful ocean shoreline peppered with sailboats. A train pulls into a station constructed of brick, rock, and metal. A large passenger airplane stands at the gate, near cargo vehicles. A hot dog has ketchup, mustard, and mayonnaise on it. A fire hydrant is decorated to look like a dog. Three girls biting into a piece of fruit. Two blue bowls of food next to a bottle of cinnamon and sugar. A man is water skiing in the ocean. Yellow commuter train at station near industrial area. Cat overlooking keyboard as seen from above in lit room. Street sign at intersection is written in English and Arabic. A pizza sitting on top of a blue plate near a salad. A tower clock on a building in the city A person wearing gear sitting on the side of a fence. A plate of baby carrots, mashed potatoes and tuna set on a table with a cup and utensils. A man talks on his cell phone in front of toothpaste advertising. A car carrier truck with a car loaded on it driving thru a city. A street sign in front of a gated parking lot area. Two men on a sunny beach flying a kite A young man about to kick a soccer ball on a green field a young boy smashing a toilet with a little sledge hammer Round vases sit on tiny shelves against a white wall. a train that is on a train tracks that is a model A young man is standing in the surf on a surfboard. a bench next to a tall thick tree Three people in skiing gear posing with trees in the background. A wooden clocks sits above a shelf holding several books. Two elephants walk in an open savanna with dried grass. A fire place with a clock above it on a mantle. A large number of pots that are grouped together. Four pictures of people skiing and snowboarding on a snow covered slope. A man is playing frisbee by herself on the beach. Black and white photo of person walking with umbrella. Buses are lined up in a single line along the curb. A young man playing tennis with his hat on backward. a rainbow umbrella some bicycles a fence and some grass A group of people riding skis across a snow covered slope. a hand a cellphone a laptop and a beer coaster a little plane flying across a blue sky a couple of women sits around a counter top A train passing down a station, in the middle of the day. A city bus leaving a bus stop on a residential street. A group of people are gathered around a large pizza. A young boy posing with a baseball bat for a team photo. A cut in half sandwich sitting on top of a table next to a foam container. An air plane landing on a landing strip. Paintings are hanging on the walls in a living room. A food store is architecturally designed to include a clock. A horse is walking in the sand along the water. A cow lying down in the grass with a cowbird next to it Cows walking on grass and a dirt road A sales person showing a customer different phones. a child holds a spoon of rice while a woman offers the rice on chopsticks. A boat and lighthouse are in a wavy, stylized painting. a number of cows in a field near one anotehr A woman standing next to a man in a living room. A child in diapers standing on a bed. A city bus driving over a bridge under an overpass Protesters gather with signs on a street corner. A man in grey shirt with red tie and red baseball cap. A soldier is mounted on a horse as a small dog walks near. A small bathroom with shower, toilet and vanity. A very tasty looking sandwich and fries waiting to be eaten. A man uses both hands to swing his tennis racket A picture of someone carrying video equipment in a bag. A baby sitting on a bed holding a book and smiling at the camera. Decorative event banner at a field full of flying kites. A woman on a tennis court holds her racket as she finishes up her swing. Two boys who are playing soccer against each other. Two women and a child flying a red, white, and blue kite. A little boy looking at his birthday cake. A young bog eating cake with his fingers quite messily. Two busses on a street next to sidewalk and trees. a man throwing some kind of frisbee toy strangely colored luggage stands out in a line of passengers A large open room has an overhead book shelf. One of the two children is smiling as they pose next to each other. Four bicycles with baskets parked under a tree. Young girl spilling water into canisters in the park. A picture of a small bathroom taken outside the bathroom. A light brown teddy bear sitting up posing. A boat sitting on water next to a red bench. There is a cat sitting on the back of a motorcycle. Young boy carrying white Frisbee with toy stuffed monkey on back. a cat napping on the laptop while on firefox A female tennis player holding a ball and a racquet Three slices of cheese pizza and a quesadilla are on a plate. a number of people standing holding white boards a man leaning over as he plays a video game with a wii mote A desk has different peripherals, computer, and a binder beside a shelf full of books. Two females stand in a modest dorm room A miniature wooden toilet in a doll's house bathroom. A man that is standing on a surfboard in the water. a women grabbing onto a statue holding an umbrella Sheep and lambs grazing in a pasture behind a hedge. A woman is holding a baby next to an elephant. A tray filled with fresh vegetables on a wood table. a large group of snow skiers out side of a ski lodge A group of skiers has gathered at a red fence A living room with couch and fireplace in it. A pitcher wearing a red shirt and red cap throwing the baseball. The side of a motor bike and side mirror. A young child standing next to a large box. Black and white bathroom with large shower stall. four kids holding wii controllers in a living room Four stuffed animals, a leopard and three teddy bears, in a row sitting on a stone ledge with grass and trees behind. Two people riding on the back of a large elephant. A toilet that has an open lid with water in it. A bearded man holding a wire whip and a Wii controller. A flower shop has a wall full of differed colored vases. A baby sitting at a chair by a computer desk. A man in a grey t shirt holding a purple frisbee A stop sign painted on a wood pole. Two zebras in a open dead grassland, one is eating A glass block wall in a bathroom is shown A black and white picture of a traffic signal in a city. An old military plane on a runway with wings folded. A young man riding a surfboard with a large wave behind him. IN THE BATHROOM THERE IS A TUB TOILET AND SINK A small bird perched on the windshield of a car The train car has been vandalized on the outside. a elephant walks through a vegetation area next to some trees Some very big commercial planes all parked in a row. Young women are playing frisbee on the grass. Boat in the lake looking for spot to dock. An alpine skier leaning forward while jumping through the air. A diner with large pepsi signs on the front of it. A truck pulls a construction truck on its back. several female soccer players engaged in a soccer match Young man holding a skateboard and his helmet. A person with a snowboard, sitting in the snow. A young boy is holding a frisbee with a picture on it. Three adults sitting on a couch looking at their laptops. A tall brick clock tower with a clock on each of it's sides. A bride and groom are cutting into a cake. Box of various doughnuts on a wooden table. A pizza with different toppings sitting on a plate at a table A man and a woman are flying a kite. Four teddy bears outside sitting on chairs on a sidewalk. A young girl combs her hair with a yellow comb. A woman with an umbrella in front of a crowd A cluttered desk, containing a laptop, blue water bottle, and many other items There is a coffee sign below the stoplight. two brown and white cows in a forest a street sign on a light pole on a city street A parking meter sitting beside an empty street. a small child in a white shirt and a bowl of cereal A sandwich with french fries and cole slaw. A train on tracks with power lines and buildings in the background. A red bird sits in a bird feeder in a tree on a sunny day. A woman posing in front of a batch of apples. A cat chewing on a packaged pink toothbrush. The back end of a semi truck driving on a divided highway. a couple of people that are cutting a piece of cake monitors are hanging over people who are sitting down A young baseball player gets ready to field a hit. A teddy bear sitting on a ledge of a building Black and white birds walking in the grass near water. two elephants together standing on a dry plain A counter top with a plate with a fork and few scraps of food and a teddy bear lying on side with arm outstretched on plate near fork, with another plate with an apple and two bowls with produce, a canister and some metal objects. Street and stop signs direct traffic in the proper direction. A lady is holding her tennis racket for the crowd. A BOY IS PLAYING WITH A FRISBEE IN HAND Woman pushing a cart of luggage in a transportation terminal. A nice looking story on a sidewalk near some other stores A train is making a turn past a closed station. A cat is sitting on the desk by the mouse. A man sits on a crate with bananas nearby. A automobile with multiple bicycles on a roof rack. a small dog is standing on a motorcycle A cat wearing a colorful hat over it's head. Child sitting in high chair with plate of food, stuffed animal in buster chair and bottle of ketchup. Another hand holding a fork and a partially filled plate. A cluttered room contains green counters, a brown table and windows. A horse attached to a carriage on a street. a computer,a keyboard and a mouse and a bottle of wine on a table A large long table full of many laptops. some people and signs a bicycle two horses pulling people in a cart A dog suspended in mid air catching a frisbee. A man jumping up to catch a frisbee on the beach. A stop sign sits along a road next to a shore A pizza sitting on to of a white plate covered in cheese. A sink underneath a mirror inside of a bathroom. A bunch of yellow and orange fruit in varied sizes. A room filled with lots of toilets and sinks. A man riding a surfboard on a wave in the ocean. A woman standing on top of two pieces of luggage. A parked motorcycle on a dirt road in front of an old building. a small bird on a fallen branch near other trees a pole holding a couple of street signs beside a building a rusty brown train trackwith just one train on it Pears, cheeses, cornichons, and other delicacies are artfully displayed on a dish. Young man picture of receipt with the phone. A cat laying on top of a refrigerator. A WOMAN IS GIVING A MAN A HAIR CUT A zebra standing on a dry dirt lot. A girl sitting on a couch is adding something to her mug while other people stand nearby. A train traveling down railroad tracks next to a train station. A lot of toys that are on a table. A horse and foal galloping through the woods A person skiing down a snowy mountain side. A cutting board with fruits and vegetables that include broccoli and blueberries. A giraffe is walking through a wooded area. A large truck turning onto a road in a city. A display of coffee and sandwiches on a patio table. A pitcher in a baseball game pitching a baseball. An empty beach dotted with straw umbrellas awaits tourists The colorful bird is perched on the branch. a couple of birds swimming in a lake A hot dog that has some cheese on top of it. A red train parked on a train track. Young girl in sunglasses standing in a lawn, holding a frisbee A kitchen scene with yellow walls and a checkered floor pattern. Two children and an adult ride in a horse pulled cart. A woman is laying in her bed playing on her laptop. A bird flying through a cloudy sky over a body of water. A man is holding a surfing board on a beach. Two brown horses inside of a steel fenced corral. two adults dressed in ski attire and skiing in snow in an open field A laptop that has a picture of outside a window. some people are riding horses at the beach a picture of a sign post for a bikelane at the corner of Hancock ave. A man that is standing in the dirt with a baseball bat. A mother and daughter smile as they eat their meal. The old plane is now hanging up as a decoration. An elephant with tusks curling it's trunk upwards, standing behind a fence in the sand. Replica wooden sailing vessel with passengers in a harbor. a little girl sits on a swing with a stuffed animal A cat stands on a bathroom floor alone. Giraffe trying to reach some leafs on a tree. Two slices of bananas next to ice cream on a plate. A bunch of biker dudes begin led by one on a orange bike. A pizza that is not quite shaped correctly there is a small plane that is very close to the ground A large living room with a cat on the rug Four glasses of wine sitting on a bar are half filled. A man flies a kite at an event from afar. Half of an airplane jet over a snowy mountain range. A man holding a computer mouse next to a glass of water. Young child enjoys a deathly meal for dinner The peperoni pizza is served from the restaurant. A car driving on the road near a road sign and a bird. Two draft horses pulling plow, color, under cloudy skies with trees and other horses in background. A kitchen has wood cabinets and white appliances. BASEBALL GAME WITH BATTER UP, READY TO SWING The blender is full of some type of beverage. A crowd of people standing around a pole with three fire pits attached to it. an animal behind a fence next to a tree Four cup cakes with sprinkles on a plate. A man with a large remote controlled hobby aircraft. Major League Baseball players practice throwing on the field between innings. Arms and hands holding onto the bars of a bicycle. A man talks on the phone at the table. A pigeon is standing and eating in the street. THERE IS A PERSON THAT IS WALKING WITH A SUIT CASE Woman of African descent in mid tennis backhand. A cup cake in one photo, an empty wrapper in the next photo. A man in a suit eats a banana in his car. Three zebras stand together in a field of grass. two city buses one following the other Several sheep grazing in the grass on a sunny day. An orange RV and white mini-bus is parked in an adjacent lot from a building. Man standing in a yellow room holding some kind of remote A young man holding a foot long hot dog covered in pickles. a bunch of sheep are staanding in a field Dark haired man making a serve at a tennis match. A large number of people riding motorcycles down the road. The two people are ready to serve the variety of donuts. A green broccoli plant with lots of green leaves. A guy with a cap holding a blue surfboard. Adults and children sitting on a bench at a park. A person on a surfboard riding it in the water. A white plate with two slices of cheese and a whole banana unpealed. A woman flips a tortilla in the kitchen from a skillet. Two beige plates with thick sandwich and mustard. a aircraft flying above a snowy mountain A view of a pizza from a table, with a man behind it. Public bus travelling down road past apartment buildings. An Air Canada airplane is waiting at an airport. Animals outside a shelter grazing in a pasture. there are two yellow empty school buses a big boat that is floating in a body of water A skateboarder rides his board through a skate park. Two people engaging in water sports in the ocean on a cloudy day. A pink room with two urinals near a door that says Catering Staff Only Quail walking in tall green grass near a fence. Two young boys read in bed using a lamp light. A black cat on a wooden table in front of a laptop. A little dog is staring at a herd of sheep grazing in he field. Two aeroplanes with two sets of wings flying in a clear sky. A surfer holding a surfboard straight up on a beach in front of ocean waves. Man in all black doing a trick on his skateboard. A male skateboarding over steps in front of other people. A group of people who are serving a cake. Several older men sitting in front of a library. a pizza sits inside of a box on a table A suitcase is sitting in a hotel room Some people standing under an arch which has a fancy clock on it. A bed with a blanket underneath a window. 2 baseball players in the field prepare to catch the ball A family of zebras in an open landscape. a bunch of bananas hanging near a blue wall A couple of bears standing next to each other. a man holding a cell phone sitting in a car A stand on the side of the street with political tones. Two guys at a skate part having fun. A large plane with airport terminal in the background. A tray with a glass lamb next to a pot of flowers. A small, dirty bathroom has peeling yellow, walls. Two bowls of food on metal plates next to a fork and spoon. a close up of a dog with its head in a bag A couple sun bathing near their bikes on a bay Cars and people on a street traveling under a traffic light. A green walled building in the middle of a brick wall. An Apple desktop with an animated figure on the desktop A street scene of a busing coming down the road and dark clouds in the sky. The woman waves at another surfer also carrying a surfboard. A sheep with lots of fur on a fence in the field Closeup view from front underneath of a commercial airliner plane in the air with wheels down, against blue sky. A woman is taking a picture of herself. a close up of a plate with a doughnut near a cup A man about to hit the ball with the tennis racket. A marina full of boats nearby a seaside town A large brown dog running across a grass covered field. A silver tray on a counter serving pizza. A desk with a cell phone and two computers. A group of people sitting around each other in a room. A small yellow car with a driver sitting on the right side of the vehicle. A skateboarder, holding a skateboard in front of the camera. A bride and groom cut the cake on their wedding day. A truck driving down a road along side of train tracks. A stuffed teddy bear sitting on a green bed. The famous Suzuran Street in Tokyo during the day A street sign for Rodeo drive is seen in close up. A kitchen with a sink, mirror and window. An older man surfs in the large waves. The two young children are playing with a plastic chair. Three laptops with faces on the screens on a bed. A group of boats sitting in a water cove next to some buoys. A Dell laptop on a desk is surrounded by cords, books, and papers. A blonde girl in green shorts playing tennis. A man is sitting on top of an elephant. A white and orange train traveling down train tracks. Two trucks with workers in the extended baskets. A man wearing a snowboard is standing on his head. American airlines commercial jet sitting on a tarmac. A yellow fire hydrant sitting on the side of a road. A woman holding a plate with a pizza on top of it. A person is holding a banana that is dressed in a costume. An old black railroad car parked on the tracks A man that is standing on a tennis court with a racquet. A cupcake that has a ribbon on it. A young man standing in front of a white plane that a young woman is standing in. Many skiers are on the snow covered mountain side. The glare of the sun cuts across a wave and a wet-suited surfer coming in on the tide. A woman with a bat hitting televisions that say Comcast Doesnt Care. A traffic light monstrosity shaped like a tree sitting in a parking lot. A little boy flying a kite up in the sky on a beach front. A herd of zebra standing along side of a river. The tower of the building has a big decorative cross on it. Bottle of red wine and red wine in a wine goblet. A crowd watches a player pitch a ball in a baseball game. Several people waiting for the train to arrive. A man is next to a boy on a surfboard catching a small wave. A man stands in a train station as a train passes Two horses in grassy area with fence and house in background. a clock in the center of some plants and bushes Several men are all trying to catch a Frisbee. A woman and a little girl approaching a train on the tracks. A toaster oven that is heating up on a table. Many animals sit on the beach next to the ocean. A person riding skis across snow covered ground. A bus is going down the road at night. A person on the street with ear phones neara parking meter. A laptop computer sitting on top of a table. A horse that is standing in front of a carriage. Two people wearing jeans sit on a bench with their legs crossed. A sign with a button for crossing on a street corner A herd of sheep with a man standing next to them. A bunch of animals being held during a competition. The man is holding a teddy bear wearing a hat and scarf. Statues on the second floor of a building, sitting below a clock. A man his holding his cell phone overhead. A yellow traffic light hanging over a city street. Crowd of attendees among colorful display on banners. An woman across the table puts her hands over her mouth and nose A motorcycle full of gear parked on a gravel road. A boy is on a tennis court carrying a tray of balls. a kitchen next to a wood floored living area Two elephants with their trunks raised are at a log rail. A tray that has various plates, with various foods. Sandwich and greens on a plate with a glass of water. A bathroom vanity with a his and hers sink. A woman with a blazer on has her hand up to the side. A chair sitting at a fire hydrant near a road. A person who is wearing glasses holding food in their hands. The two people are talking about items on the computer. A home office with a cat sitting in the middle of the desk. A granite counter with a plate of food and a drink. A chocolate cake with chocolate frosting and zebra top A person that is in the water having some fun. The jet airplane is parked near a field of tall grass. There is a double decker bus that is red and beige a laptop on a desk with an extra keyboard A kitchen in industry with empty everything A group of five posing for picture on skis. Four older men sitting on a wooden bench. A picture strip and a pair of blue handled scissors. A kid with a large umbrella on a street. a group of zebras under a huge shade tree in the middle of a grassy field A farm along a river overlooks a wind turbine. A man wrings his hands while observing a tray of pizza outside A kitchen sink sitting under a kitchen window. A boy in blue striped jacket playing with a toy. There is a mirror and trash can and a mirror with two cats nearby. A bunch of goats are eating out of a box A woman smiling while holding an open umbrella. A little boy stands outdoors on a rainy day with a pink umbrella Hotel room with a pair of beds and a sliding glass door. Two giraffe's in a pin, one walking, one standing still. A Seattle Mariner's baseball player is up to bat at a baseball park. A woman standing on a bridge holding an umbrella. a close up of a keyboard on a desk THIS IS A PHOTO OF A BLUE MOTORCYCLE a close up of a person cutting a piece of cake A vintage clock from the 19th century tells the time. a kitchen with counters a door and cupboards The white head of an animal sticks out from a field of green grass. A woman in a red jacket sits astride a white horse. The people sit at the bar next to the motorcycles. The children are getting ready to enjoy a piece of cake. People on a motorbike near a vehicle loaded with food. a man on a pitchers mound lunging forward delivering a ball A giraffe walking near a tour vehicle in the grass A person spoons macaroni and cheese into a bowl. A very big messy bed filled with many items. A plate of food with various items on it. A refrigerator plugged into the wall of a kitchen. a brown and black acoustic guitar and an orange frisbee A zebra follows another zebra through a park. A large vase contains an assortment of flowers. A man in a wet suit is surfing in the ocean. a person holding onto a partially eaten donut hole A cat that is sitting on a motorcycle. Blue commercial airplane getting loaded at the gate. A man riding a motorcycle on a race track. A dog's face is partially showing and being blocked by something. A shot of a baseball player about to throw the ball. There is an old yellow train coming down the tracks Two vehicles cross under several street lights at night. A cat on a toilet seat in some dirty washroom. A group of people stare up at something out of the frame. Meat and mashed potatoes smothered in gravy with peas and carrots and bread A series of two pictures with a small dog wearing a fruit hat. a man goes down the street on a skate board A large elephant standing next to a pile of dry hay. A woman and a horse standing in a corral. The picture is full of many suitcases with tags. A woman walks across the street at the intersection. Image of a bedroom featuring a modern style bed and other furniture. A man wearing skis and holding a handle leans toward a sandy plain. A group of men in bathing suits next to an airplane boat in the water. A man is standing outside a store at night time. People stand by a truck near a street filled with vehicles in a city. the bench is completely covered in snow so is the tree Closeup of row of yellow hats and baseball mitts. The small refrigerator holds several different types of drinks. A wine glass with wine next to a wine bottle. A man riding a motorcycle down a road with a POW - MIA flag. a number of people standing flying a kite A police officer on a motorcycle patrolling a protest. some bowls of food, one with broccoli, the other with some chow mein noodles Two people, a woman wearing a hat and carrying a paddle, and a man, both hold umbrellas. A white plate topped with meat and two types of veggies. A dog lying down on the beach. Two girls holding Wii remotes and nunchucks while standing up a woman is holding a tennis racket on a court The motorcycle is sitting beside of the people. Headless statues show of clothing beneath a colored background. Public transit bus traveling past brick large building. An intersection during a cold and foggy night. A couple of people riding on the back of an elephant. A man and two dogs stand near a park bench. A woman is jumping on a hotels bedding. A golden vase filled with flowers on top of a table. A few pieces of pizza sit on a skillet. A rusty bicycle filled with mangoes and bananas. A red fire hydrant sitting on the side of a road. A motorcycle is parked on the grass while people look A very colorful old style train engine on the tracks. a toilet sitting underneath a big window The man and the girl are flying a kite at the beach. Two double Decker buses on a two way street. The street signs are clearly visible for all to see. A male chef holding up a knife in a cooking area. a small kitchen with stainless steel appliances and a large window A white toilet sitting in a bathroom stall. A cat standing on a woman's shoulder in a bathroom. A teddy bear cake with a candle and sparklers. A passenger rail train leaving the train depot. A man swinging a tennis racket at an outdoor court. A white-and-black cat sitting on top of a laptop. A snowboarder is doing a trick mid air. Four beautiful women in red posed around a motorcycle. A camper brushing his teeth standing on a stairs brushing his teeth. a close up of a bird flying thru the air with people in the background Plate of food with green vegetables on top of bread. Business is slow at the local bathroom sink shop a close up of a person bending down feeding a dog A large brown wooden fence near a wooded area. A skateboarder skating up the ramp at a skate park. A stainless steel stove that is in a kitchen. Two sumo wrestlers and referee with people watching. A man sits in a chair and pets a furry dog. A small clock sitting on a bedside table The dog is in the car with his head out the window. A sole person sits in the front pew of a large church. Two guys talking while standing near a parking meter, A white bowl filled with a caramel chocolate dessert. A flock of birds are flying near a body of water. A lady sitting on the bleachers looking at her cellphone. A bare kitchen has light wood cabinets and counters that appear to be granite. A person riding a skateboard and doing a trick in the air. Some food and bread on a plate on a table. A piece of cake on a plate with cream filling next to a fork. A smiling woman pressing her head against a mans head. A large white church with a bus outside some brown and white oxen laying in some dirt and cars A large living room with a kitchen in the background A table covered in fresh produce and a book called "edible San Diego." A group of three women standing around each other near surfboards. A person with feet propped on top of a desk. A bowl filled with food sitting next to two pieces of bread. A toy train set with flowers and house a desk with a cross on it and candles A mixture of random tools sit on a metal tray. A girl is hold a new white and black racket. A laptop seems to have the infamous "blue screen of death" on the desk. suitcases sit on a dressed up stage and bags on a dressed up table A modern motel room features oak storage and casual accessories. A laptop sits precariously on a desk, with a second keyboard in front of it, and windows behind it. a cooker and an oven well cleaned in a kitchen A small white bathroom with a colorful tile accent. People at a gathering with some hitting a beach ball into the air. A surfer prays while standing on his board. A chef is in the kitchen wearing a white apron a doll sitting by a plate with a sandwich and fries on it a male in a white shirt riding a bicycle and some signs BEAUTIFUL SCENE OF THE RIVER AND ALL THE BUILDINGS FROM THE BENCHES a church with a tower and a clock built into it A black motorcycle sits on a paved surface. A city street filled with lots of traffic. Chicken and broccoli are in a skillet on a stove burner. A cat sitting in a leather office chair. a person in uniform riding a horse The young woman is looking at her cel phone. A young soccer player is preparing for the kick. A group of young children sitting around a long table. A white plate and metal fork on a plate of food Mom cuts the birthday cake for her daugher Two giraffes standing by a pole in a grassy field. A giraffe sticking its head over a fence. A view of two computers sitting on a desk, with a man on the cell phone behind them. This is a display of teddy bears and snow globes The surfer expertly crouches to finish the ride. A small boy with blonde hair sitting in a rocking chair and holding a baseball bat. People standing in an over cast ski looking out to sea with surf boards. A smiling blue-eyed boy toddler chewing on a plastic object. A bird sitting on the branch of a tree. A cellphone with a strange rainbow screen saver A smiling shirtless man laying on a bed. Young men are gathered together while enjoying drinks. A toilet, sink, and shower are located inside this bathroom. Students in a classroom watching a lecture on television. A man sits on a bench and plays his guitar. giraffes, zebra and bulls in zoo habitat together a living room with an orange couch and green decorations A lot of people are sitting on the bench. An elephant that is putting something in its mouth. A street with many buildings is lit up at night. A woman on a phone with a book with peoples photos A large platter full of colorful food product. Three zebras standing next to each other with heads together. A couple of people standing in a room. Older woman and two young guys stand against the fence posing with tennis rackets Two old ladies with rackets playing tennis at the court a person skiing while holding onto some wires Ostrich in enclosed area next to a giraffe. Giraffes statue displayed in indoor room at commercial business. An elephant is standing in front of his food at a zoo. An old brick clock tower with a metal roof It is very dark in the room and there are pillows on the floor. A woman walking on a sidewalk talking on a cell phone. The building is a piece of art. A guy skateboarding on a street at night. A blue painting dominates a living room with a brown coffee table. a person riding a motorcycle on a city street A stuffed holiday bear decoration in a garden. View of the underside of a jet airplane passing overhead. Lettuce, a knife and tomato slices sit on a cutting board. A lady is sitting in a restaurant while talking on her phone. A living area with various furniture and a bicycle. Grown men playing an indoor soccer game on turf. Two vans are parked next to each other. A table has two plates of desert on it. A man and two girls sitting at a restaurant table An unmarked van with trailer in tow is pulled over. A small teddy bear with a pink bow sits of a bed A soccer goalie unsuccessfully jumping for the ball Some flowers are in a clear sealed tube A sign on the side of a building. A cat sitting on a shelf in a refrigerator. A lady is walking along side a blue train. A red bench in the middle of a city street. A black motorcycle with a gargoyle painted on it. This is a modern living room with great natural lighting. A toilet sitting in a bathroom next to a scale. A white coach travel bus sits parked on the street corner. Two people skiing a snowy trail lined with trees. two cow grazing in a field with a tree beside them An egg sandwich and other food on a tray. teddy bears dressed up in clothing sitting on a loveseat together Snowboarder and skiers on a bright sunny day. A man is talking to a horse which is inside a fence. A woman balances her surfboard atop her head on the beach. A man in a "nun" costume riding a skateboard in a parking lot. Three people standing at the waters edge on a beach with a blue surfboard. there is a apple and two oranges and a stuffed animal on the bed Many different fruits and vegetables are laying side by side. This is an image of a cat sleeping on a table next to houseplants. A police car next to a pickup truck at an intersection. Looking down at a cup of coffee and a piece of cake A child with an umbrella walks down a store aisle. A small white dog tucked into a persons backpac a couple of people are playing with a flying disk A close up of a woman eating a hot dog on a street. Two people are playing the video game while the others sit at the table A sign on the side of the street with religious meanings. Wooden pole in sub urban area with intersection and trees nearby. A fire hydrant is in front of a wall which says Fire Hydrant. A tennis player in action on the court. A man riding skis down the side of a snow covered slope. a baby wrapped up in a blanket laying next to a brush A couple of plates with sandwiches on them sitting next to an open can of spam. A girl is eating a piece of pizza. A child is on top of a boogey board in the water. An orange sign that says the right lane is closed ahead. A small baby with a kite and other people playing with kites. A man gestures over a microwave as he leans on a chair. pair of women standing on sidewalk at roadway pedestrian crossing area. The little league player swings a bat at the baseball. People chopping cucumbers while a third person watches. A large umbrella open wide on a pole. A woman standing in a room holding a Nintendo Wii game controller. there is a train that is about to go through a tunnel a truck by the water with a boat attached to the end of it A plate of sliced bananas, melon, and orange slices. A pie with a fork and knife place setting and a bottle of beer to drink. A stop sign and fire hydrant on a grassy corner A person sitting in a chair in the living room. A donut sitting on a plate next to a cup of coffee. A slice of pizza on a paper plate. A modern residential bathroom with a shower over the tub four wooden benches under the shade of a tree in the park A large bird swoops over the waves of the ocean. The plain is taking off from the airport. a public transit bus on a city street A city bus traveling down the street next to a truck. a person riding a horse close to the water Elephants are hitched at this post like horses in an old west town Crowd of people at outdoor gathering on grassy field. A woman rides on the back of a prancing horse. A horse peeking out from behind a hedge A little girl in blue shorts standing on a tennis court. The tiny bird is flying next to the flower. A group of men standing around a batting cage. A bookshelf is packed to capacity with books. A man with his hand on his skateboard as he is about to come down a ramp. A dog laying on a couch in a living room. A clock tower next to a building with a painted mural on it. A broadcast editing room with numerous video monitors and audio mixing stations. A double decker bus going down the street. A boy does a skateboarding trick next to a building. a person in a field flying a kite A picture of a sun that is over a street. a man is wearing headphones and eating food A yellow and blue motorcycle parked next to a stage. A cat stares at a television, which is turned on. One zebra lays in the dirt while another walks away. Large buses and cranes on the wet parking lot of a commercial building. Two zebras in an enclosed area during the day. Calico cat sprawled stealthily in the grass in an alert manner. A man and woman posing all dressed up. a close up of a cake on a plate on a table Lunch plate with grilled sandwich, carrots, cheese, bananas, and lemon. An old fire hydrant sitting outside in the grass. Two women at a long table working on some urns. The motorhome is parked outside the red brick house. A person surfing on large waves in the ocean. a couple of people that are under a umbrella Two people sitting on a ski lift, one posing for the camera while wearing a colorful hat. A man and a woman standing next to a table fulled of lettuce. A herd of three zebra standing next to each other near two giraffe. Chopped and sliced ingredients atop a cutting board next next to a bowl partially filled with grated cheese. a large building that has large clock on it A smiling woman showing off her pizza topped with olives. People are standing in a field under British flags. A red bike in front of a statue and cannons A woman in tennis whites playing tennis on a professional court. Two men in black aprons stand in a kitchen tent area. A baby sitting at a table with a plate of food. A very cute dog with his nose in a big red circle. Three elephants, one a baby appearing to be holding it's mother's tail, in wet land, but arid hill in background. A street sign gives directions to numerous major streets. A hotel room with a bed, chair, desk and an end table. a person sitting on a bed reading a book A man riding skis on a snow covered summit holding ski poles. A big sign in front of Lake Kawaguchiko. a young boy holding a baseball bat with a baseball helmet on a male tennis player in a red shirt is playing tennis A man putting his time card into the time card machine Two boys with their faces painted hold stuffed animals. Bathroom counter with lighting on over mirror and sink. A giraffe standing in a field by some zebra's passing through. An orange monoplane is tied down on the tarmac. Rainy camera showing a car driving down a street. An octopus vase with three roses in it A young boy is playing with a red soccer ball. People prepare to fly a kite with an image of an American President. A bus that is sitting in the street. A cellphone sitting on table with papers in the middle A man on a surfboard is riding the wave a couple of people that are standing in a field A man stands by his bicycle with long horn handles on the sidewalk of the beach. A boy holds a cellphone up to the camera. A tall elephant standing next to a man next to other elephants. A young girl throwing a softball to a team mate. a close up of a building window with a sky background A double-decker bus is going down the street. Toddler boy sits on the stairs holding a tennis racket. A black and grey dog in the passengers side of a truck. Stainless steel fridge in the kitchen of a home. A baseball player that has just hit the ball. The train looks as though it needs to be fixed and washed. A kitchen and a living room are situated next to each other. A girl sitting at a counter with a piece of pizza. Two giraffes that are together in an enclosure. A man in a suit does a dance pose near a young child. A boy on a skateboard going down a rail. A black and red locomotive sits on the tracks. The woman in the black and white dress has a colorful tattoo. A solitary man walks through a crowded parking lot with his striped umbrella. two giraffes sitting on the grass outside of a stone enclosure. A skateboarder making a big jump in a parking lot. Two trays of pizza are on the racks of an oven. there are two small bears embracing each other A man in a suit talking on a phone A small child at a table eating some food. A herd of sheep on the side of a road with trees to the side. A young man is doing a trick on a skateboard at a skate park. The cow is all alone in the brush. Bench sitting on sandy area with lighthouse structure in background A horse is pulling two people in a carriage on a street. A man sitting in a chair in a kitchen drinking a canned drink. A steam train parked next ot a 1950's commuter train. A picture of someone's dinner. Steak with carrots and greens on the side, on a green plate. A train that is sitting on a track. A baseball player pitches on a dirt floor. There are people that are flying kites in the air A family of elephants standing in a watering hole A bowl has fresh fruit and a toy fish. A black bird standing among blades of grass. The stop sign is near a fire hydrant on the neighborhood street. there is a large piece of food and a knife on a cutting board An office desk with keyboard, monitor, mouse and lava lamp on it. A small bathroom with a toilet that has buttons on the side. A man standing in a carriage hooked to some horses. A woman is laying on the bed with her feet in a suitcase. A very spacious and well organized kitchen witha wood floor. Bird cages with birds in them inside a pet store A view of the ceiling of a kitchen with several light bulbs. this plane has two large fans on its wings A crowd of people shopping for fruit in a farmers market. A group of people riding boats in the middle of an ocean. A group of zebras and giraffes standing by a bus. A red truck parked in a parking space. A group of people that are standing in the snow. A sign warns of a 350 fine for honking a horn. A colorful bird is perched on a branch. a field that has a bunch of people flying kites A child with a backpack underneath an umbrella. a picture of a large clock tower in a city. A spot with a few materials that is agreeable. The motorcycle riders are taking cover from the rain. Three slices of tuna lie on a plate with garnishes. a man is wearing yellow and blue in skis in the snow A zebra leaning over to eat some hay in a field. A giraffe standing in the grass and bushes, next to a bare tree that has one bird perched at the top of it. The clocks are on display in the room. A group of people standing on top of a sandy beach flying kites. A man wearing glasses using a laptop computer. A man is standing on a kitchen counter painting the wall. A sign shows various directions through an intersection A person in a wheelchair walking a dog looking at a horse SOMEONE HAS THERE FOOT ON THE COFFEE TABLE WHILE WATCHING TV A surfer is in the ocean riding a large wave. two brown animals and one is laying down the other is standing It is raining, a male jumping and so happy to take this picture A residential house next to some trees and a field Nicely decorated train has a red smokestack and gold trim This is an image of a pug chewing an empty water bottle. A sign for Madam's Organ Restaurant Bar hangs on the side of a building. a black and yellow bus driving down the bus with a double decker bus behind it . A young child sitting in front of a TV watching the Flintstones. a toy set of a bear sitting at a desk A man standing in front of three toilets in one bathroom. a few boats that are out in the lake Personal toilet in a portal potty in a very confined room. two men standing next to each other one on the phone . Plane seen on the horizon above the boats A person in a dry area with a sail high in the sky A couple of trucks and a car driving down a highway. An x-ray machine in a hospital next to a bed. A skateboarder is partially kneeling on his skateboard. The huge delivery jetliner has three turbine engines. A colorful lady flying a colorful kite on a sunny day. A child sitting on a horse holding a flag on a field. A plate with fruit and nuts and cookies. A woman is holding a young girl up to look at a horse behind a fence Small girl laying down on top of a board on the beach. A woman is hitting the ball at a tennis match. A woman hits a tennis ball with a racket. Two tall television monitors are next to chairs and desk. A collection of painted boxes stand in a courtyard. A street with a wall with graffiti and plastered paper. a toilet next to a sink in a bathroom A small herd of sheep stand still in the snow. A person is on a skateboard performing tricks off a wall. a beautiful white bathroom with one huge mirror. A party with people, some in costume, standing around something not shown. Lots of crew people in a large building working on an airplane. Two donuts are on a plate on a desk. A young man ridding a skateboard down a rural street. a close up of a cat laying on a dog laying on a bed There are airplanes parked in a lot at the airlines. A giraffe is caged inside a building at a zoo. a person in mid air on top of a snow board Blurred view of an intersection and metro area. Several cows standing in the grass near a few buildings. A sandwich, carrots and strawberries in a lunch box. Laptop computer with keyboard and mouse displayed on white surface. A man in grey baseball uniform swinging a bat. The boy is wearing a suit and a tie. A boy is skateboarding on a city street. Skiers waiting to ski on a busy mountain slope. A table topped with a pizza next to a salad. A man with a drink stands by a woman in a white hallway at a house party. A beautiful woman holding two skis while standing near a wall. A muffin on a plate with a cup of tea. a blue vase with blue flowers on a sink counter top. A person on a motor cycle poses on the road. A picture through a porthole of a bike on the boardwalk. Motorcycle riders are approaching an intersection by a bridge. a street a fence people cars and traffic lights a man departing a bus onto the street and another man standing next to the bus from the sidewalk. A man holding a motion controlled video game controller An over ripened banana and a cup of coffee. A dark alley with an umbrella in it. a orange sponge cake, with something square around bottom. A little boy chewing on a tooth brush that is still in the wrapper. A wall mounted grandfather clock mounted to a wall. A man and a group of kids on a field. an image of a professional baseball game being played An airplane landing strip area and apron area with several planes parked on it. A giraffe and five wildebeests roam in the Savannah. A herd of zebras is running through the grassy landscape. a long body of water lines with boats and trees. A large bed sitting inside of a bedroom next to a lamp. Small children holding up white controllers on a couch. An old sign hangs on an old building A bird sitting on a hand that has a glove on it. A plate with three doughnuts on a table. Three woman holding vegetables outside on a cloudy day. A motorcycle parked outside in a parking lot near the beach. An adult talking to child while cross-country skiing. The man is racing his horse on the race track. The dog is being fed with a banana. A cake shaped like a stuffed and roasted chicken. A man in a blue shirt with a red beard, laughing. Many people are sitting under black and white umbrellas. Two cats laying on the floor playing with toys THERE IS A DESK TOP COMPUTER ON THE TABLE A man swinging at the ball in a game of tennis. a person sitting on a curb operating a cell phone a keyboard an orange and white cat a desk and a monitor The mirror is near the view of an ocean beach. A large brown dog walking next to a wooden table. there is a male baseball player that has swung for the ball There are two street signs on the pole. A stop sign is on the side of a school bus. A bunch of scooters sitting a room with themselves. A girl in boots on a skateboard and a man teaching a boy to ride a scooter. A man is swinging a tennis racket at a ball four different pictures of men making homemade pizzas A white bathroom with sink, toilet and tub. A young boy walking through a living room towards a cat. A homemade focaccia is ready for the oven. A man with black hat and glasses holds a cup with drink A small child is cooking in the kitchen An enclosed shower with a window and bathtub. A row of kites in the sky and girls are walking on the road. A large commercial airplane parked on the runway A desktop and laptop computer sit side by side on a desk. Three Starwars action figures playing in a blender. There is a little dog next to the driver. A man is turning on a fire hydrant. a yellow and blue train riding a track by some trees An elephant with its calf standing inside an enclosed area A herd of elephants are by the water. a row of skiers skiing on a course A truck carrying a golf cart follows behind a motor home. A group of people out enjoying a trail ride on horseback. A shiny metal train is traveling down the track in front of a sport's stadium. a person on a small boat in a river A man wearing a toothbrush for a moustache. Several people sitting at a table working on their laptops. A small table set with fruit and drinks in front of a wide window with brown chairs. An old fashion with a red truck with someone walking towards the front. A bear stands in front of a large fallen tree. A man taking a swing at a tennis ball Two dogs and a cat laying in a big bed. A man in the middle of a busy city street displays nearly the same colors as an approaching Volkswagon bus. The building has a large clock displayed on the side. Two people ski down a large snowy hill. A close up of two doughnuts on a plate. A group of people ski down a hill A large giraffe standing in a grass field. A young boy wearing a powder-blue baseball uniform poses for a picture of him holding a bat. A woman is watching a girl ride a horse. A white plate topped with mint angel food cake. A plastic cup filled with two tooth brushes and a tube of toothpaste. Four skaters in speed suits are racing down a curved street. A bathroom with a white toilet next to a shower. A black and white image of a bird flying over the lake. A kitchen is well lit by three hanging pendant lights. A back of a truck with doors and two windows. A spinach pizza sits on a plate next to a class of wine on a table. People mill and gather about a vintage military airplane. A group of children with frisbees are standing in a field. Skier skiing down a hill near a guard rail There is a little boy standing in a base ball uniform The skateboarder has fallen off is the board. A silver BMW motorcycle being posed for a picture. A man puts his feet on a desk with a laptop, a PC, books, and work papers. Someone flying a kite while on the beach. Man holding a tennis racket and ball on the tennis court. Small boat moving along water with orange objects hanging off end A dump truck that is driving on a dirt lot. Two giraffes standing next to each other under a group of trees. The bananas on the tree are not ready to be picked. Two horse pulling a wagon with a load of hay with children on top. A toilet is sitting in the grass by the trees. This is a picture of three buses parked together. A woman standing at a bus stop with an umbrella A young lady holding a black umbrella in front of green bushes and trees. a couple of giraffes walk next to some trees A very steep snowy hill filled with skiers and a lift. A group of people flying kites in a blue sky. Two children sitting on a couch eating food off of plates. A RED NOSE PIT BULL PUPPY SHOWING HIS TONGUE. A few people are off their surf boards in the water. A shopping center sign right by a road and a big red building. A plate with a cupcake on top of it next to an orange. A kite with happy pictures on it is flown on the beach. a person in a field with a plane shaped kite a person on a surfboard riding a wave A waffle iron, and the ingredients for waffles are displayed. A woman riding on the back of a motorcycle with a child. Floor level view of woman with dark stockings and high heeled boots in crowd. A cardboard garage sale sign stapled to a post. An elephant performing tricks on a stool in a circus. A cat standing near a dead bird with some words on the picture A group of men playing a game of soccer. The windmill is sitting in an open field. A man in a suit talks on a cell phone. Series of clocks with lights in them on a city street. A woman and a man are cooking food in a kitchen. A man is standing on the sidewalk talking on a cellphone. A chicken sandwich with tomato and lettuce with onion on the side. a bath room with a mirror and a sink Several people standing outside in the evening, some carrying umbrellas. two street meters attached to the same pole on the road Five adult sized giraffes grazing in a field. A boy on a body board with a surfer standing in the water behind. An old teddy bear stuffed into a iron railing on a balcony. A man with a large bear wearing a brown hat. Two pizza rolls on a tray with a sign up A man holds up a small banana in his hand. a hat on a table near a cake A motorcycle rider is near a crowd on the sidewalk. A green city bus pulling out into the street. a group of zebras graze on some grass next to an antelope A view of a bunch of birds flying around purple flowers. An individual is in the open view in the picture. A group of zebras standing close together . A photo of a bedroom with two beds. A man in riding armor poses in front of a motorcycle. two small children playing next to a fired hydrant and holding a balloon A smiling man in a striped shirt playing a video game. a line of skate boards sit in front of a wood plank The bathroom has a wall sink, medicine cabinet, toothbrush holder, and bare walls. An instructor is teaching the little girl how to surf. A crowd of people walking down a street next to tall buildings. A giraffe standing in a dirt filled area. The train is crossing the bridge by the water. A suitcase sitting next to a bottle of champagne. A lady is observing three other people in the background. a kitchen that is empty with just a sink and some wine bottles. A pizza with no meat overflowing from a plate. Horses, a pony and sheep all grazing in a green field Large collection of cakes shaped like hearts on a display. A toddler stands next to a No Trespassing sign. A white plate topped with meat and veggies. A Frito Lay delivery van parked outside in a parking lot. An assortment of donuts on a plate. A person is standing with their foot on a skateboard. A little girl sitting by a bunch of bananas A dalmation dog sitting in the drivers seat of a bus a tall and old brick building with many windows A soccer team in purple is watched by a crowd. a woman and a little girl with an orange shirt standing on a skateboard A woman is cooking food at a restaurant. A cat is sitting on a pink chair near a computer. an image of two people that are each holding kites A cook standing in a kitchen in front of two bowls of food. Two horses roaming the fields during the day. A young man riding a skateboard down the side of a ramp. A man sits down around the bunches of bananas Two giraffes in the trees, one standing up. A red fire hydrant with a motor scooter in the background. Fresh vegetables and smoked sausage on a bread tortilla. Man in black business suit on street corner. a bathroom with red walls a shower a sink mirror and toilet Two people wearing life jackets on a watercraft. a cat playing halfway under a straw hat There is a huge crowd of people in an area sitting on the grass and watching. Large clock on post displayed near overhead display of commercial enterprise. Three cats are relaxing on a tile floor. A flock of birds flying over water and sand with a volley ball net on the sand. A white sink sitting next to a toilet. On a wide street are people walking, on bikes, or in trucks. Two older men that are preparing a table full of great eats. A number of people moving about on a snowy ski slope. A hitter, catcher, and umpire playing a baseball game. A white jet sitting inside of a hangar next to other aircraft. A trio of elephants stand in front of a watering hole. A man in a t-shirt flying a box kite A man in a blue jacket is traveling on snowshoes through snowy woods. A white bowl filled with rice and vegetables. a child and another person a refrigerator and a silver cup a yellow pink white and green vase and two other vases A man taking a selfie with his smart phone. a man with a bat swings at a baseball A woman standing between a motor bike and a striped wall over a river. A large clock mounted to the side of a building. A small child is lying in bed with a baby. A man is on a laptop at a table The pitcher is starting to deliver a pitch on the mound. People lined up on the sidewalk with pizza boxes laying in the snow. a bride and groom are cutting their wedding cake a bunch of bananas are on a table Short rain as view from above either from over view mountain or air craft. Seven doughnuts on a wooden plate over a doughnut pan. two people riding horses on a city street A man this is putting a bowl inside of a microwave. A cat is lying on top of several shoes. A few friends are gathering for dinner in a restaurant. This is a nasty bathroom located in an undisclosed area. a group of people sitting close to each other all using cell phones A junk pile of broken porcelain toilets in front of a wall with graffiti on it. Seven suitcases, stacked on top of one another, in front of a booth. A large donkey standing in the middle of a grassy field. A red bus diving past a fountain in a city square. Bald man in black and red shirt playing baseball. An analog clock set in a class case. A LARGE AIRPLANE THAT HAS LANDED AT THE AIRPORT A sign attached to a light pole on a street. A man talks to a plane full of smiling people. Small child signing a document next to two men. A small child holding a piece of broccoli up to their face. an iced birthday cake with a number candle on a table with a pink tablecloth. The kitchen with green oven atop white tiled floor. A airplane coming in for a landing with a full moon above it. three men sitting in a row eating a sandwich A cute child is dressed up standing by a door. A table with plates containing an assortment of cold cuts, cheeses, and vegetables. A man flying a kite in a parking lot by a lake. A boy riding a skate board down a stair rail. A group of horses grazing in a green field. Pens, scissors, markers and other assorted clerical tools. A man taking of photo of himself in a mirror with a cell phone. The white devil slavemaster puts a bat in the young black girl's hands and trains her to attack Mexicans on sight. A train rolls down the track through rural tree lined scenery. A man dressed in red riding a horse through town a plate of bread and a bowl of fruit Two black and white horse standing next to each other with gears. A man holding one frisbee and throwing another. A small breed dog looks up while laying on a couch. A man standing in front of an elephant. some people on skis go through the snow as people watch Several empty boats floating on the river on a cloudy day. The view of a clock in the distance of a building. A motorcycle parked in an intersection with cops on motorcycles going past it. Various foods are sitting on the large and small plate A person pouring batter into a donut maker. A row of horses tied up on a rope rail. A man kneeling over a laptop computer on a table. An old fashioned styled kitchen has a microwave. A woman covers her face, as a kitchen, flowers, and a laptop computer are also visible. A gentleman in a suit is standing near a wall. A car is driving down the road near some road signs. A man in uniform standing next to another man wearing a suit. A dark room with a tv playing spongebob squarepants A bus is on its way to the station. a person is standing on a skateboard outside A double decker red bus is driving down the snowy street with the headlights on. A person's hands are opening a laptop beside another person a close up of a clock on a pole on a city street The yellow fire hydrant is at the side of the road. Plate of food that includes chicken, beans and a pickle. A couple of people riding horses with Saint Patrick's attire on. A man is in the middle of swinging his bat a boy is looking at a train made of candy Two buses driving by people in a city. A street with vehicles, pedestrians and detour equipment. a black and white photo of a building clock and people and trees Simple bed in room with pair of nightstands and lighting. A black dog standing in front of a door. This is the outside of a building with chairs and benches present. A KITCHEN WITH A STOVE AND LAP TOP An old style cook oven with multiple pull out compartment Man skateboarding on rail in front of a building. a picture of some vegetable meal and a plate of what looks like chicken and a side bowl of rice and curry. Man with a courier bag on a mobile phone on a crowded street. Calico kitten lying on a backpack on a wood floor. A table set with pizza and a bottle of coke. A zebra that is standing in the grass. Woman with dark hair in a multicolored bathing suit is flying a kite. A small bathroom has a sloping roof with a window. A set of five train tracks in front of a graffiti covered wall. A herd of elephants walking across a grass covered field. A girl by the side of the road selling flowers. Man and woman in airport lobby saying goodbye. A decorative church has several rows of pews. Diners at a cafe overlooking a sandy beach. A crowd of people standing under a clock ina train station. A big cute black dog in the air with a disc. A group of friends siting at a table enjoying pizza. Four cows eating grass on a sunny day. A woman getting food from a tray with fruit, cereal and juices. Two people on horses ride through a field. One large sheep and small sheep next to it in a dirt ground area with a stone wall structure next to them. A child with a backpack looking at a polar bear. A stuffed bear that is in a backpack. A red double decker bus is riding down the road. A man in a wet suit walks across a crowded beach on a sunny day. A table filled with a big bunch of assorted veggies. A barber with a big mustache trims a man's hair. A plant in a glass vase sitting on a window sill. there are old cabinets in this kitchen along with a microwave Fruit flavored donuts lined up in a glass fronted cabinet A sandwich ,pickles and cookies are for lunch A person on a snowboard in the snow. A boy in yellow shirt playing a game with a Nintendo Wii controller. Two goats on the road surrounded by trees. A man holding the reigns while riding a horse. A toilet sitting in a grass yard out side. Several giraffes wander around their enclosure at the zoo. A herd of sheep stand in a snowy field with a cloudy sky in the background. A view out a bus window of people riding bicycles. Young girl acting silly in the waiting room. A table full of food and chair with no one there. A plate of ries and a drink is sitting neatly on the table. Some goats are looking up at the camera. The skier is competing in the winter Olympics. the ball is coming toward the batter and the catcher is ready a close up of an animal with something over its head A basket filled with donuts covered in powdered sugar. A young boy in a red shirt flies a kit high in the sky while a girl in a t-shirt watches. A policeman roller boarding in the street with another man. A skiier is preparing to ski on a snowy hill. A young male laying on top of surf board. A woman making some food inside her kitchen. Lots of luggage is lined up on the sidewalk of a busy city. The kitchen is clean and ready to be used. A metal sculpture of two birds and two poppy seedpods. A male skateboarder skateboards on a wall in an enclosed area a pizza with a bunch of tomatoes on it. a woman getting ready to hit a tennis ball with her racket The dog lays down to scratch his itch. A giant Amoco sign sitting above a gas station. A man in a fuzzy hat is talking on his cell phone. an image of a bowl of tomatoes and a flower A dog is standing on the sandy area. A train track scene with one train on the tracks. Some fruits and vegetables and a ghost are in an orange container. A young girl is sitting on her bed, talking on the phone, with a laptop on her knees. Man in a uniform talking on a phone at a work desk. Small stuffed toy rests on leg of teddy bear. A woman standing next to a red and white truck. the toilet is white and the cabinets are brown in this bathroom A person holding a phone to their ear and working on a computer. A man holding a tennis racquet on a tennis court. A herd of sheep grazing on a hill next to the ocean. A pizza with meat, cheese and tomato sauce. A train traveling down tracks next to a rural country side. A pitchers mit with a ball inside laying on some bleachers. A group of young men sitting on steps in front of the ocean. Airport security drives past airplane on the runway An umbrella laying on the ground next to benches. A white counter top topped with a ripe banana and three coasters. an edited picture of the same boy doing several different tricks on a skateboard A large clock with a red second hand is attached to a modern building. Truck on an urban road hauling a lot of corn. A boy and girl play paddle ball in the grass An room that has been broken into smaller work areas by a divider. A modern jet liner taking off at the airport A laptop computer on a shelf above a stove. A young child riding on the back of a brown horse. A train engine on the tracks with a side rail beside it. Looking up at a stone and brick clock tower A woman is taking a picture of herself in a bathroom mirror. This decorated cake has a horse with a fence on the top. A tennis player is making an effort during a match. Some small boys standing near a floor drain on pink tiles. A boy riding a skateboard in the street. A traffic light is red for people on horses. A baseball player slides his body into home base. A taxi van in the street with pedestrians, by the corner of a building. A man partaking in a water sport in the ocean. One surfer riding with the wave in the ocean, and another surfer on his stomach riding into the wave. A young girl is playing Wii boxing by herselg. A person wearing combat boots sitting on a kitchen counter. Two women with open umbrellas walking down a street. Luggage including a trunk and a guitar stacked up by a wall Group of zebras standing in a fenced in area with shade. The man wearing the animal puppet makes it cut the boy's birthday cake. A bathroom single sink vanity with a large mirror. This is a cityscape of a skyscraper in front of a large mountain. Group of skiers posing for photo on foggy day. Large green truck parked at the outside of stadium with group of people walking past The cutting area of a sewing room containing scraps of fabric Many cats lounging on a couch in front of a window. The truck sporting graffiti is parked on the street. a blue and yellow bird is sitting on a branch A bird sitting on a branch next to some berries. A beer can and mug are shown with a rib plate. A man stands on his skis on a flat patch of snow near a fence. A stop sign with grey paint over top of it. A bus being loaded with bags of luggage parked in front of a building. A person that is in the grass with a kite. People going up a snowy hill on skis. A train is blowing steam as it stops at a train station. A pan on top of a stove with pizza dough and tomato sauce. A concrete bench is in front of the water. A person on a snow board performing a trick on a ledge. A skier has fallen down in the very deep snow. two street signs with one pointing towards the right next to a building. A man guides a dog to herd sheep. A fire hydrant in front of bushes with a glass face on top of it. Red bus coming down a street next to a red cab. A women wearing a tennis outfit, swinging at a tennis ball. A painted fruit bowl with different fruits in it Two giraffes are eating leaves from tree branches. A man riding skis down the side of a snow covered ski slope. Fruit and vegetables are cut up and placed in small containers. A computer desk with a computer and three monitors and a black chair sits in between them. A man is swinging a baseball bat at a game two zebras standing next to a tree a woman wearing a helmet and holding onto a baseball bat four poster bed and bedroom furniture in a bedroom two big chairs sitting close to a fireplace in a living room A collared dog standing between two potted plants A cook dishes a stew from a pan onto a plate. A couple of guys that are standing in front of a plane. A hipster emo woman sitting on luggage in the middle of a road. A person standing in the snow with a snowboard. An all glass building showing the reflection of another building. A meal containing soda, salad pizza and rice on a table. Display of about 100 vintage wall clocks. A beautiful young lady hitting a tennis ball with a racquet. A bathroom area with three sinks and a towel dispenser. A old picture of a building with many people out front A microwave, bread and rice are on this counter Two buses wait at a red light along a city street. A refrigerator with its door open and contents showing The table is set with 4 boxes of different, delectable donuts The cat leaves paw prints as he seat on the car. A woman twirling a floral print parasol umbrella. Flowers in a vase placed on a table. A woman in grey shirt standing in room next to a dresser. A room filled with dining tables and chairs. A man on skis heading down the slope View of bushes next to traffic lights and moving cars. A table full of different types of donuts. Two dogs sitting in the front of a car. A man sitting at a desk with a cat on his lap. Two people sitting on a bench in front of a statue. a living room with a tv a book shelf and plants Two guys on a mechanical lift next to a building . A large bird perches on the seat of a bicycle. A stuffed animal that is laying on a carpet. the train has lots of cars on top of it A Twins baseball player holding his glove walking on the field. trees in fall colors and a stop sign to the right. A group of people on a side street with umbrellas and awnings. A train door opened with passengers sitting inside. Two elephants outside, one being fed, one standing. Window display a different pastries on a city street. A composite image of an office desk, cars and buildings. A family of four is posing for the camera near some flying kites. Kids sitting at a table eating food. a white plate holding onto a sandwich and a salad A man in a vegetable shop holding a green vegetable. Several grassy tennis courts with five tennis players. A woman holds her tennis racket ready to hit the ball. Two children sitting at a table that has two cakes on it. A man wearing a tie holds his chin as he reads a document. service man in uniform throwing a ball on a baseball field A white trash can on a beach under two palm trees. INFRARED PICTURE DEPICTING THE SHAPE OF A HUMAN BEING Young skateboarder on pavement in rural populated setting. A little girl wearing glasses taking a selfie. A clown face made of yellow squash for the eyebrows, cucumber slices for eyes, a cherry tomato nose and a carrot smile. The airplane is about ready to take off on the runway. A man riding on a horse drawn carriage next to a red brick walkway. A red car parked on the street in front of a parking meter. A man polishing a horses' horse shoe while another man holds the horse. A woman holds a tennis raquet during a match. Pizza on a metal plate sitting on table near phone. A girl holds her arms out to a Frisbee while a boy kicks his leg. A man wearing a helmet on a blue motorcycle. A very dimly lit kitchen with a nice window. A motorcycle is parked in front of two cars. A man wearing a black suit on talking into a microphone. Two boats floating in the ocean one has a crane on top of it. A picture of a lot of people in the snow. meat with onions and sauce on a plate next to potatoes and broccoli Two children plays with a kite in the field a piece of bread with some vegetables and met on top of it A man and woman getting married on the beach. A guy that is using his cell phone while in a park. A knife and fork sit on a plate with vegetable pizza. The woman in red and black is skiing down the slope. A variety of sheep and goats drinking from a pond and eating. some baseball players playing baseball and people watching two cup like things with a bird and a wolf painted on them A young woman sits at a picnic table with her laptop. A herd of elephants are walking among the desert. A baseball player is batting with a catcher and umpire behind him. See picture of a lot of bicycles in the street. An old suitcase on the sidewalk next to the road. A woman in a blue dress with no shoes, seated with her legs crossed on a chair in the middle of a room. A young girl with a nice booty standing in a living room. A boy in a jacket and tie looks at the camera. An older couple with helmets preparing to go on a motorcycle ride. Boy wearing a helmet riding a skateboard down a street. A young adult looks at a computer screen while doing homework. An adult and a baby giraffe stand gazing over a grassland. Stuffed animals sitting on a counter with cups in front of them. Three square slices of food and sauce at an oriental restaurant A flock of birds looking for food in a field. An airplane is parked at a terminal in an airport while luggage trucks unload the aircraft. A delightful pink frosted doughnut and a cup of coffee. The purple flower with a yellow center is near a car air condition vent. An orange and gray bus parked next to a sidewalk. A group of adults standing by a table with wine glasses on it people with their head covered on a motorbike A batter, catcher, and umpire are poised for a baseball. A birthday cake is shaped like a teddy bear. A young child smiling while sitting in the grass. a bird in the branches of a tree A gang of bikers riding motorcycles down a road. A tennis player swinging his racket with both hands to return the ball. Plate covered with french fries and opened hot dog sandwich Sheep are locked up at a farm and feeding people in the ocean standing on water boards and wind surfing A bilingual directional sign to the Hyatt on the Bund. Edible food items displayed on table with receipts. Two cows grazing in a pasture by a stream. A line of baggage in a lobby with several people. A group of men playing a game with Nintendo Wii controllers. 2 towers stand connected, a large clock in between them. A man posing for the camera with a red tie on. A man riding a horse over a red and white striped pole. The zoo visitor is looking at the giraffes. A woman that is standing in the rain with an umbrella. A woman seated looking at her lap top Three different types of clocks propped against a wall. A bookshelf with books and other knick knacks A white plate topped with two slices of pizza. Three giraffes stand in the grass by a dirt pile. A young man is holding a giant sandwich in one hand. two zebras eating grass in a very big field. A small group of Zebras drink water from a pond. an image of a snow piled on the ski slope A fighter jet with missiles flies through the air. A man leaning on a building talking on a cell phone. Four individuals on skis headed in the same direction. The baked potato has sour cream and lots of other condiments on it. A man flipping a skateboard on top of asphalt. A man holding a racquet hits a tennis ball. A close up of a toy squid riding a small bicycle. Four players posing for a picture on a tennis court. A bike sitting on a sidewalk in front of a bus. A public restroom with a urinal installed in the floor. a person stretching to hit a tennis ball A street sign that says C have you paid? The zebra stands underneath the branches of a tree. Two people riding on the back of an elephant through a lake. Woman with life jacket and dog in rowboat near shoreline. A living room with wood flooring and furniture. Two Dell mouses that go with a computer. A person standing on a beach and flying a kite. The woman is learning how to use her new ski skates. A man and woman sitting a a table with pizza in boxes, in a room with a piano. A yellow table sitting on top of a hardwood floor with boxes on it. a herd of cows walks down a city street a person riding a snow board on a snowy slope Man with large orange and black kite in park area. A young boy standing on the beach with a colorful kite. A picture of a naked women who is using a laptop. A table is set colorfully with a pepperoni pizza. A black parking meter, that is next to a bunch of cars. Man standing on a tennis court holding a racket. Two people in an open field are playing with a frisbee. Two people eating slices of pizza while riding bicycles on a city sidewalk. A woman using a smart phone while standing next to a building. A jet in the air flying in a dark sky. Two people sitting on the couch with a guitar in front of them. some black and white cows in a green and yellow field A man holding two cell phones in his hands. Passengers getting ready to board a small aircraft. A guy riding his skateboard in a small town street on a chilly day. a man and a woman walking across the street A rusty bench is near the steps outside. Someone is skiing in the cold white snow. A red light that is on a pole. An old elephant with a long trunk at the zoo A man plays in the water at the beach. A skier going downhill with snow flying up. a zebra has its head down in a field Yellow lounge chairs and an umbrella are reflected in a pool. A woman holds a tennis racket in one hand and a tennis ball in the other. a public transit bus in a city street a train depot with several trains stationed in it An all way stop sign at the intersection of two streets. A view of some alcohol with a glass filled. A great shot of a mountain near the ocean. A man sitting in an office chair looking at his cell phone. A BIG BATH RUBE IS IN A CLEAN SPACE Men in suits with umbrellas walking through open area. A bird with food on its beak is sitting on a branch that holds a bitten on apple. A traffic light and street sign in a large city. A bike is inside leaning on a white shelf. A graduate wearing a blue cap and gown holding a cell phone and papers. An umpire is catching a baseball that was missed by the batter. A large nicely set dining table displaying a cake and other pastries. Three people posing for a picture inside of a grocery store. The contents of a pantry in a house. Two women are in a kitchen baking together. A city street that has police walking along with people, and some are carrying umbrellas. a bird sits on a wheel next to some plants A messy kitchen that has the drawers open. Giraffe leaning over to nibble buds off a green bush. a boy following a man holding a surfboard in the water A red baseball player sliding into a plate. A man and a woman cutting up a big sheet cake. Traffic light and street light for Belmont Avenue A person doing ski tricks on the slopes at night Two giraffes leaning heads down, one with head in feeding trough a man riding a bike with a cart attached to the front of it A little girl laying down holding a bear and a kitten. an image of a cat sitting on top of the desk area The dog is in the kitchen sink and pizza is on the counter. A close up view of two men in a large assembly hall. A person jumping in the air on a skateboard. A teddy bear wearing a blue sweatshirt sitting on a bed. An outdoor bench sits empty and covered in water. Three cows grazing on a hill overlooking a harbor. An adorable cat laying back on a chair while it sleeps. A man sanding on a walkway covered in a long green jacket. A bowl of cherries are shown with a bowl of oranges. A man near a baby elephant by the water. A building with a clock that is on top of it. A white toilet and white pedestal sink sit in the bathroom with newly laid tile. A person spraying water from a hose, onto an umbrella being held by a child. An egg, cheese and sausage biscuit sandwich on a plate A full plate full of delicious food sets on top of the table. A woman holding an umbrella while standing on top of a wooden deck. A girl holding a tennis racket in front of her face A couple of skiers that are at the end of the run. Three people sitting on their motorcycles near a building. A decorative propeller plane flying in front of a wooded area. A small Coast Guard boat meeting a personal boat on the water. A person swings a bat with a helmet on. A man on a trailer by trees with a dog. A man playing a game of frisbee with another man as they gaze into each others eyes with man lust. The plane is ready to board passengers for their flight. A view into a living room containing several pieces of furniture. A guy riding his skateboard down a paved path. a group of people that are sitting in some chairs On a snowy area, a man is holding a young child with skis near several people, sleds, and mountains. A man in blue jeans, has stepped on a banana peal. Two zebra grazing on an open ground full of grass and trees. A airplane that is flying in the sky. A desktop computer sitting on a wooden desk. two man sit at a table in a restaurant A group of people are outdoors playing with Frisbees.. There is someone at a table cutting ie ed of paper A cat plays on a laptop while watching a video. A teddy bear is seen looking out the window. A cat sits behind a person on a green revolving chair. a person flying a kite on a beach with a person near by A view of a bunch of seagulls flying around the beach, a street sign sitting between two benches sitting by a sidewalk A man on a motorcycle drives down the street A cat standing on top of a car trunk next to a parked motorcycle. a person taking a photo in a mirror Chocolate cupcake with a monsters face frosted on top. A white Nintendo Wii game controller sitting on top of a table. A herd of cattle and zebra standing next to each other on a field. There is a woman riding on top of the elephant. An empty bed in a bedroom in front of a small TV. A woman is eating a personal pizza with a friend. A girl flying a kite in the sky with her hands. Three stuffed teddy bears dressed in period clothing. A school bus by a crane and truck with a mountain view in the background. A stone oven with many kettle pots, baskets and bowls. A microwave has a container of food by it. The dog is wearing a red scarf and is being petted by the woman with red shoes. Some people on snow boards high up in the air. A large airplane is sitting on the runway. Pigeons gather atop the rails on the lighthouse. An empty park with mature trees and a backless bench. A person is in the air skiing in the snow. A dressed up teddy bear is sitting in a corner. A young man riding on the back of a black motor scooter. A bowl contains a variety of chopped vegetables. Orange cat sleeping on a small laptop computer. One horse trails behind another during a race. A cat is laying on a lap top on a desk. A man and woman sitting on a train using laptops. The girl stands behind the line and waits for the ball. The skateboarder is checking his technique in the mirror. Three teaspoons of instant coffee poised over a mug. A woman pinning a flower to a man's suit. A motorcycle parked in the middle of a crosswalk on a busy street. A display of vegetables is set up in front of a pickup. An airplane at an airport at a jetway. A person standing next to a box on the ground A person about to throw a Frisbee in the park. a small plane flying by on a cloudy day Bare feet atop a skateboard on a concrete surface. A surfer is riding a wave in the ocean. an image of a man that is by a bench on the phone A zebra laying down in the grass resting for a while. An adult giraffe places its head on a young giraffe. A man on a skateboard passing a bus while posing for the camera. A baseball player swing the bat at a baseball. Graffiti has become and famous part of the art industry A woman in a bus with cars ahead A cat hangs out in a bathroom sink by a bottle of Method soap. A man in white and green jersey looking at a cellphone. Zebras standing in the shade of a fenced off enclosure. A man jumping a grey horse over three rails. Overly ripened bananas are being skinned into a pot. Two cars are parked across the street from a sidewalk bench. A beach with many empty blue and white chairs with umbrellas The man is enjoying a snack at the park. an image of a man in the water waves with a paddle A birthday party with a cake is being held for a dog. A batter hits a baseball with his baseball bat. A woman stands beside a baby in a high chair a table is set with a birthday cake and champagne. A woman sitting in a chair laughing while another person holds a cellphone up from behind an overturned table. A large bed on a wooden frame in a bedroom. a couple sitting outdoors with some wine glasses A person with an umbrella is walking down a city street. A woman swinging a racket at a tennis ball. A counter is full of platters with different pizzas. A CITY BUS IS ON THE STREET COMING THRU Tow cakes resembling the engine of a train. A dog is in the air catching a frisbee. A woman having a meal in a restaurant and using a cell phone. A long white plane resting on a run way. A down the counter view of a very messy kitchen area A baseball player holding a bat near a ball. a clock on a white tower in front of a clear sky A man wearing a neck tie with a golden clock on it. Competitors on skis are racing around the course. Aerial view of a group of people flying heart shaped kites a box with some big doughnuts inside of it Two adult elephants interacting behind some trees and bushes. A food processor with a chopped mixture in a plastic bowl. Hot dogs lay on an orange plate while hot dog buns are on a grill. People are riding down a street on skateboards. Sailing boat tied up to a deck chair on the beach Two large trucks parked next two each other next to the building A young man playing with video game controllers. A Boston Red Sox pitcher stands, holding the ball in his glove at his waist, prepares to pitch to an Oakland A's batter. A baseball player swings at a pitch during a game. A young man walks the beach with a surfboard under his arm. People assembling teddy bears on a table A sandwich on a plate in front of condiments. a person in a field with a dog a green military truck sitting in a warehouse A number of rose flower sticks in a bundle There is a old tower with a clock in the center A luxurious living room with chandelier, bar, and couches A large elephant is shown walking through the terrain. this is a man holding a kite in the air A person standing under an umbrella with other people and lights in the background. a peacock on a wooden table looking for scraps A refrigerator that has items on the outside. A group of elephants on grassy area next to rock and trees. A man stands on a surf board and rides a wave. a small boat on a small body of water Many sheep stand in a large grassy field A man and a woman are standing by the street A female tennis player raises her racket to hit the ball on a tennis court. there is a woman with glasses eating a donut a baby cow standing by its self in the grass A man riding a snowboard down a snow covered slope. A food bowl with vegetable and chicken salad. A lunch of salad, fries, a sandwich and a drink. An elephant in an enclosure approaching a body of water. A bunch of birds sitting in a bread basket. Wine and desserts are served on a table. The large kitchen has an island in the middle of it. A tall giraffe standing next to a tree filled forest. a picture of a bunch of train cars colored red. A man is standing partially inside an open refrigerator. The dog is at the dog wanting to get into the house. White dog playing in grassy field with red disc. A pile of fruit sits ina clean bowl a laptop computer sitting on top of a homemade machine with wheels A black cat sits on a bench beside a wooden letter K. Several boats at a pier in a bay ringed by mountains. Three giraffe standing near trees in a grass field that appears to be a zoo. The happy couple cutting their wedding cake together A woman is holding a tennis racket on a court Two men sitting on steps and selling goods in the fog. the small cat is sitting inside a suitcase The little boy is holding an umbrella over his head. A Skyteam airplane taxiing on a snowy runway. A small aircraft is beginning to lift up off of the tarmac. A herd of deer in a field down a hill from a house. Twins are smiling with the same attire on. bedroom with pink patterned headboard and matching curtains A road is winding in the distance in-between trees. A boy in a red hat playing with tee ball set. A motorcycle is parked in front of a cafe. Skateboarder riding in a concrete with a large cross in the middle. A gray teddy bear sits on a doily near a card. A group of people on small bikes on a street. A girl shows a banana to the camera. A table set with wine glasses and plates. Tagged cows are standing in an open field A bird stretches his wings at the beach. Little boy with toothy grin talking on a cell phone Cacti can be seen in a large clay pot. Black and white horses are standing next to each other. Boys are playing Frisbee in a yard. Someone's living room contains a bookshelf with lots of books. Some young boys are playing with video games. A tennis player reaching up to hit a tennis ball. A group of people in a room with remotes. A man sharing a hot dog with a black and white dog. a fence that has a bunch of surfboards on it A man and woman that are standing on ski's in the snow. A bird sits on a branch in a tree. There are some vegetables, herbs, and other seasonings and a knife on a wooden cutting board. A stone tower is has a clock on the side. A photo taken from an airplane looking down at the mountains. A man surfing waves on his surf board A young zebra is nursing from it's mother on a grassy plot near some shrubbery and a mountain in the distance. A man shows the screen of his phone to the camera The stuffed bear is next to a toy doll. Several horses grazing in the grass near some hills. Baseball player getting ready to catch ball as many fans enthusiastically watch. A bathroom that is done in checkered walls and flooring. A group of people that are standing under umbrellas. a close up of a cat laying on a laptop An air plane is flying over the roller coaster. a young black man lying down on a bench outside resting a close up ofa clock on top of a shelf people holding a skating pole on the snow Two women make faces as they stand at bathroom sinks. A man lying in bed with a cat next to him. A guy in a blue shirt is surfing. a person riding a surf board with a parachute A LOT OF PEOPLE ARE ON THE BOARD WALK An antique car is parked on a city street next to two others. A group of people loading the back of a pickup truck. Two men in business suits shake hands. A brown horse standing in the middle of a flower filled field. That seems like a very small sink for this kitchen. A group of people skiing down a snow covered slope. A laptop computer sitting on top of a wooden chest. A dog and a sheep separated by by a fence. Several different kinds of vegetables on a counter. a woman is sitting outside with a blue umbrella The fridge is full of food and goodies A man playing Wii while others watch An older man stands behind a younger woman sitting on a park bench. A pair of scissors with orange string on a spool leading to the scissors. Two guys on laying on surfboards riding a wave. Large Elephants and small Elephants are walking in a line. Plates assembled near each other with silverware on right. a clock on the wall saying it is 241 in the afternoon The leg of a pair of glasses is stuck inside a clear vase. a white woman in a white tennis outfit playing tennis A person with skis down a mountain in blue pants and black jacket A salad with side vegetables and dressing are positioned on a wooden tray. A man is on the beach with a brown horse. A flat-bread pizza with melted cheese, and a few vegetables sits on a black tray on a wooden table. A bathroom has blue walls and a large mirror. There are many chefs here in this kitchen cooking A man is wiping down the elephant in the water a tennis court that has a man on it Two men with tennis rackets with one racket holding balls. A cat lies asleep in the middle of a mattress. A collection of yellow fire hydrants on the street. two male baseball players in uniform with long hair A pair of scissors, a crochet hook and a sewing needle are ready to craft. People are walking through a subway terminal. A baby girl brushing her teeth with a pink tooth brush. The person is holding a pastry in their hand some big and little bears walking across the street A young boy wearing camouflage sitting in a doorway. A warning sign for high water is on the side of the road. A bus with two levels and a hostess ad is traveling on a street. A grey cat with green eyes and a pensive look on its face. A broken cell phone laying on carpeted ground. A lot of horses grouped together walking down a road . A home with rooms under construction of them A zebra standing next to a tree in a field. Four photographs of a woman in denim shirt next to white plate of food. animals grazing on a straw field bordered by water and mountains. a baseball player throwing a ball with a glove A man wearing a hat riding his skateboard in a skate park. Various signs written in either Chinese or Japanese and also a sign of a man walking across a street. A man standing over a griddle in a park. A galley kitchen with white cabinets and fridge and a wooden island feature. A man standing under a ball on top of a grass covered field. A young man with a surfboard is surfing in the water. A person carrying a surf board on the beach. Lots of toasters sit in the floor near an oven. Boats on the water with mountains in the background. A man riding a surfboard on a wave in the ocean. An old, dirty toilet in a small bathroom that is falling down. A fat orange kitty sitting on a black chair a man standing in front of some tall trees a close up of a motorcycle with parts missing A close up of a bicycle parked on a train platform. A sandwich on a toasted roll sits atop a green leafy salad with tomatoes. Dog in the air to catch a frisbee while a man lays on the ground. A giraffe is walking along a paved walkway. An elephant is standing on a cloudy day. A man with a bicycle in a train station walks past it as a train approaches A bicycle leaning against a street pole in the snow A little boy holding a bat over his shoulder Two beautiful women riding horses in the ocean in bikinis. A bathroom sink with a large walk in shower. Kitchen table ready for party with beverage cups, citrus fruit, and alcohol bottles. A child takes berries from a table full of fresh garden produce. A bathroom with blue walls and a pink tub, toilet, and sink. Soup and a sandwich on a metallic plate. A group of people hanging around holding umbrellas. A desk with a laptop and jars and candles. Onlookers watch an elephant stop for a drink of water A train car moving down the track at a crossing. A blue city bus putting over at a bus stop. Two individuals posing with funny faces, one holding up a wine glass. A vase with an elephant head holds a bouquet of flowers. an image of two benches in the park Several purple flowers are shown growing with bamboo in the pot. A cat looks down from on top of a dresser. A baseball game in progress with the picture about throw the ball. An old airplane flying above a large city. Cooked broccoli in serving dish sitting on cloth hot pad. A person that is laying on a bed with a bag over his head. A bathroom with a phone mounted next to a toilet. A train travelling above ground near bushes and trees. A clock that is on the side of a wall. Clowns ride an antique firetruck down the road in a parade. A woman stands next to a parked city bus. Man in business suit skiing in the snow Six snowboards are leaning against a red wall. a baseball player getting ready to swing a baseball bat A woman on a surfboard surfing a wave on a beach. Small child in white shirt holding a white controller. A red and white plane in on display in a field. two woman playing tennis on a court in front of a crowd of people banana slices sit on top of toast on a white plate A man is sitting in a boat on a river and drinking a bottle of water. a group of sheep standing around while eating some grass A full view of a nice kitchen and counters. A man is standing among pink and zebra feathers and a zebra. Remains of various deserts are situated on a table. A red fire hydrant standing across the street from two silver vehicles. a man doing a skateboard trick on top of pool A boy in white shirt flying a kite on beach. A man is eating food with a pair of chopsticks. This is a dirty urinal in a bathroom. Carrots are laying on a cutting board with a knife. A woman that is next to a surfboard with a dog. a photo on mountains skating wearing very warm clothes A large bedroom with big windows and a patio. there is a black and white dog standing in the bath tub Two little girls playing with a kitchen set. A young man riding a skateboard down a street. A man wearing a white shirt, plaid tie, a grey hat and glasses smiling with his eyes closed. A man leaning up against a boat that is almost finished being built. A group of professionals at a business meeting. Three giraffes eating leaves off cut tree tops. A pizza covered in cheese and toppings on a plate. A tall clock tower flanked by two trees A young girl tasting food from her bowl Sunlight streams into the living room through two windows. A cluster of small boats in shallow water. A crowd watches a baseball game being played. A dog that is sitting down in a backseat. A pair of blue scissors sitting on top of a paper and a container of note cards. A group of planes are flying through the air with smoke coming from their tails. A man eating a slice of pizza next to food stands. A fan is featured in a yellow room. A fenced in area off a sidewalk with posted signs. A white plate topped with salad and onions. A plate that has food on it with a glass next to it. A woman holding a blue frisbee over the top of her head. A man cutting a cake with a knife. View pointing upward of a skyline in a city There are military people serving others hot food White bowl with tomatoes and greens on counter top. A bus with a few bikes on the front A green traffic light and telephone wires This is the sign for the Bart ba building. A bunch of birds flying over some waves. Handmade vases, all the same size but all different colors. A woman reaches down to pick up a video game control. Dog laying on a green sofa in a living room of an apartment. There is a man swinging a tennis racket. a man with a camera is filming some baseball players The little kid is flying a kite on the beach. I am unable to see the image above. A man in a suit and tie standing with a cellphone to his ear. A black keyboard is hook to a cell phone on a table. Black and white of two adult zebras from shoulders up playing. A man hitting the ball during a tennis match A man standing in a park looking at trees. an old black and white photo of a man near a plane An intersection with traffic lights and lots of traffic. Small floor model refrigerator, so new it still has its manufacturer's sticker. A crumby chocolate dessert on a plate with a large knife. A dog on standing on a surfboard in the back of a truck. A delicious plate of churro with chocolate sauce. a toilet with a black lid and the tank in the air A herd of sheep grazing on a lush green hillside. A black train engine on tracks next to buildings. a person riding a skate board on a street Two zebras are staying away from the sun as long as they can A stop sign with graffiti about the Red Sox a man handing an elephant a stick in an enclosure at a zoo a close up of a cat on a window sil A man sits on a boat cleaning a fish. A group of zebras and other animals grazing in a field with a rainbow in the background. A plate of green salad and pieces of tomato. A couple of cats laying on top of a brown chair. Little girl walking down a road holding an umbrella. A wooden desk with a laptop sitting on it. A scooter parked in front of the door of a stone building. A giraffe is coming up close to people there is a an standing on top of a mountain A kitchen with and island and several counters in it. A person walks on a bridge with a kite. Three skiers pose in the snow in front of barren trees. A table topped with two bowls filled with fruits. A very large pizza covered in cheese and toppings. A woman and man riding on the back of an elephant along a river. A motorcycle racer leans into a turn during a race. a zebra is standing in its pen and some green plants and grass a white black and brown cat on a table a big bathroom with a sink, toilet and bath tub in it The man in a suit stands next to a woman in a pink dress. Wooly goat stands near gate with others on the other side. A book shelf with a large clock on top of it. A man walking his dog in the park. a person holding an open umbrella in some bushes A little girl gets help brushing her teeth. A motor cycle procession down a wet street. Snow boarder sliding down the hill after falling in the snow There is chicken, couscous and vegetables on the plate. A man holding his tennis racquet on a tennis court A zebra and smaller brown animal are running in the grass. A dog crossing a pavement path near motorcycles. The man in the yellow checkered hat is flying a kite. A couple of men riding horses down a street with tall buildings. Boy attempts to hit a baseball with his bat. A BLACK AND WHITE PICTURE OF A MAN SITTING LOOK A person in the water being pulled by a kite. two very tall and white storage towers in a room Looking past a snowboard in the snow to a city beyond An Alaska airplane is reaching up to a greater height. A man swinging a bat as he plays in a baseball game. A child wearing a red helmet holding a skateboard. A man having fun in the rolling ocean waves. a person that is on some dirt on a baseball field A plate several cookies and a small sign on it. A partial view of a formal living room. a big window that has some birds out front A little girl is eating a hot dog and riding in a shopping cart. A grocery store filled with lots of fresh produce. A guy blowing on a hot piece of pizza. Two children reading while lying in their bed A horse and buddy come down the side of a road. A photo of someone's meal at a restaurant. there is one orange laying among five bananas A woman putting icing on a homemade cake. A couple of people at a counter near plates of food. A coupe of road signs near a downtown area or highway. A boy walks along the beach carrying his surfboard. People and buses are sitting still on a city street. A young man holding a basketball on top of a court. a couple of beds that are in one room A dog sleeping on the floor in the corner, a man looking down at him. A large group of men are dressed like Santa. A couple of people standing on top of a beach with surfboards. Giraffes and babies are in their habitat in the grass. A blurry image seen through a rainy window of a person holding a light blue umbrella. A toilet and a trash can in a room. Two birds sit on the back of a bench made of logs. A very large bear sauntering in a zoo type environment A teddy bear that has been buried in the sand. A blue and cream tiled bathroom with a stand up shower This table has three kinds of donuts on it. a horse pulling a carriage down the road a man wearing a striped tie holding a microphone A car driving down the street, some people are watching it. A man standing on a tennis court holding a racquet. The man with the umbrella is looking up. A door is opened to the inside of a bathroom. A person on a surfboard rides a wave. An orange and white cat is sitting in an easy chair. A man in chain mail checking his cell phone. a person is skiing down a snowy hill A refrigerator with a microwave on top of it. A green and blue fire hydrant sitting on bricks on the side of the road. A man stands and airs up his bike. a plate with a small dessert and some fruit A table topped with vegetables and a pitcher. A locomotive train on a set of railroad tracks, with tanker cars attached behind it. The woman in the kitchen is tending to her food. a table that has some glasses on it A white basket filled with ripe and unripe bananas. A fireplace mantle has an ornate clock sitting on it in front of a large mirror near a teddy bear. A small round clock atop an ornate old building. A pile of luggage, boxes, towels and other items on a carpeted floor Small airplanes are parked on a grassy field. Two bear cubs are playing together in water A bunch of luggage is on a car in a bathroom stall. a hot dog covered with some chili, mustard, adn ketchup two pans of dinner rolls baking in a large oven Six sheep standing in the grass beside a house. A calendar with some apples and oranges and pears in it. a group of people that are getting out of a boat Some very big commercial planes over the water. The man is riding a bicycle next to a train. A brown and white horse standing in front of a red wall. A field of wooden structures in front of a mountain. Horses peek through the windows of a small utilitarian horse barn. An airplane jet flying through the air against a blue sky. On this table there is bowl containing a bottle and glass vase containing rocks and leaves. A dug out filled with baseball players next to baseball equipment. A man taking a photo of an elephant as the elephant stands inside an enclosure. A dome shaped cake that has lit letter shaped candles on it, and people in the background. A young child brushing his teeth in the bathroom. Man playing Wii video game with group in background on couch A man taking a close up picture of a motorcycle. A baseball game is being played in a city park. A small water landing plane is on a lake near a neighborhood A beautiful young woman holding a tennis racquet on a tennis court. Two kid touching food that is on a kitchen counter. A woman holding a tennis racquet next to a tree. A traffic light with two street lights hanging from it's side. A very nice looking dining table by a bright window. A picture of a person in the air on a skateboard. A man walking next to two horses on a dusty road. A whit plate topped with chicken and vegetables. A black and white picture of a lady getting off of a escalator holding an umbrella walking into the city. Several balls of yarn are sitting on an oven top. A bunt cake sitting on a red plate covered in icing. A statue in the middle of a park near trees. A Stop sign is slightly covered up by a tree. A group of people and an official player soccer. A couple of benches next to a street. A kitchen with a sink, dishwasher, microwave and refrigerator. A blue and silver fire hydrant on a sidewalk. a herd of big cows on a wide farm The hand is holding an open cel phone. A row of motorcycles parked on the side of a busy street. A computer monitor, a laptop and some other electronics sit on a tan, wooden desk. A little girl is jumping on a hotel room bed. A blue and white bus parked in front of a motorcycle. Person stands and poses with skis next to a ski lift. Take-out food in a basket on a wooden table. A group of people are on the grass playing Frisbee a big building with a clock built inside the top of it Some books that have been piled on top of each other. a person sitting at a bench near a bush People sitting at a table with plates of food and beverages in front of them. A man sitting in an overstuffed chair in a living room. A long passenger train that is going quickly down the track. THERE IS A DESIGN OF AN ELEPHANT ON THE SHELF A pair of scissors with white handles sits on a white piece of paper near several sheets of flannel. Two yellow trains are entering a train station. some female hands holding a sandwich in a car This person is about to eat a banana. A table set for tea reveals finger sandwiches, tea cups and a cream pitcher all on a red and white table cloth. The snowboarder is performing a jump at the top of the slope. A white bath tub sitting next to a white toilet. A banana with a sticker on it, with a person holding it. A cat sitting on top of a bag of luggage next to a TV that is showing a store about Giant Rats. Snowboarder rounding top of sloped edge in ski area. A large type pizza with cheese, spinach, and sauce is on a silver plate. A group of large red birds that are perched in a tree. A vase that has flowers inside of it. A group of people on a field with a Frisbee. Man and woman standing close together smiling into the camera. Large polished black truck sitting in a parking space. Densely growing trees and a low fence frame the top part of a shot showing a tight huddle of grazing sheep on a section of sloping terrain with cropped grass and a cat at some distance behind them. a couple of phones that are next to each other A pile of different fruits sitting next to each other in a bowl. A couple of motor bikes parked on a beach. A white beat up bus going down the street . A bird flies over an island area of a river. A yellow fire hydrant is shown on this street. Small child standing in the center of a crowd smiling. A person sitting at a table eating a doughnut. The frame of a bench is metal and the seat of the bench is wood. A large bear walks in front of a rocky formation. A wooden cutting board with several vegetables sits on a counter. A woman is playing Wii with sunglasses on. A red, white, and blue plane is in the sky. A person holding a surfboard while wearing a wet suit near the water. A zebra stands between several small trees in tall grass. PERSON ON SNOWBOARD UP IN THE AIR OVERLOOKING NEARBY TOWN A view of a street corner in the middle of a city. A grey black and white cat laying in a chair. A cute dog lazily sleeps on top of a pile of clothes. Lots of donuts being processed through a machine. a woman and child are looking at an elephant in its pen A group of people in the snow with skis. A flock of birds standing on top of a wet beach. The little girl is blowing out her birthday candles. a kitchen with a small window in it A bus is stopped on a street surrounded by trees. A wooden park bench under a tree with long spiky leaves. Young girl with racket with dog on lap A black pan filled with mushrooms and vegetables. The man is holding an extremely large pizza with a lot of stuff on it. a large window with a city in the reflection A living room has three televisions set up. A couple of men walking along a snow covered hill side. A yellow doorway with a clock above it. A bedroom with a picture on the wall and a lamp on the side Many people are sitting at round tables with dinner plates on them. An elephant swinging its trunk inside of a pen. A woman wearing white playing tennis, about to serve. a pro baseball player is swinging a bat a man on a bus and a man looking over his shoulder both smiling A gang of bikers riding motorcycles down a street. Four bowls containing fruits and vegetables arranged decoratively A person on some skis in the snow. a girl wearing a fuzzy vest and a girl wearing a flowered top a toilet and a urinal in a marble tiled bathroom Two people on skies posing for the camera a man that is jumping his skateboard on some bricks A dish features breaded meat, lemon, and broccoli. An intersection of a regulated entrance showing the stop sign A group of four people are riding a ski lift as they ride over the snowy mountain. A living area with a television, coffee table, couch and other items. Two plates with small, rustic looking pizzas on them Five snowboarders doing tricks on the snowboard course. a microwave is sitting on a wooden shelf A mom duck with a big bunch of ducklings swimming down a river. A dog sitting on a couch in front of a table with a laptop remote controls and glass on top. woman taking picture of herself in the mirror A lady in a winter coat talking on a cell phone. Zebra, antelope and other wild animals at a African National Park. Horse drawn chuck wagon followed by Jeep and cattle. Two boys who are playing soccer against each other. a horse is standing near a large lake Teddy bears are dressed in clothing and stand in a window sill A yellow fire hydrant surrounded by pebbles near a fence. A snowboarder about to move down the slope. An elephant sticking it's trunk up another elephants rear end. A LOT OF PEOPLE ARE ON BOATS IN THE WATER Two tennis players sitting on a chair holding racket. a blue frisbee sitting on the beach with dog paws next to it A semi oval looking bathroom that is in someone's house. A group of people fly kites over a sand covered shore. Hundreds of sheep walking in the water and a ranch. A person is cutting up some fruit on a cutting board Two young children playing with each other on a bed. A cutting board with slices of peeled apple and a knife next to an apple and apple peels. A busy street with busses and cars merging together. a black and white sign is by the road The man sets up the ball to serve it. Three giraffes standing together inside a fenced area by white buildings. A man and a couple of women sitting on a colorful seat. a person walking across an odd looking pavement carrying an umbrella A yellow train parked next to a train station near a loading platform. A street is blocked of for a festival. A man sitting at a desk in front of a laptop computer. A woman on a beach on a cell phone. A street sign showing the intersection of Beacon Ave and Stevens St. A man with his arms crossed is sitting in front of green couch with remote on it. a lone black and white cow standing on a large field of grass A double decker bus is shown driving on street. A man wearing sunglasses wearing a green shirt. this bathroom has two pictures of dogs in it A piped canopy bed with a wood headboard is dressed in neutral bedding. A kitchen with counter tops filled with lots of clutter. Commercial jets lined up at an airport terminal. The view from the commercial airplane includes the wing and mountains and water. A desk with two computer monitors and a laptop. A pair of adults escorting children skiers up a hill. a fire hydrant on a city street near a pole Several suitcases sitting next to a chair outside A crowd of people sitting around a dinner table. A large teddy bear is wearing a dress. a woman with a nice little suit case A snowboarder soaring above a slope looking out on a mountain range. Two pieces of pepperoni pizza are on a plate. A young girl is eating cake with her fingers. A giraffe amongst tall, slender trees in an enclosure A computer keyboard is shown on a desk. A train that is riding on rail road tracks. A man in a shamrock hat is playing a video game. A kitchen scene looking at all the pans of hot dogs and sausage. The bench at the tree offers a respite and a scenic autumnal view of a grand valley A row of elephants standing next to each other. A bathroom with a black and white pattern on the wall. Two young woman walking by a fire hydrant, one talking on cell. a person riding skis on a snowy surface A young child is jumping high in the air. A person on a surfboard, riding a wave and leaning to one side with one hand up in the air. A full view of an airplane taking a shower. A close shot of a BBQ pulled pork sandwich. The skateboarders seem very relaxed as they wait for their turn to ride. Man with broken surfboard standing in waves in ocean. Brown leather couch in wood floored living area. The man and woman are holding tennis rackets. A man wearing a neck tie and a white shirt. Group of young adults eating pizza and drinking beer at a restaurant. A man with a yellow tie and white shirt holding a yellow sweater round his neck. A man standing in the street on a cellphone. A red bus is leaving and some people in the background. Two students are playing games at a party A soccer player kicks the ball in a soccer field a cloudy sky during a day with some overcast A plane with drawings on the side waiting for people to board. A young boy holding an umbrella on a deck A sheep is minding its business near a body of water. Long-haired male downhill skier flying down the slope, negotiating a turn. Several people street skatingstreet luging on a road. A female snowboarder riding down the mountain slope A man with a baseball uniform on with a baseball and catcher's mitt. Two cows are standing on a sloped green hill. A man in suit and tie has a cane and cigar. there are six jets flying in formation A group of elephants in grassy field with mountains in background. A bed sitting in a bedroom between two lamps. A person sitting in a chair with the ocean in front of them. A man surf sailing out on the ocean. Simple silver remote being held out in front of a television. a brown piece of cake is sliced and on a brown table A woman is holding a tennis racquet preparing to serve the ball. an air force jet flying with a sign attached to the back of it a group of people ride atop of an elephant A dog sitting on a rug watching television. A tall giraffe standing in the middle of a green field. A pair of men sitting at a table in a diner. A woman sitting on a bench with a bunch of suitcases A baseball game with the pitcher in his follow-through and the batter preparing to swing. A brick building with a clock on the outside. A dog lies in the grass next to a Frisbee. A kit has markers, a scissors, and other plastic objects. A stop sign topped by two green street signs. Numerous parking meters along the side of a street. A bowl of pasta salad with onions and olives. THERE IS A WHITE PICK UP TRUCK DRIVING DOWN THE HIGHWAY A flock of sheep grazing in a big grassy field. An old pickup truck sits outside among other classic cars. a bunch of cows eating out of a food trough This is an image of two bikes on a beach. A large bus on a open city street. air force members consulting near airplanes, while a man is near the planes. A living room with a lighted floor lamp, sofa, wooden coffee table and end table. A fire-hydrant on a street and near a van. A truck with trailer for hauling rolls down the road. A group of people standing in a field flying a kite. Bananas and other fruit on a white plate. Man with teenagers at outdoor setting enjoying food and drink. A very cramped room with a couch and a desk. several old fashion planes stilling in a field. A cat sitting on top of a hard wood floor. A man seated in front of a pizza. A man riding skis down a snow covered slope. Carrots, quash, green onions, and parsley all on one piece of paper. A baby boy holding a stuffed bear animal in his hands People sit at a table for a party. A man that is sitting in a train. Some apples and other fruits at a store A man flying through the air while riding skis. Two youngsters in orange tops have catchers gloves and are playing. young boys in uniform playing baseball in a packed baseball field A public restroom with several urinals, a black floor and red and yellow walls A group of farm animals standing in the shade under a tree. A glorious sunny day at the beach and a man sitting on a bench taking it all in. A living room with a fireplace and an artificial tree. A man is surfing in the water in a really big wave. A large long train on a steel track. A large number of people outside near some flowers and a road. A woman is looking at a fire hydrant. Several people with backpacks waiting to get on a bus. A bathroom with a large mirror and walk in shower a small cat watches a cheetah run on television there is a female surfer riding in the water A woman is reflected in a mirror as she works on her laptop computer. A large passenger jet sitting on top of a runway. two tennis players on a tennis court with a sky background A group of people standing outside while some hold posters. A young girl standing on top of a grass covered field. a yellow and brown fire hydrant on the side of the road A plate of food containing a sandwich and a salad. A little girl eating a donut in her left hand. A small private plane that is coming in for a landing. A cat sitting on the awning above a stove A person traveling on a crosswalk on a bike. A gathering of people around a large table eating. A group of people sitting in the snow while attached to snowboards. A large clock sitting in front of a building beneath a tower. A dog sleeps on the lap of his owner. some guy standing on a beach with a surf board a couple of street signs that are by some bushes A man with a shaved head lights a cigarette. A man in a pizzeria putting the toppings on a pizza. A girl with a cast on her arm stands in a bathroom. A person holding a red bowl filled with cake. a person riding skis on a snowy slope THIS IS A BEAUTIFUL PICTURE OF FRESH VEGETABLES A cat taking nap on top of a pair of shoes. Two photos are presented white people talking on their phone. The sheep are grazing on the hill side. A fire hydrant is alongside an empty road. A child playing on his skate board at a park. AN ELEGANT ENTRY WAY WITH ARCHED DOORWAYS AND GLASS AND A CLOCK A bowl of mixed fruit on a decorated mat. Red double-decker bus parked on a city street. a few people on horses are riding down the dirt. A slanted picture of a woman waiting to cross the street. a vet is trying to check a dog's teeth A man holding the strings of a kite on the ground A woman holding a baby while she has something in her mouth. A plate that has a glass and food on it. A bookshelf full of cookbooks, bottles, and magazines next to a microwave. A child is leaning out of his bed to touch a gadget. A laughing man is holding a baby with a plate. The snowboarders are taking a break in the snow. a bathroom vanity and shower door with towels hanging on a towel rack A clock clamped inside of a rusty vice. a living room with couch, fireplace, tv, chair, and window Three different vases containing several red tulip blooms. A small bird is perched on top of the branch A baseball player holds up his bat while a catch squats. A young woman walks along the beach near the water. a child and an adult pose for a photo Roses and other flowers arranged nicely in old-timey vases by a shop window. A man skateboarding in a skateboard park while another waits their turn. A man spray painting a fire hydrant on a street corner. Closeup of the head of a white cow on road. A woman who is holding a tennis racket. a baseball game with the batter catcher and umpire a person in glasses is using a laptop A cherry pie sitting on top of a piece of tin foil. A dog leans out of the window of a car. A shelf with pileed hats next to a teddy bear. A woman on a cell phone near a man. A Fiji Air Pacific plane is flying through the sky. People are walking around a plaza that has a sign that reads "Spring in the City". Two men trying to get to a soccer ball in a soccer game A man plays video games in a cluttered living room. there is only one horse standing on a large empty field A young lady laughing in a kitchen with a cake in front of her on the counter. The girl is surfing a small wave in the water. A laptop computer and mouse sitting on a table. A group of women cooking and preparing food in a kitchen. A cat next to a box full of lots of trinkets. A person on a yellow motorcycle is turning around a street corner People dresses as zombies boarding a bus at a bus stop. A man and woman holding up cellphones near each other. The brown bench is in the woods The woman is posing for a picture while skiing. A male surfer carrying a white board exiting the ocean. the side of a passenger train at a train station A man riding a surfboard in a wet suit in the ocean. A dog is in a living room lying on the couch. A man wearing a cap, walking alongside a bicycle. A kitchen filled with black appliances and a table. A man playing tug o war with a dog over a white frisbee. A plate of food sitting next to a glass of orange juice. An Olympic competitive skier furiously rounds the corner. A man adjusts his tie as the subject of a graphic. A plate of food including, grilled meat, baked potato, carrots and lima beans. There are some bananas on a dinner table A plate filled with broccoli chicken and fried rice. The view of a crowd of shoppers and vendors at a market. A man that is standing in the dirt with a bat. A person on a field flying a kite. A pretty young lady kneeling down to pet a cat. A little girl in a red shirt and blue dress standing on a road. A few skateboarders performing tricks at a skate park. A type of bread is on a plate next to a variety of sauces. The living room has an old style fire place in the corner. A woman standing in the living room with a coach and t.v. A spindled bed sits inside of a wall papered bedroom A baseball player pitching a baseball on a field. A man on skis with ski poles has just descended the mountain. A man riding a wave on top of a surfboard. Giraffe relaxes in the shade in the park A sign cautioning the likelihood of cattle crossing. A group of giraffes is standing next to a fence. A harbor in a city is full of boats. A street filled with blurry traffic and traffic signals. A tennis player is about to hit a ball in front of a crowd. a small airplane sits empty on a runway in the mountains A wooden table topped with four white bowls. Trays of pastries and sandwiches beside a bowl of soup. some people some snow and some trees and one person is taking a picture A messy bedroom with items covering the floor. Two large bags of luggage in a hallway. The train is going down the railroad tracks. A women who is in a field of dirt flying a kite. Three women posing for a picture in a dinning hall. A bench sits between two trees in a flooded area. Two people with bicycles standing in front of a field of flowers. A giraffe walks near the gate as people look on. A group of wine bottles sit next to a glass. A vase filled with lots of different colored flowers. An orange has been sliced in half and placed in a red bowl. A man with a suitcase walking through a crowd A couple of zebra standing and laying on a dirt field. The cars are parked on the side of the street. The steak and broccoli is next to a bowl of soup. A bedroom with windows with bright lights flowing through. A man and a child who are in the snow. Three male skiers standing on a ski slope A man on skis hovers over a series of small hills covered with snow. a bird that is sitting on a branch The person is riding on the back of the multi-colored truck. a person wind surfing on a large body of water A small black and white dog sitting on a yellow davenport. a young horse and its mother graze in a field A large group of people are sitting at a long dining table set with plates and wine. This is a long red bus behind another one just like it. a double decker bus stopping to pick up a passenger Two uniformed men posing while holding pastry items. The side of a train showing the entrance and two doors. Three giraffes lounging around in a grassy zoo enclosure. Several people are sitting around a lit birthday cake that is under construction. Small child playing the Nintendo Wii on carpet A herd of cattle grazing on a grass covered hillside. People watching a big blue kite on a cloudy day. there are several bullet trains on the track A woman poses for a picture while eating A water-stained cathedralclock-tower enveloped by various green vines. A large balloon on a beach with a black and white dog looking at it. A small set of silver scissors used with electronics. A cat sleeping in a sink next to a faucet. Three zebras are huddled together in an enclosure. A cake says Happy Birthday with an image of a horse. A large vase sitting on top of a wooden table filled with flowers. A clock sitting on top of a street sign. A baby that is laying down wearing a tie. Several people are getting ready to enter the water for surfing. Sea beach with a bench.Four ships are seen in the sea. A person in snow gear skiing down a snowy hill. A baseball player holds a bat across his chest A desk with art work and photos displayed on it A woman that has a racquet on a tennis court. Some dogs stick their heads out the car window. a large living room filled with a lot of furniture A large, white cow walking through the streets of a small town A toilet connected to a wire, next to a speaker. A yellow bus is driving alongside a small white car. a baseball player with a bat on the field A man is skateboarding on equipment specially made for it. A man holding a tennis racket about to hit a ball. A man who is playing video games by himself. A big pretty rainbow over a long empty road. doughnuts stacked on top of each other in a bowl Two microwaves stacked on top of each other in a kitchen on a counter. there is a very tall giraffe standing under a pole some snow skiers are posing for a picture A computer monitor that is in front of a keyboard. A gold clock that is on the table. a woman sits on a bench and talks on her cell phone that is waited down with key rings A snowboarder goes airborne with a mountain in the background. A truck that is sitting in the street. a young boy holding onto a harness for a cow A table topped with a bowl of soup and a plate with a corned beef sandwich. The traditional white sink features two faucets below the mirror.. A man being assisted with a tie by a lady. People are in a field playing with a frisbee. A white laptop computer lays on a carpeted floor and a gray and black with white footed cat is on it. Cattle with horns and red hair standing against a fence. A jumbo jet on the runway waiting to take off. There is a table covered with various displays of cupcakes A living room has guitars, shelves, and a painting. Someone has drawn a face on the yellow fire hydrant. a large black giraffe that is out side by some kites A metal rusted bed frame in a dilapidated room A pile of apples lying underneath a tree on the ground. A white bowl filled with meat and green broccoli. A two floor bus picking up some passengers at a bus stop Racer riding a dirt bike on a race course. A pizza on a pan with a spatula. Many sheep graze in a grassy pasture in a valley. A man standing on the railing of a boat near the shore. Group of people in for a group training session A man is sitting on a bench next a statue of man with dog licking his face. Steam rising from a manhole cover in the middle of a street with a yellow fire hydrant in the background. A teddy bear sitting on a bench in the shade An eighteen wheeler with a patriotic paint job sits in a parking lot. A very tall clock tower with weird arches hanging off of it's sides. A todller, a girl, and a man pull a ribbon in the grass. Horses walking through the yard toward a barn. The skateboarders are practicing their tricks on the stairs. A man on the beach kicking the sand. A kitchen with a standard stove top and wooden cabinets A couple of women standing on either side of a man wearing glasses. Two men retrieving their Frisbee from the creek. A sign that warns of speed bumps ahead. Cat relaxing on blanked, appears to be stretching a mobile phone, tv remote, game controller and chips on a blue table cloth Two giraffes are eating grass in the plains. A man that is jumping in the air with a racquet. Two dogs on a bed in an RV. A snowboarder soaring through the air on a sunny day. Two female cows looking forward outside in the grass. Orange seats on a train with Yellow doors and lime green floors. An intricately decorated bathroom with a peacock light lit. there is a blue and silver train that is stopped on the tracks A black and white photo of two birds standing on seaweed. A commuter train sitting at a station while passengers stand on the platform. A person lying on the ground with a suit case on top of them. A fruit that is still hanging from a twig. Two pieces of french toast with syrup on a plate. Several umbrella's and chairs sitting on a beach. A hotel room showing a bed, desk, television bathroom. Girl moving while holding a Wii remote in a living room. A man in a suit and a tie with a cell phone. A kitchen refrigerator covered in various colorful stickers A wide angle view of this hotel suite a bunch of people are standing near a bus Four people are in a room using four laptops. An elephant standing in the middle of a rocky environment. A group of children sitting on a bed together A close shot of a plane flying in the air. A blender full of smoothies and two glasses on a kitchen counter. The decor in the house is very elegant looking. A view of a bathroom showing vanity, toilet and shower. A baseball player is hitting the ball on the plate. A bed in a purple bedroom with a wooden dresser topped with a mirror. Some very fancy looking cocktails with fruit and veggies. A round clock on a colorful tower near a harbor. A dog is wearing a Santa hat for a portrait. A yellow freight train is traveling on a track A man and boy sit in chairs and enjoy breakfast. a close up of a street sign with trees in the background A young man with a skate board standing in a graffiti covered area. Home library area, bookshelf in background with several laptops, notebook PC and two VDUmonitor and keyboard for desktop in foreground. A man sitting by produce while another man points to it There is a plane sitting at the airport. a covered table with fish on a table A woman sitting in a bathtub wearing a bikini. A young child in a snow outfit and goggles with skis on in the snow. A bear climbs through some plants and onto some rocks. A sheep standing on a green grass covered pasture. A pile of books sitting on a table underneath a clock. Four men carrying a long board that narrows at the ends A stainless steel microwave with something in it A red fire hydrant on a city street. An older photo of a woman on a tennis court posing with her raquet. Japanese food of meat and vegetable are on a plate. People at a table with food and wine. A young girl with a helment stands on a skateboard. there is a red stop sign and a white truck behind it Young girl on large grassy field attempting to fly kite. A group of people playing with a green disc in a grassy field. A cat laying down in a bathroom sink. Two animals are standing on a mound of dirt. Group of seagulls flying around a fishing boat. Man and woman walking over a bridge in the rain and high wind. A couple of vehicles that are sitting in the street. A train driving past a building pouring out black smoke. A birthday car with a picture of a black bird on it. A young man running along a beach next to the ocean. two little kids playing soccer battle over the ball A bride and groom cutting into their wedding cake together, Towels on a towel rack of a bathroom and a towel mat on the floor. A close up of two time expired parking meters A bedroom with a bed, nightstand, windows, and dresser with a television atop it. MAN STANDING IN GRASS WITH LOTS OF MOUNDS AROUND AND A FRISBEE COMING TOWARD HIM A runway that has a jet plane and a truck on it. Zebra walking on road and other animals on grass. A light post with a no parking sign posted on it. A man in a tie and shorts standing outside of a house window A surfer stands on their board as another surfer watches. A dog laying next to a large brown teddy bear on a wooden floor. A pizza that is cooking in an oven. Firefighters gather around a badly burned moving truck. Two adults and one dog standing on a snow covered road. A cat that is curled up on a laptop a airplane that is parked on a runway A man wearing pink underwear is sitting on top of a stove door looking surprised. A tower with a clock on it's face stands in the sky. A table topped with broccoli, apples and other produce. A public restroom with focus on three urinals. A pile of tiny sandwiches without crusts sits beside a pile of crusts and various sandwich fillers. A train approaching a station where people are waiting to board. A bus sits next to a tree and sidewalk. There is a pizza that is on the table in the room A man taking a swing at a tennis ball A group of people watching a boy skateboard. A man on a cellphone using a water hose. The young woman is making a face at the horse. Four giraffes are in a grassy area with several trees. One boat sailing next to one canoe in a body of water. Girls competitively playing Frisbee in a green field Doofy young man shares his umbrella with an Asian woman. hounds running in front of a horse must be a fox hunt A man flying a kite on the beach next to other people. A bathroom sink under a mirror and lights. a train on a track above a body of water A person looks at the camera while holding a black cat. a stack of suitcases out on the street Boats are moored near a city that borders a large body of water. Kids playing tennis on a clay tennis court. animals in a field of tall grain near a tree Tennis player returning volley during match play on grass court. Giant dolls sitting in giant beds next to a man wearing an orange safety vest. A person covered up in warm clothing sitting on a bench, with two bags next to them. A bathroom sink designed as a bowl next to its reflection in a mirror. Woman walking on train platform as train filled with passengers prepares to leave. A commuter train going through a tourist area. A dog with a pink object in its mouth. some zebras standing on a hill while eating some grass a car with a cargo full of steer and symbols painted on the side. very many teedy bear with their price label Hamburger on a bun with ketchup and onion. A guy riding a surfboard on the water. A person holding a toothbrush to their face in a crowded room. A person on a skateboard riding next to a road. there is a small bird that is standing on the branch A couple walking in the snow while under a purple umbrella. A male on a snowboard on a rail in the snow as five time-lapsed stills in single image. The fire hydrant has been made into a fountain. A green candle and a vase on a table with one chair A giraffe is standing near a fallen log. A cat sitting underneath a wooden stool next to shoes. a track moving on the road with two people A plate of pasta and bread sit next to a beer bottle on a table. A dog is sniffing a chew toy on the floor. A living room filled with furniture and an old fashioned TV. A lady in a blue life jacket skiing. A couple of large gray elephants standing next to each other. A bird sitting on snow covered ground next to a statue. A yellow hazard sign sitting on the side of a road. A grilled hotdog with mustard and relish is sitting on a white plate. A cat staring at a camera laying on a floor next to a shoe. A small kitten is playing with the tv set A large cabinet in a corner next a picture. A living room with an chair and large couch sits in front large bookshelves with computers on top. A road sign that says reduce speed for motorcycles. A man riding a skateboard over a stone block. The man is holding a pink iced doughnut. A fire hydrant covered in leaves sitting in front of a tree. A black dog sleeping on a yellow and white striped comforter on a bed. A view of a kitchen with a very elegant look to it. Three people riding skateboards down a hill next to grass. Local fresh fruits and vegetables displayed for sale in a market a man and woman stand in front of a cake A bathroom sink and shower separated by an open doorway. chopsticks holding broccoli and noodles in a white dish Women are selling bagged and fresh bananas under a colorful umbrella on a street corner. A giraffe that is standing on all fours on a dirt surface, in a fenced in area. there is a bird that is sitting on a branch A man sitting on a horse while rubbing him and kids are rubbing him also. A plate topped with rice, broccoli and meat. There is a dog sleeping on a couch in a cluttered room. A baby holding a busted up umbrella whle sitting on the ground next to a pile of garbage. a woman sitting on the back of a pink scooter in the road Two sheep stand in a field with mountains in the background. A small room with a television screen monitor. A cute little girl eating a hotdog almost as big as she is A Wii remote and nunchuk that someone's hand is holding on to. A woman posing for a picture in a kitchen. A large pizza sitting on a counter next to a glass of beer. There is a man smiling with a banana in front of his mouth A boy in a chair with a teddy bear dressed in a railroad outfit. A large group of people protesting outside in a parking lot on a sunny day. two people walking in an open field with a sky background A desk with two computers on top of it a man is at a snow slope jumping with a snowboard A red and clear small glass filled with candy on a desk next to a green plant. A man and woman are preparing pizzas on a table. The girl is running through the grass in a costume. A market with a variety of fruits and vegetables. A young person ridding the waves on a surf board. a truck is parked with some rafts by the water A white cat sticking his head out through something. Two women with loads of green bananas on dirt ground. A wooden swing hangs above plush, green, grass. a female in a black jacket is riding a brown and white horse The train is travelling down the tracks of the road. People are using a boat to travel through a flooded town. A plate of food with pasta, mashed potatoes and broccoli. Two girls involved in some sort of a game. a group of zebras standing around by a fence A large tall tower with a clock on top. A traffic light sitting on the side of a road. people sitting in the grass with some of them chekcing on their cell phones A woman is about to hit a tennis ball. A man and his child holding onto their skis. A man in a shop that sells bottled liquid tapes up a paper bag A CARGO AIR PLANE IS PARKED ON THE RUNWAY A man looking back while standing in a market below a clock. A boy stands on artificial grass holding a Frisbee. A female professional tennis player dressed in white. A large white stove sitting against a wall in a kitchen. The baby girl is sitting in a high chair playing with broccoli. Three pizzas with nontraditional toppings, a statue and a bottle of wine. The young boy is sitting on the couch playing a video game. A subway sign at night beside Big Ben. A woman holds a Weiner in each hand. An old photo of four men in a boat with a bicycle two people using clear umbrellas that have fringe on them A man standing in front of a kitchen counter using a laptop. A zebra chews a flower in a fenced in field. A painting of a horse drawn carriage traveling through the country. This wall oven has just cooked a homemade pizza. Crowded market street filled with pedestrians holding umbrellas in the rain A beaver sitting on top of a tree stump. Two skateboarders practicing their flips on a wooden ramp. Two people in the water on surf boars on a wave. A glass plate holds crackers, cheese, and vegetables. A man pointing at something in front of a bus. A woman sits in the grass talking on her cell phone. A girl smiles at the camera while making candy. A cat standing in front of a television screen with a picture of a fish. A toy model of a kitchen that has a refigerator, stove, oven and baby play pin. Tennis player prepares to play with racket in his hands. A man smiles as he sips wine in an outdoor restaurant. woman wearing a black coat and boots sitting on wooden bench A man cooking on a grill with a fire. Three Zebras and a Giraffe in an enclosed area. The eight lane street is packed with cars in traffic. A women serving a bundt cake with candles to a child. A person in black shirt walking on sidewalk with an umbrella. Commercial plane taking off from a runway with water in the background There are plates of cheese, crackers, and sandwiches on a table. A group of men are waiting for their bags to be unloaded. A zebra eating grass by the barn gate. A person holds a cell phone inside a car. Three birds are looking around while on the ground. A puppy is laying on a blanket with toys. a family sits down to eat at a lighted dining room table This suitcase is full of CD's and apparently they are for sale. Large flowers are sitting inside of the vase. a zebra drinks out of stainless steel tub A guy and woman dressed up for Halloween. A man is playing a game of tennis. There is a bathroom with green walls and a white sink and toilet A pair of shoes with a baby kitten inside one of the shoes. A person on a snowboard jumping in the air. A bunch of hot dogs in a bowl with beer being poured on them A giraffe surrounded by a group of zebra in the grass. A monitor and keyboard sitting on a desk. two cows eating the grass on a urban area Some trash sits at the side of the road at an intersection. A group of people watching a woman jump a horse over obstacles. a steam engine train driven down a rural area A man looking out of an airport window at planes. Empty benches in the park after a storm. A group of young children sitting next to each other. A traffic light is sitting next to a pole A very cute little cat standing on a desk. There is a plate with one slice of bacon a half of orange and bread An empty bus is parked on a street. A woman preparing a young boys lunch in front of him. Three well dressed people are standing and laughing together. A group of people looking at stuffed animals lined up in a street. There is some trash is a kitchen sink. a cupcake with a blue umbrella in it An outside table and chairs with a pink lamp. People are standing in a field flying and watching kites. An empty kitchen with wood-paneled cabinets and black appliances. A man is flying a large kite in a field. A man that is standing up holding a surfboard. a girl is sitting on a horse outside A woman sitting in a chair while holding a purse. A giant inflatable shaped like a spiked ball placed on a field. A group of people carrying ski equipment while walking on snow covered ground. Tennis players in action on a court with shadows. a person holding a doughnut up to their mouth A marble table with plates of food and utensils. many people are trying to avoid the sun by holding umbrellas An Emurates airplane flying through the sky A bird that is standing on a keyboard. A white plate topped with two pieces of stead and a salad. A group of people watching something with one man looking off into the distance. A woman crossing a city street while carrying groceries. A sign post with signs that read "Maciel Ln" and "Wonder Stump Rd". A bike with a box on it's back wheel is parked A lady in a blue dress is posing for the camera in front of her plate of food. a couple of men that are sitting at a table a baseball player swinging a bat on the field A family watches two boys singing into microphones. Grey and white cat sleeping on a pillow and a sweater A man holding a tennis racket raising his arms up in the air while two women clap. A puppy rubbing its face on a pair of shoes. People that are sitting on the grass eating food. A woman walking down a street at night holding a red sheep umbrella. Two men in suits taking a picture together a close up of many fruits on a table A couple of giraffes that are blocking the path of the safari. A pair of giraffes stand under a canopy together. There is a full view of an outdoor area and it is nice. A small dog standing next to a table with a white plate on top of it containing two chocolate donuts. A group of people are standing or sitting around a table taking pictures and looking at a phone. Motorbikes and other vehicles move along a one-way city street. The little girl is petting the horse in the barn. a blue and white plane is on a runway A cat laying on a desk with a book and laptop. looking up to a clock on the side of a building A person standing on a white square playing a video game. Two women on cellphones laughing with trees in the background. A baseball cap with sunglasses sitting on top of a baseball glove. We are looking at a propeller plane flying in a cloudy sky. A group of sail boats on a small pond. A toilet with many buttons is sitting with the lid up. Child standing in front of a stop sign on a suburban street corner. A metal bicycle on the top of a wooden bookshelf. A lady is standing outside in front of a bus station. The plate is piled with rice next to a whole apple. Man stands inside a building talking on his cellphone. A rather large heard of elephants, including a baby. Two wine glasses and wine bottles sitting on a wooden table. A peep hole view of a a man biting a sandwich. This tennis player is watching the ball after hitting it A young woman that is sitting on a couch with her leg resting on some pillows. a yellow and white bird is closing its eyes A local bar has appetizers and tapas to enjoy the game in the background Multiple boats are docked on the water by a pier. A snowboarder lies face down in the snow. A tall stack of suitcases arranged largest on bottom to smallest on top. A child reading as his mother and dog look on. A man wearing a cowboy hat, riding a horse in a parade. A boy laying on a small wooden bench with and umbrella held up over him. A variety of luggage is stacked in a compartment. A group of zebras standing in tall grass A giraffe walking in the grass past trees A giraffe bending down to drink water from a pond. a man helping a young girl walk on snow and ice in snow shoes A lamb that is around a group of people. A girl is posing by something that was just taken out of the box. A jet and a small unknown aircraft are flying in the sky. A spoon and a blender on a counter. A bowl of bananas being placed in the middle of a table. A large black cat sits on a desk near a laptop. Noodle bar near cookie on plate near glass of milk. A close up of a not so happy white kitty. Two slices of pizza sitting on a ceramic plate beside a box of cheesesticks. there is a small glass vase with white flowers in it A person skiing on a mountain, in the snow. Woman laying down across her personal bathroom with her feet hanging over the tub. a cat looking out a window as one sits by the laptop and looks at the camera Trees in a park are in front of some parked buses. A man and woman traveling on the subway with surfboards. A girl smiles from the backseat of a car on the phone A young girl standing on a surfboard ride. Two young ladies that are dancing in the room together. A womab looking at herself while brushing her teeth. there is a man wearing a red and white uniform that is at bat A women who is sitting on a horse. This kitchen has a black stove, stainless steel refrigerator and white cupboards. A person riding skis down a snow covered slope. A man standing next to another man wearing headphones. A clock is built high into the side of a tower. There is a man talking on a cell phone. An old man sells a variety of kite string spools. A little boy sitting on a suitcase on the floor. A black and white picture of an overturned truck in the middle of a street. A large billboard that has words in a foreign language. Kids running in the grass after a soccer ball during a game. The back end view of two zebras standing at a fence. A woman is on the tennis court holding a racquet. a large pizza is in a black pan No image is being shown on the page right now. High speed train stopped at a station underground. A boy performs a trick on a skateboard A batter holding a bat at the home plate. A herd of sheep standing on top of a lush green field. An Indian man straddles a horse beside a stone building. There's a sideways traffic light next to a building. A group of fluffy sheep in a big grassy field. A stir-fry wok is filled with cooking vegetables. A baseball player standing on home plate with a bat. A woman leading a horse inside of a building. A parked white, green and red double-decker bus. A kitchen with brown cabinets and plenty of space. Baked pizza with meats and vegetables displayed on table. Steam rises from a bowl of colorful food, while a glass of juice sits on the sill in the background. people on a table at the beach eating a black seat on a white toilet in a restroom Skateboarder jumping down concrete steps outside on his board. A clock tower is seen in front of a tall building. A colorful glass vase sitting on a table. A snow covered area with a car with it's brake lights on in the distance. A giraffe in front of the doorway of a building looks around the corner, casting a shadow on the building on a sunny day. A yellow motorcycle is parked on a road with many bystanders Many zebra and one wildebeest on a savanna Man placing a white container into an oven. A stuffed animal sitting in a pizza box with some slices of olive and cheese pizza. A blue and white vase filled with flowers. People are getting ready to fly kites in a park A crooked one way sign pointing into the ground A skier is seen riding down a hill. top shot of a boy sitting on the floor eating pizza A kitchen with all the appliances such as a fridge, microwave and stove. Three riders race around a track on dirt bikes. Two rows of bicycles parked side by side on a sidewalk in front of a building. A tennis player is on the court preparing to swing. A tennis player holding her racket in the air. A bench is on a deck overlooking the water there are two brown bears that are playing together in the water A large clock below a flagpole with a flag on it. a united jet liner loading passengers before take off Vase with water holds a bunch of flowers in front of window A typical living room with all the furnishings. a sandwich laying on paper and on a table A person takes photos of sheep laying down. a white plate topped with a piece of chocolate covered cake. A family of elephants is walking along a dirt road. a person riding a surf board on a wave A bathroom that is all off-white with a mat in front of the shower. A bathroom has a custom bathtub with no curtain A computer desk with an old pc and lots of clutter. 3 adult elephants stand with a baby elephant behind a fence. Two partial pizzas with cheese, olives, green peppers and tomatoes. A couple of zebra standing next to each other. A tennis player with both feet off the ground leaping for the ball A kid standing at a table eating some food. A zebra that is putting its head on top of another zebra. An old man is getting ready to blow out some candles. Several buses are lining up on the street. a small dog tied to a bench on a leash a woman is sitting down talking on a cellphone A woman sitting at a table with a glass of wine. A girl in a white dress at the beach with two surfboards. A light brown bear sitting down near large logs. The taco pizza have a lot of olives on it. A table with four bowls of food on the top. A man sits at an outdoor restaurant table eating a soup with chopsticks. A plated filled with a fish, potatoes and broccoli. A fluffy cat laying on top of a white laptop computer. People riding their bikes down the middle of the road. Woman walking with a horse near a standing man. A couple of black bears standing on top of a rock area. A horse in a stall with three people. Three very different giraffes at a big zoo A cat curled up in a shelter made of printer boxes. RED DOUBLE DECKER BUS WITH CARS IN BACKGROUND A steam powered train pulls out of a busy station. A cow is standing in a field in front of a building. An old looking two level bus in a parking lot. A slice of cheesecake sits beside a fork on a plate. a bunch of people watching as two people play video games A man on a scooter sits beside a stop sign. There is a woman holding and playing with a baby. A boy is playing frisbee golf in a park field. White decorated porcelain vase in front of others. A woman at a product show holding a cell phone a large kitchen that has a stove and a dishwasher A couple of cows walking submerged in some water. Young, tagged calf looking through a barbwire fence. A little girl in a pink snow suit on her skis. A train is pulling into the station. A baseball game in progress with a full crowd. Sheep stand outside of a wooden building on a snowy day. A makeshift tent is constructed at a camp site. A woman posing with a stuffed bear in uniform. A bus stop with a slightly damaged bench. The young woman is sitting on the bed fixing her hair. a close up of a young baseball with a glove A woman walking her bicycle with dog walking beside her. a shower door a sink a mirror and an outlet sliced orange and a knife resting on the cutting board A large bus that is sitting in the road. Person sitting on elephant walking in a muddy river. A mother bear following her cub across a meadow a person sticking a knife into a cake on the table A man standing with his arms folded while smiling. A woman holds up an electronic cigarette underneath an umbrella. A model of a kitchen with a sink dishwasher stove refrigerator. two small birds on a bench with a blurry background A man on a snowboard caught in each phase of his trick. A man, a lady, and a youth together and enjoying a pizza. A person holds a skateboard and stands on the sidewalk. a group of men holding a long surfboard on the beach A wooden street sign in a residential neighborhood. A skate boarder rises on the crest of a concrete wall Two gentleman in formal suits, one of them is adjusting the collar of the other. A shower has a removable shower head and a glass door. a woman standing at an outdoor display of assorted fruit a close up of a dog on the ground Animals grazing on a lush green hillside covered in grass. a close up of a persons hand holding a large knife A pizza in a box in a drawer of a motel desk where a TV displays "Inspired By A True Story" A red and green bird on a perch eating. An office area set up with multiple monitors Surf boarder finding waves in a river designed for surfing. A glass vase that has dried flowers in it. A man is standing in a river with a cow. A garden with vegetables planted in it Banana bunches hanging at an open air market. A large blue bus parked at a bus stop in a city. A pizza with an assortment of toppings such as lettuce and radicchio. A empty bench on a snow covered beach. This man is standing in a kitchen eating food. A modern sink is on top of a bathroom counter top. Little girl with black hat sitting on a pony with two girls beside. A group of people sitting around a dinner table. a cat walking on a floor next to a contruction area Food is shown in a display case at a deli. young child is eating a powdered doughnut at the kitchen table A counter to an office area with an orchid in a flower pot next to a balloon sculpture of flowers. Cars and a motorcycle waiting at an intersection. An empty street and stop sign at night. A clay rendition of roses in a pot are displayed. a person standing on grass holding a large box of pizza A zebra and foal are standing on the ground. There are two boys preparing food on a table. An old picture of a building and trucks outside the building. A gray cat standing on top of a black car. A plate of food and drink on a table. A young woman holding a green baseball bat on a field. A small backyard garden with freshly grown vegetables. a plate filled with pepperoni and mushroom pizza A man is racing a black motorbike around a race track. Man in black and white outfit swinging at a tennis ball on a court. An airplane flying in the blue sky with some clouds. A woman is approaching a tennis ball with her racket. A man flying through the air while riding a skateboard. a short tree in front of a pink wall A group of people at a busy restaurant and a close up of a restaurant dish on a white plate. Some young children are looking at the black device. a street corner in a town all bright from lights A man standing on a tennis court holding a racquet. The hotel room is clean and ready for guests to use. A large bowl of food is sitting on the table. A girl is holding her phone and looking at it. A large body of water next to a shore filled with clutter. Several zebras are walking across a dirt covered area. A horses head handing over a iron fence. A close of on an entree containing meat and vegetables. A bathroom has two sinks and a bathtub in the middle of the room. A group of people on the green grass about to catch phrase. A tennis player gets ready ready to hit the ball. Several elephants are walking up a dirt hill. A wet window blurs the image of an apartment building beyond. A group of children playing ball in a field young man catching frisbee right arm under left knee. a jetblue plane sits on the tarmac at an airport A picture of a street during the night. A stuffed animal that is next to some hot dogs. A motor bike parked on a city street. The rhinoceros lays down next to the zebra in the safari. A person sitting at a picnic, eating some food. A man on a skateboard who is performing a trick. A polar bear walking along on icy ground A pizza with fresh mozzarella and basil on top some one with a glove on holding a sparkling drink in the cold A plastic horse standing on top of a chair. A woman unpacks a picnic basket with her teddy bear. there is a cat that is laying inside of a sink A banana and a yellow apple in a woven basket A bicycle parked next to a motorized sitting scooter. Cat laying on top of someones arm while using the computer. A city train stopped at a boarding station. Two purple teddy bears one with pink bows sitting in a shopping cart. An open white toilet next to toilet paper in a bathroom A train parked inside of a train station next to a person. A golden colored Shar Pei dog and a dog of indescribable heritage sitting on dog bed. Teenage kids playing a Wii Game, while others watch. A teddy bear lying face down on a bed on a pillow. A pizza covered with vegetables is on a tray near plates. A dog sits on a couch with a book. A very large elephant standing near two younger ones. a baseball player swinging a bat on a field a bunch of cows lay down on some grass A child eating a sandwich with relish on it. a stadium of people watching a tennis game A man is playing Wii in his office. A bedroom with a bed and small tables on each side Two sheep standing in a field against the sky. A man is performing stunts on a skateboard in a parking lot. Fans watching a baseball game on the field outside. people standing around with some holding onto to what look like drums A snack truck in the street in front of a building. A group of men on top of horses playing a game of polo. A person cuts grass in a yard using a small pair of scissors. A red flat bed truck with a load of lumber on the back. People are watching a man cut a birthday cake. A notebook computer by a window with an image of the same window on the screen. two people on a tennis court at night there are two pictures of a small black and brown dog A group of people skiing around a snow covered slope. a man riding skateboard down the side of a hand rail. A kitchen with an automatic dishwasher and window. A snowboarder is seen from below while jumping. Three giraffes resting under a shaded area at a camp. A white toilet sitting in the corner of a bathroom. Home plate at a professional baseball game, batter not quite ready. Some people sitting around at various tables, with a railing dividing them A pizza in a pizza box cut into eight slices. Two men stand besides an elephant and gesture toward a crowd. An old blue iMac with a sad Finder face wallpaper A woman holding a smart phone at a table. A polar bear in water puts his paw on a cage. A woman with a cup of coffee and a donut smiling. a big train that is on a rail way A bird perched on a power line looking at a house A white and brown horse pokes his head out of a stall. A train moves along train tracks in a grassy landscape. A man holds and gestures toward a sandwich a skate board being picked up off the ground by a person A person has a stuffed bear on their wrist. A puppy sitting on top of a sneaker. a uellow and blue bus is driving down the street Two zebras are on a grassy brown field. A young man sitting next to a young woman both of which are holding Nintendo Wii controllers. A blue and yellow train is sitting on some railroad tracks. A train bears the numbers 4790 painted on the side. A group of three men standing next to each other without shirts. a couple of people are flying kites on the beach A plate with chips, salsa and a burger, on a table with a glass of beer. A palm tree is on one side with an evergreen tree on the other side and snow capped mountains are in the distance. A small herd of zebras walking past the camera man. A collection of teddy bears bearing Swiss flags a man putting some cheese on top of his pizza slice Chickens are feeding on the ground while horses hover above. Black and white photograph of a man using a cell phone on the street a modern flush toilet in a bathroom with tile. A photograph portrait of a male teen in coat and tie. a motor bike is parked outside on a road A bus driving down a street with a bears face on it's front. MAN ON SKIS STANDING STILL POSING FOR A PICTURE a dog poking its head out of a car window reflected in another car's rear view mirror Two elephants are chained to the outside shed. A toddler has a baseball and a mitt and going to throw the baseball. A wooden object placed next to a tree on the side of the road A lot of building on each side of the road, with a very curvy road in the middle. an image of a boy that is lying under the bed A large yellow double decker bus driving past a guy riding a bicycle. a bathroom with a toilet a stool and a toilet bowl cleaner A surfer wearing flippers skims along a wave Red and yellow train cars hold gravel on a train track. A mattress top on a bed in a small space. A bare loaf of chocolate cake sitting on a counter. An arched doorway leads to a furnished living area. a close up of a person talking on a very old cell phone a man is sitting down as a child pretends to be cutting his hair with fake scissors A group of tourists riding a tour boat down a river. several sheep graze on grass near a tree with a protector around it LARGE DIMLY LIT BATH ROOM WITH A DOMED CEILING A picture of the president standing at a podium a close up of a person with a large sandwich A replica sculpture of a baseball player holding a bat ready to swing. A man stands in a screened in area with a cell phone to his ear. A batter prepares to swing at a pitch during a game. Small bathroom with lights on above the sink. A person that is standing with objects in their hands at the beach. Some giraffes and ostriches in a grass field with trees. close up of a thin crust pizza with tomatoes A giraffe's head is framed by the posts of its enclosure. a purse a pair of shoes and a horse behind a display glass a man sits at the table an leans over to blow out the two candles on a cake A man snowboarding near a frozen pond and a tree. a cat laying on the couch next to a remote and a pillow A plate of food including broccoli, sweet potato, and pork. plat bread pizza with BBQ chicken on it A zebra standing on dirt area with fence and bushes in background. A fighter jet is flying through a clear sky. A young boy with his tennis racket in hand is waiting for the ball. A large teddybear float is on snow skis. A picture of a person sitting down under an umbrella. A collection of trunks are piled against a wall. A plateful of meat, fruit, vegetables and bagels Two men and a woman are standing in an elevator. Several people going down a snowy street in skis. A man is taking a big bite of a folded pizza in a cafe. A man is playing catch with two children and a dog. A man standing next to a large brown horse. A towel hanging on the bar in the bathroom. A kitchen has black appliances, wood cabinets, and a large window. A woman stands in a room that has two small beds in it. A chocolate dessert slice sitting on a clear plate accompanied by a fork. Horse drawn carriages lined up in the street. A train parked at a train station next to a loading platform. A bike is shown hooked up to a rack. A man with no shirt on a skate board. Elephant standing in an exhibit behind a fence with a park keeper. Two Pug dogs setting on a green park bench wearing harnesses. Two zebra standing on a lush green grass covered field. A happy sun is painted on the building behind the bench. A snowboarder jumps very high above the snow. This is the side of an intersection with a red sign An empty bench looking out over a bay with numerous boats on it. a young child shows off his smile after brushing his teeth A woman riding a horse with lots of purple flowers. A train soaked street lined with lots of street lights. A close-up of two ducks swimming with fish. A computer desk containing a laptop and computer monitor with a printer located on the left side. A red train is traveling undergroud on the tracks. He loves the thrill of snowboarding down the slope. A man and a dog standing on a dirt path in the woods. This looks like a McDonald's in a Chinese or Japanese community. A picture of a plane that is in the air. Two adults and two children sitting on a couch. A green light is shown on this busy multi-lane street. A group of girls celebrating as they leave the field A couch and a television in a room. A couple of boats on the open water. A half full glass of red wine with food arranged on a table behind it. A red umbrella hangs from an ornate stair rail. The orange and white fire hydrant sits on the edge of the street. A man sitting at a wooden desk using a laptop computer. A group of men sitting by tables working on laptops Two people stand in front of a bunch of elephants Two women sitting next to each other on a boat. A cat that is laying down on a bed. A young baseball team sitting on benches together A bowl filled with fruit on top of a green table. A little girl standing in a forest holding a black umbrella. A model of a beach front scene showing the parking lot, beach and the sea. Several skiers ski by a direction sign and a fence. two men looking a little boy beside a table A bedroom with a lamp, bed, and dresser. A truck parked next to another truck near a building. Two buses in a downtown area,, near a boat dock a man that is surfing on some water A woman decorating a fancy cake in her kitchen a table with a shake and some fruit on it Skiers at the base of a mountain, one is fixing bindings. A tennis player readies to swing as they await the ball. A man with a hat getting food from the refrigerator The man is on his surfboard in the water riding it. A girl is pulling back a sling shot on her fingers A group of boats sit on the shore line. a person using a laptop in front of a television A group of Zebras grazing in a field. There is a toilet in a bus or plane stall A kid is touching an elephant's trunk near a fence. A photo looking down at a parking area with garbage and old vehicles. A crowd of people standing outside of a bus. A man is holding the waist of a woman as they both stand and smile together and look straight ahead. A herd of animals walking across a grass covered field. A rectangular vase is displayed, surrounded by flowers. A bear sleeping in a tree, with the branches hiding its face. A meal with meats, salad and eggs on a plate, a cup with soup, and a dish with something in it. A display of a variety of fruits and greenery. Two giraffes standing on bare ground in a zoo. A small bathroom with a patterned tile wall. A group of people in a small boat in the water. A group of people is sitting in the living area of a loft apartment. A horned cow standing in a green grass field. The interior of a bathroom with a toilet and soiled floor. A child standing in the snow with pine trees surrounding him. A box that contains a cooked pizza in it. A gray and white cat near a black goat outside of a barn. A person playing baseball with foot up in the air. A giraffe head sitting next to a branch. A nigh time elephant parade or show in a street A group of people on a court with a tennis racket. a person skiing down a snowy slope A black and white image of some electronics near a pen and cup of coffee. A little dog balancing itself on a surfboard. The plate has two sausages, noodles, and broccoli. A mother bird sitting with her baby birds. A woman is showing a white teddy bear to a man. A man and baby are holding their arms up while at a dinner table. Two giraffe standing next to each other near a stone mountain. A man riding on the side of a wind sail. a woman riding down part of a snowy hill with a snowboard a small bathroom features a tub, large vanity and mirror. Serving dishes of fruit and cheese sit on a table two large air planes on a run way A traffic light suspended over a snow covered road. a cat is sleeping next to a laptop Man looking at cell phone while on another at a game. a computer PC monitor and a keyboard and mouse there is a man standing in the field with two cows Cute teddy bears with flowers lying around together There are people on an outside platform waiting for the approaching train. A kitchen with a stove, refrigerator and dishwasher. A cat laying in a wooden chair with a patterned cushion. a woman on the tennis court playing tennis ball A man holding a white and yellow frisbee. A person feeding a kitten from a bottle. A young man is on his skateboard going down the road. three baseball players and one is hitting the ball Sheep on a grassy hillside overlooking a river. The guy is skateboarding while walking his dog. A teddy bear with a red hat sitting on a bed with fluffy pillows. A stop sign has street signs crossed on top of it. there are many people that are standing around this building A hot dog sitting on top of a plate with a salad. Red train giving tour crosses a beautiful bridge A couple of birds sitting on a tree, with a blurry background. Bundt cake with icing sitting next to another decorated cake. A batter has just hit the baseball in this small-town baseball game. Sandwiches Displayed for sale at a shop by keeper A man in a suit carrying an umbrella walks across a tight rope while a woman in a gown waits for him on the other side. A person takes a picture of people holding different pink umbrellas. The cat is looking inside of the open backpack. Men and women are playing a softball game. A man skateboards up the side of a wall. Four boxes that have pizza on them in a row. The small bathroom has a metal toilet and railing. A colorful public restroom focused on the sinks. Decorative clock with three owls for the framing hangs on wall next to mirror A plane flying low over a snow capped plain A train is traveling down a road with buildings. A clock tower with the American flag on top. A ripe banana is sitting on a table with a cat key chain on top. Happy people sledding down a snowy slope together A wet rain soaked street surrounded by buildings and trees A woman presenting on a computer to a large group Two cats resting comfortably on a double bed. A busy street has many cars parked on the side. A black cup with a spoon sticking out next to a folded pair of glasses. An airport filled with jets next to a parking lot filled with cars. The manager is having a conference with his pitcher and his catcher. Home base of baseball field with an umpire and catcher squatting down, and a hitter bent legged, holding a bat against shoulder. A pizza clock mounted to the side of a wall between two windows. Two kids using an electric toothbrush at Christmas time three zebras in the foreground and wildebeests look around Woman holding a banana over her face in the guise of a smile. A computer monitor sitting on top of a desk. A living room filled with furniture and a window. A room filled with luggage sitting next to furniture. Two men skiing on snow in the woods A toddler on counter top eating a banana near the electric stove View down a city walkway and street, with grass, pedestrians, trees, cars on street and parked on side of street, a bench, and some buildings in distance. Small toilet with tiled wall and patterned flooring next to it. A stuffed toy is packed in a bag. a counter with cleaning supplies ice cube trays and racks from a fridge and a drawer missing A man is seated at his computer desk and looks at the camera. Two people on a beach throw a frisbee. Seating area with many benches outside a building. A man in a black cap is purchasing a bottle of Aquafina water at a grocery store. two buses and a streetcar on a busy street Two large horses stand nose to nose in an open field. A modern kitchen with recessed lighting, appliances and an island with a marble countertop. A kitchen with a large stove and hanging pots. A warthog and a zebra running in a grassland. a person doing a trick on a skateboard A bare bathroom with a sink and toilet. A woman trying to eat a donut tied on a string Kitten laying in a brown loafer stretched out A group of planes near a large wall of windows. An apple and orange resting on a table. A red pick up truck parked on a field next to another truck. Several commercial jets lined up at the gates at an airport. The clocks are built onto two sides of the building. a person that is standing on his head with a skateboard there is a red bus that is parked outside The cows are standing on the hay in a meadow. A marina filled with lots of small ships. A skate boarder takes flight on a high jump. A small dog carries a frisbee in its mouth A toilet with a full roll of paper and plunger. a bath room with a toilet a sink and a mirror An umbrella obscures a person sitting outside a store. Two giraffes walking through a fenced in enclosure. Two very large pizzas sitting on top of wooden cutting boards. A young girl squishing her body into a suitcase an image of a woman sitting on the bench People sit around low tables eating pastries, drinking juice and coffee. Burgandy colored train coming around the tracks in wooded area A herd of sheep grazing in a grassy field A young girl brushes her teeth with an electric toothbrush. A half eaten bunt cake sits on a white plate. A group of people standing on the side of a ramp. A man is holding a skateboad and a pepsi. a blue and white plate with a sandwich on a wooden table A fluffy cake is on a metal cooling rack. An asian girl taking a photo cuddling with a teddy bear. A fire hydrant painted to look like a soldier. A couple of people with ties in a room. a woman in a black top on a couch with a brown black and white dog A decorated garden with a sheep standing in it. A gold plated Chopper Motorcycle on display at a convention. A fishing troller boat docked next to a lighthouse. On main street is the Wisconsin state fair presented by U.S. Cellular. A boy with his baseball mitt and ball. A street plaza with horse riders and onlookers. A baby is sitting on a potty chair. A gray cat is laying on top of a suitcase. A motorcycle parked in front of a brick building. a person holding a paper sheep beside a busy subway car A bunch of bananas hanging on the tree A hotdog with relish in a basket with a receipt. A kitchen with wooden cabinets and a gas range. Two birds are standing on a very tiny rock island. A Tennis player getting ready to hit the Tennis ball. A man is walking his poodle as the poodle stops to rest against a bench. A bedroom with the drapes open, and a television on. A plate holds french fries and a sandwich. A BLACK CAT LAYING ON A WOOD BENCH a kid in pink is holding a stuffed animal A fire hydrant in a garden on a suburban street A parking meter on the side of a city street. The man in the tuxedo is also bald headed. A man that is standing up with a cellphone. A white cat with a brown head sits in the window sill of a brick house. A small dog lies on a pillow near a toy banana. A white, black and green plane cake that is decorated. This is a babes room that has a crib and a small couch and a dresser A bicycler is stopped at an intersection waiting to go. A woman pouring some wine into a glass. a polar bear siting on rocks near a body of water A few men ride on top of elephants while they carry large pieces of wood. A giraffe showing his head to the camera from an enclosed area. A Zebra and a horse are together in the wild. Plums and bananas are in a glass bowl. Two people sitting in chairs under an umbrella in the water A black and white picture of people in a park, flying kites. A water hydrant on the sidewalk with plants nearby an image of the wilderness with a brook A dog lays in a bed and looks a little sad. An adult elephant and baby elephant loving on one another. Afternoon at a dock with seagulls flying overhead. Several white chairs lay on a grassy field while cows mill about them. A yard and cars on the street covered with snow. A small red couch in a living room with a coffee table topped with a flat screen tv A passenger bus that is parked in a parking lot. A woman with her pants pulled down on the toilet. There is a large window over the kitchen sink. A man riding a skateboard down a sidewalk. Two giraffes standing in a wide open area. A person leaning back holding a tether while water skiing. a tall tower with a clock on top with a sky background A little girl in fashionable rain wear is walking under an umbrella. A boy asleep in bed with his Christmas teddy bear A table with crusty bread and cheese platter on it Several horses standing on a hill while grazing. A skateboarder doing a kickflip in a skatepark. A couple of beds sitting next to each other. A giraffe is drinking water from a pond. Several people standing together with a red stoplight behind them. Giraffes are standing in an enclosure peering over a fence. A computer sits on a desk with a red chair in a bedroom, Some cute small kids sitting and playing a video game. A young man holding a doughnut sitting at a table. A group of men holding up a bunch of bananas. A man in a hat riding down the street on a skateboard Some cattle are walking on a dirt trail A man in a carnival outfit posing for a picture. A calico cat taking sun bath in a window. A woman holding a plastic utensil passing out a piece of cake. A group of five zebras walking in a grassy area next to a rhino. A motorcyclist stops on the road to allow a pedestrian to cross. Recyclable material in garbage bags are left outside. Toilet with blue rug and blue rug cover saying please do not use. Sorry! A skateboarder plants his board at the end of a bowl. Little boy swinging a plastic bat at a ball in yard. A man holding a racquet toward a tennis ball. a dog going for a frisby with a house and vehicles in the background two men in womens pajamas playing on the wii game a grizzly bear is standing in some grass and brush An old city with canals filled with water. The airplane is flying really close to the tower. a close up of a toilet with a device over it A microwave mounted in a shelf with the microwave door open a man with a tie and headphones sitting at a table A white toilet seat in some lavatory somewhere. A dog sitting on top of a bed under a window. A couple of people in a room with remotes. A cloudy day with two airplanes getting ready for take off. A few people are doing something at this point that is darting. A brown long horn cow standing on top of a field. these are three giraffes on the grass outside A male tennis player jumping and swinging a tennis racket. An orange cake with whipped cream frosting sits on a plate beside a book on the table. A black and white photo shows a man hanging out of a plane. the double Decker bus is not in service a couple of skiers on top of a snowy mountain A baby in a high chair at a table. A pizza oven with a baking pizza inside it. Three giraffes walk together across a field with trees behind them. a close up of a bike at a train station The woman is sitting at the table and eating pizza. Man with glasses and a mustache standing in front of a door. This bathroom has a handrail in the shower. PEOPLE WAITING IN LINE TO GET FOOD FROM A FOOD TRUCK A hand holding up a cell phone that is taking a picture. Several motorcycles are parked outdoors facing each other. THERE IS A TOILET IN THE CORNER OF THE ROOM A man and a woman are standing besides a parking meter on an urban and colorful city street. A man and a woman pose for a picture at a party A panda bear rolls around looking ridiculous. A cat has made itself comfortable on the chair. A bathroom that has different posters on the wall. A sign for a pizza place rests on the ground. a couple of birds that are on a branch a yellow and white concrete truck next to a bus A man doing tricks on a skateboard on the street. A large bus and some people on a road. a close up of a cow near a wooden bench in a field an airplane is taking off from the runway a large pizza is laying on a table A girl holding on to a large, white teddy bear. A skiier posing in front of a mountain range this is a park with people flying kites A green passenger bus is boarding passengers near some water. many people sitting at desks near one another a stop sign with the red color looking all cracked A living area with couch, cabinet and many windows. A ham and chili sandwich is close up. A white bowl filled with vegetables on top of a wooden table. a black and white god with blue frisbey Cropped up carrots, onions, other vegetables on a on a cutting board Is this a Honey Dew donut or a bagel? An airplane beneath a cloudy sky flying over a bridge. a person cooking meat on a grill Very large TWA plane sitting on the runway with passengers milling about a tall giraffe standing on top of a dirt field. a close up of two people holding a video camera A large elephant walking towards a watering hole. A man is sitting down at a table, eating his stew and tortillas A tall clear glass with a very pretty flower in it. A man riding a wave on a surfboard. A trainer picks up his horse's lead rope. These two cats are playing in a room that has a large TV and a laptop computer. an orange bathroom with a sink toilet and mirror A man is balancing on a skateboard while others ride and stand. an area with snow and lots of skiers and orange cones A man sitting at a table about to enjoy a healthy meal. A man that is sitting in a chair by a skateboard. A group of men and emergency responders surrounding a table. A man wearing blue jeans and a white shirt is on a skateboard in a skate park. A bird sitting on top of a log in a lake. A passenger plane sits on the tarmac awaiting passengers. a bunch of bananas and apples for sale A small black dog sitting inside of a car. A passenger bus that is driving down the street. An empty living room with a charred fire place. A person laying under the sheets watching television. A big bear sits on the ground and grabs on to a guy's leg A view of a airport with people towing luggage. A white refrigerator next to a counter with an orange box. A modern bathtub, with a water hose next to it. a cat sitting by a person using a laptop. An adolescent giraffe near the fence in its enclosure. a close up of a person laying in bed next to a book A man riding a surfboard on top of a wave. A glass shower door near a sink counter. The food is ready to be eaten on the table. A bathroom with a toilet, sink and red tile flooring. People at a ski lift, with people off to the side one leaning down in the snow. A flat bed is on the floor with blue blankets. A herd of zebras drinking from a watering hole. A young bear and a mother bear foraging for food. A man holding a Wii controller in his hand. Motorcycle parked on road waiting for train to pass. A brown table holding a vase and three flowers. A cat walking past a bicycle on a rock path. A large group of people are having a pizza party A hot dog covered in toppings on top of a container. Red and white flowers in a vase on a table Man directs two horses on an open field. A kid buying ice cream at a truck An outdoor patio with chairs and tables made of wood. People are typing on their laptops in a room. Someone who is cutting a cooked pizza with a pizza cutter. A little girl hits a tennis ball over a net while a man stands on the other side of the net. Man wearing a blue shirt and pink tie posing for a picture sitting by a window. Woman flying a kite on walkway next to water. A kitchen with lots of black counter top space. A man in an arena rides a bucking horse. A city view with buildings, bikers and walkers. A beautiful white horse pulling a green carriage. A small girl sitting at a table with several foods. A mix of broccoli and other items in a pan. A living room with a brown sofa, chair and coffee table. a living room filled with furniture and a dog A white plate with a small piece of cake and a cup of coffee. A white toilet and hanging towels in a small bathroom. A stuffed teddy bear is sitting on the sidewalk next to a street. The man is riding his horse on the land. People swimming in the ocean on a clear day. A clock tower with a blue sky in the background. A group of people in a wine cellar. An elephant is standing in a grassy field. A train is coming down the tracks near a building. A bus driving on a rain covered street Two laptops sit next to a tv on a tv stand. A desk with three computer screens and a desk chair. A vase with flower on top of a table A man in a vest is eating a banana. A small horse is standing in the grass next to a larger horse. A white vase with some cherry blossoms in it A vase full of roses on an office desk very many trains at the railway station to their directions a old train that is on a train track Two elephants stand face to face as if conversing. A man sitting on a park bench holding paper A cake on a plate next to some oranges. One slice of simple cheese pizza on a paper plate. An electronic device that is available for free. A large truck next to people on a scooter. Table top with two sharpie markers and pair of scissors. a close up of a dog laying on a couch An old building with clocks at the tower. Lots of silver and black remotes sit stacked on top of each other. A woman is talking to a man and holding a plate with a piece of cake. Several plates with snacks and sandwiches in a display. three people sitting on a bench holding plates of food Two guys are playing with the wii together a bird is standing in a patch of dirt A white bear is laying out on the rocks A large herd of animals drinking at the water. A bus is parked on the road next to a building. A baseball player slides into the base, as the opposing team waits for the ball. A small kitchen with microwave and fridge. A man standing in front of a shelf filled with supplies. The man grins in a restaurant holding a glass of wine. A black and white photo of a person swinging his tennis racket towards the ball. A snow field outside of a ski resort. A man standing in front of a motorcycle on a driveway. A small yellow bird sits atop a hanging water supply. a bathroom with a toilet a sink and a mirror A cup of liquid with a fancy design on top of it. This is a photo of a building with a large clock in the front of it. a man standing next to a big red truck A baseball player looks up and drops his bat. A man on a tennis court holding a racquet. The lofted ceiling features two white ceiling fans. Two people holding remotes in their hands standing near a couch. A computer monitor, keyboard, and tower with peripherals and plugs sit on a desk. The bears are at the water, along with a seagull and another sea bird. A white bus that is sitting in front of a crosswalk. A white bowl with shrimp, broccoli and rice. Street signs with trees and rocks in the background. A photograph of a tiny bird on top of a tree branch. Three couches are in a living room arrangement. A man wearing a hat, standing on a snowboard in the snow. A toilet near a wooden stool with a container on top. A sloppy joe being displayed on a plate. An elephant statue painted black, blue, white, red, green and yellow Two adorable dogs enjoy a nap on a bed together. A narrow room with various luggage and two men. a person holding a carrot with a bike in the back ground golden delicious apples, coffee beans, and blueberries are in the foreground of this photograph, in the midground is a banana, and in the background are varieties of cookies. a baby zebra nursing from an adult zebra The multi-colored cat is standing on the roof of a car. A dog sitting on the floor in a room. Small children in red and blue uniforms, kicking a red soccer ball. A cat is sleeping on a wooden chair A man flying through the air while riding a snowboard. A couple of giraffe standing next to each other. A man in leather and a dog with a hat and sunglasses on a motorcycle with people walking around them. A giraffe is standing erect on a dirt path and grass and trees are in the background. Many cars are parked at the curb or are traveling down the street. a small child in a black top a kite and some grass An empty chair at a desk with a computer A antique style bedroom with hardwood floors and accessories. A man riding a wave on top of a surfboard. some girls playing a softball game with some people watching them P.O.V. of laptop with people walking by on the path Two horses standing on a grass covered hill. The man is driving the horse fast A bus parked at a bus stop letting passengers get on. Two people are attempting to catch a Frisbee. Lighted urban street at night with cars and buildings. People milling around a row of two story busses. A plate with a sandwich and a salad next to a pickle. A group of people flying kites in the national mall in Washington, D.C. A plate of food with carrots, green beans, brussel sprouts and sauce. An old white boat sits in the port. A stoplight with street signs on it A cat eating food off of a wooden floor. Tour bus parked on an empty street in a tropical city. A skier is carrying their skis and poles in the snow. A woman with yellow gloves on looking at herself in a mirror and covered in blood. A model set has boat in water going under a drawbridge A plate with two grilled hotdogs noodles, macaroni and cheese and corn. A blue dish filled with steamed carrots and broccoli. A person on a surfboard on the water. a well made bed in a hotel room with a window A woman poses next to a statue of a giant piece of luggage. a mushroom and broccoli stir fry on a bed of rice A desk topped with snacks and electronics with office supplies. A man in white shirt doing a trick on a skateboard. A woman in shorts waving to a teddy bear mascot. A woman sitting on a bench outside holding two donuts. A group of people in a kayak rowing together A woman that is standing holding a remote. A horse is on a brushy hillside on the gravel. A man cross country skiing in the snow under dark clouds. A dog standing in the grass as a frisbee flies pass him towards the bushes. the men are in the middle of a tennis match THREE MEN STANDING NEAR A PARKING METER, ONE OF THEM PUTTING IN THE MONEY A white dog running across a field with a frisbee in it's mouth. a school bus having a colorful shirt sale Some very cute small boys at a table with food. a woman staring and some do nuts in a plate A man on a cell phone sitting at a booth table with books. A toilet that was set outside and a small part of it was broken. A woman brushing the teeth of a baby in a bathroom. Two giraffes in the savannah with buses in the background. A bus pulling into a bus stop in the city. a gray fire hydrant with eyes and a girl with a backpack A woman in a white tennis dress playing a game of tennis. A cat is looking at two pigeons perched on a ledge. A group of people walk through the middle of the street. A man is using his large laptop in the living room. A young man skateboarding on the rim of a crater some kind of cabinet that is in a building a number of people in a field with many kites flying above A TV showing two men in hats and women. A sandy area with an elephant made from sand. A small red and white airplane sitting in an airfield with a wooded area and a mountain in the background. A person holding a hot pizza on a pan. A baseball game in progress with the batter at the end of the swing. A bathroom with a glass shower door, toilet and a rug. a wooden desk with a computer keyboard on it A boat in still water at a harbor at dusk. A horse tied to a post next to a tree A man with a helmet that is sitting next to bananas. A plate with a sandwich and chips. A zebra grazing in some very brown grass. A plate is full of a vegetable medley with a spoon in it. Man in full red winter gear on skis in the middle of snow. Two orange sheeted beds in small room with desks. A triangle sign with an English and foreign warning three zeebras all walking together in a row. A man looking at the bed in this lamp-lit bedroom There is a man walking through the snow A man preparing to hit a tennis ball on a court. There are several skaters at a skate park skating around. A feathered bird is sitting on a tree branch. A white table topped with tubes of tooth paste and tooth brushes. A counter with carrots, onions, peppers and other assorted vegetables on it. Men with horse and buggies pose in front of a train. A zebra that is looking at the ground. A black cat is sitting near a mirror and a picture. A kitten on a laptop sitting on a desk. Two mean are playing tennis and both are wearing sunglasses. A sheepdog at work herding some sheep in an enclosure. A bus that is sitting in the street. an image of a truck driving down a dirt road a man sits on a bench next to a dog A plate wit some very tasty looking treats. A woman glides over the water while standing on a glider. a person riding a skate board on a city street A kitchen with dim light in the evening. a small boat on the ground tethered by a rope A person in a mirror in a very small rest room. A large jetliner flying over a forest in a blue sky. A man and woman beside a red motorcycle. A group of people walking and cross country skiing in the snow in the middle of the city. A very pretty lady touching a cute bow tie. A bathroom with a toilet, sink, tub, towel rack and a window. Two empty motor boats floating in the water. A women wearing a top hat who is riding a horse. A group of people at some sort of function A pizza sits on a plate on a table cloth. A couple of cows and some people near a bike. There is a baby sleeping on the bed. A bleeding cut on a thumb near the nail. this is a dog looking through a arear view mirror a group of birds eating on some pizza A hand holding a donut with a grassy field in background. A room with a bed and some furniture. People are skateboarding by the ocean at sunset. A train drives past a station during the day. A small girl is smiling next to a large pizza. A closeup of this Giraffe shows his interesting head. A white plate topped with different types of cake. A living room with a large decorated Christmas tree. Two hands that are holding a rose next to a tie. Three women sitting at a table with drinks. a couple of boats sit parked on a beach A train stopped at a train station with passengers standing next to it. Bottles and other items on a counter top. Several different kinds of julienned vegetables in a bowl. Two men playing tennis with one man preparing to hit the ball. A yellow taxi cab that is parked illegaly parked in front of a fire hydrant. There is a little toy hanging on the key chain. A teddy bear sits on top of a sandwich board with writing on it in front of a cafe with outdoor seating. A tasty looking slice of pizza with some toppings. A freeze-frame series of a baseball player making a pitch. A living room with a couch that has blankets on it. a giraffe walking on a green field next to trees. A cat that is laying down on a couch. A table with a lamp next to apples sitting on top of each other. A lot of different size trees in the woods. Mangos, strawberries, and other fruits being prepared in a kitchen. There is a road filled with busy traffic including odd buses. A red and white, beige and pink moped parked on the street. a man in motorcycle gear standing next to a motorcycle parked near a tent A pile of band aids and medical supplies. A small elephant standing underneath a wooden structure. A man asleep sitting up on a metal bench. a couple of women pose for a picture A bird bath with three birds amongst some greenery. A man and a woman sitting on a bench with laptops. A man standing next to an older man near a plane. A zebra is standing in the middle of a field. A small boat sailing along in the open ocean A man is trying to ski down a small hill. A woman holding a cell phone is standing near a person on a bike. A city street filled with heavy traffic flow. a drain on the floor next to a trash can A tan dog rests on a public bench in a city at night. A couple of giraffe standing in the middle of a forest. A dog sits in a car, looking out the window. A street sign in the city giving directions to several intercity areas. Blue bench in front of a large sandy beach. A herd of cattle standing on a lush green hillside. A large truck on the side of the road. Young men playing with Frisbee in sports like competition. A computer, coffee cup, and books sitting on a table. A group of people on a grass field with kites. A plate of chicken, green beans and mashed potatoes. A giraffe and another animal are standing on the grass. two males are on some grass playing frisbee Two large blue bird with red heads walk on a grassy area near a body of water. Two brown bears are wrestling in the water. two cats laying on both ends of a bed a table with fresh vegetables and some dressing A group of people waiting by a large clock. A picture of a street sign on the street. A keyboard and monitor are sitting on a desk. A herd of elephant gathered at the edge of the water a bed room with a bed and a window Three men with skateboards standing above a ramp. A rooster and a hen are standing on a bed of hay. A damaged, leather suit case sitting on a dirty sidewalk. There is a clean bathroom with a blue floor. There is a veritable banquet of fresh fruits on the long buffet table. An area in front of a building has fountains, trees, benches, and people. a young female standing in front of a large cupcake sculpture a beautiful horse and a lady is standing by it. A happy woman sits on the couch while holding a glass of wine A woman with a tennis racket on a tennis court. A fire hydrant that has been colored red, white, and blue. The cutting board has apple slices on it. A group of zebras walking in a grassy savanna. THERE IS A WOMAN THAT IS PLAYING WITH WII A zebra standing next to a parked car. Three different dishes of food on a wooden table. there are a few ducks that are sitting in the river Sandwich on a bun in a white plate with a blue rim. an old photo of an elephant near a body of water Several people watching a snowboarder grind on a rail at night. A woman standing outside with her umbrella open. a black and gray pigeon some windows and a building A group rides horseback down the beach. two giraffe standing side by side next to a group of trees A fence separates three people seen inside the dugout. One young boy is in the batters box with his bat ready and another one is standing behind him and a man has his hand up to block the sun and is looking off to the distance. A public restroom that is kept clean for it's customers. Some elephants walk together through the grass. A person holding an opened umbrella walks down a wet street. There are four cows in the field together. One man playing the wii by his lonesome. Baskets of oranges are lined up on a table at a market. Several woman are at a table while one of them slices a cake. The kitchen needs to be cleaned before we can use it. A wine glass sitting on top of a glass sculpture. a close up of a stop sign and a street sign Someone placed bananas, strawberries and oranges in a blender to make a smoothie Woman walking a small white dog behind her. Two riders on horseback cross a desert landscape. A person is riding down a hill in the snow on skis. Party of people in canoes going down a river while site seeing. A man on a snowboard is performing a trick. A corner of a bathroom showing the sink, medicine cabinet and small window. Picture of what might be a TV remote control and a distorted picture in the front. This double-decker bus is headed for White City. A kitchen with green walls, white trim, and a refrigerator. A Captain Jack Sparrow look alike plays tennis with school children. A person is carrying some luggage near the train. A pita and some fries sit on a large, white plate. A giraffe is stretching its neck above trees. A small dog sleeps in a basket on a computer desk. A kitchen filled with furniture and a painting on a wall. A street sign warning cars to keep clear of a driveway. A bathroom with gray walls has a fan in it. A man and a woman are throwing a Frisbee. a person eating a hot dog, with a basket of toppings. A street pole with multiple street signs pointing in different directions. A Vietnamese woman stands on a boat laden with produce. A display of ceramic items on a street. A group of people wait in a red reception area. A dog tied to a sign next to a man on a bike. A white cat and a wooden bench by a building. A man sitting down with a brown teddy bear on his shoulders. white plate with a variety of vegtables on a scure two kids are eating pizza at a small red table A man is smoking next to outdoor tables. A garbage truck is going down a well lit street with buildings all around. A beautiful colorful angel portrait in on the back of a vehicle outside with nobody around. A line of buses stopped at a crosswalk as someone crosses the street. A rusted-out farm truck in a mountain field beside trees. A women holding a wooden board that has a desert on it. THERE IS A BATHROOM WITH A SINK AND A MIRROR a baseball bat and a brown glove on some grass A bench next two a table holding several pamphlets. Two cats are laying on a bed in a bit of sunlight. a man leaned over a toilet inside a bathroom some skies are on a stand outside in the snow A menu board at a fast food restaurant. A bench sitting on top of a sandy beach next to the ocean. A kitchen with wood cabinets, stainless steel oven, stainless steel microwave and a refrigerator and a hole where the cooktop goes. A goalie is guarding his end at a soccer game. A clock on a boardwalk near a beach. There is no image here to provide a caption for. A black dog running on the sand with waves in the background. There are several stuffed animals standing near a brick building. A laptop computer sitting on a table with a glass of beer. A table topped with lots of different foods and sandwiches. A group of elephants sitting in the middle of the forest. The young man is playing a game of Frisbee toss. Two vases and a potted plant sit atop a worn dresser. A wire fence divides a background of backyards and houses from a yard with a child kicking a large ball. Three friends are getting ready to ski on a warm, sunny day. different colored ribbons a basket and a pair of scissors An adult and baby giraffe in an enclosure. A kite blows in the wind above a large sandy beach. A television monitor mounted on the ceiling of a plane or bus. A boy stands in the grass with his mitt open. a man with a cell phone attached to his hat sitting on a bus A kettle sits on a kitchen stove beneath a shelf storing a blender, canisters and other items. A plate with a couple of scones and a kettle that may be tea. a bunch of kites fly through the air A pizza sitting on top of a box on top of a table. Park benches are lined up in a room in the grass. Little girl holding a ball over a red and white fire hydrant A blond toddler in a pink shirt brushing her teeth. A swan is floating down the river by the boat. A woman plays with a Nintendo Wii in her living room. Contemporary living room setting in urban residential building. a toilet and a sink that is in the bathroom A pair of individual using sails to surf. A man standing in the grass with hid dog. A man with a pizza playing on the computer A man holding a ball on a mound of a baseball field. Man in black vest with orange tie looking at the camera. A person walking with a kite in the air. a number of people standing near by parked motorcycles a couple of people climbing a hill of snow The outside of a house that has a clock in front of it. A slice of cheesecake with a red sauce and berries for topping. An empty bed with a teddy bear laying on it. A bear is seen walking in a forest in a blurry photo. Children sitting down to eat lunch at school. Several skiers wearing colorful attire ski slowly across a snowy mountain A few cows graze in a big wide open prarie A man standing next to a baseball player laying on the ground. A white dish filled with vegetables on a white table. a bathroom that has a sink in it A pizza slice,with tomato on it and cheese An open air market with a lot of fruits in a bowl A man in a wet suit riding a wave on a surfboard. A train is on going down the track while people watch. A woman smiles as she holds a parasol. A pole pointing where different things are at. A dog on a leash is standing in a grassy field. A large tall tower with a clock on top. Several people are sitting around a table with food on it. There is nothing but beer bottles in the fridge. A young man with ear phones holding something. A black and white photo of a martin Luther King next to a Lincoln statue. A dog stretched out on the grass with his tongue hanging out. A train sits parked on the tracks in front of a billboard. A young blonde girl holding up 2 cell phones. a baby giraffe has its head under its mother The animal has very large horns on its head. A shelf with hygiene products in a bathroom. A clock mounted to the side of a brick building. A woman dressed in black is playing tennis on the court. A man eating a hot dog at a sporting event. there is a woman holding a little girl taking a picture The pay station for a parking lot is in a location that has recently had snow. this bathroom is big but has a small tub a toilet and a sink a white and red tow truck some trees and a building A large group of girls enjoying a pizza party with pizza and soda. a man that is riding on a bike Male tennis player rushing hard to hit a ball. A half eaten piece of pie is all that is left. A man sitting in front of a table with a box of cupcakes. the boat is sitting on the shore line away from the water A couple of airplanes flying through a blue sky. three large dogs sitting outside near a forested area The young soccer player is kicking the ball. A car and a motorcycle parked side by side. an image of a man on top of a snow mountain A batter waiting for a pitch with the umpire and catcher behind him. A zebra stands in snow in front of a wall. A stuffed monkey on a computer desk with two computers. A table full of asian styled dishes and soup A two stories bus is parked on the side of the street. A woman riding a bike near a bus and other people. a man performs a trick on a skate board A group of people on motorcycles at an intersection. A woman that is standing next to a dog. A dog is in a yard with a Frisbee in its mouth. A group on people standing on steps and posing for a picture. a couple of people that are sitting on the porch Rockers with crazy hair holding out a racket and smoking a cigarette. A woman teaching a little girl how to ski. The car us upside down on the road way. A child washes grapes in a stainless steel sink. A man dressed in a suit is eating carrots. A stuffed elephant standing in a museum window. Two keyboards and a computer mouse on a table. A crosswalk signal at an intersection with a car and a bus. An elephant walking across a dirt road in front of cars. a small bed with blue comforter and sheets An asian market has hanging bananas by the roof. These people are posing in front of the trees. There is a bathroom and a shower in a bathroom. A giraffe standing next to some dead brush near a bird. A pile of locks of hair next to a pair of scissors. a green plant is in a glass vase A man that is holding up a camera. A man is eatinga beignet covered in powdered sugar. Two pieces of bruit are set bside a keyboard. a bathroom that has a toilet and some nasty stuff all over Shower with a removable shower head and a soap dispenser. A man in a wetsuit is holding his surfboard on the pier. A herd of sheep in a grass field. Mountains with steam coming from them with horses on the lowland. Two pizzas are on a pile of white plates. A toilet in a bathroom with large signs on the wall. The horses are grazing in the grass along with another animal. This is a woman getting on a motorcycle posing for a picture. A man prepares to swing while playing a video game. A young blonde boy leaning on a toilet. Two glasses of wine, two hot dogs and some tater tots. a close up of a plate of food with salad on a table The silver cover of an Apple laptop computer a woman is holding a baby in her arms Two people are walking down a snowy path with an umbrella. An airplane is descending in the air to land. A very drab looking room with a mattress on the floor. A bin is piled high with many apples. A young child is standing in the grass with a frisbee people are sitting on a bench together outside There is a burger in between two glazed donuts a man surfing on an ocean wave headed to a beach The head of a Giraffe with its mouth on a tree branch. a little girl is holding a video game controller A bicycle is chained up and locked to a sign post on the sidewalk. A pile of Chinese noodles with broccoli mixed in. A black and white photo of people flying kites A group of people next to a person on a surfboard. Closeup of a hand holding a Wii controller. A glass vase with flowers in it next to a pair of computer speakers. A store window with the reflection of a parking lot with a stop sign. Two young men playing a game involving a disc on grass. A large yellow train on a steel track. A person is in the distance while a brown dog is in midair and is running after a frisbee. A giraffe looking down while in a zoo pen. A group of people are standing on the beach flying a large kite. A slice of slightly eaten homemade pizza on a plate. Stacked kites with long streamers being flown in grassy field. A tray filled with plates and dishes full of food. a plate with bunch of diffent foods mixed together The dog and cat is laying in the bed with the man. Family sitting at the table together enjoying dinner a little girl sitting on a small kids toilet A clock is affixed to the wall of a religious institution. A large clock that is on the top half of a building. A bathroom has blue guard rails by the toilet. a man in a blue jacket and helmet on a black horse A man sitting in a chair sitting inside of a living room. a snow skier wearing black shorts and a blue jacket Three park benches at the edge of the water a couple of baseball player high five A large dirty airplane is sitting in a dirt field. A fancy steak sandwich served with fries and dipping sauce. A beautiful living room view with a vase sitting on the table. a bathroom with red walls and a tiled floor A cow standing in the alley near a building. A girl on skis is grabbing a man's head for support while several people watch. A couple of giraffe standing on top of a green forest. Children are in the living room playing a video game a black computer keyboard on a wooden desk A very narrow bathroom with a walk in style shower. A light airplane flying in a cloudy sky An older man watches a kite fly from across a body of water. A desktop computer sits on an old and scarred wooden desk. a couple of men that are standing up A person that is kneeling in the sand near a bike. a bunch of people that are skiing around in the snow A person is sitting in a chair and a bird is on the ground. An old fire hydrant on the edge of a city street The painting shows a naked woman using her laptop. Several people walking across the street in the rain with umbrellas. A clock tower sitting in the lobby of an airport lobby. Bar stools at a bar separating a dining area from a kitchen. A guy sitting at a dining table with some tasty looking food. A kitchen with white cabinets and a cool tile design. Pictures of a bathroom taken at different angles. a person standing under a colorful umbrella and wearing a big hat and sunglasses A boy on a skateboard in a skate park performing a trick. Cafe tables with table cloths and orange umbrellas over them. A dog standing with his head outside a caged area. The interior of a kitchen with wood floors and large appliances. A broken black umbrella laying in the street. A black and white cat beside a wood carving. A small group of zebras is standing beside a water hole. A big, fat bird has some crazy hair A woman watches a dog watching a man eating a sandwich. A person wearing an orange back pack standing in front of park benches. A king size bed in a hotel room. people walking around with a bus and car on the street behind them a group of people with umbrellas walk on a side walk A small bedroom with a bed and a desk a stove on the front lawn near a side walk Two elephants in a herd playing with each other. A tour bus stopped in traffic on a busy street A large pizza sits on a large white plate. a living room with two big couches and green chairs a teddy bear sitting on wooden steps leaning on a pole there is a small pizza and broccoli on a plate A plane sitting on the tarmac at an airport Two adorable chubby dogs sleeping next to each other. A woman in shorts giving a thumbs down signal A Siamese cat staring at a laptop computer screen. a big plane flying through the blue sky a clock tower near many buildings wit ha sky background A cat lounges on the arm of a sofa near a window. Three people with a video game remote in their hands. A vase with a white long stem flower in it. A parking meter with no time left in front of it. A four faced clock a top a stone column in a parking area. A group of people standing outside of a blue ice cream trucks. A cluttered desk with books, bag and electronics this is a baby and a blue chair A bathroom sink with travel size soap and shampoo. The skier with the animal cap is standing on the mountain. A baby observing a calf eating hay outside. a little kid holding a toothbrush standing in a doorway for a bedroom A red parrot eating a piece of fruit from the palm of a hand A sports announcer talking on a cell phone while on a ball field A very small bathroom stall with a toilet and several rolls of toilet paper. Several sheep standing around in the grass. People standing around talking and doing different things three people dressed similarly playing frisbee on a tiled floor A citrus fruit sliced in half on a plate. Two elephants are walking through the tall grass. the person is standing next to the animals in the water A woman looking at a tablet while standing outside a train car. A cat with a irritated look sitting on a bed. a man sitting on a green bench in a park A wooden cutting board next to a window topped with fruit. A cat is sitting behind the keyboard of a cluttered computer desk. A red double decker bus is seen in London. A fire hydrant is painted red, white and blue and sits on a sidewalk in front of a brick wall that shows graffiti. A pack of elephants are trampling in the sand. A person kneels as they ride a wave. A group of elephants walking across a large river. A little girl tossing a red Frisbee in a driveway. A group of zebras grazing in their enclosure A white plate with a hot dog topped with mac and cheese. A giant sheep with a lot of fur eats outside A black and white picture shows a tree covered hill. A couple of elephants washing a baby elephant in a river. A young man that is wearing a nice suite coat with a skirt and a purse. A big elephant playing in a puddle of water Four People riding two elephants across the water. a person riding a snow board on a snowy surface A red train traveling down train tracks through a rural countryside. A toilet, sink, mirror, and tub in a bathroom. A truck made to look like a train parked on the side of the road. many different vegetables are sitting on a white counter a woman walking outside with oranges on a stick Airliner being moved by tow vehicle near airport terminal. a river that has a bridge with a train on it A cat rubbing its head on a laptop. A plate of toast and other breakfast items. A light blue sky filled with colorful kites. A bear reaching up towards a tree on a rocky hillside. a bunch of guys in front of a table with cake frosting on their faces. A truck waiting in front of the warehouse. Two lamps by a window looking out at a forest. A bathroom with a toilet and a sink. A baby sitting in the grass watching kites fly in the sky. Some motorcycles are parked on a brick area The fire hydrant is painted all completely yellow. A table with a bowl of food and some mugs. The layer cake is on the flowered plate along with a fork. A woman with good posture sits at a wooden desk with an open laptop. a horse drawn carriage on a city street A little boy is smiling at the camera in front of a brown chair. The antique furniture and mirrors are next to the wall. A woman running through a city while carrying a Frisbee. A boy in camo shorts stands before an overturned skateboard. a day of the dead offering with fruit Some people are hanging out and playing the nintendo Wii. A car in flood waters in front of a camping area with camping trailers that is flooded. A very cute curly haired dog with a toy. A man and woman sitting on a vintage motorcycle. A street at night time with many different lights. a person at the zoo feeding a giraffe Someone holding a sandwich like food object with a few bites taken out of it. A cross country skier walking in snow during the day. A blue two layer cake sitting on top of a counter. Two teddy bears one dressed as a female and one male A black stuffed animal sitting on top of a toilet in a bathroom stall with blue floor tile. A woman leaping into the air while holding a tennis racquet. A carrot is being sliced as well as an onion Several dogs on a yellow school bus with a stop sign below the window. A woman sitting down with a large cell phone holder on her pants. A seated angel figure next to a clock dial. Two beds sitting next to each other in a bedroom. a nun rides around on a motor cycle around on the street A horse standing around in teh middle of a farm. A family of four sitting on an outdoor sofa Fine food served with sauce on a white plate A kitchen with a microwave oven next to a stove top oven. A small-furry dog on a red seat in a living room. A wet woman with two horses wading through a river Some elephants that are together in an enclosure. A couple of people with many bikes on a street. A couple of men standing on a tennis court holding racquets. Two halves of a sandwich sit on a white plate on a table. A peeled orange sitting on a white table next to the peelings. A man in an orange outfit is directing traffic to drive slowly. A woman in tennis attire swinging a tennis racket. An elephant standing eating hay in an enclosure. A blue and white plate with a chocolate dessert on the plate and powdered sugar on top. Woman holding red cased cellular phone in room. The man is showing the mess in the fridge to the ladies. two tennis players with rackets and balls on a court Picture of arctitecture probably a church or university. a dog on a skateboard in a shirt A group of giraffes and zebras in an enclosure There are boxes which haven't been unpacked but the television is already up on the wall. A fire hydrant painted in the American patriotic colors A green, grassy field with grazing animals on it. A mid sized commercial airline flying in the air A man taking a bite out of a doughnut. People are standing in the grass playing with a frisbee. a man about to throw a green frisby A woman with her head out of the photo is standing barefoot in a simple dress holding a suitcase. A cat is on a table with stuffed animals. A highway with several cars on a cloudy day. A streetsign with one side pointing to Maciel and the other pointing towards Wonderstump. A room that has two people sleeping in a bed together and another bed on the other side of the room and a person at a desk and computer. A polar bear in a polar bear enclosure at a zoo looking up. A man sitting on a couch and a man on a chair. A woman is hanging up post it notes in a kitchen. Three giraffes under the shade of the trees. A woman sitting on a yellow surf board on the beach. A boy is running while holding on to a kite. A meal is being prepared on the stove in a kitchen A room of bookshelves with books, suitcase, area rug and tv a small copper vase with some flowers in it They are holding a frisbee together while hugging each other. a little girl that is outside with a umbrella A toilet in a white bathroom is seen in this image. A man dressed in a suit and tie posing for a photo. A man actively plays wii in front of a television screen. A bear looks around in a rocky enclosure. A commuter train passing through a small town A couple of computer monitors on top of a desk. A four sided clock on a raised pole. an image of a slice of pizza on a white plate A low to the ground stop sign on the corner of a suburban street a kid sitting down eating a slice of pizza a body of water with buildings near by there are many people that are sitting on this bus a person sitting on a toilet while operating a computer Young woman with long brown hair in very dark grey jump top holding electronic instrument like a remote control. a woman with glasses is eating a hot dog Two small teddy bears sit by the vase with flowers A man pitching a baseball on a baseball field. A cow with a tag in its ear looking observantly. a toddler playing the piano with a stuffed animal A meal is being displayed in a tray with separate compartments. A toddler is brushing her teeth in a bathroom. this is a close up picture go two broccolis A collection of differently colored trucks in a field. a desk with a computer a laptop and monitor a man stands on a beach with a bunch of surf boards Several bunches of carrots on a cutting board next to a squash on a counter. Altered photograph or painting of a necktie creatively knotted A man with a bucket hat riding a hose on a beach. a little boy batting a ball while his family looks on. A group of bikers passing through a crosswalk. A monkey holding a strawberry and a banana. A boat that is floating in the water. A MAN IS ON HIS SKATE BOARD ON THE STREET A couple of men standing near a sailboat. The cat is sitting on the ground near the bench. Motor and photon boats moored in the water. The bus is parked next to the curb. a room filled with white furniture and books on the ground. He is writing to his destination on a skateboard. A green bus of some sort moving along a road. A elephant walking the edge of its raised enclosure at a zoo. a cow that is laying down on some hay A photo looking out of the side of a plane at another commercial plane. A large cut pizza on a wooden surface. A street sign that says Pee Wee Reese Street. A baby zebra rubbing up against it's mother while she eats grass. A bus leads traffic down a city street. A picture of a person standing by a bicycle. Several colorful foods are sitting on a large plate. A close shot of a green bathtub and a toilet. Two male tennis players posing at center red clay court. An egg-topped hamburger and arugula salad with broccoli A fish tank is inside a underwater themed bed room. A bred and silver plane resting on stands outside. A young man standing in forest filled with trees. a home made breakfast that looks super awful Red and white bus parked next to a glass building. A large polar bear walking near some rocks. a woman is jumping up in the air by another girl Man in blue pants and white short on a stage Old model Harley Davidson motorcycle and old cars parked. A bird swimming in wavy water, with a island in the background. A zebra is trying to stand in the shade. A woman cutting cake while another woman is holding a plate. An orderly bathroom with two sinks and a large mirror. A bath tub sitting in a kitchen next to a brick floor. A man in an apron arranging a stack of oranges. A group of people riding an elephant through the jungle. A person climbing up a snow covered mountain. a woman sits on a chair with a laptop on her lap Two guys sitting on a couch conversing while another guy looks at his camera. A distorted black and white picture with clocks. A girl holding a racket and touching her head A man does a skateboard trick up a ramp Two people sitting at a table with laptops in a bookstore. A foreign candy sitting next to it's open wrapper. A stop sign has been tagged to include the hammer time song. a person walking beside a boat sitting next to a fence a train going down the track all by itself a traffic sign two people walking and a van in front of a large building Several women working with some type of production equipment Business people having a discussion during a luncheon A giraffe caged in while grass falling from his mouth. women riding on the backs of elephants at the circus Two snow patrol people at the bottom of a snow hill. A large bear standing in front of a bunch of leaf filled trees. THERE ARE A BUNCH OF SHEEPS THAT ARE ON THE GRASS A dessert is sitting on a plate by a teapot. A snowboard sticking out of snow covered ground. three giraffes walking outside near a wood gate A cow rests in a pen with a turkey, chicken, and duck. A family of people hanging out on a beach. Several pedestrians crossing an intersection at a bridge. A little boy that has birds on his arms. A large jetliner flying through a sky filled with clouds. Two cats eating out of one food bowl. A blurry man standing next to another man laying on a bench. A single file row of dark colored luggage backs. A man is laying on the couch with a large cat. A woman with a concerned look talking on a cell phone. A plate topped with two sliced of pizza. The traffic lights glow green in the night sky. There is a man sitting on a bench listening to music A red fire hydrant sitting in the middle of a sidewalk. a sign in front of an old house in the city The medium sized zebra is looking into the camera. A young boy holding a remote control standing in front of a TV. A teddy bear is on the hand rail of a train door. A woman sitting in a car smiling while sitting beside a bunch of suitcases. A small group of cows standing in front of the camera. A large bunch of broccoli growing with the leaves around it. Several street signs displaying street names, addresses and driving option. A metal sink with a cupboard of knives sitting on it. a fridge is shown with some pictures on it Two blue and white vases are sitting on a table. small bathroom with tiles on the floor, sink, toilet and a window A group of men sitting on a lush green field. four jet plans are flying across the blue sky A group of men waiting for a bus at a bus stop. two people riding down the middle of the road on a moped bike A woman holding down a dog with a swab in her hand. There is someone standing in water holding a board. A man with a bandanna on serving himself food. A man and woman that are standing near a table. A woman that is holding a camera taking a picture in the mirror. Washed clothing is hung out on a clothesline in a cattle enclosure. A group of people standing near a number of blenders The men are playing a game of baseball. A young man is body surfing and paddling in the water. A young girl sitting in front of a bunch of bananas and grapes. A double decker bus driving past a tall building. a couple of people are holding tennis racket on a court This man is skiing down a snowy slope There is a surfer riding a wave in the ocean A bus parked along the side of a busy street. a man cutting up carrots in long strips A man sitting down eating a pizza at a restaurant. A body of water containing boats, kayaks and people. A woman is preparing to make dinner at her kitchen counter with the cabinets open An arrangement of doughnuts grouped in front of a store window. A fork sits next to a piece of white cake. A ram sitting on top of a hill in the day. A man on a surfboard riding a wave. A black girl removing her denim jacket top. A male elephant stands beside a shady bush. a young baseball player starts running to first base A small white dog lays in front of the fireplace. There was a lot of organizational effort put into planning this kitchen. A bull is next to a large group of people outside a train. A skier is shown kneeling while on a flat patch of snow. Two horses in grassy field below power lines. A group of people outdoors next to a large white building. A slice of cheesecake sitting on top of a white plate. A blurry image of a gauge on a pipe. a couple of hot dogs that are on agrill a newly married couple cutting up a colorful wedding cake. I sign in a video game warning dog owners to pick up after their dogs. there is a baby sheep that is laying on the ground A large bathroom features tiled walls, two mirrors and two sinks. A bird perched on top of a branch in a tree. a small boat on a large body of water An elephant and a handler in an enclosure down below. A manual or book about ten-speed bicycles A skateboarder doing a trick at an event. A stop sign between two traffic cones in the middle of the dessert. A brown dog carrying a frisbee in a grassy area. A picture of two smart phone display screens. A United plane flying close to the runway. A large airliner with a kangaroo on the tail wing. Passengers waiting for their bags at a luggage carousel. A rural train station is loading and unloading passengers A group of soldiers sitting at a table with a woman. A group of ties hang off a pole A bear is sitting on a rock in the sun. A person surfing on a continuous wave ride in a city. Lady laying across a bed with a dog. an image of a small airplane flying in the sky A woman taking a swing at a tennis ball A family is posing with their luggage at the airport. A white bowl filled with rice and broccoli beef. A group of passengers with a lot of luggage. A boy sits in a living room using a laptop computer. Three carrots being cut by a large metal knife. a cat sitting underneath a vehicle on the cement ground A man wearing a red striped tie is seen talking A man is sleeping with the covers pulled up high. A puppy is learning to retrieve a frisbee. A double bed with white sheets and floral pillows and blue trimmings. A lucky bamboo plant in the window of a small bathroom. Two people in suites posing behind some serving bowls A bowl of salad is sitting next to a dessert on a plate. A group of men standing next to each other. a number of motorcycles parked near each other A snow covered street with a person walking down it. a couple of sheep stand in front of a rock A black and white dog sitting on a bench. a man eats a sandwich and drinks a cup of coffee A close up of the side of an orange train. A man holding a large soup pot in a kitchen. A dog standing on blocks outside near deck furniture. A man looks down at his loose necktie with disdain. Three people cross country skiing in a wooded area. Two zebra eating hay outside in a zoo. A airplane that is sitting on a runway. Batter, catcher and umpire during as baseball game A big elephant standing beside a small elephant in tall grass field with other animals obscured in back. An airplane flying through a cloudy blue sky. a girl in a white jacket and orange sun visor playing tennis Four pieces of toast with olives, cheese, and other toppings. A zebra that is close by is grazing on some hay. The large winged bird is looking for some prey. The cook is slicing lengthwise several bananas on the cutting board. Train on the tracks at a station with people sitting on a bench. A cat with a cone on sitting behind a man while he is sleeping. A fairly normal looking bathroom that's in someone's house. A man with a knife and chopping board cutting apples A blond girl carries a tennis ball on top of her racket. a baby with a pacifier sleeping in bed a dog wears a baseball hat on his head there is a very tall giraffe in a zoo three people closing their eyes standing in a line together A large number of identical wooden boats float close to each other on the water. An emo girl laying on top of a bed on her back. A train trolley with a car in front of it. A view of kitchen missing everything except the microwave and top cabinets. Woman walking down a icy walk way next to a stop sign. The salad is inside of a clear bowl on the table. People observing a display of a concept motorcycle. A background of blurred shapes is fronted by bunches of green bananas of which one's been ripped off. a couple of cars are parked outside a church A cat lies up against the arm rest of a couch. A crowd of people in a metropolitan area at dusk. A little girl at the picnic table eating a cake. Plate full of cooked carrots, potatoes, and other vegetables. A dish which consists of roast beef, broccoli and potatoes. Laptop computer next to monitor on wooden desk. a white plate with some food on it some phones on a wooden table and a laptop a red and yellow train is going past some red lightstrain signals The giraffes walk next to each other down the wilderness trail. Three glass vases with a single yellow flower in each. the truck has been painted red white and blue A picture of a scene in a baseball game. A giraffe reaching for a tree branch on a sandy zoo lot. there is a owl that is sitting in trees and bushes These people are riding horses through the mountains A pizza with veggies and eggs on it. a cat laying down on top of a cardboard box many fruits arranged in large containers indoors near a weall A young boy in glasses paying video games a slice of orange sitting next to a sliced cake A green street sign sitting on top of a metal pole. A pair of gray shoes are sitting on a bed. A cake that is made to look like a pink castle. A tan cat wearing an old bowl as a hat. A bathroom with a toilet, sink, and other bathroom items. A small cow stands near a market display of soda bottles. a train on a train track with trees in the background A white toilet and a dark cherry paneled wall. A brown and white animal standing next to a marina. Two young men standing next to two dogs. a living room that has a coffee table in it Two children stand beneath the tail of an airliner near many others. A zebra standing on a lush green field. A black train parked next to a red train in a train station. a clock that is sitting on top of a table A messy desk with a computer, cups, glasses, bottles, books on the desk and the floor. A MAN SITTING AT A TABLE WITH NICE DINNER GLASSES A man on a surfboard kneels down as a wave breaks. Rice with ground beef and asparagus in a bowl. Boat sitting by the dock at the river A commercial stainless steel kitchen with white dishes a male in a red shirt cooking pizzas in an black oven A large elephant standing in a grassy field. View of one of the clocks surrounding this tower top A person with a red bike jacket is riding a red bike a bunch of toilet seats in a building that is being renovated. a small child is holding up a bottle A drawers of various supplies in different sections. People walking through a multi level shopping mall. A giraffe standing behind a wire fence on a grass covered field. A view of a small room with a bed, and small kitchenette. A woman holding a plate with a pizza on it A chef playing salad in bowls in a kitchen. A variety of donuts in a glass case. Two girls walking with umbrellas on the sidewalk. A skier in a panda hat poses for the camera. a small child with an open umbrella on the ground A skier on the snow with gear and ear muffs. Antique military biplane at waters edge at beach. Fighter jet on a airstrip with low hanging clouds. A baby lies on blue and green bedding next to a teddy bear. Old single engine plane on display in open building a man is holding something next to a motorcycle a guy dressed in leather sitting on a motorcycle next to a bus A white sink in the corner of a grey tiled bathroom. A fenced in area with a giraffe reaching it's neck and head over a fence that separates it from people. A man flying through the air on top of a skateboard. A group of people sitting around a table with food. Man walking up mountain using ski poles with backpack on a cow sitting on top of a hill eating in the rocks Two pizzas sitting on pizza pans on a oven. a child and adult in ski gear walking in the snow. A woman in bed beneath red linens having a conversation with a man. an image of two people playing outside with cups Two motorcycles side by side in a building. A boy skating on his skate board at a skate ramp. A man riding a surfboard on top of a river. A woman talking on a cell phone standing next to a parking sign. A woman under a sheet in the bed with her head on a pillow. A elephant standing close to a fence in front of trees. Cattle in a fenced area resting and eating next to a lush green field. a little green bird sitting in a tree next to a house A black and white cat stands on a bathroom sink. The young woman smiles shyly while washing dishes. A bus that is driving in the street. Three zebras hurry across the road in front of car A hand with a glove over it above a toilet. Boy holding an umbrella at the edge of a cliff. a boy looking over a gate at a cow. Hazy image of a surfer riding a wave on the ocean. a bathroom in an outhose with a wooden window on the side of it A plate that has food on a table. A person holding a purple stuffed teddy bear. a line of buses that are parked in the road robot dogs playing soccer in front of people Bathroom vanity show featuring the sinks and a stool. Yellow fired hydrant on the side of a city road. An oversized picture of a train has a conductor standing by it. A green and white bathroom with folded towels A red double decker bus traveling down a road in the snow. An old man riding a skateboard down a street. there is a woman standing by a trains window a close up of a man taking a bite out of a chocolate glazed sprinkled donut The desk has multiple computers screens and mouses on it. Many cows grazing outside on hills in the grass. A bed with grey sheet and two red pillows. An grown elephant standing beside its two babies. a tennis player on a court with a racket A kitchen is all white with gray counter tops. A man's legs standing on a skateboard on a road A man is surfing a wave in the ocean. A man with long hair is about to hit a tennis ball. An open kitchen with dark wood cabinets opens to a seating area which is vacant. Two women on a balcony cooking on the grill. A look at a sign signaling no skateboarding. a black kitty laying on a bench licking its paw A High flying skier is doing a mid air flip. A man sitting on the sidewalk under an umbrella A sandwich on a green plate on a kitchen counter Three cupcakes with blue icing are on the table and the middle one is split in half at the top. A large truck driving down a road next to a car. A Christmas display featuring stuffed bears and rabbits. People looking through the tents at the book festival a male skateboarder in a black shirt is doing a trick A group of men standing around a giant sheet cake. A kitten toy is on a desk with a computer. A herd of long horned cows laying on the grass. a city street with some cars driving down it A man swings at a pitch during a baseball game. two dogs laying down in a pillow on a wooden floor Workers in a restaurant kitchen preparing meals passenger train in front of a depot on a late afternoon Two giraffes feeding while standing behind a fence. a horse in a field of grass A white and brown dog is covered in a blanket. A young female wearing black is holding a purse and a cell phone. A giraffe walking away in a zoo exhibit. Pen and paper on desktop with computer equipment. A women reaches out to catch a softball A large clock sits in the middle of a flower bed on a street. The surfer sizes up the waves as he holds his pink surfboard A man working on a laptop looking at the camera. two parked motorcycles umbrellas shops and people and a tree A huge double couch in a living with a TV against the wall. An area with blankets and food containers laid out with people holding umbrellas sitting on the ground. A kitchen in a dollhouse with various dolls in it. Baseball batter gets ready for the pitch during the game. The cat is laying down in the window resting. A red fire hydrant pouring water onto a sidewalk. There is a vase filled with water that has rocks and a plant in it there are many people walking in the rain with umbrellas A horse wearing a saddle standing in the sand. A statue of a baseball player extending his arms to catch a ball. a piece pizza on a white plate with tomato Two red trains are on one track as a yellow train rides down another. The man and woman stand next to each other holding video game remotes. A group of people socializing at a dinner table in a restaurant. A man sitting on a bench in front of a bunch of pigeons. a collection of animal kites flying into the air A bus headed to Manchester is on a street. A view of a total gym exercise piece laying against the wall. A headboard attached to a bed mattress in a room. An old truck, painted over blue in the desert Two suitcases that are sitting near each other. Open door going into a bathroom with black and white tile floor. A man riding a wave on top of a surfboard. A woman catching a red Frisbee while standing on a dirt road. A dog is posted by the window with his reflection in a mirror. A picture is taped to the bottom of a stop sign. A lot of food that is on top of a table. Five unknown objects displayed on a beige counter. A couple of of surfers talking on the beach, with other surfers in the background. A woman putting a pot into the oven. Stairs lead down towards a fire between benches in a garden. A man standing on a very busy sidewalk in a city An elephant standing next to a green plant with purple flowers. A girl brushing her hair by a bed n a room. People on skateboard and with bikes on a ramp in a parking lot. there are two elephants that are walking on the road a big pink house with some chairs out front A MAN HIS HOLDING A SURF BOARD WALKING ON THE BEACH Slice of dessert items served on plate with fork. A living room with big windows looking at the ocean. A train traveling down a track during the day. Several street signs hand on a pole as a brick building stands in the back ground near some trees. A tall giraffe standing on a lush green field. man on blue tennis court preparing to make serve a man talking to a pretty girl under an umbrella A pair of zebras standing in pen, in the grass. A vase containing water as well as a flower in it. A blue water hydrant on a roadside in a city An adult elephant standing next to a baby elephant. a rest room and a bench inside the dugout A brown and white cat laying on a tan sofa. A person in a room with a remote. A picture of a black cat sitting on a young man. A Bathroom with a toilet, sink, and records on the wall. A young man eating food on a kitchen counter. A large refrigerator and freezer sits in the middle of a kitchen. two kids playing in a park with their kite A chestnut horse stands in the surf on a beach. A couple of buses parked across the street from each other. Two people next to a metal bench stare into a river. A trash can sitting next to a bench outside with a trash bag next to it . A group of teddy bear on a shopping trolley One man with soccer ball touching his head while another stands near. A narrow bathroom with a thin door is shown. A collection of items for an advertisement are arranged on a table. This is a bathroom that is in someones home. A dog in mid air catching a frisbee on a field. A woman sitting on top of a brown horse. A group of cute stuffed animals in a bed. some people and two laptops on a yellow table A man is taking a slice of thin pizza a chair made out of skis with people playing on the grass A woman walking a path by snow with her dog. a couple of horse that are pulling a wagon there is only one boat on the sand at the beach Maintenance city man inspecting fire hydrant on street. A roller skier pushes off down a street. A man leaning on another man both in suits and ties. A modern kitchen with a glass of wine on the counter. An old cement wall in a home is decorated with garland. a ball game being played on the field in front of an audience Three people at a intersection are waiting for the light to change. A group of passengers on a public transportation bus. A cat sits by four matching luggage bags. A long haired cat is sitting in an open suitcase. A young boy and man eat food at a cafe A table topped with oranges and a bowl of salad. Cows lay down resting in the foreground while a flank of trees highlights the background. An adult walking beside a child in a field. A child stands with his bat ready to hit a ball. A baseball player is going to hit the ball panda bear sitting between two trees in forest The sandwich dominates the plate and comes with soup. a person reaching up for an open umbrella An airliner is descending over the water to an airport. A boy is holding a dog that is wearing a hat. A woman hitting a tennis ball with a tennis racket. A white tub sitting next to a window and shower. a woman sitting in a chair at a dining table in a restaurant A bald man in a suit on a television. There is a half eaten piece of pie on the plate. Three zebras standing near each other in an enclosure. There is a woman that is sitting down playing wii A brown leather piece of luggage sitting on a luggage stand. a Shetland pony with tennis shoes on THERE IS A CAT THAT IS ON THE BACK OF A DOG a group of people skiing down a snowy slope An open Swiss Army knife rests on a table. A variety of kites flying over the beach and ocean. People at a bus stop getting a a bus. two cooks in a kitchen sampling their food A lady looking into the sun standing on a hill wearing skis. Several cake doughnuts cooking in large fryer full of oil. a guy sitting on his motor bike under some palm trees a living room with a bright red couch next to a yellow wall Two cats find room to stretch out and rest themselves end to end, even on a cluttered desk. A man in a red snow jacket is standing on skis. A painted fire hydrant next to an old tv. A close up of a pole with several street name signs. A bathroom with a toilet, and sink with the lights on. A man is presenting someone with a chocolate cake. A close up of a television remote being pointed at a TV The young woman is licking the bread of a sandwich. A cat looking at something on the floor a room showing a fridge well cleaned and a microwave Two Zebras eat grass in a dusty area. A silver sports car is parked beside horse droppings left by a group of horses. A china cabinet filled with fine blue and pink china. A bathroom with a sink, toilet, mirror and toilet roll stand. A man is standing on his skis in the snow. Several boats out off the shore of a lake. Man displaying bunches of fruit in arid area. a male in an orange shirt in a black suitcase People sit in a hot tub that is surrounded by snow. A baseball player has just swung his bat. A large white clock tower sitting in the middle of a city. A group of four people standing next to each other in the snow. A heard of sheep are roaming in the pasture. A dog sitting in a chair next to a table. A person they sitting down in a chair. A room with low ceilings and old furniture. A nice big living room with a big fireplace. Couple of goose standing at the water's edge while ducks swin in it. A couple of women sitting at a table next to drinks. A mass transit train moving across a small bridge. A piece of cake is sitting on a blue, green and white decorative plate. A fighter jet flying through the air above the clouds A pole is holding up street signs in the city. A photo of two people sitting on a couch, one playing the Wii. Adult and juvenile cows roaming in a grassy field A very cute dog laying down in a child's bed. A picture of a person touching a cupcake. A person wearing a helmet is holding bunches of bananas. A woman sitting on a bench that is facing the ocean. A man standing on top of a beach near a surfboard. Small Prop plane drives along the runway in the day. A woman hitting a tennis ball on a professional court. A broken flip phone sits, in two pieces, on the counter. A produce stall at a farmers' market displaying baskets of carrots and cauliflower. A bathroom scene with two bathtubs and a toilet. A man holding a new sign under a stop sign. A small bathroom with a stand-up shower. a cat is sitting on a wooden bench outside Two guys getting ready to jump of a ramp with their snowboard. A zebra stands in the dirt in its enclosure. Several men in suits and military gear standing near a table. A smiling boy wearing a white shirt and red tie. A blue single engine airplane in the air above a landing strip. A group of stuffed animals sitting on a bed A bathroom with a sink and a tub and a minimal, modern style design. A goup of people at a wine tasting. A woman in black shirt resting on a luggage carrying cart. A young boy riding a skate board on the walkway of a park. A kitchen with white walls and wooden cabinets. A group of open umbrellas piled on top of each other. An assortment of fruit for sale at a market. A single engine aircraft parked in a grassy field with other planes. Group of people all showing off their cellphones in a group seating. A bird soaring through a foggy sky over a snow covered mountain. a bird flying just above a body of water. some giraffes standing next to each other in their pen Several herd animals are on the grass by a mountain. a close up of a plate of food on a table Vehicles on the side of the road and a herd of sheep. A young boy standing on top of a green field holding a baseball bat. A wet dog running on a beach with a neon green Frisbee in it's mouth. A man in an empty parking lot trying to pull something A man is in the picture above a plate of food. The thin woman is standing between a man and an eating dog. An apple being held by a hand with a knife tip presses against it. A bathroom with two white toilets and a large bathtub. This group of steer are laying in the grass A zebra standing next to a zebra sitting on the ground. A man camping with two dogs eating a meal. Two people are flying a kite on the beach. A plate with a hot dog and fresh pickles. A group of animals standing in a grass field. A woman sitting on a bench looking at her cell phone Several paraskiers engaged beneath a cloudy winter sky. Young girl dressed in blue and pink skiing down a hill. A smiling grey teddy bear with a plaid bow lies on a green carpet. A tennis player holds his racket in the air after hitting the ball. Couch and chairs in living area with television. A cup of coffee next to a laptop of some sort. A cat cleaning itself on the top of a suitcase There is a bowl of fruit with apples, pears, and oranges in it. A person is skiing down a mountain next to a blue line in the snow. A young girl standing in front of a plate of food. A cake is the table along with some fruit. A group of celebrating fans in a city street. Someone laying on a wood floor with a dog A group of giraffes stand together in the field. A giant clock on the side of of a neon sign. Three Red Sox baseball players stand smiling in a dugout. THERE IS A CAT IN THE MIDDLE OF A BUNCH OF KNIVES A boy biting into a piece of broccoli. Two garbage collectors standing behind a garbage truck gathering up bags. People on a safari look at an elephant in the road. A choice of poached eggs and bacon on a bagel or donuts. a mirrored door showing the reflection of a couple there are train tracks that lead in to a train station Kids out on a sunny day while skate boarding. A bathroom with a tiled backsplash over a sink and bathtub. A street sign is pointing towards 8th avenue and the other is pointing towards 22 34 street in the middle of the forest. a man that is outside with a kite in hand A red and yellow sign for the life guards and an umbrella on a beach near the ocean. Two men and two women are hanging out at a skate park. An orange container filled with office supplies sitting on the ground. A plate with food on it next to a bowl with salad. A large airplane flying through a sky above a city. Boats tied up in a harbor with cranes in the background. Playing on a small laptop and a phone at the same time is not recommended A man wearing a suit has a boutonniere pinned on his chest. there are two men on a field playing with a frisbee Four people are skiing down a snowy hill. A very cute small child touching a fire hydrant. A table with bins of food that include pizza, fruit and salads. a cow eating garbage on the side of a road A man in a short sleeve shirt with a tennis racket A large multiple layer cake with yellow frosting flowers. A large jetliner sitting on top of an airport tarmac. a boat partly submerged in a body of water An animal that is looking at something on the ground. A mission style bed is dressed with bright white sheets and a striped folded quilt sitting in between two matching nightstands and lamps. Two plates have a meal prepared on each of them. A person on a skateboard does an air trick. Two side-by-side photos of different living room settings. A large kitchen generously adorned with shiny metal surfaces. A man in front of a horse working on its hoof. The woman stands on the cart behind a man driving it. A man holds a laptop that has a message about Barack Obama written on its screen. A motorcycle is parked in a lot by a store. Various trains at a train station next to people on loading dock. A guy rail grinding a skateboard on a ramp. A woman sitting on a bus next to a dog. a giraffe is eating a piece of food A booth with salesman trying to track down THERE IS A WOMAN THAT IS PLAYING TENNIS ON THE COURT A woman texting on her phone while on her laptop. A brown donut on a thin piece of white paper. A bunch of sheep grazing in an open field. A baseball player is up to bat during a game. A stand with various tv and game equipment on it. A closeup of a train at the station for people to board. A small pizza sitting on a wooden table next to a bread maker. A baseball player is preparing to swing his bat. Two kids sitting at a table eating a meal. a young man is performing a skateboarding trick A man holding a tennis racquet on a tennis court. An image of some baseball players in front of some money. Some food that is on a glass plate. The owl is looking at the camera in an intense fashion. Cat sitting on cabinet in front of large screen television. A brown clock tower with a gold, black and white clock. A table with cut up vegetables and cheese with it's rind cut off. A group of people sitting in a chair, working in computers. Two road bicycles are locked to a pole in front of a man talking on his phone. A teen boy and teen girl standing on skateboards in front of a stone brick wall. The man is riding up a hill on a motorcycle. A person wearing sandels standing in front of a cat. A toilet filled with Hershey squirts with a blue lid. A man in grey shirt doing a trick on a skateboard. Two hipsters sitting down at a table cutting up a chocolate cake. THREE BASEBALL PLAYERS STANDING ON A BASEBALL FIELD PLAYING A GAME A person is driving a speedboat quickly through the water. a teen holding onto a brown teddy bear A man rides a motorcycle that is decorated with three teddy bears. A Virgin Mobile train driving in the middle of a city. A restroom has a toilet and a decorative sun wall plaque. The man is carting his suitcase around the city. A toilet with the lid open and a phone on the wall beside it. A little boy is in a batting cage with his dad, who is serving as catcher. A kitchen area with a large pot, dove and a wooden cabinet. a person behind a stand selling fruit with a person near by This small kitchen has pots, pans and spices on display a man getting ready to serve tennis ball A plate of vegetables and meet on a table A public bathroom that is dimly lit by a window. A person jumping on a rail on a skateboard. two white sheep, a black goat and a white goat in a field a man holding a surfboard on his back A bench is sitting near a wooded area. Two guys wearing nice clothes are standing outside. A door opens to a view of a toilet. Two girls in cowboy hats riding horses waving. A giraffes face and neck while he eats leaves from a tree some baseball players are playing baseball on a field A pole stands in the dirt with a biker in the back ground Two men and two women enjoying an outdoor meal. Two men walking a dog and watching an airplane about to take off. An elephant putting its trunk in another elephants ear. A clock is shown on the top of a tower. Three laptop computers sitting next to each other on a kitchen counter. A person is looking down with ski boots on and skis next to them. A white toilet, sink and shower stand in a bathroom. A car driving in an intersection, past a furniture shop. A smiling woman perched on a chaise long under an umbrella A couple of cars that are parked in the street. A man playing a game of tennis on a brown tennis court. A plate with chicken and broccoli on it. Two young boys laying on a carpeted floor playing on laptops. The hotdog is next to a bucket of popcorn and a soda. a plate with a cheese shrimp and scallion pizza A man lays on a bed wrapped in a white blanket. Many plates of food with their silverware. A giraffe leaning down drinking from some water Two jets high in the sky with white trails. A small bathroom has toilet, medicine cabinet, and small sink. An industrial sized blender filling a jar Car and motorcycle traffic in a large city A living room filled with living room furniture and decor. this is a dog running near some water A classic military motorcycle is parked in front of a crowd. A computer desk with two desk chairs at it. A group of people with kids sitting in a living room. A man on a skateboard standing on asphalt. A zebra foraging for grass among dead branches. A refrigerator is filled with a lot of food and beverages. A small elephant playing with a toy suspended from a wire. A plate with sliced pizza and a bottle of beer. Several people standing next to two people in cell phone costumes. A plate of food with a sandwich and a salad. a toilet some white brick walls and toilet paper A boy riding a skateboard on the sidewalk. A man goes to strike a tennis ball. two men looking angry at each other. A person has a sandwich on a plate. The old vase is on display on the table. A silver car next to a parking meter. Three zebras are standing in a filed under the clouds. A man walking down the sidewalk, and a blue briefcase in front of a post. A cat laying on a handbag on a bed. A couple of coaches in a large room A baseball player swinging a bat on top of a field. This sewing room space is small but well stocked. White kitchen cupboards with grown counter top and black stove. A church stands in a country field, underneath blue sky. A sumptuous table setting in a royal dining car. Several bottles of wine on a display table. A guy with a white shirt and jeans riding a skateboard. She is checking her messages before finding a good spot to enjoy the concert. A smiling young woman holds up a bottle. A man wearing a patterned shirt and tie and glasses. A woman is dressed as Merida from Brave. A couple of guys at a picnic of some sort contemplate sweets arranged on a paper plate. A group of cows mill about on a grassy pasture. A kitchen with green cabinets and tile back splash a bunch of food is sitting out on a table A very thin cow standing near a herd of elephants. An industrial kitchen with a strainer on the counter. A man sitting on top of a cement ledge. Young adults in tennis clothes are playing Wii. A living room filled with furniture and a fire place. Two officers are riding horses near a crowd on the sidewalk. Several people are skiing in the snow by a tree. a train approaching a station with people waiting to board A skateboarder is crouching and arms fixed as if to run into something. A man releasing a baseball at the end of a pitch. A boy is blowing his candles on his ninth birthday. A vase full of flowers sits on a counter. A bathroom with a pink sink and blue tiles. A man in red shirt kissing a woman's forehead at a table outdoors. A woman pushing a stroller and looking at her cellphone walking down the street with people walking or riding bicycles behind her. very types many ripe fruits in a basket People are standing on a sidewalk in London. People pulling their luggage as they walk skiers riding on a ski lift to the top of a mountain A chair at a desk in a room. A hot dog in a paper boat sitting on a person's jeans clad legs. Skate boarder performing a stunt in a vacant area A man carrying a white surfboard across a beach. A group of students posing for a photo. A open laptop is on the table next to a box. A man standing on top of a boat on a large body of water. The garden vegetables are blooming outside and are ready to be picked. A plate of food sits next to its dessert Some smiling guys in a very big crowd of people. A street intersection with street lights in a small town. A baby sitting in front of a stuffed teddy bear. Looking down on a very winding twisting road a bunch of chairs and umbrellas on a beach. A woman sitting in front of a desktop computer. A bike parking white tent cover is set up. Five planes are flying in formation in the sky. A black and white picture of accessories in a store. A sandwich and condiments sit on a white plate next to a drink. a person in fancy clothes rides on a horse Two people jumping in the air to fight over a frisbee. a skateboarder skating on a stone skate ramp. A stove top topped with three pans filled with food. The rider and horse canter onto the field to compete. corner cabinet and sink area of a green kitchen A group of elephants that are in the grass. The couple are dressed up and posing for photos. A snow skier is off the ground in the snow. A man holds a bat awaiting his turn in the batting cages. A siamese cat lays on a wooden desk The snowboarder is grabbing the board while jumping up. Door leading into a compartment on a train. Cows grazing on the grass in a green pasture. A bucket full of toothbrushes rests on a rock outside. A hand holding a spraying hose to a toilet bowl in a small toilet stall. A couple of guy sitting at a table with a couple plates of food in front of them. two elephants are walking together down the street a large stack of old and antique multicolored suitcases. A man with a backpack sits on a non-functioning toilet outside. Two green metal street signs with Spanish words on it. A girl is sitting and eating a biscuit. A toilet in a bathroom next to a plaque on a wall. A group of guys in a field playing soccer together A picture of a orange cat in a bowl. Two young kids play soccer against each other. A group of oranges are sitting in the bowl A glass sitting on a table next to an oven. Food prepared on a bun and set in a basket An old double decker green bus says London Transport. A mother and daughter are cooking together in a kitchen. A living room with wooden walls and a tv. This little league player is catching a ball during a play A pair of parking meters sitting behind a row of parked vehicles. A child looking at an elephant that is standing in an enclosure. a person on a skate board comes off a ramp A girl with a black eye and pig tails sits in a suitcase. A group of snowboarders gliding down a snowy mountainside. A man in a jersey swinging a baseball bat at a ball. a small bathroom with a toilet and a sink A plane flies over, painted in right colors. three people sitting in chairs and a teddy bear an image of baby sleeping next to a woman Many cows in a pasture with trees eating grass. An old woman attempting to play a video game. An airplane sitting on top of an airport tarmac. A blue vase with a bird painted on it with flowers in it. A young man is standing at the bottom of a staircase. a number of people walking on a side walk near a building A crowd watches a large giraffe through a wire fence. Several people on skies in the snow Two ducks by the water one is spreading its wings. A passenger train that is pulling into a station. A man and a child standing on top of a beach. A bike sitting in front of a beach in the evening. a couple of baseball players are out on the field an old steam powered locamotive at a station filled with passengers A half dozen assorted doughnuts in an open box People standing outside of a hut with several bunches of bananas and other fruit outside of it. A blurry screenshot of a green street sign. The stop sign below the street signs has writing on it. three surfers are walking on the sand at the beach a man has a refrigerator on his three wheel bicycle a bunch of people stand next to some suit cases a sheep and baby sheep standing in a field A smiling man is holding a skate board near a street. A man on a surfboard riding a wave. A young girl who is looking in the refrigerator. A baseball bat leaning agains a wall beside a yellow box. A larger standing horse is standing protectively over a smaller resting horse in some tall grass. A man catching a white frisbee with his hand. A cake with a couple of birds and other animals on it. A tall clock tower and a tree against a blue sky. a female in a white top is playing tennis A yellow fire hydrant gushes water onto the street. Two giraffes are walking next to one another. Model train locomotive on track in small village display. A boat with a wooden hull is on a beach. A girl sticking her hand in a large bowl. A vase filled with lots of colorful flowers. An old refrigerator displays its open door and contents. A picture wedged in between a bunch of bananas Modern kitchen with counter and cabinet and hardwood floors. A little boy watching two elephants in an enclosure. The motorcycle rider is on the road with all his gear. Two large slices of pepperoni pizza on a table. a red and white stop sign and a street sign Two women are at a table with laptops. A young man holding a tennis racquet on a tennis court. A man playing with a soccer ball in a field A Blue dish full of green broccoli heads and asparagus. A bowl has onions, shredded carrots and other ingredients in it. A red stop sign with a car parked behind it. A large church clock tower towering over a city. tourists riding and petting an elephant at a tourist attraction three people are sitting on a bench watching a train go by The men are playing doubles tennis on the court. A man and a woman carrying a surfboard down the road. a small bird in a field of green grass a close up of a vase with art behind a display glass The wooden bench has spray paint on the back. A pitcher winding up to throw a ball on a baseball field. A table topped with a plate with a pizza on it. A male competitive speed skier coming around a curve. traffic cones in a bathroom that's under construction A male getting an object out of a tree. THERE ARE PEOPLE THAT ARE SITTING AT THE TABLE A men's tennis couple watching a ball hit the net. Group of people enjoying food at a market. A full view of a building that has a huge roof on top. a male with short hair is looking out of a trains window A street sign surrounded by orange and red leaves A woman feeding a white dog a small carrot. Three individuals flying a geometric kite on the beach. A 4 way stop sign on the corner of a city street A girl took a selfie of her taking a selfie on her cell phone. a wooden piece of art consisting of two birds standing at opposite ends of a log with a cone shaped vase in the center with a group of red berries sticking out of the cone. It's easy to imagine a dinosaur as an ancestor of the giraffe. A young man holding a tennis racquet on a court. White police car passing through a stop sign in front of the building. a vegetable sandwich with cucumber pickle and tomato A hand wearing a ring reaching for a pair of scissors. a man is riding on a ramp on a skateboard A woman sits on a bus, presumably waiting for a bench. This bathroom has a wood floor and wood on the wall. Two guys stand by bottom of stairs playing the Wii A group of zebras crossing the dirt road . A chef standing in a kitchen preparing food. A small bird sits on a corn plant. Several park benches lined up under a row of trees. A red Two level bus stopped to pick up passengers A person standing on a sidewalk with a black umbrella A table with pots of sliced carrots, green vegetables and baked bread. A living area with two chairs, stool and a television. A man is next to a horse in a window. A white woman and an indian man shaking hands A child's lunch, of soup, fruit, and veggie, sits on an A B C place-mat. A very sophisticated bathroom with a white theme An old man in religious clothes reaches to catch a frisbee. Some zebras eating together outside in a grassy area. a double decker bus rides through london happily Kiwi fruit, banana, apples and an avocado in a dish A table with two bottles of wine on it. a plane sitting on a runway with a ladder sitting there for it a jet on three pillars in front of a building a kitchen with a stove a sink and some cupboards A man is cutting small hot dogs and adding toothpicks to them. this is a man hitting a ball in a game A flock of ducks are swimming in the water. A table with a tin of hotdogs and a plate with bun. a grill that has some pans on it A light pole and street sign in front of a store front. A city bus that is sitting on the road. A man bounces a tennis ball as he prepares to serve during a match. a lot of people are on a tennis court. A collection of knives and a pair of scissors in a wooden block. A cup of coffee sitting next to a sandwich chips. A pair of scissors near a stick of butter. A surfer is riding a wave in this aerial photograph. A man riding a skateboard down the side of a hand rail. Train coming down a rusty train track with scrub grass. A single seagull standing on the coast with waves in the background. A white plate with a cut in half sandwich on top of it. A rectangular toilet bowl in a tiled bathroom Surfboader riding the crest of a ocean wave a person riding a horse puling a lot of hay The hotel bed is designed for the business traveler. A yellow fire hydrant is standing alone in a parking garage. A motorcycle and car are parked in a garage. a white chicken with a black tail and a red head An adult cow walking along side the river bank A yak needs long hair to survive in these mountains. a man sitting on a bench and laying down Man posing with a tennis racket for a shot. A passenger train that is pulling into the station. a person riding skis jumping in the air A desktop computer sitting on a a desk. The woman's on the horse giving presentation with flags A white beach chair with a red, white and blue striped towel under a yellow umbrella. A person on a snowboard anticipates a jump. a vase having a bunch of flowers inside of it A baseball pitcher throwing a baseball on a field. Two people and a dog are sitting under a sheet-tent. A group of people standing at a table with bottles of wine. The train rides on the track past the station during snowy weather. Black and white photograph of a modern commuter train a dog with its frisbe in its mouth walking in a water way The woman in white outfit swings the racquet at the tennis match. A close up view of a pizza sitting on a table with a soda in the back. A child snowboarding down a hill in the snow. A cat is lying on a table, watching a television. A cat sits with hisher toy on a blanket. A man on a skateboard standing on a ramp. little kids sleeping all over a big bed A 24 hour recovery truck traveling down the road. A snowboarder sitting on a ramp in the snow A kitchen table, refrigerator, garbage can, chandelier and window. a clock with big numbers at the end of a table A person is standing in the snow on skis. A canopy bed in a white and brown room A towels hanging from a towels rack outside a shower. An unfurled sailboat in the water under a pink sky. A blue and yellow meal pole with street lamp lighting A toilet is made of wood with accents on the back of it. bacon, lettuce and tomato on toast with slaw and a pickle. A family of giraffes standing by a puddle. Five giraffes are standing in tall grass, in their habitat. A girl playing some video games with her family. Parking meter and flower in vase displayed in window. Students are sitting at tables with books and laptops. A plate displaying vegetables, meat and bread on a table. an image of a stop sign that is posted in the three way zone A laptop computer and mouse is on a sofa. A man with an old-fashioned hat is looking at the camera. Two gray elephants fighting each other bumping heads. a group of horses graze on some grass A bowl full of oranges and leaves on the table. A man doing a trick on a skateboard over a ledge. Three people ride an elephant while a man on the ground directs him. some bananas peaches apricots and and apple A woman standing next to a motorcycle and some health aid trucks. Player at bat and umpire holding a ball . A large plane sitting inside of a hangar. A row of red stop signs sitting next to a lush green field. A view of a mountain range is seen from an airplane. A woman talking on a cell phone while walking down a street. A woman sitting on a trunk wearing a polka dot dress with a red belt. A couple of people at the beach during the day. A blue street sign in front of a building with many windows. A woman in a blouse wearing a striped tie. Someone riding on an elephant as it stretches it's trunk out a bunch of motor cycles all parked together Two zebras stand next to each other on a field. An altar with purple cloth, vase and two candles. Man poses sideways wearing a plaid shirt and a tie. A small plane is parked on the tarmac. A van follows behind a bus on a rural road. a little kid starts to learn how to ski a little boy bending down taking a bit of a hotdog The laundry is hanging in the tilted room. A young boy riding a skateboard down the side of a ramp. A couple of cruise ships in port with a large building in the background. This is a picture of a woman playing tennis. Two giraffes stare out of their enclosure at a zoo. A small naked boy holding a tennis racquet on a beach. A homemade cheese pizza is made and ready for the oven. There's enough wind to fly a large box kite. A clock that is siting above a sign. The man reclines in his seat from the table with doughnut in his mouth. A white chair with two glass birds on top of it. The blonde lady answered her cell phone because she was waiting for an important call. The street light, the electrical box, and the sidewalk are littered with bird poop. A bunch of people lounging at a beach near an ocean. Two men hold hands around a dining table. A large train gains speed on the railroad tracks. A couple of cows grazing in an open meadow. Two young boys sitting on a bed with three teddy bears and a sign with the number twenty crossed out. a truck with two off-road vehicles in its back compartment A cat is sitting on a motor scooter. A group of people walking around a train station. Young boy posing in front of a flying kite in the park A stop sign has collaborate and listen on it. A compact kitchen set-up with shelves for storage and a small stove. A herd of elephants walking across a ground near a river. There are two cows walking on the sand. Apples and oranges pile in well lit color photo. Fresh fruit and vegetables on a kitchen counter A group of young women getting food from a table. A white fireplace that has pink candles lit on its mantle. Two boys with an umbrella and chair on the beach. A man and boy are sitting on a couch. A woman and some children near a zebra behind a fence. A jockey is on a brown horse with a crowd watching. A skateboarder doing a trick in the air at night. A topdown view of floor with sheets, shoes and a desk on it. A herd of cattle grazing in a lush green field. A woman in a suit and tie standing with her hands in her pockets. A person taking a picture of a stoplight on the side of the street. There is an old-fashioned clock tower in front of a building. Two boys are standing in front of a train with backpacks. Two officers are riding horses near the ocean. A cow looking at the camera from inside its fenced in pasture. two geraffes in a feild next to a tree. Snowboarder impaled on a tree during dusk with fire. two people feeding each other cake at a wediing a dog outside playing with a ball in the grass A two level bus with a large advertisement on the side two little kids sleeping on a pink bed A fleet of small air crafts are flying over sea. A royal, horse-drawn carriage moves along the road. two giraffes are laying down in a park like setting. a woman stands with some luggage by some chairs This room is caught in a design time warp. Two blue and black parking meters sitting on a sidewalk. The man is cutting into a large cake as others sit around the table. The girl in shorts is attempting to hit the tennis ball. A group of people are riding horses near a train. A cluttered countertop with a celebratory pink and white cake and opened containers A man on a surfboard riding a wave. A rectangular pizza served on a wooden cutting board A cat and dog standing buy their human in the kitchen. A pregnant woman is in bed reading a large book. A person that is holding a dog and a bowl. Several pieces of wood lined up near a lot with several axes around. Three children on a sofa by window eating bread and pasta. a magnetic knife holder on the wall above a kitchen counter Three giraffes stand near each other in a field. A sign at the corner of St. Clair Street and South Main with flowers above. Many people have come to tour an authentic military aircraft. A bus moving past a street sign opposite a building Staring into the camera to take a picture. Bottom view of an airliner flying directly over head. There is a man sitting on a wall talking on a phone a living room with a couch and a tv A small kid on a field with a bat. A blue sign posted on an overpass that people walk across a number of large kites on a beach A man standing in front of a car with it's hood open and a dog standing in front of the car. Men laughing and playing tennis on Wii Sports A big building in front of a tall clock tower. A small cats eats out of a food bowl while standing in another bowl. The snowboarder is jumping high above the ramp. A plastic container of food with rice and vegetables. A small showcase of an assortment of funny and cute items. A row of motorcycles filling all the spots in a parking area. A giraffe and other animals in a field a black and white image of a man on the phone A large kitchen has a stainless steel counter. A woman sitting at a table eating a plate of food. A double decker bus stopped at an intersection. a girl that has a racket in her hand a man standing on a tennis court holding a ball and a tennis racket A man prepares to hit a ball during a tennis match. A man standing next to a produce stand with tomatoes and other vegetables. A man is in a hospital bed has a teddy bear. A cat laying on many shoes on a brown rug. A kitchen counter with a candle display on it A little girl that is standing on a surfboard in the water. A small orange vase is on a table with a small branch in it. A group of cyclists are riding across an intersection. Two people are on a motorcycle driving down the street. A herd of different colored sheep walking near rocks. A table with two red vase type items A number of people flying kites on a clear day. A bath and sink with a woman in a room. A parking meter sitting on the side of a road. A cat drinking out of a sink faucet. A grey tabby cat stretches out on some clothes A kitchen with wooden flooring and white wooden cabinets. Dog sleeping in his bed next to rocking chair Man on a surfboard under a large wave A roadwork crew constructing a guard rail along a mountain road two woman stand in the snow and pose for a picture Two men having beers in a dimly lit room. Long woman sitting on a raised log looking at the mountains. A lone horse standing next to a fence. A trainer feeding two giraffes from his hands Old picture of a sumo wrestler playing baseball. A child holds a game controller for Wii. a large dog is looking oof to the left Living area with small desk and leather couches. A train is pulling into the train station. A large chair statue with a large horse statue on top of it. A dog running in the sand near the water. The man in stripes holds onto the plate of food as he poses for a picture. Two people standing in the grass playing with a soccer ball. A small toilet and trashcan across from a dirty sink in a very small, dirty bathroom. A large white sign above a brick wall with a yellow vehicle to the left and a parking sign to the right. A black bird is perched on a tree limb. Cowgirl at rodeo riding house with a Texas flag. A man with many bags walking on street next to fence. A skier is being towed over the snow. A small pizza on a plate that is sitting on a checkered table cloth. A picture of a large apple and walnut pie on a plate. A woman petting a horse in an open field. a couple of men are eating at a table A few pieces of luggage sitting on top of a wooden floor. A herd of goats walks by a car and its driver. A large group of competitive cross country skiers. A woman smiling while sitting on a bed. A slender high rise building is fashioned behind a pole clock. A man dressed as Elvis sitting on top of a bull statue. The city buses are parked together in the parking lot. A woman is in water catching a frisbee near a boat A man takes a bite of his food at an event. A red airliner is parked on the tarmac at the airport. Dock area with urban area on cloudy day. A bus is seen coming up to a bus stop. A long tunnel with a long table with lots of seats and candles next to wine glasses. A man and woman are walking in the rain with an umbrella A woman tennis player serving a tennis ball. A red double decker bus parked near a red telephone booth Surfers on surfboards ride in a row on the ocean waves. A person stands next to a train parked on tracks A group of boys stand around a museum exhibit. Candles, flowers, and stuffed bears are set in a corner near a poster. A mother elephant and baby standing near the water some people sitting at tables eating pizza and drinks a room that has a bunch of tables in it Four bowls of snacks crackers, broccoli and carrots, nuts and dip A woman riding a carriage pulled by a brown pony in a race. People stand near a desk with laptops on it. A small child holds onto a fire hydrant to stand up. A man holding an umbrella for another man in the rain A sail boat with a large Colgate Clock in the distance. Skateboarder with an elongated shadow at an outdoor skate park. A group of people riding skis on a snow covered summit. Two people standing on a tennis court with ball in the air. The men are playing a game of baseball in the field. Spectators watch the players at a baseball game. a road that is next to some trees A small farm animal steps through the short grass of a green field. A Dairy Queen sign on a major road advertising it's special. Horses pull carriages on a dusty dirt road. A young boy in a green outfit holds baseball mitt. A city bus going down a city street. close up of fingers holding a slice of pizza A man with a tennis racket stands on a court. A picture of a street with parking meters. THERE IS A BED WITH A SKY BACKGROUND A woman with her laptop on a bed in a dark room. A man on a horse going down a track. A young couple poses with a cake decorated like a keyboard. Two cats sit on top of a towel on a counter. A plate that has an apple and sliced kiwi on it. Grizzly bear grazing in grassy field in daylight A computer is set up with gaming equipment. An elephant walking through a brushy field A brown and black bird standing on a tree branch. a number of elephants near a body of water Three stuffed animals next to a radiator and below a rocking chair. The man is focusing on something in his hand while holding up his bike with his leg. A tennis player holding a racket on the tennis court. A stop sign and people on the street in front of a double decker bus. a kitchen wit ha stove some cupbaords and drawers Black and white photograph of houses and a clock tower. a cat licking its lips while holding onto a toy in the shape of an elephant The young man is jumping on his skateboard. A batter swings the bat as the crowd watches attentively. Three people with surfboards standing near the waves. a group of people that are eating some food Two men in suits and ties with woman behind one of the men two college graduates pack up after a long day two yaks are out in a grassy meadow A baseball player swings at a pitched ball. The man is holding a tennis racket in his hand. A group of women that are in a kitchen. People at the picnic while an elderly woman shows a pizza. A statue of a man and woman with luggage in a city. A pizza sitting on top of a cardboard box on a table. a large plane is parked on the run way The building has several umbrellas suspended in mid-air for decorative purposes. There is a window with a cake and other baked goods showing. An antique fire truck parked on the side of the road. four traffic lights over a city street A woman sitting on a pier near boxes of fruit. Apples, oranges and bananas all mixed in a bowl. A sign with a large hand with five dollars written on it. two elephants giving people rides down a street A man eating something from a paper bag A couple of people that are watching a baseball game. a very tall clock tower sticking out of a building. A chrome colored microwave oven in a custom cabinet space. A group of people sitting down at a dinner table. A lady reaching for a huge wine glass A man riding a skateboard down a road. Three skiers in bright outfits start down a slope. The stove top and oven is separated in this kitchen Two rams are staring at each other in the woods. Two people dressed up entertain a little girl. Two people are riding a sports motorcycle down the street. A view of a bright hallway and a room with a wood burning stove . A person holding a toothbrush in his hand. some people standing around one man is wearing a tie A picture of someone riding a snowboard doing tricks A woman is leaning on a car talking on her cellphone. A plate of food with a bite taken out of the hamburger. A woman on a motorcycle is next to a man walking a dog along with other people going down a dirt road. The entire baseball team has gathered on the field for a celebration. A giraffe that is sticking out its tongue. A woman in dress leaning against stack of concrete blocks. A black and white scene with a lady answering a phone this is a horse and a dog by the water A white and gray bird perched on a human's hand. People riding on the backs of lavishly decorated elephants A large black horse standing on a field filled with green and brown grass. a van driving down a cracked street The dirt bike has seen many hill climbs in its history. A cat and dog napping together on the couch. A visitors desk with a vase with sunflowers in it. A skateboarder in mid air after a jump a hotdog with toppings in a paper tray A group of people sit in a open living room and kitchen area. A group of four zebras standing in different positions. A simple wooden bench is in the woods. A man in a red uniform and shorts throws a ball while wearing a baseball glove. Multi colored cat laying on the floor next to door and liquor bottles The person is holding his cell phone while on his laptop. A man standing in a kitchen using a blender. Two men stand by a trunk of a car next to which are both a surf board and some folding chairs. A variety of shots of a man doing skateboarding tricks a couch in a living room with three pillows A marble bathroom with automatic toilet and bidet. A white and grey airplane sits at a gate at an airport. Horse grazing in a field in front of a cascade of mountains. A white dog has curly matted hair in it's eyes. A group of zebra crossing a river together a living room with couches and a table A woman looking at the camera while holding a cell phone The person on the motorcycle had a big helmet on. A young lady reading a paperback book on her bed at night. light brown cocker spaniel dog howling in street A cat licking a bowl clean on the counter. a bathroom with a chipped sink and holes in the walls This looks like a bunch of burned food on top of burned bread. A baseball player has just launched a ball. a living room with a chair near a tv Person high in the sky after jumping snow ramp with snowboard People are standing behind the bakery counter. Two cows in a large green grassy field. These people are sitting on a street bench. A row of cars and trucks parallel parked at parking meters. Group of people riding bicycles on a busy city street. a man sits next to a monument on a bench a caddy holding one cell phone and another cell phone in a holder A soccer team is praying on the field. two black and white cows on a green hill a hot pizza topped with cheese and olives to be eaten a close up of a cat at a table with a plate in front of it A women who is picking up a large sandwich. A very large cat sitting in front of a television set. A store on a street corner called "James Smith Sons". The old metal bed with dirty linens is the only furnishing in the abandoned room A plane flying with a dark, cloudy sky in the background. A boy is sitting down and eating a donut. The plane is taking off from the runway. White and grey cat laying down on a white sheet. A gray tiger cat sleeping on a bed under a blanket. A photo-shopped image a cube drawn around a lego in the kitchen. A young man catching a yellow frisbee on a green lush grass covered field. a square white appliance with a blue thing on top three people riding horses on a beach near a body of water The laptop has an attractive image on the screen, and there are welcoming flowers and munchies A man playing tennis on the tennis court while his coaches watch. a street sign on a pole with buildings in the background Street sign edited to look like a man is holding the white bar A businessman wearing a suit and close up picture of suit. Boy sitting at table with food and a cellphone. A man holding a woman's hand and cutting a wedding cake together. A beautiful woman sitting on the back of a moving truck while clutching her dog. A bathroom cubicle showing a toilet, sink and waste can. A wedding cake with flowers descending down it to the plate. A bathroom area with three different sized and shaped urials on a mosaic wall. A plate of cakes with frosting and topped with berries. A large truck next to some trees outside. People can be seen boarding a ship through the windshield. A tower of brick holds clocks and a bell in a courtyard. People are playing in a field, flying a kite. A young giraffe stands near some trees in a wooded area. Young baseball player running in open grassy field. A few skiers are enjoying the calm snow-trodden mountain tops. Mexican food is layed out on two trays. a small boy and a big chocolate donut cake A brown horse standing next to a metal fence. two men standing in front of small car a boat a larger ship a buoy and water Shot of a nice quaint living room with an ascending staircase on the side. the skateboarders are taking turns using the ramp. A giraffe extends its tongue to drink water A park bench is in the white fluffy snow. A person is standing with their face near a toilet. A couple of elephants are dawdling in an enclosure. A giraffe standing near a tree in a field. An African American man wearing a bow tie is taking a selfie. Four men are dressed up with a tie. People having a drink in a basement bar. An underground Asian subway train, on the tracks and in transit. The strawberries are supposed to make this dessert look less fattening. Men playing Frisbee on the lawn at a get-together. The man is carrying bunches of bananas. A young child that is looking at a birthday cake. A person doing a trick on a skateboard caught in motion. Two giraffes and two zebras are standing in a grassy field. vegetables laying in the soil next to a trowel An adult elephant standing over a very small baby elephant. A bunch of vegetables that are stacked together. An old photo of railroad tracks passing through a western town. Two giraffe standing next to each other.on a lush green field. A man holding a tennis racket getting ready to hit a tennis ball. Batter, Catcher and Umpire wait at home plate for ball to be thrown. a formation of fighter jets flying by in the air A group of people standing around a nearly empty field. A young lady that is smiling and holding up a box with a blue tie in it. There are people watching a game of baseball. a brown and white dog lying on a bed and brown pillows A couple of trains parked in front of a tree. A train is stopped at a train station. The skateboarder is jumping down the stairs on his skateboard. A sheep standing with his behind to a fence in the snow. A round plate that has a white and red pie type dessert on it and a light green pitcher behind it. Pedestrians on narrow alleyway with archway between buildings. A blue and yellow fire hydrant sitting in a field. A soldier is cutting a large decorated cake. a small girl in a field with a blue yellow and red kite A yellow fire hydrant in the middle of a plaza. Four children eating pizza in a booth at a restaurant. A train traveling through a tree covered wilderness. a lady holding a game controller and a man giving the rock on sign An antique gold clock with a man and an eagle. A woman eats a hotdog while holding another. A man riding water skis on top of water. A bundle of six apples are hanging from a tree. a bathroom with a white toilet next to a tub. Several elephants standing in a lake near trees. a grassy medium between a two way street in a city A macro image of an apple keyboard. A single person is working in the cluttered kitchen. A young boy holds a baseball bat above his head. A donkey draws a carriage carrying two people Two men reach up for a Frisbee at a park. Fire hoses are attached to a fire hydrant. This blue and yellow transit bus provides information about the service it self rather than advertisements. A group of people standing around a table together. An apple sliced into four, fork and knife A very elaborate cake decorated to look like a bear's forest dinner table. A giraffe standing by a brick building with a ladder. A couple of women preparing food inside of a kitchen. A bed containing two small boxes and an electronic item A brown and white cow standing on top of a lush green field. A motorcycle is parked next to a blue tent two people are sitting on two different elephants A very cute toddler playing with a laptop that is fully open A black and white cat standing on a table next to a pizza. A plate with fries and a napkin with eating utensils. Tourist train with several cars driving on street. A group of young boys playing a soccer game in progress. A train passing on bridge over a busy city street. A bicycle parked on the side of the road beside some doors. Lego clock and wall setup for a interior Lego house. A couple of hot cars in a packet on the table A small plane is taking off from a grassy field. A look at a hotel room with two beds in it. A man standing on a tennis court hitting a tennis ball. A guy with a motorcycle helmet stands behind a motorcycle. A dark kitchen with many cabinets with a small light on above the stove. A bus drives down a city street featuring larger brick buildings. Open textbook near a computer keyboard and mouse on a mouse pad. A desk with a computer a keyboard and a mouse A basket with a sandwich, coleslaw, and onion rings is sitting on the table. A bed sitting in a room under four pictures. A tray of assorted food including fruits and vegetables. A box with six divisions, each with its own variety of donut. Person flying a red kite in a grassy area of a park. A photo of an outdoor with many things in the scene. A train is moving or resting on railroad tracks. a fire hydron that is next to a concrete road Two streets cross and the signs prove it A bedroom with a large blanket covered bed in front of a flat screen TV. A man holding up a hot dog on a stick. A toy wagon holds many stuffed teddy bears. two zebras are standing together in the woods A group of people on motorcycles driving down a street. Two parking meters sit on the side of the street. A vehicle decorate like a pink elephant with passengers on its back. Black and white photograph of a woman in an old kitchen Animal in shadows of woods surrounded by foliage. The domed, shiny surface reflects a man falling off a skateboard. Closeup of a brown bear sitting in a grassy area. This is an old town from the 1950 's. A man skateboarding on the grass in his yard. Two people stand in a field as one of them flies a kite. A baseball player is sitting on the bench at a baseball game. A white bus is driving on the road. Contrails can be seen from a descending jet. A grey and blue train passing over a city area. A young girl is smoking in a kitchen. A city bus with bikes on the front of it Four people playing a game system in someones living room. a group of baseball players that are on a field a baseball hat cake made with fondant that says happy birthday A bird flying over a beach with a few people in the background. a male is looking at a sausage pizza A IMAGE OF A CAKE TEA POT AND BIRDS A group of large trains on a steel track. the people are walking down the street with luggage A bedroom with a desk in the corner. A picture of a girl that is posing on the ground. A large green freight boat is seen at sea a microwave is open with some food in it A man cutting a cake on a table with cards on it. a man looking up as he rides a surfboard an image of a living room setting with tables A young person holding a surfboard next to a man. A chocolate cake with a pile of strawberries on top. A giraffe laying down and another giraffe standing up next to trees. A man riding a bike while holding an orange and black umbrella. a close up of a sandwich on a plate next to rice and beans a building with clock tower in a town square a clock is sitting on the outside of the building a pizza topped with different toppings is brought to a table A man rides a horse while other people look on. A warning sign is at the edge of a body of water next to a fire hydrant. The sign on the side of the road is telling motorcycles to use caution. Their is a skyview of the city from a small aircraft. There is a orange tabby cat sitting on a mat A large passenger jet sitting on top of a runway. an adult female standing on a beach holding a colorful kite A clock is shown in a package on a shelf. A woman smiles on a street while holding an umbrella. A young boy swings a baseball bat as another boy waits to catch the ball. A group of people sit in a room while one plays a video game a toilet and a bidet in a bathroom Tall, fresh, colorful flowers in a clear vase three people are skiing down a huge mountain slope Four fine zebras cruise through grains very alert. a tall clock tower near other buildings The large twin engine airliner has a red stripe on the sides. A city scene has a tall red double building. One single biker seems to be leading the group down the road. A man in light blue jacket riding on a skateboard. Street construction being separated by orange barriers. A hotel room with tv, desk, bed and arm chair a close up of a person eating food at a table A laptop that is sitting on a desk. a flooded city street with a stop sign coming out of it A squirrel is eating a piece of food on the ground. A park bench is next to a colorful fence. A group of people on motorcycles sitting in the road. A blue and white bicycle parking in bike track next to building. An older woman sitting at a table cutting up donuts. A man wearing a lei is waiting in a parking lot with his luggage. Man with young boy carrying surfboards at beach. a person standing up and holding two remotes There are rambutan, bananas, and papayas in separate crates. Cement ledge with orange in bowl and red plastic bag below. A brown and white sink sitting in a bathroom. A small vase of flowers with petals on the table a man is standing on top of a surf board at sea A table topped with steak, potatoes and carrots. A lavish hotel room with a comfy bed. A couple of glasses of wine on a table. The wooden bench is near a busy stream. An big airplane flying through the sky Adults with looking at watercraft on waterway near park. A kitchen filled with an empty refrigerator and microwave. A driver's view of an intersection on a sunny day. a herd of zebras walking through the grassland A green bus parked in a parking lot next to others buses. A dog curled up by a pair of boots on the floor. a man is skateboarding on the edge of a building Two men play tennis in a fenced courtyard Snowboarding elderly man on side of mountain posing for picture. A view of a single bathtub in an otherwise empty bathroom a man holding a child next to a double decker bus. A bird is wading in shallow water by a boat. A dog and some humans in a garage of some sort. A sink in a kitchen with an overhead light on one dog laying down and another dog standing over it A woman on the beach is flying a kite. A street in an Asian country is littered with signs and advertisements. A group of people standing on top of a grass covered field. There are two people standing on the side of a street. a trolly train on a city street at night A young and a woman sitting down outside with a laptop between them. Seven cows are lined up while being milked. A woman riding a horse in a pasture with great caution. A large jetliner flying through a cloudy sky. A young woman drinking from a wine glass A woman standing alone holding a large, white umbrella. Several elephants eat grass and plants by the water. A computer geek's setup of his computer, laptop and various games. a cat in a luggage bag in a closet Two zebras, one facing forward, one looking at its mate. A tall church tower under a blue sky filled with white fluffy clouds. a person standing on a beach wit ha dog Old rusted train left out on the train tracks A teddy bear drinking from a pink cup. A pizza with an egg in the middle is on a plate. A country road covered in rain next to a river. A boy holding a kite while standing on a sidewalk. A young woman holding a teddy bear in a room. a person is standing next to a surfboard A clock on the side of a window in a room. Three skiers posing for photo in front of sign. The fire hydrant is in a field near a covered, wooden bridge. A group of young children sitting next to each other. The catcher in a baseball game picks up the ball with his glove. A black dog laying underneath a car in the shade a brown and yellow bathroom with a toilet tub a mirror and a sink The modern bathroom has a glass shower door and cream and brown color scheme. A group of zebras standing beside each other in a grassy area. Large special saddles are used while riding elephants. A living room with a brown wicker couch and ottoman. Three cats sitting on a leopard print bed. A table topped with a tray full of cookies and a vase filled with flowers. a street with cars parked on the side A boy in a red jacket at a bus station. A shower and sink in a small room. Numerous water fowl either taking off or landing in the water A large group of motorcycles stretching into the distance on a highway. Old lady takes a rest from her walker on a sea side bench A hotdog sandwich with sauerkraut, cheese, and mustard. A couple of people with snowboards in the snow. Three people standing on a mountain taking a picture as they ski. b baby doll holding a very big samsung phone A street sign sitting next to a tree. Four women with two children cross the street in a crosswalk. A professional kitchen with metal counter tops with good lighting. A gray truck driving past an ATM machine. A couple of horses standing near a road. A person at a table outdoors with a laptop. A pair of hot dogs with toppings next to a drink. Sliced apple in a bowl covered in cinnamon. zebras are walking in a pack on the grass A man in a tan shirt and glasses in a car A large train resting inside a railway station. The hotel room has two large beds, a desk, a flat screen tv and a lot of space. A kitchen area with a dishwasher, stove and microwave. there is a large blue vase that is empty A clock tower on top of a building next to the ocean. Skateboarder doing a high jump down stairs at a competition. A person riding a snowboard on the snow on a sunny day. An indoor bathroom with reflective marble counter tops A bunch of cows enjoying the grass and sunshine. A man flying a kite over a sandy beach. There are benches on the landing where one can sit and enjoy the view of the wooded surroundings A parking lot filled with parked cars in a shopping center.. a close up of a young person wearing a suit and tie A large commercial airplane parked on the runway A man riding a skateboard through the air above a skate park. Several skiers ride down a steep, snowy slope. a toothbrush that is on down on the counter A man is standing under an umbrella in the rain. some people standing around a bright lit up party bus an image of two little kids playing baseball A small red bicycle sits on a hardwood floor. A white plate topped with meat and broccoli. A woman returning a shot at a tennis match. Two surfers are riding two large ocean waves. A large bus parked on a handicapped parking space. A person stopped wearing a yellow jacket riding a motorcycle. Stuffed bear posed "reading" open computer reference book. A dog laying in bed all covered up with the blanket . Old canopied single bed with luxurious linens and curtains A cat sitting on the floor beside a pair of shoes. A bench falls into a crack in the asphalt. A man holding a luggage cart in front of an airport. A pile of carrots and other vegetables on a tray. A woman sitting on the couch with her baby. Some very tall giraffes in a big green area eating. A couple who is cutting their wedding cake together. A black cat sitting in a tub licking the faucet. A black and white clock is mounted to a building Computer screen displaying a page of small print. The woman is sitting on the couch watching TV. There is a little girl standing next to a very large pizza A grey and white cat laying in a black wire basket. A child playing with toys in a backyard pool. Two skateboarders performing a trick on a ramp. The man is about ready to cut the cake to share. A vintage bicycle is parked outside a storefront beside a state of the art apparatus. A beautiful young woman holding a tennis racquet on a tennis court. A large bathroom shower has flowers by it. A refrigerator in a basic kitchen with bottles on counter. The man is in the air after jumping on a snowboard. A person stands on skis on a snowy mountain. A green bowl filled with oranges on top of a blue striped table. Teddy bears modeling on a runway with other teddy bears watching. the man is holding on to a firsbee A bicycle is locked up to a post A long city bus pulls away from the curb into traffic. A crowd of people stand in the water on the beach. a woman in a skirt holds a tennis racket an ball the elephants are moving people across the river three people skiing together in a line down a hill A baseball player is holding a baseball bat. A little kid that is touching a fridge. A woman carrying a surfboard into the ocean. A giraffe standing in the grass near trees. a yellow fire hydrogen next to some weeds A black cow eating some grass in a field. a little league player getting ready to throw a ball A person is snowboarding down a snowy slope. A girl plays with a Wii remote in her hand. A female tennis player getting ready to serve. A large bus driving down a city street. A white plate topped with meat, veggies and bread. A colorful breakfast omlet with toast on a green plate. Paper umbrellas hangs from the trees for art A man is turning a pizza with a spatula. Three sheep are grazing on the city sidewalk. A small child in a pink dress sitting at a table having cake. a couple of men in cowboy hats near a sheep A chocolate cupcake with a smiling giraffe face. A man in the air on his skateboard doing a trick. a street sign that reads "do not enter" on a quiet street a baseball player holding a baseball bat on a field a street sign showing a one way street A cake with a knife on the table A black kitchen sink with potted plants, toaster oven and knife rack behind it. This black dog is sleeping on a bed with white sheets A man with a brown sweater is playing a WII game. A man standing on a park holding a white frisbee. a group of teens playing at a skateboard park with one doing a jump Two zebras and a giraffe are walking in a park. A large crowd of people and flying kites. Two beach chairs next to an umbrella on the beach. A cat peeking over a tub that it is inside of. A red and silver airplane sitting outside at the airport. Overhead view of a table with a log and food on it. Black capped cranes standing in a zoo enclosure Small child on a skateboard watches another skateboarder. A tennis player holding a racket looking up at the spectators. A man holding an umbrella on top of a bridge. Two plates that have food on a counter. An old green steam engine is on the tracks. The reflection of a red truck in a buildings windows Two well dressed hot dogs are sitting next to fries. A person on a court with a tennis racket. A sink sitting under a mirror and near some cupboards. A person riding a horse and another person petting the horse. A small engine plane sitting on a runway. person walking their dog on sidewalk past cars A bath and a sink in a small room. A skateboarder doing tricks in the air at a park. A herd of elephants walking through a grass covered field. Blurry photograph of a cat jumping up from a chair A person that is about to throw a frisbee. some sheep standing in the snow with one looking for food The man squats down while surfing through a wave. A tidy hotel room that has two beds and a flat screen two females dressed in ski attire standing side by side in the snow A body of water with boats floating on top of it. The pizza is next to a bowl of salad. A couple of boats parked at the wharf during the day. Guy in glasses using scissors to cut something in the room. A group of vehicles driving down a city street. A group of athletes are waiting to compete on a playing field. Cars backed up several blocks in traffic on a city street. A small girl is sitting inside an open suitcase. A horse and buggy parked along a sidewalk near a wharf. A little boy wearing a baseball hat holding a baseball bat. a person wearing glasses with a cellphone in their hand. A number of mannequins in a clothing store A couple is on the beach with a small child. a white dog that is looking at a frizbee A very pretty blue city street sign near some trees. A group of men standing around a UK surfboard. a cluster of blue flowers inside an orange peel A person riding skis down a snow covered slope. Two parking meters on roadside and a road sign A sandwich with lots of french fries in a foam container next to a cup of dipping sauce. A dragon boat race with a bunch of people in the boat a boy in a baseball uniform poses for the camera with his bat A group of soccer players on a field Zebras are socializing in a pattern of three by three by one. a kitchen with a table and chairs and a stove A person skateboarding in front of two statues of reclining women. Two women play together in a tennis match. A toy sits at a desk with a beer. The fire hydrant is across the street from a large building. two people playing tennis in front of a crowd A skier at the bottom of a slope, among coniferous trees. a giraffe leaning over so it can eat some leaves A man sitting in a chair holding a glass in his hand. very well made meal placed in a bowl Two sandwiches in plastic wrap sitting on a counter. A teddy bear suspended in mid air as it rains down water on it. Two hot dogs on a plate with a cup of coffee. a man goes to hit a tennis ball with a racket World War II vintage fighter plane parked in a museum. Bulldog riding a wakeboard on a body of water. A man flying through the air while riding skis. College dorm room with stack of newspapers, backpack and suitcase near bed. A bathtub sits next a window showing a ferris wheel. A couple relax and watch a wide screen television on the far end of a messy living area. people standing around the table with 3 laptops on it A falcon sitting on a back yard BBQ grill lid A large cow standing in a grass field. a tennis player is doing an overhead serve A giraffe stands by a tree in its habitat at a zoo. A horse stand behind a fence and in front of an old building on a snowy day. Living room of residence with green couches and large bookshelf area. A person on a orange motorbike is on a track. THERE IS A DISPLAY OF DIFFERNT DOUGHNUTS ON THE TABLE A large clock on a cloudy day in the city. A table topped with small and large metal bowls filled with veggies. An airplane is flying high in the cloudy sky Bowls of chopped carrots, onion and lettuce on a turquoise mat. Green traffic light shown against a tall sky scraper at night. Three sheared sheep on grass facing different directions. A dining room and kitchen area with a glass table and gray chairs. A train that is sitting on the tracks. The black and white photo shows a toilet and a bathroom sink. A round bed with lots of pillows next to a cat bed. The three zebra are walking down the road. A plate with two slices of pizza on it with toppings. Quick Stop Groceries has many things besides groceries THERE IS A RED STOP SIGN ON THE DOOR A mom giraffe escorting her newborn around a fenced in area. A guy is eating a huge slice of cheese pizza. A bird walking past a white car in a lot A man feeding a brown spotted giraffe over a fence. A dark colored beverage in a tall glass and a small bow of food on a table. A hipster standing between two surfboards while wearing sunglasses.. A toaster oven and dish drying rack sit on the kitchen sink counter. Boats docked by a couple of city buildings. The hula girl doll sits on top of the car dash. Snow piled up on and around a fire hydrant by a fence. A man and woman set a formal dining table. two trains on train tracks at a train station baseball players in motion playing in a stadium A man is kneeling down in front of five surfboards. The little boy is petting the giraffe whose head come over the zoo enclosure. A hamburger sitting on top of a tray on tissue paper. Cars riding on the street across a train on the tracks A empty living room that has a table in the center. a work desk with display with graphs, notebooks, and keyboard A little boy and a little girl laying on a cat shaped beany bed. An airplane with four engines is on a runway. A brown curly haired dog chasing after a red frisbee A woman is swinging at a tennis ball on a court. the baseball pitcher getting ready to pitch the ball A school safety sign lies against a piano. two people in a living area playing with a dog wearing a cowboy hat. A boy enjoying some sandwich or donut during the day. A residential bathroom with sink, toilet and curtained tub a living room with a couch a window and a lamp A close up photo of a baby giraffe standing in the hay. Some hooks that are holding hot pads, a ladle, and a pair of scissors. A man is leaning out of a train. A group of people enjoying a meal at a table. A bathroom with a toilet, a sink, and a bathtub. people walking on the sand of a beach shoreline beneath flying kites. a woman talks on a phone in front of a slide out glass A small framed picture is hanging above the toilet Two military men riding horse in the water along the shoreline of the beach A woman walking down the street with an umbrella Some highly cultural objects on display in this well lit room. A man plays an organ in an historic photo. A little girl with a snack laughing on a bench. A man hitching a ride on an elephant. A variety of old motorcycles on display in a shop An open toilet seat next to a urinal. A close-up of pink and red flowers in a clear vase. A black and white photo of men shoveling rocks. A group of people on top of some horses. Three giraffes in a field with a fence A traffic sign is displayed on a street. A man using one hand to hold a skate board while performing a handstand Two female tennis players on a grass court. A broken park bench in the middle of a grassy lawn. Sugar donuts sitting in a white paper bag. A man sitting on a park bench next to a person laying on it with a dog. A row of auto-flush urinals lines the wall in this public restroom. A baseball player holds the bat while the catcher and the umpire stand behind him on a baseball diamond. A gray and black cell phone resting in a man's left hand. A lonely sheep standing in a field in front of a rock wall. A man and a woman riding on a motorcycle are getting ready to hit the road. Two horses gaze out from among the trees. A man sitting on a couch holding a small white object. A bus going down the area next to the ocean. Guy with shades on taking picture of his hot dog The skier is skier down the snow covered hill. A large bathroom with a large bathtub in front of curtained windows. A bunch of people walking around in a street a guy riding his skateboard near the edge of a pool A mascot entertains fans as baseball players leave the field. Several motor scooters are jammed into a small market street. A microwave sits above a stove built into the cabinets. A bus going to Oakland in an empty lot. People watch a women's softball game from behind a chain link fence. two girls standing outside a building next to a large toothbrush statue A young man sitting on a toilet in a white bathroom. There are two snowboarders in the air completing stunts. The image shows a book digitally modified onto a tennis racket. A zebra eating hay scattered on the ground while another zebra lays in the shade. A man in an orange t-shirt rides a wave on his surfboard. A woman in a window either taking a picture or video taping something outside. A skier holds a ski pole in each hand. A large commercial plane sitting on a tarmac. a man that is on a tennis court with a racket A person and some animals that are by some plants. A baby pulling themselves up to look at a laptop. Trays of snacks and a bottle of wine. Four guys are sitting around a table eating and drinking. A slice of cheese pizza on a plate with parmasean cheese on the crust. A cupcake covered in lots of white frosting. a picture that's been sped up to show streaks of headlights and taillights A young black man sitting on a skateboard on a basketball court. A train is on rails over the ocean by a pier. A LARGE TRASH CAN IN THE SHAPE OF A SOUP CAN IS ON A STREET THERE IS A PLATE WITH SWEET DESSERTS ON THE PLATE a woman in a pink top holding a cellphone and a few other people A blue Hospital sign with an arrow pointing towards the Hospital. A man riding a motorcycle down a curvy mountainous road. A giraffe standing in a valley of two small hills young boy with surf board in hand walking out to the water an image of a man with a tennis racket in hand The meals are ready in their individual containers. three yellow buses line up on the street A surfer rides a medium sized wave on the beach. Boy doing skateboard trick in air at a skate park. A person riding on the back of a horse drawn carriage on a beach. A large smart phone made the the NOKIA company. an image on the table with apples and oatmeal A cat is looking at itself in the mirror on the floor. A Frisbee team on a field being happy. A refrigerator with a variation of different magnets and photographs on the doors. A pond with lilypads and a frisbee floating in it. military jets being prepared for a mission A picture of some people posing for a picture. two people sitting at a table with laptops A group of young men standing around playing games on the Nintendo Wii. an image of a stop sign and yield sign A red trolley train is going down the tracks. a small boy is playing with a remote control The kitchen is has a stainless steal refrigerator. a vintage photo of a bike parked next to a store A giraffe about to eat leaves from a tree A kitchen with a stove and a microwave. A young woman in a red skirt is waiting on a train platform with her suitcase. Several horses that are grazing in a field. A bathroom with a marble bathtub and a large sink. A parking meter with two cars parked beside it. Giraffe stretching its neck out to reach green leaves on a pole. The large red city bus drives on a brick street. A man riding a skateboard up the side of a ramp. a person riding a snow board on a snowy surface A street sign that is on the side of the road. a woman holding onto a container as she eats a donut A kitten peeking out from of a pile of white blankets. Three zebras standing in grass with bushes and trees. Two young girls in uniforms sitting closely together. A small boat is in the water and a red bench in on the dock next to it. some giraffes standing in front of a white building THERE IS A PASTRY THAT IS SITTING ON A PLATE A cat drinking coffee from a cup on top of a table. A plate covered with a meat and vegetable dish a man performs a flip trick on a skateboard A sandwich and french fries are on a plate. A living room with wooden walls and furniture. four giraffes basking in sunlight of enclosed area a close up of a pot cooking broccoli A herd of cows standing in a field grazing a group of people standing around a table covered with different containers of food A boy is cutting a string with scissors. A close up of an "all traffic" sign on the freeway. A large jetliner sitting on top of an airport tarmac. A man posing with a mouse and keyboard an image of kids playing on skateboards in the street A woman attempts to fly a butterfly kite. Two boys and their mother playing with a kite. A man on a couch playing a video game. a close up of a white toilet and trash can Two teddy bears with a price tag on ear. A pan of food on the stove consisting of sliced carrots this train is leaving the station on rails A jet airliner sits in front of the runway. Two men play a game together using the Nintendo Wii gaming system. A couple of kids in skinny jeans with skateboards. An open, lit-up, fully stocked refrigerator and freezer. People are getting off of a large bus onto a commercial airplane. A photograph of a white range and oven. A beach with people, beach chairs and umbrellas. A woman plays Wii while a man holds a martini glass beside her. A red firetruck is on a street near a brick building. A group of children with two of them brushing their teeth. A MUNI bus in San Francisco, parked next to a fountain. some people and a red and black train engine A surfer rides a wave in this Michael L. Baird photo. a tennis player swinging a racket at a ball A man wearing a medieval style helmet sits atop his motorcycle. A person is holding a flag in a gathering Many people near a river gathering around in a circle A couple of young guys at a skate ramp with their boards. A pair of kites fly above a statue. A person balances a large scale full of goods. A blurry picture of a man with black hair wearing a suit and tie. A man is walking out of a pizza restaurant with his pizza A town square with many pedestrians walking about. a bathroom with a blue dustpan and broom on the floor A cat sits under an umbrella while indoors. Two skewers of vegetables and broccoli on a plate A large inflatable whale sitting on top of a beach. a full view of view of a zebra and a head shot of another zebra A vase with flowers is displayed next to a handmade object. A woman serves a ball with a clock in the background. A man and women at a table eating, there is a baby in a stroller behind them. A man adjusts his tie while getting ready to go. an apple sticking out ot the side of an apple a kitchen with a refrigerator and a stove A young woman is brushing her teeth at the sink. A man pitching a baseball from a mound on a field. a cat is on the coach staring at a remote control a cat sitting on the toilet looking at one on the floor A bunch of pedestrians walk down the street in the rain Keepers looking after a family of elephants at the zoo A red truck has a black dog in the drivers chair. Three people sitting on bench watching a train go by a white train is coming down some tracks Four people on a ski slope preparing to ski. A male tennis player gets ready to serve. A road sign by a stone wall and dirt path. A sea lion on the rocks with an elephant's head photoshopped on it. Two men sitting at a table with a pizza in front of them. Sun shining through a window into a bathroom. A person on a motorcycle riding down a street. Two dogs sitting at a dinner table enjoy food in bowls. A sign, on a sidewalk, containing directions to nearby locations. Several different donuts are placed in a tall bowl a person on a skate board tries to do a trick White commuter airplane with blue tail on an airport runway. A train is parked by the sidewalk in the city. A man doing tricks on a skateboard with onlookers watching A person surfing on a wave in the water on a surf board. A woman in a towel combs her wet hair. A bench in a park covered in snow. The vase is decorated with a colorful design. Lady teach a class and uses her laptop A woman is playing a game of tennis. A baseball player swinging a baseball bat at a baseball. The fireplug is the dominant element with the architecturally interesting building in the background. A policeman on a horse is standing across the street from a building. A bathroom has several items next to a sink and on top of a medicine cabinet with a door opened up against a glass-walled shower. There are various utensils on the counter of a large kitchen. a plate containing a bag and several pastries A stainless steel kitchen sink on a black granite countertop. A paper plate with a very large sandwich with a lot of condiments on it. a cat rests its head and paw on a pair of womens shoes A cup is on a desk with a dog figurine. Baby at her first birthday party feeding dad cake. A living area with several chairs and a lot of color Two children's miniature trains with conductor and mother and child. A horse is shown behind fences in a field. A white refrigerator freezer sitting inside of a kitchen. A man in black shirt holding a yellow frisbee by rocks. A man in blue shirt touching a cake with a utensil. The woman in a pink shirt looks at the kite in the sky. A cake with thick icing partially eaten with a knife. Up close view of a plate with two well cooked hot dogs on it. Clock and sign on church tower made of bricks Two lanes of cars waiting at a traffic light. A herd of cows grazing on a hill by the road. A plate with food on it and an orange with a fork on the plate. A small train with writing all over it passes through an intersection. A yellow truck next to various cars in a warehouse. A miniature bathroom set is shown for a model A batter is throwing a bat and getting ready to run. A hot dog on a bun covered in lots of pastrami and a pickle. A group of people by a bunch of bananas. a boy and a woman in a competition with a motor bike A baby chews on a toy with a cat pinned under his leg. The man is preparing to pitch the ball. A man preparing food on top of a large metal pan. A bunch of items that are on a table. A skier goes cross country consulting a sign A sunset scene with water, elephants and grass. Two small sheep, one standing and one sitting, in a grassy field gray and white cat hiding underneath a toilet a close up of a number of zebras behind a fence A baseball batter, catcher, and umpire await a pitch at home plate. A young man carries a black backpack and a blue suitcase. A cat lies on its back on top of a table with pink roses. A living room with a couch, a chair and a piano. An open oven has lots dishes on the racks. A statue of a jalapeno on a fire hydrant. A bus is passing through a city intersection. Two elephants standing near a small pool of water. A brown cloth covered table filled with stuffed animals. A railing in front of the beach with surfboards leaning on it. Two zebras looking for food near a tree. Two guys are playing some sort of video games. Many televisions are showing the same sunset picture. There is a person looking at the contents of a refrigerator. A woman cutting a mans hair in a barbers chair. A boy operating a mouse and viewing a laptop. An apple laptop with pens, headphones, books and various small items. A skier carves a path as they descend a snowy slope Children playing in a soccer competition on a grass field. Small girl eating pizza off a colorful plate on a blue table. A toilet in a restroom with a wooden toilet seat. An orange on a counter next to a bottle of alcohol. Bed and nightstand with blinds closed and doll sitting on pillow. a kitchen with a brown dining table set and a potted plant on the counter Living room with half circle window and furniture. a table with two glasses and a plate with a chocolate dessert and a spoon A view of a gourmet style banana split. a couple of zebras are standing in a gassy field A desk has picutres, cds, cups, and a dog figurine. A man jumping in the air with a skateboard A large outdoor clock with two faces and various designs and numbers on the faces. A little girl and woman standing near a birthday with lighted candles. a train on a train track near a small river A man with a bright green tie with his arms around to boys. a person snows boarding on top of a small hill A vase with flowers, cup, pitcher and mug sitting on a table a woman holds out a stuffed bear to a man in a suit people standing and windsurfing on boards in the water with trees in the background A group of people flying lots of kites in a large grassy park area. A colorful bus stops at a bus stop. A man riding a wave on top of a surfboard. A giraffe picture on box with some pizza. A couple of multi-colored lawn chairs sitting on a beach. A group of baseball players playing a game of baseball. A man posing with a horse in the shade two elephants are in a field together eating We are looking at a delicious plate of banana walnut pancakes. a public transit bus on a city street A bartender filling a long row of champagne flutes Several stacks of disposable cups sit in a kitchen. Assortment of toothbrushes in ceramic container in corner of counter in bathroom. A machine that dispenses tickets for some mode of transportation. A baseball player holding a bat standing next to a base. A group of sheep surrounded by three dogs. Two people touch feet while sitting in chairs. A cat sitting on top of a book shelf filled with books. A person holding an open mobile phone and a camera. 2 people outside on a snowy area snow boarding A man sitting on a ledge reading a book. A cat is standing on the back of a huge dog. Warning sign displayed in wooded lane on sunny day. Fresh fruits, vegetables, and other foods are spread out on the table. A meter with a sign on it stating that the meter remains as a courtesy to cyclists A city bus moving down a city street on the sidewalk nearby Two cows standing in a penned pasture near a log. A person does a trick on a skateboard in black and white. A zebra standing amongst tall, dry grass during the day A plate of food that is on a table. Gloves, cell phones, brushes, ties, and ear buds are placed on the floor. A fork perched into shredded meat on the bread on the table. Brick fireplace in a white and brown living room. A person riding a snowboard on a snow covered slope. Four pieces of luggage sits on the floor. A man flying through the air while riding a snowboard. Black and white photograph looking past traffic lights at an old building A girl standing in a boat resting her arm on an elephant who is passing by. A statue of a dinosaur, next to a bunch of flying kites. A baseball player tries to avoid a tag out play. A man walking down a road holding a black umbrella. a close up of a small dog near a pair of shoes Guy in hoodie peeing in a bathroom toilet A parking lot with cars and motorcycles at walmart. A table topped different plates and bowls of foods. The goose is curious about whats in the bucket. A very tall tower sticking out of the side of a building. A person that threw a frisbee in the air. a man with white and blue on playing tennis A dog is looking out a large window. a woman trying to fly a kite with no wind A bathroom with bathroom supplies is pictured in this image. The man in black is moving towards a refrigerator. An arrangement of food is displayed on a table. a couple of cows are standing in a field knife cuts into a medium sized pizza on a plate A man sitting on the hood of a car talking on a cell phone. a hotel room with a bed, chair and a window there are many benches that line this park A commercial airplane being pulled across the runway by a truck. Two baseball teams of young children playing baseball on a dirt field. a woman and child checking out a display of food on an outdoor table A truck in the street near a person on the side of the road Toilet design outside of the US with accompanying trash can. A person and a dog with frisbees in a park. A young boy is flying a kite in a park. Multiple trains sit on tracks that run through the city. A wooden carved clock tower with posts holding it up. some people and the male is holding a baseball bat A small kitchen with dark wood cabinets and white appliances. A pick-up truck with a Christmas wreath attached to the grill. A refrigerator door left open showing the contents inside. A red fire hydrant between two flower boxes Several square pizzas are sitting on round plates. A plate filled with fruit salad and a melted cheese sandwich. A baseball player prepares to swing at a pitch. A woman sitting on top of a purple motorcycle. A coffee cup sits next to an open computer. Rusted fire hydrant covered with bees in grass near road. A workspace with a laptop computer and desktop computer. A baseball player is swinging at a pitched ball. A plate of colorful vegetables and a cut of meat. An orange fire hydrant sitting below a tall building. A chef is cooking food in the kitchen. The zebras are grazing on the grass in the field. A person is holding a fork with pancakes on a plate. A man and boy in dirt field playing a game with frisbee. A man poses for a picture in a suit and tie. A cat drinking water from a bathroom sink. A train pumps out steam while going down a track on a cloudy day. A pitcher, a catcher, and a man up to bat. a photo of a kitchen with a fridge, an oven and a sink a flooded street with a street pole five giraffes drinking water with a field behind them A woman holding a tennis ball and racquet on a court. a person holding onto a banana with brown spots a sign attached to a metal pole sitting in the grass A couple of trains that are riding in the rails. People taking photos of a public speaker with their telephones A snowboarder performing a stunt on a snowy mountain. Three women are standing in a kitchen cooking. A jet flying in the sky surrounded by smoke. A living room with hard wood floors covered in furniture. A close up of a pizza with spinach and parmesan topping. A girl in a green shirt and denim skirt cutting a cake. This is a staggering picture show of people having a remarkable time. A close-up of a green apple next to other fruits. Several cows in a field with a train passing in the background. A street sweeper driving down a city street A young giraffe and an old giraffe outside of a building. Five uniformed players are on a baseball field near a crowd. A cat in a bow tie laying under a car. A desk full of desktop and laptop computers. A group of people on a horse carriage ride going down a street. A very close up look at a plate with some food on it. A plate of food that includes beef, broccoli and sauces. A large bird flying next to a tall building. A herd of cattle grazing on a lush green field. A frisbee in mid air with a someone below jumping. a giraffe outside near a forested area and a lot of trees an image of several giraffes in a zoo four planes flying in the sky in a formation A bathroom with a toilet and a counter next to a door. Cat laying on the floor near some books A man in a wetsuit riding on a surfboard Boxes filled with donuts sitting on top of a table. A small and large giraffe are by a tall fence. Young men playing frisbee in a grassy field. An older black desktop computer running Windows operating system. A man is catching a white Frisbee on the beach. Several images of a surfer in various phases of going out for a wave. A vase of flowers are placed on a long table. a person in snow gear walking through some deep snow The sign for Spring St. and 6th Ave. is in front of a brick building. A couple of zebras are in a brushy field. a male in a light blue shirt and a white frisbee some people on a bank flying kites and water A boat is coming down the water near the shore. a couple of pelicans sitting on some rocks A family is playing with the Wii together in the living room. Three giraffes are standing in the field spread apart. A couple of women chasing after a frisbee on a field. A plate of cheese bread next to bread sticks and wine. A street that has a bunch of cars and trucks. A pizza that is setting down on a table. Someone's hand on top of a computer's keyboard. A man in the progress of getting ready for a wedding. A black horse standing inside a fenced enclosure. Hands typing on light colored electronic computer keyboard. Two beds with a nightstand in between them. A man wearing a tie holding his suit jacket over his shoulder TV in a cabinet with other furnishings around it Man sitting on the side of a van playing the guitar. some elephants in their pen and in some water A wooden table topped with plates of food and fruit. A brown bear is walking in the woods by some bushes and trees. A kitchen with a refrigerator and some cabinets a man and woman are sitting at a table with their food A man in wetsuit surgin on surfboard next to wave. A white bathroom sink sitting under a mirror. A grill holds meat and a wide assortment of vegetables. A group of women eating at a dinner table and conversating. A palm tree in front of a poster. In a park, a man in a dress shirt sits on a rock. The boy is skateboarding up the ramp during the day. A laptop computer sitting on a desk in front of a window. Commode scene, probably commercial establishment, outside of USA. Girl with cake in hand looking at lit candles on it A living room with gold walls has a playpen and mounted television. A large cut pizza on a table with a laptop. A cat sitting on top of a television looking down. The toilet has special buttons that help the handicapped. Several people film and observe children as they use iPads at school. A woman swinging at a tennis ball on the court a guy jumping with a skateboard on a sunny day A mix of beef and broccoli covers rice. Four powder covered donuts on a blue plate. Sheep grazing in an open grassy field. A woman sticking her tongue out and doing the "shocker" hand sign. three muffins sitting on a chair with a bite out on one of them A tennis player with racket serving the ball a couple of kids that are playing on the ground A tray covered in chocolate donuts on top of a table. Several cops on motorcycles parked next to a large group of people. A close up of a clear vase with flowers. A group of people throw a frisbee in a circle. A wooden bench sitting on top of a dirt field. An alarm clock next to two people sleeping and a pillow. A laptop on a pedestal near a hedge. A taco salad sits on white paper near a table with a lap top. Teddy bears with barcode tags in a pile. A cramped bathroom with a yellow bowl on the back of the tank. a very large collection of remote controls spread out A floor with lots of different items and a bag. The motorcycle is parked on the side of a road near snowy mountains. A line of hawks wearing hoods on a wooden beam. a hot dog with onions and cheese next to some french fries A low angle shot of Big Ben in the daytime. a woman looking up at a banana tree. A motorcycle police officer leads a parade on a sunny day. A counter cluttered with many items, including a tea kettle, a pot, a food scale and more. The city bus is driving through a street intersection. Two woman sitting at a table eating food A wild trail with elephants and jeeps driving down a path. The boy eats his large breakfast at the table A market has an array of fruits displayed in boxes. a pair of scissors and eggs laying on a table a close up of a person pulling food out of an oven Two women in front of a television playing a video game. A green, red and blue bus parked on a street in a foreign country. He is eating a banana while taking a selfie. A couple of benches sitting next to each other. a busy street that has a lot of cars in it a person taking a photo in a mirror A few people are laying on a pull out sofa bed. Vintage motorcycles sit on a tiled floor way in a shop A baseball player is getting ready to hit a ball. A piece of cake sits atop a piece of foil. A woman prepares to hit a tennis ball on a tennis court. A snow boarder taking flight while skiing down a slope A baseball pitcher in motion with the ball right out of his fingers. A big screen TV and a Wii gaming console on a rooftop. A large truck and a bus on a road. Police officer on horse moving through city street. a couple of horses that are tied up a clock on a tower next to a building an old photo of some people in fancy clothes sitting on a boat A woman riding a aqua blue wave on a surfboard. a red city bus coming through an intersection A bowl has a salad with carrots, red cabbage, and broccoli in it. A man soaked walking out of the water holding a surfboard. A glass bottle on a red surface with a red backdrop A bowl filled with mixed cooked green vegetables. an extremely long hot dog covered with ketchup and mustard sitting on a table A man with a dog is preparing to board a train with others. The people in the homemade boat have a bicycle and a big green umbrella. A donut sitting in front of a laptop with black and orange sprinkles a close up of a hot dog next to a drink on a table A man standing outside beside a bunch of fruit. two apples and a banana laid out to look like a happy face An asian woman smiling while holding a cell phone. a young broccoli plant in a garden bed a train moving on the tracks next to a building on a hill Various sizes and colors of tagged and bundled luggage. A bride and groom cut the cake at their reception. a lady on the mountains in very warm clothing A guy leaning on the front of a food truck Odd plant in a vase on a tray with cookies. A view of home plate and to left field during a baseball game. An adult elephant walks near two smaller elephants. The floor of the bathroom is strewn with toilet paper. Person of a surfboard riding a wave in the ocean. Two bikes are sitting in the sand on the beach. A cat perched on a toilet using the bathroom. Two people in cowboy hats riding bicycles in an RV park A jockey rides a horse through a course. a blue and gray bus and a woman and buildings A person holding a tennis racket and ball getting ready to serve. a large bus riding in the street outside a building two little teddy bears with peoples names in tags a black bird flying above the water of the ocean A person riding a racing bike on a track with spectators. A surfboarder falling off his board as a wave hits A batter, catcher and umpire in a baseball game. A herd of sheep standing on top of a grass covered field. A group of kids at a table with a cake. A teenager doing a skateboard trick in front of a crowd. A small red bird perched on a branch. A man swinging a tennis racquet on top of a court. A woman sits on a bike holding a small gun as a man lies in front of her. many kites flying in the sky with a street light A red food truck has a crowd of people by it. An old model motorcycle parked outside a house. A bathroom with a glass shower door, toilet, bidet and sink, with a set of shelves A bed and a mirror in a small room. some children are in a yard and one has a dog on a leash Several horses running down the track near a fence. A person is displaying the hot dog they are eating. Two bears playfully fight and nip at each other. A busy city street has many red double decked buses on it. Skate boarder performing aerial trick on sidewalk with car nearby on roadway. A passenger jet that is on the runway. someone is holding in their hands a very old mp3 player A group of men on a field playing baseball. A cow in a fenced in grass area. This truck has two yellow ribbons and says Freedom isn 't Free. two people playing on the ocean with a frisbee A man on a court with a tennis racket. The zebra is drinking water from the pond near the grass. A white plate with two crab cakes and fries. A woman's feet who is wearing a pair of red heels. The painting shows a parrot sitting on a branch over a river. A small child poses in his baseball uniform A fire hydrant in a weedy lot next to a street. A woman is riding a moped on the road. A stack of folded shirts sits in a darkened room. A small group of cows are grazing out in the pasture. Black and white photograph of a train at the station People are standing in a street car covered in oranges. A brown bear licking the ear of another brown bear. A man wearing a beret while using a laptop computer. A little girl packs her luggage with toys. a bathroom with a sink and a mirror in it Woman placing a dog on a white and yellow surfboard. a small brown and white bird eating off paper plates on a table A couch and a coffee table is in a living room with a wooden floor. A teenage male is falling off of a skateboard. A couple of buses parked in a parking lot. a desk has a laptop computer and monitor on it A computer is sitting on a messy desk with flowers. A large living room has a mini kitchen in the corner. A building with a clock built into it. A dessert consists of donuts and custard cream. A slice of cake and strawberries on white plate. there is a cow along with baby cows behind a gate A pizza on a rack and a plate with noodles. Pair of giraffes foraging in natural outdoor setting. A glass filled with pens and scissors and pencils. A black cat laying inside a bathroom sink. a white van is on the back of a truck A wooden chair sitting on a sidewalk next to a tree. A slice of pizza on a white ceramic plate. Orchids are arranged in a glass bowl with table accents around. a man that is standing under a tree A painting of waves upon an ocean with tall grass and gold flowers Luggage at an airport under a blue net a very black dog lying on a courch A girl with pale skin wearing a hoodie holds up a toothbrush. A close of a fire hydrant painted red white and green A giraffe and several zebras out on the plains. A large tower with a clock stands in front of the cloudy sky. a number of people in an open field with kites flying above Skier in the air on fresh powder snow. Elephants are bathing in a river with three men. two large air planes on a run way a little boy is holding up a cell phone A faded yellow and red train passes through the trees. A street scene with many cars and a bus. Three surfers stand in front of a wall facing the ocean. A young man standing on top of a field holding a baseball bat. A man sitting at a table with a laptop and looking off to the side. A cute little animal made out of oranges that is on a plate. There are tombstones in the cemetery next to an old church. Two stuffed animals sitting beside each other on a chair. A large truck is shown in a rear view mirror. a fire hydrant near a tree in a field A dozen of glazed donuts in a white box. a zebra grazes on some vegetation next to a fence A car is driving down a city street. BOY TAKING A GIANT LEAP ON A SKATEBOARD IN FRONT OF ONLOOKERS A man and woman on beach with three surfboards. A man surfing, with a vegetated coast in the background. A large circular clock near a body of water. A tall stop sign next to the road near a red fire hydrant. A young girl is holding the reins to a small horse. A man on a skateboard with his friend talking to people. A bear eating a piece of food in rocky area with hay. There were a flock of sheep walking down the road together. A wedding cake and cupcakes on a table with knife. A table has some old fashioned computer type equipment on it. A desk with a laptop and desktop computer. A man with a baseball bat that is standing in the dirt. A hotel room with a neatly made bad and lamps on the bed stands. A modernly styled hotel room has a bed that appears to float off the floor. A couple of bannanas and cards for sale A herd of horses standing in a dirt horse coral. A Not A Thru Street signed hung up on a tree. a blue chair is in front of a desk A woman preparing to serve a ball thrown high in the air. A toilet and a urinal with male and female signs. a person wearing pants surfing on a white board A grassy field with different colored umbrellas on the grass. A outdoor cafe with many people chatting and eating. a group of young people getting ready to go ski A large group of people are on a field flying kites. a dog sitting in a truck with its head out the window a cat laying on top or s shelf in front of a window A bunch of wooden desks sitting inside a classroom A man and woman stand holding tennis rackets with a young boy. a large air plane flying in the sky A kitchen area with dining table, refrigerator and sink. A wood plate with several yellow rolls on it. Elephants are drinking water from a small pond. a close up of a dog laying under a table A ginger cat lounges comfortably on a bed. A man standing behind a camera on a grass covered field. A couple of men kayaking in a flooded park area through a gate. A woman scratching flakes of fecal matter off of her buttocks. A number of wine glasses and a cup on a tray A bicyclist speaks to two police officers on horseback. A construction worker standing on dirt near a fire hydrant. A young child in a field of grass holding a baseball bat. Weightplate with me investable sitting on top of the table. Three colored toothbrushes standing in a glass holder. Cuff links on the sleeve of a man wearing a business suit Two giraffes are standing under a tree back to back. Fresh fruits are stacked and arranged in colorful rows. A table with plates, silverware and an electric grill. The ski jumper is concentrating intensely on his target. A man is walking while using his cellphone. A man riding a surfboard in the water A person on a snowboard rides on the hill. This is some fine dining courses on nice plates. A person inside of a house using a computer Horses and goats are grazing on the open terrain. Two boys in jackets and hats ride horses together. A person has fallen off a surfboard near a large wave The young boy is playing in a baseball game, Man on skis on a downhill course after a fall. A rainbow siting below a lot of clouds near a field. A person holding a controller pets his cat. A white toilet and broken mirror in a side yard. A man riding a wave on a surfboard in the ocean. Zebra grazing on grass in outdoor enclosed area. A sea plane taxis across the water in a large lake. A microwave oven sitting on top of a counter. Two pillows on a bed next to a window. Stuffed toy bears on display through window setting. A white toilet sitting in a restroom with a open lid. A dog is laying on the ground with a frisbie. Man with a backpack using a urinal with against a tiled wall. A trio of images of food including bell peppers, watermelon, milk, and chopped meat A close of up oranges with people standing around fruit stands. there is a surfer that can be seen in the water This is a cut up potato on a cutting board with a knife on top of it. A baseball player swings his bat after a hit. A baseball game in progress with the umpire calling a play. A man standing in a room with something in his hand An adult goat standing beside its baby goats in a grassy area. a teddy bear sitting on a wall next to an old stone house. Several skiers congregate around a slope at a ski resort. Little boy looking out over a calm body of water A young child is asleep next to her mother. a hummingbird eating from a little bird feeder Quesadilla for breakfast with a friend at a restaurant A bird is posted on a rock by a lake. An empty roadway between two rows of buildings. Two children stand on a porch with toy tennis rackets. A cake depicts a laptop, mouse, and latte. A blue bus waiting for passengers at a stop. A kitchen with multiple counters and various appliances. A brown bear pup running across a grassy area. The men are celebrating at a formal dinner with one wearing a paper crown. A smoothie is pictured next to several fruits and vegetables. a group of navy jets slying together in a line The huge airliner is flying next to the clouds. A blurry image of a man in a room full of pots on tables. A bunch of people with some wearing headscarves are flying kites and pulling a panda bear balloon. A kite laying on the ground surrounded by people. A baseball player in a white uniform holds a bat over his shoulder as he stands near an umpire and a catcher. A desktop computer sitting on top of a wooden desk. A young girl looks through the eye holes in a pizza. A mirrored bathroom with a good hair dryer. Steak and crab cakes served with grilled peaches. A professional skateboarder leaps over a bunch of over skateboards while a crowd watches. A variety of Asian foods sit on a table. A girl standing under a white and black umbrella. A man is riding a surfboard in the ocean. A surfboard stored on a rack at the beach with people in the background. A man leaning on a pole on a sidewalk in front of a store. A street sign sits at an intersection near a store. A red train traveling past a three story building. a bathroom filled with a sink, toilet and hardwood floors A few trucks at night with their headlights on A skateboarder takes a leisurely run down a city sidewalk. A boy riding a skateboard on a sidewalk in an open courtyard. A cat playing with a cup that is on the floor A man is covered with four cats in bed. a clown, teddy bear and troll doll for sale in a store. someone rolls a pizza cutter over a small pizza A wireless computer mouse with a computer in the background. A child reading a book next to a dog that's lying on the ground. There is a man sitting on the couch next to a woman but he has three neck ties on. Three lamb in a pen, some of which have been sheered. two young people playing in a house one is posing with a stick. Man sitting at a picnic table near the beach with his lunch. A cat on a leather chair next to remotes A camera and tripod is shown with a laptop. Hundreds of birds soaring through a cloudy sky. A jumbo jet is just taking off form the runway. A woman puts something into a stone oven. A airplane that is flying over a runway. A child flying a kite on a sunny day. A counter and refrigerator in a small kitchen. Surf boaders preparing to head into the ocean. An airplane is on a runway near a passenger ramp. A food entree is served on a plate. There are different types of Italian food in the picture. A man leaping to hit a tennis ball with a racket. A beautiful woman inspecting a small brown dog. A fancy bathroom with clear shower, toilet, and mirror Two boats floating on top of a river next to a rock mountain Red pickup truck carrying a sign it its truck bed. A set of windows with a red farm house in the view and green grass on the ground. A woman sitting at a table with a plate. A soldier dressed in white on top of skis. A young elephant by a pool of water in a zoo enclosure. a split picture of two tennis players swinging at the ball A police officer and police horse directing traffic. a black keyboard and a power strip and cords A young woman looks over her shoulder as a sky lift takes her down the mountain. a person riding a wave on top of a surfboard. Four men playing with remote controlled dog toys. A man teaching his child how to ride a skate board Baseball player at the plate in the process of swinging at a ball. A t-shirt has been put onto a stuffed bear Small cat sitting on top of a table looking at a television. A cat with a collar sitting on a laptop keyboard. Two square pizzas sitting a grill with cheese. A baseball player is starting to run to first base. Four people who are all wearing snow skis. Two kids laying down propped up on pillows. this giraffe is going for a walk in the grass A man is surfing on a surfboard, catching a big wave. The purple and pink flowers are in a vase. Two baseball players walk near another player from the opposing team. A person performs a jump in the air on a snowboard. a close up of a plate of pancakes on a table A giraffe standing next to a wooden pike fence A woman is posing in front of a giraffe. A woman sitting on a couch near a dog A very sleek, clean and dark modern kitchen. a baseball player is running down a field A person on a snowboard in the snow. A plate on a table is filled with carrots and beans. a toilet on the ground outdoors near a bath tub The contents of a purse are on a table. A vodafone sitting on a table next to a Mac laptop. A mismatched bathroom includes a center shower pan. A white toilet sitting next to a window and a sink. Three giraffe standing next to a fence under a lot of trees. There is no image here to provide a caption for. A table holding a group of fruits and vegetables in bags and crates. A tour bus with advertisement on the side of it People sitting at a table with multiple servings of food. a woman in a black top on a motorcycle and a male on a bicycle a red plate that has a piece of chicken with some veggies on it birds standing on the edge of the ledge by the water a large clock on the wall above a radiator many people riding skis on a snowy slope A hat that is on top of a shelf. A keyboard and mouse on the ground in a room. A woman holding a suitcase on a dirt road. A couple present a birthday cake with three candles. 5 very people posing for the camera over some drinks. a bunch of motorcycles are parked tightly together Two monks with umbrella standing on a pavement A man carrying a surf board out of the ocean. A nurse administering medicine to a patient in a hospital. A white plate that has various types of vegetables, meat and food items on it. A set of cutlery and personal items lined up on a table. An orange kitten on the green couch by itself. Skier in red jacket stands on top of a large mountain placemats are on top of a counter in this kitchen there is a man sitting in his truck next to a surf board two trains on a train track at a train station A elephant standing next to two men near a stand. The bathroom is clean and ready to be used. A railroad train letting off a big black smokecloud Two birds stand beside each other outside a green door. A person standing in the snow near a snow board. A single engine plane out front of buildings close up of a red vase holding sticks A old picture of workers building the railroad. A large leafy green salad in a silver bowl. A pre-made cold sandwich is in a cooler with drinks. A man is on a saddled horse with reins. A woman sitting on top of a bench with large breast. A couple of street signs that got wrecked from a car accident. A horse with a cover over it being carried along by a woman. A woman that is sitting outside on a bench in the snow. Two giraffes standing on a grassy plain with mountains in the background. Group of folks playing bowling on Wii sports A person goes down the slope covered with snow. A PICTURE OF A BASEBALL PLAYER PLAYING BASEBALL A table topped with vases filled with flowers. Three sheep are grazing on grass by trees. A man standing in falling snow at night holding on to a snowboard. A man running on beach with a surfboard and mountain in the background. A woman sitting on a white bench with her dog. A wooden table topped with plates of food and drinks. A close-up photo of a propeller plane in flight. A baseball player is getting ready to swing the bat on home base. An accordian sitting on a toilet in a bathroom. A child holding a teddy bear while outside. A train that is driving by in the day. A baseball player has just hit a ball. The train has stopped on the railroad tracks. A pair of shoes that are under a bench. many horses st a horse stable with people walking by A very nice looking pizza with assorted toppings. A woman in a dress carrying an umbrella The vase is filled with multiple pink flowers. A photo of a horse race from inside the stands a number of boats in a large body of water Three urinals are hanging from a marble tile wall some kids in a bedroom with a lot of beds in it A surfer dressed as Abraham Lincoln rides a wave into the beach A clock sitting next to a large tree near a building. Young girls with backpacks are standing near stairs that look to go to the subway. a young lady in her room looking out the window A couple of people playing a game of tennis. A frame has six pictures, two with a horse. A living room area with tile flooring and a man sitting int he middle of the room on a chair with a remote control in his hand, while looking at a television. Yellow box truck parked on busy street in city. A man is cutting a sub sandwiches while a lady put a vegetable in the bag. A dog chasing after a Frisbee with green grass in the background. People standing around two cakes and plates on a conference room table. A woman laying in a bed next to a cat. a sign on a short pole nest to some little trees A gray and white cat sleeping inside of a luggage bag next to clothes. a person in the air with a skateboard at a skate park A person holds an umbrella in their hands. A woman is making homemade pizza at a table. A table topped with boxes of cupcakes and a sign. A woman in a white shirt cutting into a cake in front of a television. Man surfing on a surf board on water. THERE ARE A LOT OF VEGETABLES ON TEH STAND three elephants in a green field and some clouds Motorcyclists gather at an event with their bikes. A book and a pillow with a face lie on a blanket. The group is gathered around the table to eat their meal. Two breakfast meals on a table at IHOP. A train on a track pulling into a station. A dog enjoys chewing on a carrot in the living room. A black and white themed bathroom with two toilets. A food tray with french fries and a sandwich. A young man prepares to hit a ball with a plastic bat. A train going along a track near apartments. A large glass table topped with different types of plants. Two men in the park playing with a frisbee. Bathroom stalls with trash on floor in commercial business. A woman holding a tray with a chocolate covered pastry. A toilet with buttons or a remote control. Screens and small stuffed animals on a computer desk. A pantry area next to a large white fridge. Pair of skiers on snowy slope at sunset. A garbage truck is emptying a plastic garbage can. A man is bicycling down a street with a passenger standing on the back. A group of people are skiing down the mountain. A game controller in a persons hand over a couch. Two horses are pulling the covered wagon through the snow. A man on skis comes down the slope a person standing over a squat down toilet A group of four people playing croquet on a lawn. A small airplane is flying against the blue sky. A bouquet of flowers sits in a vase on a desk. A person is in the rear view mirror of a motorcycle. The man smiles while walking with skis down a grass slope. A walk in shower in a dilapidated bathroom. A young girl holding a tennis racket upright A commuter train pulling out of a suburban station. A person using a pair of scissors to work on a garment. A man taking a big bite of a hot dog. a bus showing domestic animals moving along the street A couple of cows that are penned up for safe keeping. A close-up of a black cow in front of a metal fence. Cheerleaders are riding atop a trash truck turned float. A bunk bed sits next to an open window. A very big pretty bird in the water. a fire hydrant with the word hydrant written above it A man holding a remote in front of a garage. two boats are idly floating on a lake. People looking at a group of giraffes in zoo. A baseball player crosses home plate as his teammate waits. Four airplanes are flying high over telephone wires into cloudy skies. a man bent over a sink while brushing his teeth Three wine glasses and a glass bowl are on the top of the refrigerator. The woman is walking with a pick umbrella. A woman on a transport bike waiting for customers. Two double decker buses sitting on top of a parking lot. The Ansett-ANA airplane is parked on the lawn. A clean bathroom with a toilet and shower. A person holding up a smart phone to take a picture. a girl sitting at a table with several plates of food around her. A little girl trying to feed two giraffes through a netted barrier. A woman is playing with a Frisbee on the grass. A passenger train leaving the train station that is now empty.. People walk near the many parked tour buses. A teddy bear sitting on a table in front of a computer. a number of people sitting and standing near a building a man is eating fruit from a bowl Two cows roam and graze among trees and shrub along a mud path. A table with three each of three different kinds of pizza. There is a taxing rolling through a wet street A train covered in snow on top of train tracks. two elephants in tall bushes and trees in the background Construction is being done on a street near businesses. A man looking in a toilet under a sink. A person is standing in the middle of fruit. A chili cheese dog on a plate with a bag of corn chips next to it. A woman standing on top of a sandy beach flying a kite. The man and woman are decorating the vases together. The men are enjoying a meal together by the window. A street corner with a sign and a person riding by on a bike. A television in a living room with a doughnut logo on it. A kitchen has old white cabinets, and rice on the counter. A yellow and black fire hydrant on sidewalk next to building. A window in a kitchen with a red shade is shown. a nice stove that is inside of a kitchen A bullet train on rail tracks in the open country. A yellow bus on street next to a building. A man flying through the air riding a skateboard. A donut on a plate in a microwave oven. A display case in a bakery filled with lots of donuts. A snow boarder performs a jump on a ski slope. a giraffe standing next to a tree with one with it's leg in the air A person with a baseball bat on a field. A white toilet in a black bathroom with a phone on the wall. Three children dressed in "Sunday school" clothes posting for a picture. Fans pose with stuffed animals at an ice rink. A tennis player is lunging forward after hitting the ball. The toilet is clean and ready to be used. A rocking chair sitting near a fire place. Four men stand behind a couch playing a video game. There is a mirror with a reflection of a train in it. A her of zebras in the watering hole with a giraffe in the background. A boy performing a trick with his skate board. A refrigerator that has a plant on top of it. This is a broccoli carrot soup with a lot of broth. A man wearing a black shirt and a purple tie. A man riding a wave on top of a surfboard as he flies through the air. Oriental umbrellas at a food court in a mall. A picture of some food in a plate. A person with an umbrella stands in front of a bench. A little baby laying on a fluffy blanket. A dog with a frisbee in his mouth in the back yard. A surfboard is decorated and sitting in the sand. A group of CGI people standing on a hillside flying kites. An orange and white cat sitting on a wood seat by a bed. A white plate topped with a pile of food. Two women prepare various vegetable dishes in a kitchen. A women riding a bike with an umbrella. a piece of cake sits on top of a plate A group of girls sharing a pie each with a fork. A cat sitting on the back of a motor bike. A man placing some flowers inside a vase. Woman riding a horse on an asphalt road. Two pug dogs dressed in green bow ties and green top hats to celebrate St. Patrick's day. A young child is standing in a room with toys on the floor. A white van is covered with graffiti as it's parked near a curb. A man is riding his 103 labeled bike on the road. A young boy holding two skis poles on top of a snow covered slope. Two people on bicycles and a dog crossing by barrier on a street. Three different trains stopping at a train station. A plate of vegetables is set next to some sauce. A motorcycle cop on a city street tries to look cool. A man and a woman enjoying a meal of sandwiches. A man exits the huge boat parked by the beach. The dog is laying down on the grass outside. Man flying kite in open field near RV park. Boy skate boarding on cement ramp at night. a bathroom with a tub, counter, mirror and small mosaic tile Three people skiing in single file in the snow. there is a fried crab inside of a small bowl A group of animals standing next to each other. Several Southwest airlines planes sitting on the runways. Two dogs on a beach surrounded by grass. a couple of kids sit at a table with some cake Orange puffy dog standing in the light on a tile floor. A herd of sheep grazing in an open grassy field. A stir fry consisting of rice, broccoli, and other vegetables. a man is taking a selfie in the mirror People are skiing down a snowy hill. A bus is headed under a pass way on a foggy day. Five girls in the frame playing soccer, one has the ball. A colorful train winds through the valley of a mountain. A statue of a man not far from a large clock. A young boy in a cluttered rec room playing Wii in his pajamas. A person riding a skateboard down a handrail Horses cows and sheep are led down a dirt parking lot. a bear that is sitting on a very large rock Two baby horses playing together in a field A male tennis player about to return a tennis ball. A kitchen that has a stove, refrigerator, and table in it. A smiling man with a box of donuts is handing a donut to a girl as two other young children look on. a person riding a surf board on a body of water a person is drinking a beer and eating food A white toilet with a clear toilet seat. a bath room with a toilet and a towel rack A group of snowboarders in the snowy conditions Three fire hydrants in front of a huge building. Sports team playing baseball on a ball field. A couple of women holding up smart phones in their hands. A night scene with a lit street sign, "Fremont St. Experience." Three cows stand at the top of a grass-covered hill. The front and back cover of a book. A busy street with people walking by a train station. A man in a top hat and a woman with glasses. a bath room with a toilet and a sink A pair of scissors and crumpled paper sitting on a table. A plate with a piece of food next to a pile of cheese broccoli. A cow and a calf are standing in a pen. A grown and a baby elephant are in a sandy area a nice fast green motor cycle in the sun A ELEPHANT IS IN THE WATER RIGHT NOW Young professional looking man with a tie and cardigan A red train with a bike painted on the side. A man ordering something from a milk truck. A man riding a skateboard through the air over a ramp. A red double bus sitting on top of a dry grass field. A police officer rides a motorcycle with a side car. A tennis player lunges to hit the ball. a delicious looking sandwich on a plate with a knife The cowboy at the rodeo is trying to rope the calf. a street sign for Peepee Falls street above a stop sign A dog is sitting on a piece of wood. A women in a sunhat and sunglasses posing beside a bilingual English-Arabic stop sign. A girl stands while talking on a cell phone. Living room with a table, couch, and a lounge chair. A young boy is in the park holding a kite. A green and white van full of signs written in spanish A child laying in crib with teddy bear. A man sitting in front of his birthday cake smiling. Some apples and strawberries are on the plate. a fire hydrant sitting undeneath trees covered with toilet paper A punch of different shots of a man in the air. This is an image of a laptop computer An animal eating from the ground near a beach. A man standing up holding wii controllers in his hands A group of people holding while glasses posing for a picture. A store is on a city street near a traffic light. There is a military plane that is parked on the tarp a black and white cat looks out the window Some type of cheese casserole enclosed in parchment paper in the oven. A man leans down and picks up a flying disc. A large kitchen with many brown cabinets and brown flooring Feet wearing red tennis shoes stands next to a white toilet on a tile floor. A bed in a bedroom next to a table with a lamp. A laptop sits on a pad on a desk. A tennis player bounces a ball before a serve. A young boy tosses a tennis ball into the air in preparations to hit it. a couple of people walking on a highway A man cooking vegetables and sausages on a grill An assortment of miscellaneous gadgets spread out on a table. A white dog is lying down under a chair in sand. An cat sits on the sill of a dilapidated window. A happy woman engaging interaction with her laptop A clock tower made of bricks outside when it is not so bright. Three giraffe standing next to a brown stone building. The corner of a kitchen showing a dishwasher, sink and household items. Large assortment of decorated vases on shelf on display. A surfer rides in on her stomach and a gentle wave a couple of elephants are standing in a field a group of weird looking vegetables sitting on a table There are people with a man holding a Frisbee on the grass. Pizza with side salad and glass of wine on display on table. A man is talking a picture of a man on a skateboard. A black boy playing tennis at a tennis court. there is a man on a skateboard doing a trick Brown, black and tan cows grazing on grass in an open field. A man sitting with two ties on. A large Italian dish on a wood block A woman wearing a pair of glasses on top of her head. a man is holding a tennis racket and a ball A horse drawn carriage coming down a city street Men talking to monks sitting down at an airport. A baseball batter striking a ball at a baseball game. A person sail boards in a lake with hills in the background. Two people sit on a city train while checking their personal items. a long narrow bathroom with a dirty tub and blue and white walls. A stop sign in front of the water on a bridge. Two beers sit on a table between bunk beds. A pan on a table with lots of pizza. Bowl of pasta with chicken and broccoli with bread and cheese. A dirty show floor in a very small bathroom. A yellow school bus negotiates and intersection in a city. A large room with much seating available. A young boy riding a pedal boat at an amusement park. A man talks to a young boy who is wearing skis. A little boy plays outside with his ball. A table with a lamp on top of it next to a couch. Young man in orange jersey swinging a baseball bat. A man is sitting next to a Christmas teddy bear. Two zebra standing next to each other on a dry grass field. A person wearing a tie posing for a photo. Smoke billows from the back of a yellow and blue fighter jet. A dog is standing in the middle of a rug wearing a green tie. A couple of people are walking their horses. A cat that is laying underneath an umbrella. three baseball players holding up bats on a baseball diamond A man on a scooter doing a trick in the air A vase and glass with decorative paintings on them. Three friends pose for a picture while dining. A woman plays a video game in a living room. A man installs wood cabinets in a kitchen. A picture of a old water pic machine. A man lugging a red bag of luggage down a sidewalk. A man and a woman seated on a motorcycle, leading a line of others, also riding on motorcycles. A man stretching out yelling while catching a Frisbee. Asian man in glasses holding two colorful mobile phone cases a number of people standing holding umbrellas near a building A couple of lawn chairs sitting under a white umbrella. A person is holding up a carrot in a kitchen. A close up of a blue vase with flowers on a table. A cow eating grass by a house next to the ocean. a person who's going down a snowy slope. A kitchen filled with metallic appliances sitting next to a stove. some sliced up orange peels sitting on a counter and bowl A wood room with some tools on shelves A little boy seems fascinated by this silver fire hydrant. A bathroom with a white toilet next to a sink and tub. A bicycle chained to a beached boat on a beach. A man skiing on a slope while people watch. An old man in the middle of his kitchen. People walk on the beach, with a hut in the distance. A crowd gathers outside of an outdoor bar. A man crossing a busy intersection near train tracks. The man is fixing his two skies so the shoe will fit. A crowd of people with umbrellas standing near a train. A dog in a field looking up while wearing a hat. A black and white cat relaxing inside a laptop. A kitchen is shown with an oven and stove. Two men ride a bicycle contraption with a big load of bananas. A large motorcycle is on display at a gathering of people a baseball pitcher ready to throw the ball A coffee cup sitting on a counter in front of a TV with the show 24 playing. A bathroom with a shower sink and windows. People carrying surfboards walking down a sidewalk during the day. A bus sits parked at the curb on an empty street this man is riding a wave on a board A baseball player stands in front of advertising signs. Fruits, vegetables and a carton of eggs sitting on a table. Three giraffes standing idly in a dry field A man in a suit is holding a glass of Champagne. Streetlights in front of a brick building in some downtown The working and kitchen area of a dorm room Man in a green field standing behind a red Frisbee in the grass. A surfer in a wetsuit catching a breaking wave. two dogs standing on a checker board printed floor a close up of a small dog near a car Two cows stand next to each other inside a corral. A young child lays in bed with a bunch of different books. Two birds walk in the surf along the beach. a black and white picture of a blue fire hydrant. The small bathroom has a beige toilet in it. The woman is showing the child how to feed the giraffe. Two brown cows standing in some tall grass. A young boy touching a small frog that is sitting on an orange frisbee. A herd of cattle walking down a country road. Three motorcyclists riding down the road on a curve People are waiting on the station platform for the train to stop. Several zebras standing in grass during the day. A man riding a motorcycle while talking on a cell phone. A tennis player has just hit the ball. The toilet is sitting in the brown colored bathroom. A print ad for the Pizzeria La Crescia. A desk with a laptop, monitor, keyboard and mouse. Cute picture of white cat snuggled near older dog. a close up of a remote control pointed at a tv A hot dog on a plate with lettuce. people skiing down a hill with no poles Horse-drawn carriage moves along street carrying two passengers a man that is throwing a frizbee in the woods A cat is perched on top of a parked car. A couple of street signs sitting next to tall buildings. A young woman with an oar paddling on a surf board. There is a bowl of fresh fruit on the table. A young boy is sitting in front of the oven. A wall mounted black oven next to a counter top. A man trying to manoeuvre through violent waves as he surfs. A woman sitting on a bench holding a kite of a bat. A train traveling down train tracks next to a small building. A boat sailing on a beautiful lake during the day. Olympic skiers are competing in a cross country event. a train traveling along tracks near a lush green forest. A man using a snow board holding a giant fake axe A tennis player swings his racket to return a ball. A plate that has a sandwich and french fries on it. A elephant and a brown elk in a field. A neatly organized room with a bed and stuffed bear on it. A pair of hands preparing a sausage dog on grill. a group of guys on the soccer field playing in front of a crowd A group of people skiing in a ski race on snow covered ground. An electric commuter train at a well maintained station A motorcycle sits on a sidewalk near a city street. A dell lap top and an apple laptop side by side on a counter A man driving a two horse wagon team. A kitchen with a stove top oven next to a kitchen counter. A man in a shirt and tie motioning with his hand. a table that has a bunch of stuff on it A large airplane flying in the blue sky. Two pieces of flat round bread laying next to each other. A boy sits on a brick wall while holding his skateboard. A small room features a microwave and a mirror. A man wearing a fedora talking on a cell phone. A man holding a plate of fresh pickles up. A group of zebras gathered and a wooden shelter to get out of the sun. A man cleans his surfboard with a cloth There a man and woman standing on the beach. A laptop sitting on a desk near a cellphone, mouse, keyboard and monitor. A blue chrome motorcycle with a dark blue seat. The tennis player in the green Nike shirt has a pained facial expression. A restaurant called the library bar and grill This bathroom has wall paper on most of the wall and wall paper on the bath tub. a child in a wagon with many green apples A couple of people that are playing a game. a traffic light next to a street sign Two buses, one blue and one red and white, are going to different destinations. A man climbing up the side of a black pole in a park. A red and green plate holding a pink cake with frosting. A person on some skis in the snow. a image of blue and yellow trains on train tracks Two people walking along the beach while someone flies a kite in the surf. A woman and two men inspect cars at a show. A GROUP OF ZEBRAS CLOSELY GATHERING TOGETHER IN OPEN AREA. A breakfast sandwich made from biscuits contains egg and sausage. this is a group of elephants in the water near rocks Stop sign with street signs at a parkway intersection. a pair of animals on the side of a rocky hill A man standing on a snow covered slope holding a board. a grey cat standing at the sink with its eyes wide open Four bowls with food in them on a table A laptop and a mouse sit on a wooden table. A bed that has padding with a blue picture frame hanging above it. A young boy about to throw a baseball during a game. The desk is full with computers and other hardware. Two men sitting on a couch holding pool sticks, one between his legs. A couple of people walking out of the ocean with surfboards. Black car sitting at a red light intersection. Variety of different deli products sitting in a glass case next to each other. A police officer on a motorcycle with others following. The dark green double decker bus travels down an empty road. There are lots of seagulls flying near a boat. There is a freshly made pie on top of the stove A rainy picture of three red double deck buses on a street. Various black and white street signs with a pigeon on them. A man and a woman cooking hot dogs on an outdoor grill. A person and a kid on a couch in a room. Parents laying on bed in opposite direction of their daughters A zebra standing on top of a grass covered field The table has several plates of pizza on it. Two sandwiches and a bowl of fries sitting on a plate next to a cup. A skateboarder jumps over a limbo bar during a competition The view from the airplane shows a mountain range. A man throwing a baseball at a baseball game The people are playing a game in the living room. Child laying down with arms extended in the air. A small bathroom with a lot of white tile. Two women in a kitchen preparing a meal two cats laying in a messy bed near a wall The woman is posing for a photo near the bikes. Men in SWAT gear running with guns drawn. a pair of colorful vases holding white daisies A small bird perched on top of a tree branch. An old building with a clock tower in it. Cupcakes with candy and marsh mellow toppings sit in a white box. A man performs a trick on a skateboard in front of three other men. A train on a railroad track adjacent to 5 other railroad tracks. Man in blue shirt feeding birds from cup. An elephant with a medium sized bird on his back is eating brush. Two men are standing in front of flags and shaking hands. a big white bridge is going across a lake A person sits at an outdoor bar with a piece of paper. a stuffed animal dog sits inside of a toilet A bird is perched in front of a window with bars on it. A black bear that is sitting in a grassy spot in a garden. A little girl wearing green shoes riding a skateboard in the street. Two men pictured next to a light aircraft with another one in the background A stained-glass window is seen in front of a unique background. A man is standing in a field and flying a kite. A table filled with fresh vegetables being prepared to eat. Catering truck parked tightly between cars on a city street. Many people stand in front of a large modern building. ATTEMPTING TRICKS ON BICYCLES AND SKATEBOARDS AT A SKATE PARK Colorful bird sitting on a branch of a tree. Man in blindfold and red garb holding glass of wine. A man is running to try and catch a frisbee. A sign indicating turns ahead in the night A deep pizza with cheese sliced into 6 pieces. Two horses racing with two men on them. A man takes his dog for a ride on a scooter. a sandwich and a cup on a table A bit of broccoli, celery and melon on a table. a couple of men are eating on a boat Outside shot of a restroom showing the door partially open. A street sign leaned over with the words High Gate Avenue on it. a view from below of a one way sign Two boys are playing a game of soccer. A kitchen with an island which includes a dishwasher, a stove, cabinets, a vent and two windows. a bath room with multiple mirrors and sinks Street sign saying Tow Zone with a teddy bear hanging from the pole. An orange subway car with purple and yellow graffiti is passing by two men. an image of a bear that is walking up a hill A man in a cap is sitting at a laptop. Two dogs standing in front of debris in the snow. A sandy beach covered in lawn chairs and umbrellas. Baby eating food from a blue plate and spoon A boy hitting a baseball with a bat on a field. a park bench with a blue umbrella among flowers and trees A cow is standing outside in the grass on a foggy day. a man that is walking around with a surfboard A yellow and blue train traveling under a bridge. A feminine shirtless man holding a bottle of wine in the kitchen with the refrigerator open. A couple of people in the snow on skis. A cross country skier is stopping along a path. Street signs one on a street corner surrounded by trees a cat siting on a blue bench in front of some trees a rusted boat resting on the shore A boy skateboarding along the top of a marble garden shelf. A bus that is driving down the road. There are books on all three shelves of this book shelf. An airplane and airport crews preparing for takeoff. A pizza cut into four slices with blue stuff on it. a women that is on a court with a racket a male in a white shirt with a black suitcase and people A double decker bus driving on a street. A guy is performing a trick on a skateboard. some sheep are standing way off in a field A red pick up truck with a large blue object in it's back. A little boy in a plaza holding a kite. Two men standing on a hill in snow skis. Close-up of green bananas still on the stalk a bunch of trains that are sitting on tracks. A large jetliner flying over a mountain next to a statue of Jesus. A pasta dish is featured along with a grilled flatbread. A man is standing in the grass holding a baseball bat. Person holding a camera in front of an orange display. A man standing in the doorway of a bus traveling down the road Many different cars parked on a city street. A white toilet with the seat up in a room. A bed that has some books on top of it. a number of birds flying over a body of water A red bowl of meat and vegetables on a wood table. A herd of zebra standing next to each other in water. A person in a blue coat snowboarding down a mountain. Group of people all on laptops during a meeting A plate of homemade cheesy pizza on a table. gentlemen in suits one wearing a bow tie and one a regular tie A man riding a skateboard down the side of a graffiti covered ramp. Behind a metal bar a giraffe is view-able. A young boy jumping into the air while wearing a catchers mitt. A couple of people sitting at a table with pizza. A skateboard rider on top of a handrail by a path in the city A dog is laying down with some stuffed animals. A bunch of animals gather together in the snow A bowl of soup with chopped broccoli on top. Two women talking and having a drink at a bar. A cow reaches through its fence to eat hay. A girl is playing frisbee in a courtyard area. A subway train stopped to except new passengers an image of a 2 zebras looking on We see a very old and beat up coke machine. A horse tied up to the side of a tree in the snow. A person that is working on a computer. a yellow green white and red double decker bus and a building A yellow street sign sitting on the side of a road. Two people ride an elephant on the side of a road. A man is holding a cellular phone against the rail. a person holding clothes near a bed in a bed room A man in sunglasses is getting ready to play tennis. A photo of an omelet and toast with coffee on the side. A delicious looking pizza with a variety of vegetable toppings stands out on a yellow plate. A child watches television while a panda bear sits by a purse. A white washer machine positioned in a bathroom. A dog walks through a kitchen with cabinets. The train does not have any cars attached to it. A cute teddy bear sitting on a table next to a bottled beverage. A gren and white bus on street next to a building. A sliced chocolate cake on a white plate. Hand with scissors cutting computer printout paper. A boy holding a skateboard with a two women and coffee design on it. A silver car driving down a rain soaked street with bikes on top of it. The train engines and cars have seen better days. Black-and-white photo of two benches on the street. A bowl of soup including vegetables and rice. a knife with a black handle broccoli and green beans A man on skis performs a jump in the snow. The sky is cloudy behind an illuminated street light. A bathroom vanity with candle, toothbrush's and holder and photo's of Marilyn Monroe. Two small ducklings on a field of grass. A dog sitting outside a large brick building. A man unloading sheep from the back of a truck onto a pile of mud. A lady eating a doughnut and drinking coffee. A red train sitting at an empty station. A woman helps a little girl take a bite of a large hot dog while they sit on a bench. a big tower that has a clock on top A filtered photograph of a person riding a motorcycle. A dirty bathtub sits in a bathroom with a big window on the side. A hot dog has a person's head on it. A crowd of people on a beach flying kites. A white plate with a piece of brad on top of it. a grey suitcase next to several other objects outdoors on the pavement. a large giraffe that is walking by some trees A couple of baseball players that are on a baseball field. A woman swinging a tennis racquet on a court. A jug shaped vase holding yellow flowers on a table. a ceramic set of two cups and a cake which is probably a sugar bowl A man sitting on a sofa holding up a laptop with writing on the monitor. A communal sink in a white and dingy bathroom. A small child putting peanut butter on some bread. The young man is going around the cone on his skateboard. there is a man pointing out to another man in the ocean A skateboard turned upside down in a street with shoes hovering over it A street sign pole with many street signs on it. Two three dimensional images of a woman with an umbrella. A woman standing in front of a table with lots of salad. a man that is standing up at home plate a person waking up and hitting the alarm button on a white clock Young woman using video game controller in living area. a flat screen television sitting on a entertainment center A large elephant walking through a wooded area A large airplane flying high up in the sky. A man standing in front of a TV playing a video game. A man in a a shirt and tie smiling at the camera Two green street signs sitting under a tree. A bowl of cherries beside apples, bananas, and eggplants Slices of pizza sitting on plates next to a glass. A parked airplane with the terminal gate to the plane. A cat sits on a desk in front of a computer. Two small kids on skis on the slopes A person cutting bananas in half on a cutting board. A small truck sitting on a road near a gang of bikers. A man cutting a piece of plastic with scissors. a baseball field and some players playing baseball A baseball player poses as if he hits a baseball. The pizza is ready to be cooked, then eaten. A fork sitting on a table next to a car shaped cupcake. The clock has many designs and sculptures carved around it. A tractor and a truck travel down a road. A blurry image of some bison laying on the grass. A number of motorbikes and cars parked in the field A white bull dog rolling around on it's back next to a cat. A large long train on a steel track. golden clock details on large clock tower clock surrounded by brick A woman playing tennis swings a racket overhead. a woman sits on a couch with a cat laying on her a tiny ass bed in a tiny ass room with a tiny ass tv A cat is sitting on the arm of a chair. a giraffe in its pen some bushes trees and grass A vase filled with lots of flowers sitting on top of a table. a white plate filled up with a lot of glazed donuts A farm animal on dirt outside of a home. A person holding a tiny piece of paper. The pizza sits on the board on the stove. A sub sandwich on a table at a restaurant A girl and boy playing on a fire hydrant. a close up of two people shaking hands over a motor cycle A white and grey freckled horse next to a brown horse in a valley filled with trees and tall grass. A woman in a white dress and someone with a striped umbrella seated by a pond. some cute brown and white cows looking towards the camera a male skateboarder in a white shirt is doing a trick An orange keychain is next to a red camera. a young man doing a jump with his skateboard in a skate park A giraffe bending over near a big pole. A woman standing in a twist position with arm extended, and a Frisbee in the air near her, in a grass park with trees, with people playing Frisbee, walking and lounging in the grass on a sunny day. A cat sleeping on a blanket on someone's bed A cluttered living room with figurines on a display case and photographs on the wall. A girl in a dress standing on a small skateboard. The man on the couch is playing a video game. A man is standing next to a motorcycle in a village. Several cars and people at bikes sitting at a red light. A study table where two laptops are kept open. some baseball players a pitcher catcher and an umpire two ladies and kids playing sports in a green yard A man in a white shirt and black shorts jumps near a soccer ball. A plate of bacon, sausage, and other breakfast foods. a man on skis fly through the air A long train with a yellow front stopped at station. Red motor scooter parked on the sand with a sunset in background. A fire hydrant is gushing water on a sidewalk. A man arranges toppings on the uncooked pizza. a group of friends sitting on a mountain posing for a picture Two teddy bears sit on a bed in a bedroom. an image of a group of giraffes at the zoo A person sitting down talking on a telephone. Skateboarder sitting down in the snow in front of another rider. Two boats docked on top of a gravely beach near the ocean. Two giraffes are walking through the enclosed area. a dog lying on the ground next to a red bicycle with a laundry basket attached. A person leaning on their ski poles with a snow covered background. A couple of men reaching up towards a blue kite. a green train is coming down a set of tracks a laptop sitting on a special rack on top of a desk Two blue and yellow trains parked next to each other on train tracks. A person cycles on a motorcycle down a road. A stove sitting next to a bunch of old box springs. A man, woman and child petting a goat at a petting zoo. a bear on a road near a field of green grass A man kneeling down on a beach next to the ocean. A man in a tie standing in front a a Budlight truck. A young cow looking forward while several others drink at a trough in the background A white sink sitting under a bathroom mirror. Partially eaten cake doughnut with sugar sprinkle topping. A grey automobile driving down a city street shaded by several trees. A man throwing a frisbee towards a man and two children. A toy plane flies through a cloudless sky. The baseball player is throwing the ball from the mound. A person riding a skate board in the air. Woman in dark, heavy dress cooking in a home kitchen. A metro train is pulling into the station. A large gray elephant walking across a road. a man with his arms out waiting to hit the tennis ball Two plates filled with plain hotdogs on a table. a room filled with a stove and surrounded by cabinets A woman on a bike with a baby seat holding a dog leash. People in the ocean are playing frisbee and sitting in small watercrafts. Flying bird silhouetted overhead against cloudy sky background. Kitchen accessories in a clean, organized kitchen. an old stone building with a clock mounted on the side. A pole with several different street signs on it. Two students waiting to cross a busy street. A collection of plush animals with big ears and eyes. living room angle with fireplace, bookshelf, furniture, and hardwood floors a black and yellow train sitting next to a fence A young boy sitting on a stone bench in an arid landscape a large wooden park bench next to some rails A store building with stuffed bears in the window. A man leads a painted elephant carrying tourists down the street. A woman sitting on top of a wooden chair at a table. A woman sitting in a vehicle using a cell phone. A whole pizza sits on a pan on the table. there are two skiers that are going down the hill A desktop computer that is sitting on a desk. a wooden table with so many tools on it Man posing in front of bicycle with a banana in his hand. White horse looking over shoulder in enclosure of wood some zebras are in their pen eating some food A couple of signs hang off of a building A commercial airplane on the runway with the jets on. A doughnut sitting on top of a napkin next to a cut of coffee on top of a doughnut table cloth. A toy baby in a toy stroller in a toy kitchen. A homemade pizza, salad and two glasses of wine on a table. A long train riding on train tracks through an empty field A horse is looking over a fence with a shield on its face. Several kids are playing frisbee outside in a yard. A giraffe is sticking its tongue out at some people a woman in a red top some glasses and a pizza A man riding on his bike and talking on the phone. A TV sitting in front of a picture on a wall. a teddy bear set on top of a child sleeping A man is driving with his dog in the back seat. a man takes a photo of a clean bathroom A plate that has a half eaten piece of cake. There is a group of small birds standing on the chairs. A wooden table topped with different kinds of foods. there are many men that are playing soccer on the field A brown, black, and white cat that is wearing a black hat A person on a field with a baseball bat. A man riding a skateboard down some steps. The train is pulling up to the platform. A red small engine plane in motion on a field. A man windsurfs with several other people in the background. A commuter train that is stopped at the station for loading of passengers. A side mirror with the image of the Eifel Tower reflecting in it. A tennis player lurches forward after hitting the ball toward the other side. Two plates of food next to two laptops. A woman in business attire walking on a sidewalk and talking on a cell phone. A woman on a bench reading a book USA 20 dollars totaling 120, held down by a cell phone with Coca cola cans nearby. Girl smiles for picture in busy Asian plaza. A small pizza sitting on a sheet of tin foil. Three people posing for a picture in front of a cell phone case. A person with their car open stands on the snow. This is a small and clean but cluttered kitchen. A homemade pizza with cheese and cucumbers in a pan. some one skiing on a snow filled hill a person helping someone prepare food on a buffet a bathroom with a toilet a curtain and wooden floors a cat starring at the camera and a television in the background LOTS OF CABLE CARS, ON LOTS OF TRACKS red flowers in a jar against a screen Man with unhappy face in clothing store shirt section. A large white bush stopped at a bus stop. A man and woman with blue shirts and bicycles on a sidewalk. Purple and gold bed and flowers against a red wall The two teddy bears are posed together to take a photo. A person giving a stuffed teddy bear a kiss Two sofas are facing each other in this well decorated living room. Two people carrying backpacks are cross country skiing. An elephant in the shade of a tree. A tower white with yellow trim tower features a large clock. A large bathroom with a frosted walk in shower A man with a racket walks on the pavement. A group of children standing next to each other on snow. A MAN IS SKATE BOARDING ON THE SIDE WALK Two boats that have groups of people in them. a person with a skate board and a back pack A baseball player holding a bat next to home plate. A man on a motorcycle talking to a woman in an SUV on residential street. A group of people riding skis on a snow covered slope. a sandwich in a plastic food basket on a table Several zebras walk through the tall green grass. frosted donuts in a display case to feast upon A bathroom with a toilet and a shower with a window. A triple layer cake with a white hand made out of frosting on top of it. A white plate topped with a salad next to a glass of OJ. Three carrots sitting on a plate in front of a knife. There are several plates with different pastries on the plate A kitchen with wood cabinets, white refrigerator, white stove and a microwave above the refrigerator. a man in a suit is holding up a beer A construction worker smashes away at the roof of a building. A hockey player on the court with a bunch of stuffed animals. A woman with some of her fingers in her mouth Two giraffes are eating their food from a feeder. a group of people loading up on a big airplane A sandwich and can of soda on a table. Man riding a colorful surfboard on green ocean waves. Golden lab with smile sitting in the bed of a red pickup truck. There is a small kitchen with black cabinets The steak and hot dogs are being cooked on the grill. a yellow and black train traveling along a train track there is a street sign that has been bent in the middle An Audi car on an oriental city street The jumbo jet flies over a building with it's landing gear down. A woman walking down a street holding an umbrella. A bike and some small birds on a field. A wooden bench by the water and some grass. A bright red umbrella with a view of the ocean and mountains behind it. A large pile of stuffed animals is outside. These are user manuals for an Apple mouse and keyboard. A bathroom features a large mirror and toiletries next to the sink. A group of chefs working in a kitchen that has a statue of a chef. An airplane parked on a runway at an airport. A sign at a railway crossing giving instructions on how not to get hurt. A beautiful blonde girl holding a Nintendo Wii controller next to a man . A group of wild animals walking along a gravel road. A woman is sitting down and talking on a cell phone. A man standing in a wooded area looking at trees. Dogs walking down a set of rickety porch steps. Two plates of food are sitting on a tray with forks. A very nice looking train by a plat form. Chairs and a table with a laptop on it sitting outside. Three zebras stand in tall grass near a wooded area A boy is flying a kite in a field. a desk covered with electronics, paperwork and a lamp A lavish bedroom furniture set of carved wood A zebra standing on the grass above a bird and rocks. Motion blur photograph of lights at night time A picture showing a long line of scooters parked on a city street. A man displaying a cake pan at a kitchen counter. Brown and white cat sitting in front of an open refrigerator. A little child standing next to a yellow fire hydrant. A fire hydrant on a side walk in front of a building. A person holds a fork and knife to cut pizza. A gondola boat ride on the canals of Venice. A bench on a pier near a ferris wheel in a park. A woman is riding a show horse at a competition. a bunch of skiers at a skiing resort on a clear day A dog rides in a cart pulled by a man on a bike. A living room filled with lots of furniture. A passenger bus that is driving down the street. Two vehicles are parked in a giant warehouse. Fruit on a plate next to a book. an orange truck people trees and a street and buildings there are many canoes that are in the water A number of street signs on some poles A long train sits at the station waiting for it's departure. a dog is playing with a water bottle A suitcase sitting next to a brick wall. A smiling woman pours a bucket of water into a toilet. The large herd of cows are all around the large field. A room has blue walls and a wooden floor. Atribe of people ride some elephants out side Craft tools and a project currently in progress A boy is riding a skateboard in order to skate off the ramp. Antique black and white photograph of a horse drawn tram A young girl opens her mouth while eating cake. View of a city bus through the side view mirror This red pot is filled with a variety of vegetables. A for rent sign hanging outside in front of a building . A picture of a Wii remote in its packaging. A room filled with flowers in front of windows. A group of people in grassy area with kites in the sky. Two young people are posing for the camera with their surf boards. A herd of elephant walking across a dirt covered ground. Orange train on tracks in the country side. A young woman on a tennis ball about to return a hit A man is taking a picture of himself in the mirror A red double-decker bus with a open top level. A batter up to plate in the middle of the swing. a man in a tie and a woman in a hat ride horses A quail looking bird is standing in a tree. This man is looking downward while his is skiing. A photograph of something in the image. A view of an empty kitchen with white and wood lined cabinets. The paper towel holder in the restroom hangs from rope. A view of a pizza cut into four slices. a couple of jets are flying in the sky a basket with a sandwich and some fries in it. Two men smiling while riding in a bus. A dog staring out the window at people standing outside. A screen of people playing a baseball game A bedroom with a four post bed decorated in black and white. A man standing next to a wall sized glass window. A small person rides a skateboard modified with large tires. A couple of street signs mounted to the side of a building. Girl on a couch with her computer on a table a group of people walking on the street during the day A person on a surfboard riding a wave. A dog is under a brown computer desk. The woman is playing a game of tennis on the court. a body of water with three boats sitting next to each other A bathroom vanity sink with a large mirror and hairdryer on the wall. People are sitting in a large room on couches with a fireplace. Several men in the kitchen with one cutting a piece of meat. There are two beds in a room side by side. there are two blur bullet trains on the tracks A lamb with several babies is laying in the grass. A bathtub with a colorful wall decoration is seen here. Small flowers are placed in a clear empty bottle. Horses in fenced area with grass and hay and adults nearby. Group of people standing around a kitchen area with food on it. Plates of various food items sit on tables. A baseball player is standing on the playing field. A very tall clock tower towering over a green tree. There are plates with food and drinks on the table. A man and woman in kitchen preparing food by a stove. two zebras in the field grazing on grass A man stands in an outdoor market selling a variety of fruit. A man ries his skateboard around bright green cones. a cat with its head burried in a shoe Two children on boogiebody boards in the ocean. A public bus parked in a bus station at night. A man skillfully water skiing in wild water. a couple of guys standing up with some snowboards in hand A cat is standing under a red car. A vase with a few large sticks in it next to a sink. The back of an elephant with tusks overlooking a road. Man standing on a soccer field holding a frisbee with a dog beside him. There is a clock displayed on the side of the building. A group of people sitting at a table eating food. There is a green plant inside a bottle A man calmly sitting on a bench with an Indian Head Dress on. A bi-plane with a wing walker on its wings. A large body of water with a train traveling over it. A woman brushing her hair standing in a living room. a man holding a tennis racket beside him during the game some oranges hanging from some branches of an orange tree A vase filled with purple flowers sitting on a table. An Apple laptop rests on a custom wooden stand. A large bed in a bedroom next to a fire place. A woman flying a kite on a rocky beach near the shore line. A man has fallen off of his surfboard. A man catches a wave on his surfboard and holds his arms up to balance. Two men in horse drawn carriage on city street. A person sitting on a snowboard going downhill A view of a bathroom with a mirror, towels and a tub. Two men in robes while one has a toothbrush in his mouth. A biplane in the sky in the middle of a turn. a clear road across the street from tall building and water a red baron pizza cooked in a microwave An all white and steel bathroom with 2 windows A car stops to pay a parking fee to a woman A black Chrome laptop sits on a desk. Some very tall pretty giraffes by some other animals. A couple green stoplights on an empty street. A little girl holding a green cup in front of a bowl of food. A young child with a colorful umbrella walks down a path in a coastal setting. A man is holding a small Dell laptop. The sink and mirror in a business bath room A table filled with plates of food sitting next to each other. A massive crowd of people standing around the Washington monument. A man doing a trick on a skateboard. a bathroom with a toilet, tub and cabinets A cat taking a nap upon a laptop computer on a desk. Bicyclists on a city street, most not using the bike lane Busy city traffic in an older part of town. A black and white photo of an old steam locomotive on a train track. A blue and white train in front of a building. a man sits by himself on a bench in a stone-paved square in front of a large bed of flowers A large amount of tables and chairs are by a clock. Young man pointing at computer keyboard with sprouts growing out of it. Baseball game with batter on base and umpire standing by. A sign reading "Car parking," is on a fence in front of a herd of cattle. Small black and white cow in grazing field. Large black motorcycle with a motor vehicle sitting on the side. A couple of women are walking down the street A plate with sandwich and french fries on it. A person is walking in the sand near the water. a large air plane on a run way A cat looking out a window with green trim. Person in pink snow gear on a snowboard. Jumbo jet flying over a group of trees. Toilet bowl sitting on the side of the road filled with papers. an image of two men working on their laptops Many cows travel down the side of a street The family is playing a game all together. A large fluffy cat is sitting on a chair next to a computer mouse. A man sitting and looking at something with his hand to his mouth. A man kneeling on the ground next to a couch working on his laptop. The tennis player stands ready for the next play. Two men hold up glasses of bear above two pizzas. A passenger plane that belongs to American Airlines taking off. A man checking on some food that is in a white oven A wooden table holding two glasses of wine and a plate with pizza. Lots of bags of luggage sitting on the floor of an airport. A group of people playing a game of frisbee. Two people riding on a boat on a large body of water. Pair of adults and teens having silly fun. A young man is using his skateboard on the street. A cat sitting in a plastic water bag on a hard wood floor. The elephant wades through very deep, calm waters. A female tennis player in action on the court. An animal on top of a table while a bear rides a bike. Four stuffed animals against a plain light colored background A person walking while brushing their teeth and wearing a red hat. A jumbo jet travels down the runway towards the camera. A crowded city sidewalk with lots of people. A giraffe in an enclosure looking at onlookers. A man sits and admires the architecture of a large bridge. A sign prohibiting skating on the sidewalk with black and red writing. A father taking a mirror photo of him and his daughter brushing their teeth A small boy is eating a sprinkled doughnut A boy on a skate board crossing the street The skateboard competition is geared to even the youngest boarders. A bunch of food is layed out on a white dish This is a shot of someone wearing a pair of skis. a black and white photo of two people cutting cake a giraffe standing in a pen next to a tree trunk. A handyman moving a refrigerator back into its place. A person wearing a Cat in the Hat costume in front of kids. A horse drawn carriage waiting outside a hotel. A fruit stand with grapes, oranges, apples and plums. Player returning volley during match play on tennis court. Someone who is applying some chocolate on a cake. There is a street light with two green arrows in different directions A plate with bagel, fruit and potatoes sitting on a table. A surfer is riding a wave while another swims to catch the next one. A giraffe inspects the roots of a fallen tree. A bathroom that features a vanity cabinet with sink, commode, overhead cabinet and mirror. a couple of doughnuts are under a display case People watching a man doing a skateboard trick at a skate park. A man cuddles with a woman who holds a banana. A young man siting at a picnic table holding a sandwich while a young girl looks on smiling. A hot dog sandwich with eyes on top of a plate. Two giraffe's by large rocks with and ostrich. Relax in the chairs next to the pool. A pizza with ham, cheese, olives and oysters on a plate. a black and red brick building with a white and black clock and sign Vehicle traffic on a city street in a snow storm. A NARROW HALLWAY WITH A TOILET IN THE BATHROOM A stop sign with bike rack next to street corner. A giraffe attempting to lick a woman's hand over a fence. A fluffy cat is sitting on a glass table. a woman swinging a tennis racket on a tennis court. The little girl hold the pole while the man sits on the fire hydrant. a person riding a surfboard in a body of water A woman sitting down holding a cell phone up to her ear Well hello there cat, are you up to something? a blue car sits in front of a red bus People are walking down a busy city street. A woman is brushing her teeth in the bathroom a black orange and white cat and a controller A table with a white plate of food that includes broccoli and chicken. A plate holds a portion of a broccoli casserole. The small bathroom has a shower, a toilet, and a sink in it. Little kid on a metal pedal tractor in a yard with sheep. A group of people on grassy field with kites in the sky. Several young children with ties in a school room. A man in grey shirt sitting at a table with plate of food. A kitchen with a sink, cabinets, and other accessories in it assorted decorations laying around on a plastic tarp Two persons in formal dress posing for a photograph. A person in a car smiling next to a suitcase. A young man dressed in a suit and a tie smiling at someone. Multiple person seating bench on the side of a city street. A gastric delight of sausage, broccoli and onions. A young boy riding a skateboard on a street in front of a house. A train riding on a track near a platform. The two woman are baking in the kitchen. A green double deck bus parked next to a railing. A boy runs in the grass while holding an umbrella. A young baseball player stands at the plate, in motion to hit an oncoming ball. THERE IS A BOAT ON THE GRASS IN FRONT OF THE YARD A kitchen with stone counters and a bar. This is a shot of a printed recipe for lemon marmalade. A large group of people flying kites in a field. A man cooking food over an open flame. A variety of people sit at tables and watch screens. A tennis match from above with a large crowd of spectators. A fire hydrant in the grass with a red top and yellow bottom. A toilet in bathtub in a home bathroom A large open field filled with a large group of cows. A kitchen with an oven and a sink. there are many small red trains on these tracks A pair of cats laying on each other on a desk. a man wearing a hat is on his boat and four birds A steep incline of snow, a strip of sky and what looks like a large red strut sticking out of the snow make a setting for a helmeted skier in otherwise regular clothes in a twisted posture the points of his or her skis ascending the slope. Old photograph depicting train in industrial area of city. A clock tower has a group of people inside it. A person pets a cat near a weathered building and plant. A living room with a large picture window, a fire place, a table and a couch and loveseat The bowls on the table are filled with food. a plastic toilet in a small bathroom stall Skateboarder performing aerial trick in large indoor area. two photos of one woman playing against a doubles team Three people are standing together in the snow on skis. A man in a wetsuit surfs a wave. A man is riding his dirt bike while wearing a helmet Four cows grazing and resting in the shade of trees. A long blue classic truck parked in a parking lot. A large passenger jet flying through a cloudy sky. A tennis player runs and swings his racket at the ball. A skier standing on top of a rail on a snowy slope A stop sign with a one way directional sign marks an intersection. A group of people are gathered for a photo. A black cow walking across a stream. A young man standing on a skateboard riding a wave. a bench near a river, situated close to a bridge A woman wearing a dress holding a brown teddy bear. some zebras gather around a watering hole in a herd A dog lays on a large turtle pillow panting. two people sitting in a living room touching each other with their feet A herd of sheep blocking a parking lot. Two men standing on a tennis court holding tennis racquets. A man wearing a shirt and tie next to a barn door. A couple of brown and white cows standing on top of a hill. A black-and-white photo of a man walking next to a fire hydrant. A smiling couple stand next to a bench at the bottom of an escalator. Small dog sitting on bed in bedroom of home. This is an unused bathroom with a sink, toilet, and bathtub. A woman stands near a red double-decker bus and uses her cell phone. A green, orange and white train in a train station. A group of men and women standing together. A man riding in a carriage with horses and an umbrella. A man holding a baseball bat wearing an old fashioned uniform The view of a large sized bathroom with a tile floor. a dog sitting in an open suitcase with a stuffed animal A man pets a small, baby elephant. An old vase with artwork of an octopus. The cows are ready to eat and drink their next meal. A girl with red dyed hair eating a banana a delivery truck and a couple of bikes A laptop displaying a picture of a man. Two giraffes eat leaves from some small trees. This metal plate contains lovely slices of tangerines and pomegranate. A dog whizzes by to run towards a yellow Frisbee. Young boy with giraffe in enclosed fenced area. A counter holds a plate with bananas on it. a yellow red and silver train on its track and some wires A white table topped with game controllers, remote controls, a keyboard and phones. A woman eating a pizza and smiling for the camera. view of a snow capped mountain from in a plane An old man with a beard and a bowler hat A sink with cups and towel next to it Several pictures of Asian style dishes and in the middle a person is eating. a giraffe standing close to a big rock A far view of a plane in a cloudy sky. This is the caboose of a freight train. a large group of people on a beach doing various activities. a man can be seen walking past a store front A cat is playing on a small computer. a man standing on a tennis court in front of a crowd There is a chair sitting in an empty lawn. A bathroom that has a mirror and a bathtub. A giant clock tower and a clear sky. A child with a bat standing at home plate waiting for the pitch from a pitching machine with his teammates in the field. A cabinet that has some stickers on it. A man and boy are looking at a cellphone. The restaurant kitchen is closed until lunch tomorrow. A couple of women reaching out to a tall giraffe. A black baseball mitt, ball, and baseball bat. A batter positions the bat in the air. a woman sitting at a table while using her white laptop A bedroom, well lit, has a couch, dresser, comforter. A brown clock tower rises above some trees. A group of people pose with a horse as a crowd looks on. A man standing on a sidewalk beside a park. A large blue street clock attached to a post. A close up shows a large bunch of broccoli. Animals next to shoreline in artist's painting at sunset. The woman on the bench was near the boy on the skateboard. A man smiles while eating a small piece of food A teddy bear sitting on ice with a knife stabbed in the belly. An on microwave unit is heating something in a cup. This desk has a one computer and two laptops on it. A metal stove and a counter in a room. A CHEF IS HOLINDG CARROTS IN HIS HAND SMILING Two cats sitting in their beds beside a window a clock hanging fron the ceiling announcing the time as 144 Rain falling on a city street filled with people and cars. Two men playing competitive frisbee against each other. Two adult horses grazing in a dry field. A flamingo with its hear scrunched back near its feathers. A blond married man in a green T-shirt sits in front of a computer keyboard taking a bite out of a donut. Small brown dog with colorful braids sitting on a couch. A cat sitting on top of a wooden table next to a yellow motorcycle. A picture of a person that is looking into the water. Two small elephants standing together in the wild. A little boy that is standing on a skateboard. there are four small beds all in the same room A cat sitting in a sink in the bathroom A dog and a cat standing side by side. THRE ARE TWO PEPLE THAT ARE SITTING ON THE BENCH a man is playing tennis on the court People are gathered around vintage cars at a car show. A seagull strolls on the beach during sunset. a group of people are crammed in a small area on motor bikes This guy is in the country about to fly a kite. Four remote controls attached to the side of a television. The pole of the stop sign is covered in vines. There are some people riding horses on a field A woman sitting in the sand holding a kite. A giraffe standing near a tree branch in the grass near a grove of trees. A laptop with keyboard and mouse separate on a desk Their is two pieces of bread on a white plate A living room with a decorated christmas tree in it Traditional view of crowded city residences with big bridge at the end of the street. A group of people sitting at a table eating food. A young boy is herding rams with a stick. A grinning man stands outside in the snow with a snowboard. Three people on a baseball field with catchers mask and baseball bat. A computer desk with a laptop computer on it. Two giraffes inside a white fence in a yard. a double parking meter near a tiled wall A person is walking through the snow on skis. a woman smiles at two babies who are laughing at each other A girl happily eating a pizza in a restaurant. Fresh fruits such as apples, pears, watermelon, and apricots. A group of men playing a game of frisbee on a field at night. two snow skiers coming down a snowy hill People skateboard down the sidewalk and the street. Two elephants standing next to each other in front of a face. The teddy bear sports some very unusual colors. An upside down sign saying "Road Work Ahead" A teddy bear is sitting down reading a book. A person in a black snowsuit pulls a kid on a sleigh who is holding a birghtly colored umbrella. A pair of yellow scissors sticking out of a cow pattern bag. A couple of sheep grazing on the grass in a pasture. Two girls walk down the street carrying pink umbrellas A man is performing on stage at an event. Several bunches of bananas growing on a tree there is a cat that has fallen asleep under a car A red sleeping area in a bedroom scene ONIONS, TOMATOES AND OLIVES ATOP A PLATE ON A TABLE A young child dressed like a chef cutting broccoli A normal bench sitting on a wooden bridge A small calf next to a large cow in a field. A man riding a wave on a surfboard in the ocean. a large pizza that is in a box Seven stuffed teddy bears lined up against a wall. Group of cars passing by a long row of apartment buildings. Several small boats are on the water on a foggy day. A bathroom that has some blue tile on the wall. A pizza has meat toppings on a square plate with other food items on a table. The bathtub and sink of a bathroom with a large mirror. a girl that is playing some tennis on a court A chocolate cake with candy on top of it. A fire hydrant stands in front of the entrance to an apartment. A zebra in front of a barn and pen. A bird holds its wings up as it wades in shallow water. A child in the window of a paper fire truck. A surfer hitting a trick on top of a wave. A cow laying on a green field next to it's baby. A very cute small boy holding up a cell phone. A holiday wreath with stuffed teddy bears and a penguin. A young man on a skateboard rides past a cafe. The items are on the conveyor being ready to be put on the wagon. A closeup of a waxed surfboard in a surf shop. A car at a show with people in background. A car's passenger side mirror reflects the image of a long freight train. A sandwich in a box with carrot sticks and an apple. Two men holding bottles with ties on their heads A man holding an umbrella on a sidewalk. A person with a tie is holding a baby. A red fire hydrant sitting on the side of a road. A man purses his lips while holding up an orange in front of his face. A giraffe is standing tall in an enclosure with large plants. A young boy sitting at a counter drinking from a straw. Two young people playing a game on the Nintendo Wii. A bowl of cherries on a table in front of different fruits. A person is on snow skis on a mountain top. A delicious looking donut or cinnamon roll covered in icing A young boy contemplating skating on the pipe. Some people in a large kitchen preparing food. A large airliner is taking off from the runway a pizza on a plate on a table A birthday cake is topped with a dog's head made out of frosting. There is no toilet paper in this tiny, claustrophobic bathroom. A baseball catcher standing and ready to throw a baseball as an umpire looks at him. A BMW motorcycle sitting in a marina with boats. a horse pulling a little carriage down the road A person with a black, grey and green striped tie on. a person sitting on a city street talking on a cell phone A highway scene with a bus and a car behind a cattle truck. A plain white bathroom with a sink and toilet. A couple of people standing in front of a TV. there are many fruits and vegetables on the table A messy living room with pictures on the wall Large pottery and bonsai trees are sitting outside. Donuts with frosting in foreground, plate underneath them all A blue motorcycle strapped onto a vehicle trailer. A man with a hat is sitting by a television. Skateboarder with leg tattoo riding on a skateboard A man placing a baseball on a tee for a child. A large bunch of baby bananas still green and in a basket. A woman poses for a photo while sitting on a bench by the seaside. Lady with sunglasses under pink umbrella at outdoor event. The stop light found at a Hocken avenue intersection. A man loads bananas high on top of a banana truck. A bus with its doors open is waiting at a bus stop. a tall giraffe peering over some trees and shrubbery Black and white photo of a skateboarder doing a trick. A pizza that is made of many various ingredients. A red church in between two plain buildings. A photo of a woman eating a hot dog. Small black object sitting on the inside of a toilet bowl. A very large commuter train is going down the track. Men at the beach with one holding a surfboard Two people looking into an empty, lighted wall oven. A sandwich with broccoli, onions, cucumbers and other food on it. a bird sitting on a fence against a lake two plates of food on a table with chairs a pair of scissors cutting sheet of plastic cups A group of people that are sitting at a table talking. white plates that are covered with assorted donuts A nice bathroom has a sink on glass. The horse is tied up to the post outside. A living room with orange colored walls, and a purple chair. a view of a flock of sheep grazing in a field. A man in a black jacket holds a toothbrush in his mouth as he stands near a woman with her eyes closed. An elaborate metal vase holds a decorative bouquet of flowers. A woman cross country skiing with her dog. a blue double decker bus traveling down a street Birds perched on iron poles in front of a tall building. A man is asleep in bed with a laptop open on his lap. Two horse drawn carriages travel down an old looking street. a male in a red shirt eating and some people and lights A motorized bicycle covered with greens and beans. Several planes are parked in an airport field. Herd of elephants standing in waterway near man in orange shirt. A professional baseball player in the middle of a swing. The parking lot by the market is full of cars. Two plates sit on a table as one plate holds a sandwich and the other holds a cup of soup. A young man is in action with a frisbee. A man sitting on a bench surrounded by trees. A desert and fork on a plate with multi colored polka dots. A group of men in uniform riding a bunch of horses. A blue and black motorcycle parked next to a silver truck. A man skiing in the air above a snow filled mountain top A cat is sleeping in a window sill. The basket of lemon is near a rubber duck in the large bathroom. Carrots, peppers, and zucchini resting on a paper towel. A crowd of people are under a tent with a giraffe. A table topped with a banana and other items like scissors. A snowboarder is at the top of a snowy hill. A very big building with a bunch of chickens. there are two very large beds inside of this room A man riding over a wave with a surfboard in the ocean. A room with beds and suitcases and other items a clock hanging down off a brick wall in a row of circular hanging light shades An old red fire truck with 3 kids sitting in it A man wrestling with a calf at a rodeo. there is a small plane that is flying in the sky A man riding a white and brown horse in the dirt. two white plates with pizza a pitcher of wine some glasses and silverware A giraffe running in an open barren desert. An orange and white food truck is parked inside. A bird stands near a car in the snow. A white vase that is holding a pink and white flower. A person wearing a large hat standing in front of a building. A white and blue train traveling down train tracks. Two large gray elephants standing in a dry grass covered field. A man and a woman standing in front of a train. A baseball player, catcher and umpire in a baseball field. Broccoli, carrot, and dome other items on a dish. A man surfing the waves on his surfboard in the ocean. A plant coming up from the inside of a square pipe two people driving motorcycles next to each other A small white building with a clock tower on it. A person on a skateboard up on a ledge. A picture of a blender with some liquid in it. A group of young boys riding scooters at a skate park. Two women share a red umbrella walking down the street. a person holding an open umbrella near a small pool A man that is holding on to a racquet. A cat sits on top of a toilet in front of a bathtub. The giraffe is standing with its head between the tree. a giraffe is eating food from the branches of trees A tall brown elephant walking through a lush green forest. Stuffed toy dog lays next to laptop that the woman is staring at. A group of people sitting around a wooden table with food. Three people are standing and throwing a frisbee. A desk with multiple computer monitors and a laptop. A man swinging a tennis racket at a tennis ball. a living room with two white couches, a fireplace and a window with a view A wooden desk topped with a computer monitor and keyboard. People are getting off a bus in the evening. a line of parking meters with buildings and vehicles in the background A full view of a plate full of delicious food. A horse standing in a secluded field of a Mountain Valley. A plane fitted with pontoons moving around in the water. A large dark sheep stands with two young ones. Two people sitting on a ski lift over snow and trees, one wearing skis and one wearing a snowboard on feet. A brown bear walking in an open area. two pizza pies sitting on top of wooden pizza racks on top of stovetops. A white kitchen filled with appliances next to a window. a rum cake vender in a yellow truck a laptop on a small table with a mouse a bunch of random stuff sitting together on a tablecloth A beautiful young lady walking a black and white dog past a hotel. A table that has two cakes on it. Series of propeller airplanes lined up at an airport. a silver gray subway train parked in a subway A paddle surfer riding a small wave in the ocean. Pots, pans and a collander displayed on a kitchen cart. A person doing a trick on a skateboard. A row of orange trees sitting along side of a dirt road. There are bicycles parked along a stone sidewalk. this tennis player stands waiting for her opponents serve Several people riding their bikes down a sidewalk. A woman that is sitting down with a book and umbrella. A woman is sitting with a congratulations sign A bunch of bananas are hanging from a rack A person is skiing on a lake while holding a rope attached to a parachute. The man had to bend down to kiss the horse. A group of cows walking down the middle of a street. A small boat floating on a lake at sunset. A bowl of oranges and bananas is in the center of the table while a plate of toast and eggs is towards the end. Bright white bathroom sink and shelf with folded towels on a shelf. A white bathroom area with a plant and yellow bottle. A cat stretching its paw over a keyboard. man riding a blue surfboard in the ocean a group of bikers driving down the street Man standing in office with glass walls eating a donut. A MAN IN A RED SHIRT AND JEANS PLAYING A VIDEO GAME. A man with a skateboard talking to another man. A cat rubbing its head against a person's shoe. Someone holds a donut in front of a box of donuts. a couple of giraffes are sitting in a pin Two large adult elephants have saddles on their backs. A laptop, a mouse, and a pen are on the wooden table. Three surf boarders talk on a dirty beach covered with seaweed. A view of the back of a bus from inside. An elephant is carrying people across a forested area. She is sleeping with her dog on the couch. A young boy sitting on a rug holding a cell phone. A man standing next to a woman as they prepare food. Two people sit on a bench in an grassy area in the midst of some building. A parking meter is on the curb of a hilly street. A umbrella stuck into sand at a beach with boats and hills in the background. Two men are riding on motorcycles through the air. The bus is stopped at the street corner. Two pieces of luggage leaned up against a tree. A cat stretched out next to a persons leg who is sitting in a chair holding a laptop in their lap. The snow boarder is snow boarding down the mountain. A man is seen walking out of a building. A green birdhouse sits on a wooden platform in a garden. A young elephant holds its trunk up to its mouth. Crates of different vegetables stacked next to each other A grey cat wearing a hat is getting petted. an image of a girl that is playing outside in the field Parked motorcycles and an old yellow school bus A couple of people on skis examining a park description sign. A picture of a stop sign with a small green smiley face sticker. A vase of flowers, money, and a bottle of wine sitting on a table. A clock sits above green bushes under a blue sky. Passengers wait on a platform for the arrival of a train. A large truck with crane scaffolding on the back. an athlete holding a tennis tacquet in a stadium Two uncooked pizzas has different ingredients on each. A man wearing a blue tie with the ten commandments on it. Five youths stand together holding tennis rackets on a court. a wall with a bunch of graffiti on it A man in gray and black holds up a small cell phone. The man is walking up the ski slope. a person with an orange beanie taking a picture of a gray train a plate full of vegetables sits on top of a table A woman and children surfing in the ocean. A few men standing in there military uniforms . A woman eating a doughnut sits behind a box of doughnuts. a close up of a person riding on the back of an elephant A planter box of vegetables in a fenced garden. A man standing on the sea shore with surf board in his hand. Purple teddy bear with book in its lap staged to look like its reading to a small orange stuffed bear beside it. Two giraffe standing next to each other under a cloudy sky. A woman is walking down the street with a red and white umbrella. The cat is sitting on a person with a laptop on their knees. A dual monitor station also hosts a cup of coffee, water, and a thin keyboard with a mouse. A white fishing boat being followed by birds Man and woman sitting at table enjoying meal with wait staff seen in background. Tennis court match with a player on each side of net and people in audience. The two ball players are setting in the dug out. Some people sit together for a meal. The very large, spceous bathroom has carpet and a jacuzzi. A surfer wears a completely black wetsuit including a head covering. One bird on top of another on a tree branch. A man holding a skateboard in front of a group of people. The intersection of a city street at a red light A group of people sitting around a table together. A young boy who is eating some food. A line of young skiers ski down a gentle slope. A delicious lookign healthy vege pizza in a box three giraffes behind a fence with a tree near by Giraffe and other animals graze in tall grasslands. A man standing next to a red motorcycle in a parking space. A man rides a cow through a parking lot. a close up of a person with a plate of food on a table A man in blue shirt standing by a brown and black dog. A large brown teddy bear laying on top of the ground. a bowl with some noodles inside of it Two people and a dog that are standing together. A man in brown shirt jumping with skateboard over gap. The man is on a ladder painting the walls. Several sheep standing and grazing in a yard. A collage of photos shows different foods being prepared. There are a lot of items laying in the bathroom floor. An old style white stove with a kettle on it. Two women are about to cut into a chocolate heart cake together. A man and little girl sitting on a bench near a parked airplane. two cake doughnuts with three strawberries and a cup of coffee Four fighter jets fly through the sky leaving a trail of smoke. A man sitting on a motorcycle near several bicycles with a partially visible person standing nearby. A young girl being pushed on a skateboard by her brother. One lamb, amongst other lambs, looking directly toward the camera two sheep sitting on a hill next to a fence A yellow fronted train is going down the tracks. This is a bathroom that is painted an ugly mustard color. there is a woman dressed in a costume holding a bear A parking meter and a car on a road. A train car with graffiti on the side of it. A person with a guitar hung on their body while playing a keyboard. A paper, laptop, cellphone, mouse and bottle sitting on a table. A haul of produce including squash, bananas, and mushrooms. a bath room with a toilet a sink and a bath tub A small herd of sheep grazing in a grassy field. A green vase filled with multi colored candy canes. A group of children playing in the snow. some soldiers cutting into a decorated sheet cake A red hammock set up in a wooded park. A snapshot of a family at a store taking a picture together. A blue motorcycle parked on the side of a road. Three men holding baseball bats dressed in full uniform the first man is holding the bat and the man in the middle has his hands crossed and the third man is holding the bat with both his hands cupped together. A man standing on a dock next to a boat. A man and woman playing a video game together. A giraffe standing next to a horse in the grass. A meat filled sandwich sitting next to a cup of chili. A couple of sheep in the middle of a grassy field. A bathroom with blue walls has a window, a sink, a bathub, and a toilet. A bike and a dog on the sidewalk outside a red building. A cat laying on a pillow on a couch People in casual sports uniforms running and jumping around. A large double decker bus is driving down a street. A photograph of a kitchen inside a house. A field with horses on a cloudy day. A Dilbert doll sits on a table next to drinks and a plate of donuts. A guy is doing stunts on his motorbike. A room full of American soldiers eating pizza. A kitchen, including a table, oven and cabinets. A vase full of flowers is sitting on a deck. A traffic light and street sign on the road. A cake that is shaped to look like a child's toy. A group of friends sits in their living room while playing video games. Black and white photograph of man on skateboard carrying a surfboard. Two small babies sitting in feeding chairs with spoons in their mouths. A woman that is kneeling under a elephants trunk. A church lit up at night in a town. Man with glasses talking on cell phone in car A dog and a man are herding sheep. A baseball game with a batter and a catcher. A family riding on the back of an elephant The man stands on a stage as his neck tie blows in the wind. Harvested bananas, still green, sit in a pile. A cat standing in the fridge with milk and juice. An electric train pulling into a train station. women sitting on a bed while man is getting dressed A pizza is sitting on a pizza stone fully cooked. A person waiting to perform a stunt on his skateboard on a quiet street. An orange cat laying on its' side. a hotdog a hamburger and some onion rings an old rust bucket truck with a cracked mirror Several teenagers are playing soccer in a field. A very blurry picture of an intersection taken from a moving car A white and red boat in water with lighthouse in background. A plane prepares to land on an airport runway. a meal with meat, rice, and vegetables Man poses for picture while sitting on the motorcycle a person riding a surf board with a sail A miniature blue train engine sits on the tracks in a rural setting. An Asian gentleman sitting in a blue chair at an open office area. people skiing down a roped off section learning a bunch of traffic driving on a city street Long empty white bus sitting out in the parking lot The back side of a small charter jet flying through the air. A balding man with glasses, standing near a bridge. An open door leading to a small bathroom a close up of a sandwich on a plate A homemade pizza with gourmet toppings cools on a plate. A woman standing in front of a large candle lit cake. A professional baseball player holding a bat during a game. Antique black and white photograph of surfers on a California beach there is a large pizza with toppings on it Two slices of pizza sitting on a white plate with soda near it. a large building with a fence in front of it . A restaurant sign hangs in from of a large oak tree. The pipe smoker enjoys his nightly smoky ritual. A group of snowboarders poses for a picture on top of a mountain. A man jumping up with is hands raised while playing Wii A duck is in the air flying over water. A group of snow skiers waiting at the top of a mountain. Skiers on a snowy slope stop for a rest. Some animals that are sitting in the street. Two small beds are sitting side by side An empty side walk with in a city A man flying through the air while riding a skateboard. A double decker bus driving while it snows. a little bathroom with a striped tiled floor some people a clock tower and a black and white clock A saddled horse tied to a rope on a beach The street sign in posted near people walking across a road. People watching two school buses crash on a dirt field. a zebra is walking around in the snow A metal wire fence confining sheep inside a grassy meadow. colorful head pieces on large elephants for entertainment Some very big trains one of them blowing smoke. Photo of a man riding an old styled bicycle near what appears to be the Golden Gate Bridge. A woman is walking and holding a kite A siamese cat playing on the bed with a tabby. A dozen surfboards are lined up on the beach shore. black furry dog sitting in front of yellow fire plug A man playing frisbee with a child in the park. A bus is making a left turn behind a white car. Man looking at camera taking a bite of food A road sign advertises luxury while a cow rests on a dirt lawn in front of run down buildings. A seagull holds a small fish in its beak Soda with a plate of food, such as, pork, macaroni, and corn. A desktop and a laptop sitting on a desk. A flock of birds in motion of a field of grass. A painting of green apples next to a bunch of bananas. Two cats lying stretched out on a bed. Two horses are standing together on the beach. A food truck that sells soft frozen lemonade that is parked near other cars and kites are flying overhead. A bouquet of different flowers is in a vase. A snowboarder catching some air over a bump. A woman ordering food in a dark restaurant. These are crab cakes served on lettuce leafs. A man is flying a kite at a park. a group of zebras together in the grass A school bus covered in art and a sign. A holder with toothbrushes, toothpaste, make-up and earrings. Two men stand holding skateboards in front of them. Modern looking living room with white flooring and furnishings Three red traffic lights suspended above an intersection by a cable. Unoccupied park benches near very unusual, leafless trees. a person riding a skate board on a skate park A photographer holding a camera is looking in a mirror. A busy street with many people walking down the sidewalk. This is a man and a dog walking towards the water. A large display with many watermelons and bananas. a silver and blue fire hydrant lights and grass diced meat and tomatoes are mixed with cheese and pasta in a large bowl. a person wearing a vest, collared shirt and tie in front of bookshelves a man standing on the street at the bus station A brown and white cow standing in front of an iron fence. a foot long hotdog and a regular hotdog and a mug of beer some people walk down a city sidewalk by stores A black cell phone resting on the table. Man standing in a living room holding up a Wii controller. A drink in a mason jar sitting beside a vase of pink flowers. A clean bathroom with a white toilet and black bath mat. A bunch of kids and some grown ups skiing. A woman holding a tennis racket while people watch from the stands. A lot of oranges are on a plate, with some having spilled onto a table. A large building that has a clock on it. A train with a red and yellow engine on a railroad track. A man and a woman holding Nintendo Wii controllers. I love the way the sun is creeping behind those two buidings A pair of glasses and a cell phone next to a laptop. a close up of a street sign with a building A man holding a phone up to take a selfie. a lady petting a giraffe behind a fence The dog is laying on a rug in the the living room. A man wearing a tee shirt eating a sandwhich. Sheep are laying down together in the snow. a vintage photo of some cows grazing on some grass A truck driving down a rural dirt road near a street light. A horse carrying a carriage getting a drink of water. A child wearing a hat, tie, and white shirt smiling A small baby is eating a long banana. A man gets ready to swing a tennis racket. Bananas on a table woman using a cell phone on another. A snow boarder is in mid air on the mountain. a train sitting next to a pedestrian sitting on a bench on a railway platform. A crowd of people standing next to a parked truck. A woman with a cake and bag on the street A woman seated and another standing with a cake and soda on the table a book and a tablet on a black desk A little girl in a green dress watching a herd of sheared sheep. Three horses are seeking the shade of a large cottonwood tree. A man in a red shirt in midair catching a flying disc. A sad, young girl sits on her bed, moping. A hill that is used for people to ski on. a kitchen with a stove and a refrigerator a collection of stuffed animals with some wearing party hats A view of a room with a couch, television, and a fireplace. A toddler holds a tennis racket that is bigger than they are. Farm animals graze in the grass in the sunshine. A skillet full of broccoli and vegetables cooking. There is a stuffed bear in an electric chair An area of a city street section off with police tape. He should be careful not to get sauce on his notebook. Blue umbrella in black and white photo of crowd of people A bathroom in the process of being remolded. An empty bench sitting under a nice big shade tree. Computer stand with large monitor in cluttered room. A scooter is parked on the street in front of a car. A man wearing a black jacket next to a brick wall. An old woman is playing with her two dogs A boy wearing a green shirt and helmet is leaning up against a black fence while standing on a skateboard. there is a sandwich and a bowl of food on a white plate A laptop and a tablet on a wooden table A person is wind sailing in the ocean. a silver oven and stove and some brown cabinets and bottles A small stuffed bear with a red hat. A gray and white cat sprawled out on a sandy surface outside. an upset adult baseball player throwing a baseball bat on first base a car and a rear view window on a dirt road. A bathroom with two small windows and a bathtub covered in a shower curtain. A baseball player in red shorts prepares to swing at the ball. Some people and chickens hang out in an undeveloped space. a person holding a hamster holding a piece of broccoli A surreal photo of a chair, a clock tower and a table suspended from the side of a building. There is a person in animal suit holding large toothbrushes. Two girls enjoy playing a game on the Nintendo Wii. A person in a shirt and tie is holding a can. A couple of ladies are playing tennis in this 3D image. A bag on the floor with various items around it such as sneakers, clipboard, scissors, insect repellent and paper towels. a guy taking a picture of some art work on the wall A beverage cooler and counter area in a small store. A man wearing a blue shirt maneuvers to volley a tennis ball. Mandarin oranges tangerine on yellow with blue trim bowl, white counter top. a few baskets of food that is on top of a table A small pizza has a curly topping on it. A wooden caddy is full of scissors and pens. A flat screen tv on a wooden shelf in front of a green wall. Two cows standing on a dirt road next to wild green brush. People spending time on a beach during the summer. A blue vase holding pink carnations and white daisies. The street sign is for Curran Street and 10th Street. A little girl standing next to a boat on a beach. Young men are playing frisbee in a park. A clean white stove with a stainless steel pot on it. A night scene of a traffic light in front of a parking lot. A black bear laying on top of a field near trees. A group of tourists are feeding some elephants. A kitchen with wood cabinetry and a double sink. The airplane is being serviced so it can make it's next flight. Young men playing on the beach with a cow in the foreground. Brown bear standing next to a big log. a close up of a pizza with broccoli a small boat on a beach with trees in the background a bunch of orange cones sitting in the road A couple of kids are on their laptops Swans are swimming in the pond at the park Two snow skiers pose to have their pictures made on their way uphill. A bathroom with tan tiled floors and a glass shower. several people are waiting to board a train A surfer standing on the beach in front of his board A woman is on skis riding down the snow covered sloped. An animal that is looking at something in the air. A man wearing a suit and tie and red hat with a silver buckle. a couple of kids that are playing some frizbee A kitchen is completely decorated in white and black. A baseball player holding a bat in both of his hands. Crowds of people on a street corner and a bus picks up people. A woman riding a bike down the street. A man bites in to a piece of food while outside A teddy bear sitting in a fake bath tub with a rubber ducky. A car crashed into the side of bus on a busy city street. Man with piercing riding a skate board through neighborhood A man in red jacket snowboarding down a snowy hill. a lady in a chair touching a vase that is on the floor A girl sitting in a chair holding a laptop in her hands. Covered and uncovered produce is sitting on tables at a market. A black and white image of a shipyard with some boats. A computer desk with various items around it. A couple of baseball players standing on top of a field. a woman with flowers in her hair staring at the horse next to her A skier skiing on a snowing day with trees in the background. A cloud rolling over a ski slope with skiers watching. Some women who are cooking a pizza on a grill. A little girl cutting up food on a cutting board. a brown horse feeding on the grass which is well cleaned A woman holding a cell phone while she smiles. An older man wearing a suit and tie. A couple of animals on a grass field. A wooden table with a hotdog and a pitcher of beer. An otherwise ordinary roof and chimney are offset by an ornate tower resting in the middle of the roof that features ornamental work, a walkway, a weather vane, and a clock. A person in blue ski pants on skis going down a slope A girl in a hat sitting on a dock near the water A woman laying on the floor next to a dog and a cat. An old smiling lady holding out a remote. Man behind counter in shop with coke cooler, newspapers, condiments on table. A skate boarder flying high in the air over steps. A red stop sign on the street in the snow. Two adults and one baby elephant walking in the woods A woman is jumping her horse over a piece of wood. Multicolored kites flying in the blue sky with a few clouds. A boat that is on some wooden cylinders on a beach. A window stands beyond a large tub in a room. A transit bus riding down a street with trees lined along it. Three surfers standing in the sand holding surfboards We see a blurry picture of a person riding a bike through a field with some cows. A small airplane flying over a field filled with people. A sign on a street post advises smiling. this grizzly bear is standing in some shallow water A woman walking up some steps towards a door. A fried piece of lobster sitting on top of a table. A person and a dog playing with frisbees. a table that is full of many different teddy bears Many skiers are walking through the snow with skis and poles. Someone is showing a text message to the camera. Slices of pizza in a box next to a DVD movie. A dumptruck is parked on a street near a hill. Two young cows standing next to each other. two brown bears lying together and relaxing on a rock an elephant extending his trunk out and on to the ground A close up of a plate of food containing eggs and toast. A zebra and her baby walk through dry grass. An older man in shorts with flip flops and an umbrella standing next to a luggage belt. A male and a female walking together in a military airport. a cat and a dog near one another A woman and child are in the kitchen eating food. A tennis player getting ready to serve the ball. A bathroom scene with focus on a mirror and a bathtub. A person in a ball cap sheering a sheep. a coin-operated parking meter stands beside a brick wall along a parking lot Men standing around outside on possibly a movie set A dog laying on a red couch in a room. A woman with glasses contemplates something as she rubs her chin. Two persons on the sea shore holding a ski board. Peddlers in boats on the waterway talking to people on the sidewalk. an older person on an air plane looking at a display on the back on a seat People are standing outside near a clock tower. A display of vintage items including an antique television, Barbie dolls and a lunch box. Young boy dressed in a large baseball uniform. a woman holding a mitt during a baseball game A large television screen in a large room. A photoshop of President Obama and a celebrity A cat sitting on a bench in front of a building. A man sitting on the floor by a window with an electronic device a train is moving forward letting out a huge puff of black smoke a man in glasses gazing at the pizza on the table A sign with plants and shade umbrellas sitting on the side of the road. An old blue truck is on a grassy area. Little girl covering her face and sitting in a wooden chair outside of a door. A humble kitchen has a stove and microwave. A herd of sheep standing outside of a pen. a large bunch of flowers outdoors in a field The cat is on the counter in the bathroom. White goose with young floating on water in daytime. A cow grazes from a junk pile, as a bird of prey soars overhead by the side of the road in a desolate setting. The single bird has a small head and a large body. Large group of food sitting on top of a table with white dishes. A toilet and sink are connected to a steel piece. A girl is standing outside flying a kite. A plate with meat, broccoli and cheese and a potoato. Several elephants walking on dirt and grass near body of water. Two young girls holding hands in front of giraffes A man rides an elephant across a body of water. a train that is on a train track Several countries have their flags displayed with flower memorials at the base of lighthouse. A fat hipster wearing a gray hat, a pink shirt, and a black butoniere. Motor bikes with multiple packages driving on city street. A group of tourists watch a herd of sheep in a field. a street sign with a sticker on it to make it look like someone on a cross Couple walking with an umbrella in the dark. A bathroom with a toilet and sink below a window. A white bird with a long black peak standing near the ocean. A person riding skis on top of a snow covered slope. brown cabinets in a kitchen with black appliances A man with a tennis racket and ball is on a tennis court. A foot long sandwich on a plate on a table. A tennis player holds his racket with two hands A living room filled with furniture sitting on a hard wood floor. a train covered in black dirt sitting in a fancy train station Children pay adept attention at a party as someone speaks. a man riding a boogie board in the water A ram laying down in the hay inside a wood enclosure. A large number of suitcases cordoned off by rope. A man eating a slice of pizza without holding the slice in his hands. a man is in a salon getting his hair dryed a lamppost during the day with two street sign This is a game of professional baseball being played, A motorcyclist walking away from his motorcycle that is parked beside the road. The horse is approaching a man wearing a camera. A ski slope with one skier on it doing the snowplow. A large skylight inside of a building with a high ceiling. a young man holding onto a bat by a sign woman takes a picture of herself in a mirror. A young woman kneeling behind a small stone wall. A set of bulls lying on the ground next to a boat. Two sheep stand next to a fence on grass. A whole sliced pizza and a can in a box. a bathroom with a toilet and a sign on the lid A woman balances an umbrella on her finger. A striped zebra is on short grass by a forest. an image of a cat on top of a couch Electric train car, on tracks with car carrier in background. a man with a green bandana holding onto a kite string There are different appliances in the middle of a kitchen. A man bent over in an open grassy field with something in his hand. THIS IS A PHOTO OF A SMALL HERD OF COWS WALKING DOWN THE ROAD Two laptops are stacked on top of each other on this desk. A painting of a blue fish flying through the canvas A single engine plane painted yellow flying overhead. a man in the kitchen cutting something on a cutting board An umbrella standing upright in a room on the floor near a wall. A woman is pointing and holding a hair dryer. Two women who are holding papers and wine glasses a street post with lights while clouds go by A large tower stands tall in front of a blue sky. A snowboarder posing for the camera on a snow bank. A group of rescue workers helping an overturned car two people standing side by side holding a glass of wine Two giraffes out in the sun either in a zoo or in the wild Train with its lights on a train track at night. A man flies a kite by the water side. a sanctuary sign and a tall clock tower Five giraffes in an enclosure on a sunny day. Fruit baskets and dips on display in a market. Beach umbrellas made of straw with the ocean in the background. A couple of girls holding tennis racquets and a ball. people in a field lfying many kites flying in the sky A elephant stands at a watering hole with its truck in its mouth. A shelf full of teddy bears on display. a train going down the tracks near a large city A bathroom with wooden door and a suitcase on metal a metal frame chair. A man in a red suit is on a white surfboard on top of a wave. A man walking in the sand with a surfboard. Multiple fire engines in the street in front of building. The boy is playing video games on the tv. Two long buses parked on the side of a road. A giraffe that is standing in a grassy area. Two women smile with skis on as they sit in a snow bank. A pair of tiny red scissors getting ready to cut. A cut in half sandwich on a plate next to a shake. a white plate on a table filled with pizza plices Woman places a piece of chocolate at the top of this treat a long train is crossing over a river Young boys on a couch with their stuffed animals and a laptop computer A man sitting in a chair with his legs crossed. A lady playing tennis on a court professionally. People are walking with horses on a trail of dirt and stone. A skier comes down the snowy slopes quickly. A male surfer riding a very small wave to shore. A cat is on the floor with some scissors. A dog that is sitting on a couch. An older stove sits in the kitchen next to a bottle of cleaner. An athletic middle aged male skier courses downhill. A train is traveling though a very beautiful mountain area. A stuffed bear is sitting next to some jars A dining space with a table and four chairs under a window and art on the wall. a small 3 storey building with a clock on the top A large empty bathroom with a walk in shower tub. A small child sitting in a sink brushing his teeth Double photos of two Rice University tennis players A meal laid out on a table outside at a restaurant. A stop sign on a pole in the grass. A boy getting ready to hit a baseball at a game. A man riding a skateboard into the air. large gothic styled church towering over cemetery A rusty fire hydrant is between two poles. A plate full of food accompanied by a glass of wine. An old train is making its way through the city. A ski slope scene with a skier on skis. A person and a laptop in a room. A black and white photo of a motorcycle. The four images each have different plates of food. a person riding a skate board on a ledge A pole with two wooden street signs in front of a bush. Fresh produce, including oranges and apples, is on display in bins in the sunshine. Two wine bottles on a table with one wine glass next to the bottles. There are many zebras out on the plain. A flock of birds are clinging to a tree. A piece of paper and some scissors on a table. A women holding a tennis racquet getting ready to play a game of tennis. A man carries a surfboard through the city. a small pizza that is on a white plate A white sheep standing in a wire pen. A tennis player prepares to return the ball. A large commercial air plane on the other side of a body of water. a group of zebras standing on a dirt and grass field Some ice cream with a fork on a clear plate. A kitchen filled with kitchen furniture and accessories. A man carrying a plate with food on it. A man standing in a field is throwing a frisbee. A plane that is on the ground in the air. A bathroom is reflected in a round mirror. an image of a skateboarder doing a trick down a ramp a dark gray horse grazing in the field This is an arrangement of pebbles and fruit with a butterfly sitting on an orange slice. A woman in her bra and a dress holding a giant green object. A zebra stands near a mound of dirt in a wooded area. a close up of a slice of pizza in a box A smart phone sitting next to a receipt on a table. A kite that is stuck in a tree. Book case with books and computer with keyboard A woman faces a truck that is loaded with luggage. The man jumps high to hit the tennis ball. three brown bears are cooling off in the water an image of two horses with noses nestled to each other A suitcase and a stroller full of miscellaneous items abandoned on a city sidewalk. A building with an ornate clock fastened to it near a flag. A group of different animals that's standing in the dirt. A bunch of fruit like banana along side each other. A young person stands in the kitchen, holding up a box of food, near the island counter. some people riding some bikes right by some boats A man on a striped board windsurfing in the ocean. Man preparing to serve ball on outdoor tennis court. The giraffe seems calm inside of the fence. A couple of people carrying surfboards under a pier. A person sitting at a table eating pizza and drinking wine. Two men standing in a living room next to each other. a ca dipping its head into a toilet bowl A group of surfers ride a wave on their surfboards A young child sitting on a surfboard at a beach. A person is riding a snowboard down a snowy hill. a hotel room with a nice tv and sofa setup A girl who is wearing a baseball glove. there is a woman cooking in a very large kitchen A young boy wearing a blue shirt standing next to a woman. some people walking on a pier and a skateboarder Cat sitting near a row of shoes and boots. A strand of beads on an open laptop computer. Two military men being honored with an award. A bird on a beach with the ocean in the background. A sub sandwich is fully loaded and must be eaten from a container. The cow is grazing in the tall grass. Green highway signs pointing in opposite directions next to a building A brick outdoor structure of the Delacourte Clock. The perspective of the skateboard picture creates an unusual scene. A boy and a girl pose for a prom picture. A wooden table with bowl of soup and cup with beverage in it. Two giraffes under the trees on a sunny day A surf board rider falling off his board while a ship sails out a sea. A plate with a sandwich and french fries with a drink in a glass. A person's hand holding a bitten into doughnut. Several cows laying in the grass on a sunny day. A small boat tied to a dock at a pier. an old silver and brown double parking meter A mountain covered in snow with a person on a snowboard. A woman grimaces in frustration with a video game remote. a truck sits parked next to a bench A cat sitting on the home office desk by an open window a steeple outside of a window with a clock Person on the tennis court bent over with racket in hand A large elephant stomps around on the dirt covered ground. A woman smiles from behind a bar displaying liquor bottles. A kitchen view of a refrigerator, with TV trays next to it. A skier standing in the snow next to a yellow and blue train. a man bouncing a tennis ball on a court before he serves A man wearing a backpack and holding a suitcase on the road side. A blue, yellow and brown house with a clock in front of the fence. a tiger striped cat hiding under a bed so many people at the beach swimming and resting part of a sandwich sitting on a table Three men sitting on a bench holding black luggage. a dog sitting in the driver seat of a truck A female in pajamas and hooded sweatshirt playing a video game. A group of people riding an elephant through a forest. a plastic cup of almonds some crackers and cheese Large made up bed in modern bedroom, with small desk. A woman tying a horse down to a trailer. A zebra runs across a field with antelope in the background. A man is standing in the middle of a living room. A piece of pie sits on a red plate. teddy bear like candy on a wooden table A man on a skateboard is riding on the ramp. A close up picture of a vase in front of 6 other vases. an airplane flying about many tall buildings and cars A clean living room with multiple sofas and a flatscreen television. People standing at a bar, eating appetizers and drinking wine. A bridge over water that has several trees on one side. A little girls peers into display of goods in a bakery. Someone is doing something right now that is fascinating. A bald headed man on top of a red motorcycle. The man is working on his cell phone by his desk. A woman pouring coffee into cups on a counter. A group of men on a field playing baseball. A motorcycle sits parked across from a herd of livestock. a man on a surf board riding on a wave A cat is licking up food from a blue plate. A red brick building sits on a corner and has a tower and a clock. A wrought iron bench sits above the sea shore. A garden filled with lots of green plants. A plate full of food that has carrots and some meat on it. Three beds in a white bedroom with two windows. a photographer wears a umbrella to get camera dry A child in a blue coat skiing on a ski slope. Some french toast sits on a plate next to coffee. A table with a plate of food, pitcher of orange juice, coffee and sugar packets. A brown and white dog with long ears holding a yellow frisbee in it's mouth. a woman is on her cell phone on the sidewalk Two planes that are flying in the sky. A table topped with plates and trays of food. A woman swinging at a incoming tennis ball Two Clydesdale horses being walked through a park. Skiers enjoying a day on the slopes in the sun A group of men are playing a game in a living room. The living room has a long grey couch and a rug under the coffee table. a man sits on a park bench surrounded by pidgeons an image of a cat with a tennis racket by a girl an old jet fighter with a propellor sitting in a plane graveyard A kids ski school with one instructor teaching A woman shops at a market with an assortment of fresh fruits. A fire hydrant and a little yellow ball person is between three yellow poles. The sausage is sitting on the side of the plate. A train decorated with candy canes and other Christmas decorations. An open laptop computer sitting on top of a wooden desk. A living room decorated with a modern theme. Some players in action on the soccer field. A very small bathroom has a toilet in it. A male is eating a large piece of food with his mouth wide open. A couple of people underneath a building with a clock. A large tower that has a clock on the side of it. Young boys are playing softball on a dirt field. Two bears playing in a water hole at a zoo. The vase has some beautiful flowers in it. A customized motor cycle with skulls on it A custom motorcycle on display at a motorcycle show. A black tour buss parked on side of road A person jumping up into the air on a skateboard. Blue passenger train passing through an open forrest. Black and white photograph of a women's tennis team A snow covered sign in a city neighborhood. a cat with some kittens laying on a bed Two water buffalo's standing together by a fence. The words Market Street are written on a white sign. A person leaning on an upright skate board in front of a building. Two people seated on a couch, one with glasses and holding remotes. A boat is sailing on the water in foggy conditions. A surfer waits at the water's edge on a rocky beach. A desk has a keyboard, monitor, and laptop on it. a closed up flower laying on a huge leaf This toilet has a weird plastic piece on it. The dining table is in the middle of the large kitchen. a guy in a black suit with a bright tie The city is next to a beach and many docked sail boats. A man riding a surfboard on top of a wave in the ocean. a kitchen sink with several white mugs hanging on the wall. A city in the night light up with lights A family poses together during a day out skiing. A black and silver fire hydrant sitting on a sidewalk in front of a brick building. A baseball player standing on top of a green field. A woman in the kitchen with others preparing a meal. A giraffe is eating in an enclosed space. A view of a few cocunuts in a basket. A tall shell gas station sign proclaims it is the Czech stop. A man standing in a room holding something in his hand. Woman with umbrella walking in the rain next to man. three small birds on a sandy beach A cat laying on top of a couch on a shoe. a chopping board with some cakes on it A lush green hillside covered in cows grazing. Three baseball players stand on a baseball field. A woman getting ready to light candles on a cake. there is a dog sitting in a room where there is sun People holding various phones in a group together A black tennis player swinging the racket towards the ball. a girl is turned around on a wood bench an image of a military man holding his daughter A man who is standing in front of a crowd talking. A man and woman holding coffee and talking to a woman in the city while walking their dog A close up of the push to walk button A couple of animals lounging on a hill in the open. A horse that is walking around by themself. A couple of computer monitors sitting on top of a wooden desk. A woman is sitting at an outdoor table using a cellphone. A long haired house cat, sitting in a shallow pot, is roaring. a man on a horse that is in side of a gate a polar bear sleeping on a rock ledge A women riding a scooter on a busy street. an image of a dog that is catching a frisbee Woman carrying bags eating a hotdog on a crowded street. A beach with a lot of kites flying in the air. A train going down the track with steam on top and a bicyclist riding beside it. a man in a a hat i standing with a horse A keyboard, mouse and monitor sit on a desk. Two large toilet sectionals in the middle of a grey bathroom. this is a cat in front of a tv A mirror, road signage and a skyscraper in the city A bathtub and sink under a window with a lace curtain. A person is holding a sandwich in one hand Some baseball players are playing a game. a boy on a skateboard is about to skate down the ramp Picture of a person that is reading a book. A white plate has a brown stripe design in the middle A sink in a kitchen under a microwave oven. A couple of black bears snuggling each other. A traffic light with a building in the background. a man on a surf board rides a wave A building with a sign that says Donuts above the door. Trays of a variety of different donuts for sale. close up of a pastry with a bite taken out of it A man is standing and talking on a cell phone. A bus is traveling down a city street that does not have much traffic. There is a flip phone in a banana shaped case A giraffe running around a field at a zoo. A parking meter reserved for the disabled outside of a boutique A man holds an oversized frisbee at the park. A toddler happily takes a bite of a donut. A beach that has people walking on the sand and in the water. A train sitting on top of tracks with steam pouring out of it. First bus on street currently not in service. A man flying through the air while riding a skateboard. The girls are checking-out where to put their surfboard in the water. A black cat rubbing up against a laptop. a close up of a small bird on a green surface A car with some surfboards in a field. A Delta airlines plane with the food services truck docked at the service door and a worker at the door. A sausage sandwich and greens sit on paper. A cake is being cut in front of little kids and parents. Cat standing on papers that are sitting next to a laptop. A studio apartment with a bed, a table, and a kitchen area. Woman poses on beach with two umbrellas in front of a floating boat A red car with various pizzas sticking out of its window. A church with a steeple and the sky in the background. A table full of food with a glass of water. Cows walking on a path between rocky outcrops. a man standing by a desk with a toothbrush in his mouth some cars and a motorcycle driving on a road A group of people watch as a man stands before them holding a string that is attached to a kite that flies in the cloudy blue sky. A gathering of people fly kites in the park A woman riding a motorcycle with a man on the back of it. There is a male surfer riding a wave while the sun goes down A black and white cityscape shows lots of people, mainly a tall, smiling man in suit and tie, who is paying attention to a woman standing beside a second smiling man in glasses and headset, who is also holding a microphone and notepad. A man making a face while biting a hot dog with cheese on it. Bright sunlight shining through a colorful window curtain. a white horse standing next to a stream, rocks and a green field. A man with a surfboard walking into the ocean. A kitchen sink near a couple of windows. A woman wearing a white shirt and black capris getting ready to fly a multi colored kite. Two boxes of donut with milk and juice on a dining room table. A green bus with a bike on the front of it driving. Four bears standing on a fallen tree outside. A young boy holding a blue baseball bat on top of a green field. An adorable little girl holding two ski poles. A motorcycle parked across from a business next to a highway. A red and white wings black bird sitting on wood Man and woman at an outdoor restaurant smiling for camera. a person riding a two thick wheeled bike on sand a group of people shopping for fresh fruit and vegetables at a market A baby elephant following behind a mother elephant several bottles displayed on counter in well decorated indoor area. A yellow train is traveling down the railroad tracks. Fireplace with brick border displaying many photos and decorative flowers. A view of a bathroom, that is very old looking. An airplane ready to let passengers get on. A rendering of an old fashioned water closet. A old time picture of a woman milking a cow. A pinto horse walking in a coral with two people. A dog standing on a chair eating out of a dog bowl. A cute puppy curiously looks to see whats going on. A woman gets a fresh glass of wine from a cask using a glass instrument. A bedroom packed full of home goods and luggage. Military colors being shown at a baseball game. A group of alpacas grazing on a dry hillside. A woman and her son picking out sweets at a bakery. A young skateboarder wearing safety equipment skateboarding down a sidewalk. A dinner plate with meat and vegetables on it. A kitchen large green hanging plant and a door. A bed with four pillows and the covers turned down. A large motorcycle is parked next to a brick wall. cows in a small wood and straw shack a vintage photo of a woman sitting on a horse with a man in a suit standing A woman smiling while holding a yellow banana. The lady is sitting with food in her hand. Pedestrians cross the street during a winter day. assorted foods separated in bowls on a white table a number of people standing in a kitchen area with a counter top Four pieces of pepperoni pizza on a plate. A man in a suit standing in front of a window A woman is walking her dogs on the city sidewalks through the newly fallen snow. People swim in a pool on a beach resort. A toddler in a t-shirt holding open a refrigerator door and looking inside A street lines if restaurants with signs hanging off of them. A baseball sitting in a baseball mitt on a blanket A lone kite is flying above the water and under a blue blanketed sky. Two road side workers chatting, one is holding a stop sign. A homemade square pizza fresh from the oven. A lanky skateboarder poses against a barn-red door. a dog stands inside of a boat as it stares at a camera A man surfs on a surfboard over a wave A foot ball fan is showing off his team spirit A man in black jacket with dog in snow. A tray full of breakfast items served on a plane. A young man in a black shirt and purple tie driving an automobile. Different style toys placed next to eachother and a batman costume. Two men are talking to each other while holding a skateboard. A pitcher throws a ball while the opposing team watches. A man in uniform is looking at his phone. A woman is pouring a bottle of wine into wine glasses. a man in a blue shirt and a orange tie A bird sitting on top of large pile of brush. A man casually throws a frisbee into the air. A woman in a Sailor Moon costume rides a motorcycle in a street full of people a laptop on the floor with a cat on the laptop A plethora of stop signs in the same vicinity of each other. A pizza with a sign with a cartoon mobster. There are horses walking beside of the cars. A cow lies down in a pen and looks at the camera. Three boys peel vegetables and cook at a counter. A police officer on a police motorcycle rides past a line of men in uniforms. A wide photo of two people kite surfing in the water. Several trays of pastries sit on a table. Highway road sign announcing exit ahead for vehicle traffic. A black and white photo of a woman asleep on a park bench surrounded by foilage. a toddler sitting at the end of a surfboard on the beach The worker is cleaning the eating area for the customers. A baseball player swinging a bat while standing next to home plate. A woman sitting beside a table full of fruits. A train with smoke coming out going down the tracks A airplane sitting on the tarmac at an airport. A gross bathroom has graffiti all over it. Dog trying to pick up an object with its mouth underneath a bench. a close up of a white keyboard with a black monitor Two military men are cutting a large cake. A woman walking down a street holding an umbrella. A red stop sign posted next to a tree next to a sidewalk. a clocktower standing high with lights on A dog and cat in a master bedroom looking at the camera. THERE IS A BATH TUB AND A SINK IN IT Boy in purple shirt holding a tennis rack on tennis court. A full view of a picture cloth with an animal. CHEF IN KITCHEN WEARS FACE MASK WHILE PREPARING FOOD. An assortment of fruits and vegetables sitting on a counter. Two people walk down a walking path. This is an image of several kids playing soccer. A man carrying a surf board into the water where there are other people. The pizza in the box is divided into four slices. A farm picture with an old cabinet and a horse with its head down. A plate of food including chicken, rice, and beets. A plate of food with onion and broccoli on it. Square white plate with a sandwich full of meat and dressing. A group of people walking down a wet sidewalk. Man riding white horse in the street while others watch. A man holding a camera standing in a crowd. a train on a track near a platform A man doing tricks on a skateboard outdoors in a city. A group of people mill about on a lawn of a building. A green highway sign beneath a beautiful blue sky. A toilet with a wooden seat is in a small bathroom. A big road sign listing three different locations Hotdog sandwiches sitting on ears of corn on a table. The young child is riding swiftly on a skateboard. A very close up look at a tasty looking pastry. Motorcycle police and their bikes with Battenburg markings There is a seagull flying towards beach umbrellas A man leans against a wooden box on wheels that contains a teddy bear and a basket. A large white bus is traveling through the city streets. A young woman walks in the rain, smiling and holding an umbrella. A neon green toilet and sink are by a large trash can. Some people gathered together on the snow covered ground. A man riding a surfboard on a wave in the ocean. The crowd of people are gathered in front of the building. this is an unmade bed with a flowery blanket A long line of skiers is waiting on a snow covered mountain. Smiling young girl holding video game controllers while standing A woman in a blue riding jacket rides a dark brown horse on a riding course. A brown dog laying on floor under a brown mat. A couple of people riding waves on top of boards. A sandwich on a plate with a side of coleslaw on a tray. A small boat is going down a water channel. A woman with an umbrella standing by a fountain at the park. A clock behind a fenced in area in a city setting. A cat sitting on a couch looking intently at something. Street signs showing streets with a one letter name A young man scoffing a huge slice of pizza from two paper plates. A table cluttered with a bunch of stuff. A woman wearing sunglasses and a hat is smiling. A train sitting parked on tracks next to a platform. A young woman looking at a store display and holding an umbrella. A clock that has been placed on a window sill. A giraffe in a grassy fenced in enclosure. The colorful lights are illuminating the darkened street. A stuffed zebra posed and being chased by stuffed wild dogs. A rear view mirror has the reflection of a truck. A small pizza sits on a granite counter top next to a napkin. Two firetrucks with their lights on are stopped on this road. A woman sits cross legged near a pile of eggs. Guy on bench looks over while eating pizza Chickens on a sandy beach with a motor boat in the background. a couple of zebras are inside of a caged area A bus stop sign that is on a pole. A English muffin lays on a plate next to a drink. An ornate antiqued pole holding a clock with trees in the background Small white sheep below another sheep eating in an open field. Several kites are flown along the shoreline on a cloudy day. two people holding surf boards on a beach A zebra standing on top of a dry grass field. A couple of giraffes looking attentively at the camera. A non passenger train sitting out on the tracks at a curve Colorful lights reflect off the items inside this bathroom stall A man standing behind a woman holding a bat. A man playing with his dog near the water. Two men playing frisbee in a large field The man is outside playing Frisbee with his dog. A large elephant standing in a grassy field. A man is using his board to surf a wave A young man wearing goggles with spiky hair dressed up like Robbin. A toilet sitting in a bathroom under a window. Sink with electric toothbrush and toothpaste sitting on the top. Plate of vegetables made from knitted yarn on wooden plate. A group of young people playing a game of frisbee. Two young girls sitting a big bench on the beach. A small dog with long hair sits on a computer desk. Several brown cows grazing in a field. a person standing next to a fire hydration that is spraying water A cat underneath a car on the pavement looking Rome underneath A very large jetliner sitting on top of a tarmac. A group of people sitting down at a table together sharing a meal. This truck has an open deck for the passengers. Interior of a public toilet stall in a country that squats to defecate A counter topped with small different shades red tiles Young boy in front of a large elephants cage. There is a parking meter with one side covered up. A large passenger jet flying through a cloudy blue sky. A black steam engine train sitting on top of rail road tracks. A very close up view of a very pretty bird. A store that has trees on the side of the building. A large stuffed white teddy bear sitting on a bed. A baseball player wearing a white and red suit with the number 19 gets ready to hit his bat. a large bed is in a white room Several pieces of furniture are in an empty parking area. A small kitchen with stainless appliances and red cabinet doors. The baseball between the pitcher and the batter during a game A vase of flowers on a table near a window A fighter plane is taxiing down a runway. The father and daughter are under an umbrella on the beach. A man walking on the sidewalk next to a suitcase leaning against a lamp. A cat laying in a bowl on top of a pillow. A woman is standing in front of a birthday cake. Cars driving on the street and people walking on the sidewalk in a city. A plane is sitting on the ground at the airport A pier stands in the ocean while people wade in the water. A green fire hydrant sitting in the middle of a sidewalk. There are apples and oranges on top of a table A person cut out a bird shape out of a piece of paper. a big crowd of people that are looking at a zebra A large bear walking around a zoo enclosure. a zebra standing next to a car on a bright day A woman flying a kite on the beach under a grey sky. A large flock of birds flying in the air. A soccer player is about to kick a soccer ball a fridge stove sink and dishwasher and a dinette set in a kitchen A group of people standing on top of a building near a large clock. Different types of fruit displayed on a table. Skateboarders waiting to hear the go ahead word to skate down a ramp. An open laptop computer sitting on top of a wooden desk. cars on the road that are nothing but blurry lights A bedroom scene with a bed and dresser. A young woman is pulling a casserole out of the oven. A man in a blue shirt serves a tennis ball A table holding two trays of cookies and a cake. A full view of some cows grazing on a field. A plethora of apples sitting inside a bowl. a home made pizza sits on a trey A white toilet missing seat in an old bathroom. a yellow and black train is on some tracks A sepia-tone photo of a man and a boy standing near a stove. A piece of cake sitting on a square plate. Man standing on side of busy street next to a mall. a small airplane sitting in the middle of an airstrip in a field a couple of surfers are walking out of the sea There is a group of people flying a kite together Two saucers have a doughnut and cappuccino on them, respectively. A woman seems to be doing yoga on a surfboard in the water Man holding dog mouth open to brush teeth in tiled area 3 microwaves cooking something and catching on fire. A man who is riding a wave on a surfboard. A clock on the wall inside a mass transit vehicle. A computer and keyboard are on a computer desk. Two men stand near another man who is jumping onto a bed. A group of people standing around a living room a heard of sheep on a grass field. A group of people ready their skiing equipment in the snow. A tennis player throws the ball up to hit it. The traffic light is in front of the building. a man waterskiing behind a white boat on a lake A black and white cat is sitting in a window. A sandwich and french fries on a paper plate a man is flipping through a book on a bed Disc on beach, with dog prints in sand A dog is standing next to a cat on a suitcase. A plate with asparagus, broccoli, carrots, cauliflower and a sandwich. A group of elephants standing together in a field of grass. A red fire hydrant with a hose sticking out of it. A airplane that is sitting on a tarmac. A display case holding various types of donuts in metal racks. small boy eating food from a white plate a large green and white clock tower in the middle of a plaza A skateboard zooms down the railing at the skate park. a man in a tie holding a cigarette and looking down A cross country skier on a trail, smiling. Some people riding a motorcycle near a bunch of motorcycles. man having fun with a video game system A man sitting on couch with two little girls. A group of people stand by a red lighthouse. The mother smiles as she holds the baby boy. A man teaching a girl how to play tennis. A broccoli head with onions and potatoes by a wooden wall. A woman standing next to a man near a traffic light. A person on some skis in the snow. an image of a woman sitting in a dark room A very long and wide road with some assorted vehicles. Two hotdogs and a side of french fries in yellow containers. A stop sign in front of a brick building. Four men in a lake attempting to stand up on a board together, with their hands raised in the air, and one man in the water. people are at outdoor seating with umbrellas overhead A tennis player prepares to hit a forehand on a red clay court. Small group with a folding table next to a decorative old bus. The bowl is full of broccoli and some kind of meat. an image of a place setting with soup and biscuits A bathroom with a large white tub and his and her sinks. We see a picture of many many teddy bears. A group of men sitting on a snow slope while attached to snowboard. A pitch approaches the batter in a baseball game. A man in glasses eats a slice of pizza. this photo is blurred it is of a house A black cat standing inside of a piece of luggage. Pack of zebras in a zoo standing together. A metal pole with three street signs pointing different directions. A man riding on a wave on top of a surfboard. THIS IS A PICTURE OF A TOILET AND SINK IN A BATHROOM Man smiling with hat in kitchen with mess around A young man in a baseball uniform with his arm pulled back. a red bench and some buildings and lights A bird sitting on the branch of a tree near leaves. A sink with some cups on the counter top. A man on a horse without a saddle stands on a hill. Low view of small passenger train moving through the countryside. A person with an umbrella and some cars on a street. A man is playing Wii tennis in his living room. A large jetliner flying through a cloudy blue sky. A man standing on a beach near luggage. an empty and clean wood floored home kitchen A brown and white dog sits on grass next to a Frisbee. Many kites fly above a crowded beach. A picture of a room with a table that has a vase and candles on it. a plant that has a yellow bird on it Young man exclaiming over an unripe green plantain. Two teddy bears that are sitting next to each other. Some ripe bananas are in a brown wicker basket. A young boy is skateboarding swiftly through a crowded park gazebo. Four people on skis standing in the snow A book on finance sitting on a bed. There is a blue and yellow train stopped at a train stop Two oranges and a banana laid out to look like a sad face there is a military truck that is stopped on the street A crowded harbor filled with small sailboats and other watercraft. a blue bus is parked by a bench Three cats on a bar watching television very closely. Chefs and cooks are preparing meals in a restaurant kichen A microwave and a cone on asphalt by bushes. a red and white plane and a blue and white plane A very tasty looking cheese and vegetable dish A person with a snowboard next to a man with skis. A person that is eating some food in her mouth. A very small kid in the road next to a big yellow bus. A movie cover with some food on top of a plate. A silhouette of a woman with a tennis racket. A teddy bear is sitting alone in a window. A woman sitting on a brown couch with two children. Littleboy been playing with a Nintendo Wii and amused Two children playing a miniature version of tennis on a city street. A group of sheep walking in a grassy pasture A small airplane in the sky and another in the water. A pair of giraffes is stretching up to a limb in perfect harmony. The bananas were cut to put chocolate inside them for a treat. Two benches are empty on a sunny day. Two young males playing a video game together in front of a tv. A man in green and a red haired woman sharing a laugh. two guys riding bicycles while carrying their surf boards A bunch of different types of tools in a play kitchen. a street sign on a wooden pole near a fire hydrant Fresh cut flowers in a glass vase on a tablecloth A banana sitting on top of a table next to a paper. A person is parasailing on the water under a cloudy sky A fire hydrant is placed in a wooded area A pizza sits on a table and it has cheese, olives and broccoli on it. Sheep grazing in a lush, green field on a lavish farm estate Older Americans ride in a simple parade float adorned with red, white and blue decorations. A person wearing a glove holding a chili dog. A close-up of the rear end of a propeller plane. A black park bench sitting near the water a person walking on a sie walk talking on a phone Several cows are on a sloping grassy hill. a blue and white plane flying over a lake. a bike with a tarp and boxes of items Two men standing on the street wearing a suit and tie A shirtless man with a hat and sunglasses holding a frisbee in one hand and in a stance where he is preparing to throw the frisbee. A tub and shower with a curtain in a bathroom. Some cars at a traffic light, one with a red sticker on the back A group of people play a game of frisbee. A bathroom with a separate area from the sink. two zebras walking next to each other in a desert area A man is sitting in a chair watching television with a remote control in his hand. That looks like some sort of huge satellite. A person is typing on a lap top and there is a person up on screen. a giraffe in a field with rocks and grace The giraffe is standing alone in the field. A can of soda and a cat with kitten next to a monitor. a bread with some noodles and minced meat a bird that is sitting on a log in some water A man wearing a bow tie walks in the rain with an umbrella. Large man in leather biker outfit with a small brown dog. The wooden boat is floating on the river near the bank. A kitchen with a stove, refrigerator and a microwave. People flying kites in the snow on a sunny day A plate with grapes, green vegetables, and noodles on a child's place mat. a cow walking in a crowded city street A closeup of a bull cow with horns on its head A person does a snowboard trick on a rail in the mountain There is a grilled sandwich on a white plate with sauce A stove is shown with a mixer next to it. dilapidated, dirty bathroom with mold and water damage A yellow bus and blue bus passing on the street A cheese pizza pie is in the serving dish on the counter. A soccer player blocks the goal during a nighttime soccer game. A person is snowboarding down a hill fast. A man holding a ball as he leaps into the air. A kitchen that has wooden floors and a bay window. The "Yoctangee Park" sign has a Native American on it. a park bench that is on top of some bricks Side by side view of two oval plates, one with fork, with chicken salad sandwiches and rosy new potatoes, by an open and an unopened bottle of lager, a pepper mill, paper towel roll, basket behind. A very large elephant in a field standing next to a pond There is a woman drinking from a fire hydrant and several other people nearby. you can see a large belt that is used to make donuts A man holding a tennis racquet on a tennis court. A man holding a racket playing tennis at the court A woman sitting on a bench reading a magazine. A young boy and girl playing on a ride. A large grizzly bear walking through tall grass. A pair of scissors next to a writing instrument of some sort. A small child using skis to ski down the hill. Surfers walk out through the surf toward large waves. A person is holding a computer and watching a flat screen t.v. A bus is stopped in the middle of the road. Elephants with passengers walking through a calm river. A little boy that is standing on a skateboard in the street. Meat, carrots, and a roll sit on a small white plate. A bathroom with a wall mounted toilet and TP dispenser. A herd of zebras in a tall grassy field a close up of a plate of fruit with apples A zebra standing in water next to grassy area. A large herd of elephants at the edge of a body of water. a kitchen with a refrigerator near a window A small elephant laying on the ground in the mud. A red train or trolley car is shown at a station. A man holding a baseball bat on a baseball field. two people riding skis across a snow covered forest. a person is touching a small stuffed bear A couple of horses grazing on a lush green field. A woman standing by a yellow fire hydrant A man plays with two young children in the grass. An elephant stands between two bushes on a dry field. A shoeless foot standing on top of a skate board A man stands on skis near a snowy mountain. A farm house and barns in the background with horses and farm animals in the yard in front of them. Several views of mean playing with a white disc on grass. An ornate wrought iron frame holds a sign reading GARAGE. a man playing tennis going for the return A child in white shirt laying on bed in wooden crib. A man and woman pose for a photograph while sitting on a moped. A close-up picture was taken of a giraffe. A yellow plate topped with different types of food. A women is laying on a board surfing a small wave. The zebras are eating the grass in their habitat. A man dressed up for a themed party. A black and white photo of an old train a women that is eating a very long hot dog A post with a clock and several birds sitting on it. An Asian stir fry on a plate with chicken, brown rice, and broccoli. Two men skateboarding down a road near some cones. The street sweeper has a safety triangle on the back A motion blur street scene of people and a bus. A man riding on top of an elephant. a car on a road with people standing on the side walk A stuffed bear and a stuffed bunny sitting beside of one another. A man taking a swing at a tennis ball A photo of a green and red train on a set of tracks. a vintage photo of some kids playing on a bed The skier is upside down in the air. A kitchen counter with an unassembled food processor. A group of people sitting on a train. A dog that is standing on top of a fire hydrant. Three white plates topped with pizza on an orange table. a black and white photo with a sign next to a building I cant see wht the images are in this one A planter is full of green plants along side a fence. a monument with many kites flying near by A living room with hard wood floors and furniture. A rusty looking parking meter is on the pavement. Three horses are walking through the grass wearing blankets. A man in a suit and tie beside a stack of suitcases. Two pieces of pepperoni, sausage and ham pizza on a plate. A bed with a stuffed animal looking out a window A batter, catcher, and umpire stand on a baseball mound. A closeup of a shelf displaying a canned beverage and a muffaletta sandwich. Man standing on a skateboard with a person sitting on it. A bathroom stall that says did you check your lipstick A large wooden clock hanging from the side of a large cement pole. A small blue and silver airplane spewing smoke at an air show. A bag sits on a white sheeted bed A white house with a red top next to the ocean. A large green truck with giant tires on it. A packed Chinese train is filled with commuters. There are several bananas tat are in te tabe A vase of flowers that is on a table. Two dolls with crazy hair and interesting clothes. THERE ARE BLACK AND WHITE KEYS ON THE KEY BOARD A computer desk is very cluttered with various items. A trellis and arbor with a bench under it A bunch of zebras grazing near a road where vehicles are driving by. A group of parked motorcycles at a parking lot A person can be seen trying to cross country ski as though they are on a slope. a person is skiing outside in the snow A market display with the rows of vegetables in baskets. A man leaning in to see the laptop he is using A man in black jacket playing a game with a Nintendo Wii controller. A cat is sitting on a laptop's keyboard An old man in a suit and tie is staring. Two pizzas sitting on top of a counter top. A four way traffic light showing the green light lite up. A rack with many accessories next to a refrigerator. Two UPS trucks are parked side by side beside a building. A skate boarder practicing his tricks on the ramp. a little bow outside in a yard by a bar, playing frisbee Guy in shirt and tie walking away from the chairs A man doing a trick on a motorcycle. a kite flying in the sky above a body of water A group of fruit, vegetables and eggs on a kitchen counter. A man standing next to his wife as she holds their baby. A modern kitchen is displayed with silver decor. All white bathroom with shelving unit over commode. A surfer in a wet suit rides a wave. A beautiful woman riding skis down a snow covered hill. A blue sign in front of a bamboo wall. A bathroom with white toliet and sink visible five used toothbrushes in a clear glass on top of a sink Black and white photograph of a man on a motorcycle. A bunch of young boys playing soccer and having lots of fun. An adult carries a child and a surfboard through the waves. A man holding a yellow frisbee in his right hand. Three boats filled with people floating down a river. a player squatting down to return the ball There are a collage of pictures of different foods A dog standing outside next to a car. A woman is taking the first bite of a banana. a steel bridge over the water with a train A baseball player swinging a bat towards a ball. Five snowboarders in yellow jackets perform a simultaneous jump. A barber giving a man a haircut with a blue smock on. A wooden table with two plates of food and a paper an oven and a small table in a home kitchen A person sitting at a table with a cup of coffee. An open computer next to books on a table a big zebra that is on a dirt ground Vehicles and people on a crowded city street. Two people posing next to a giant statue with a suit case. Three planes fly high in the sky in unison. A person on a surfboard riding a wave. A tray with four different types of food. A man holding a child with a toothbrush in its mouth. A LONE GIRAFFE IS GRAZING IN A OPEN FIELD A variety of vegetables hooked on sticks on a tray next to a remote control The wheel of a bicycle going down the street A mailed postcard of people in a boat being rowed Sausage and cheese on bread on a plate a person riding skis on a snowy surface a man standing on a porch holding a bat over his head Yellow construction trucks parked in line on a dirt road. A red train traveling out of a dark tunnel A living room filled with furniture and windows. A man is lacing his boots while several others are ready to ski. Mushrooms are used in many variety of dishes Several oranges hanging on tree branches in a grove. An antique semi with flames painted on it. The woman is standing by the elephants outside. there is a slice of pizza with mac and cheese on it A dog sits in the side car of a motorcycle. A bathroom with a wooden frame around the mirror A person on a skateboard near a building. Two single beds that are made up with a night stand between them. A large clock tower on top of a tall white building. A woman wearing a towel holding a blow dryer. A solar panel powers a public phone booth. Picture of a bathroom with three paintings over the toilet. A photo of a place during bike week. A person is handling broccoli on a cutting board. An orange and white cat standing in front of a flat screen TV A view of a person's legs sitting on a bench alone. A man stands on a white object while playing Wii. A large elephant is staring in front of a fence. A man sitting on the beach behind his surfboard. The bathroom is clean and ready to use. broccoli cauliflower and carrots in a white bowl The Master of Hounds leading the dogs out for a fox hunt. Crammed and congested city street in oriental area with many people and buildings. A cat walking through a kitchen by a eating tray. a person holding a tennis racket on a tennis court. a shops table filled with apples oranges and other fruits A man wearing skis poses for a picture in the snow. There are two beds in the bedroom, along with a desk and a television. A tight, rectangular kitchen space, with kiwi colored walls and a grey door, shows cabinet and counter spaces of pale wood, holding built in appliances, that borders a white tiled floor. a single person standing on the side of a snowy mountain A baby crying with a teddy bear in its arm. A man and a woman sitting on a motorcycle. A view of a tree with pink flowers as soon in a mirror. an image of the back end of a childs car seat The giraffe is standing inside of the pen. A dimly lit bedroom that has odd colored walls. Roadsigns showing stop lights, right and left turns and warning cyclists to dismount. Two polar bears are sleeping atop some rocks. Two bowls of food on top metal plates. There is not much space left for anything else. A person skateboarding on an outside basketball court. Carry on bag sitting on bench near metal railing. A sticker promoting vegetarianism has been placed on a stop sign. This restaurant provides laptop computers in the booths for each of its patrons. A yellow bowl filled with soup next to another bowl of soup. A couple of VW buses parked in front of a small brick house. a new kitchen cabinet with a sink being installed Two cakes shaped like trains are on gold foil. The room is crowded with many things including chairs, a bicycle, and a table with cups on it. A train going through a tunnel under a building An elephant is walking through the mud behind a gate. Several bruised oranges and lemons mixed together. A public toilet with the lid up in a stall A framed picture and reed diffuser sit on top of a toilet in a bathroom. A man in a white sweater placing a turkey in an oven. All aboard for a ride on the tourist train. A bathroom wall with three urinals on the walls and images of women peeking out behind trees on the wall. A tennis player is trying to hit the ball. This is a downhill skier sticking his pole into the mountain. A girl is having fun playing a video game of tennis. Which one would you choose to drive, the beauty or the beast? A single train at a train stop with many train tracks. A couple hold their cellphones while taking selfies. Several zebras walking together in the wild A street free sign sitting under street lights on a bridge. A memorial with various plaques and American Flags on it. a toy animal is wearing a feathery hat the white vase has drawings of women on it. Two birds are sitting on some gray cement. A man on skateboard riding a skate ramp. Some cats laying on a bed and posing for a picture. a train on a track near people A lady in red water clothes skiing on a lake. A beautiful woman sitting in a bed holding a tooth brush in her mouth. A wire basket of bananas and apples on a table. Some people sitting and painting a road divider A young man is playing frisbee in the park. A wooden bench under a tree in the field A girl takes her friends picture while wearing leis. The zebra is in the field standing all alone. Several cross country skiers prepare to start down a course. A woman skiing down a steep hill as snow flies up in the background. A older TV on a shelf with videos on shelves on either side. An eagle is standing on top of a pile of rocks. A woman playing with a dog while another person is skiing. The white toilet is sitting in the corner of the bathroom. Three sheep standing together in a grassy field. A lady is standing by the white truck. A man with a bald head has a cell phone to his ear. A chicken and cat walk in a barnyard. A living room with a large couch and a coffee table. a tall building with some clocks on it below a cloudy sky A bunch of broccoli that is near carrots Bunches of bananas are shown for display at the market. A sandwich cut in half and a cup of coffee. A baseball player holding a catchers mitt on top of a field. A wooden double door refrigerator with one side opened up. A number of seagulls stand in the shallow water as the tide sweeps over the beach. a man dressed nicely and sitting next to a female People holding signs on a one way street. A giraffe kissing a man with a shaved. THIS IS A SIDEWALK SHOT OF A PLACE CALLED THE LION a person sitting at a table eating food from a plate. The slice of pizza has large chunks of tomatoes on it. This is a spacious bathroom with an interesting tile pattern. Dinnerware with fruit painted on them beside a matching vase. Two girls are smiling and staring in their school uniform. A woman in a room with multiple cats laying and walking around. A vase of flowers on a white sheet. A person taking a piece of dessert from a plate. A man playing tennis going high for the ball. Boats in a river with trees alongside in a rural setting. A motorcycle rider gives a thumbs up to the camera. A truck is carrying a load of logs. this is a pair of women sitting on bikes A plate with a hot dog, chips and a strawberry. Woman in a field playing with spectators watching A man and woman that are standing in the sand. A small cat is walking behind a bike. A man in black jacket riding skis on a snowy slope. Two people standing near the ocean with sails in the sky. a man with a hat skiing on the snow towards a building A man that is standing on a court with a racquet. A woman is trying to catch a frisbee. A stop sign on a residential area has caring under the stop. a bench near a tree near a light pole a girl in front of a stop sign A surfer in a wetsuit in the curl of a wave Two rectangular boxes with chop sticks have food in them. a young man on the beach holding a sall A girl is at a table with two pizzas. a little boy standing beside a toilet in the bathroom a man in a wet suit stands on top of a rocky hill Two ponies are running through a grassy field. A man riding a skateboard down a cement ramp. a close up of a cat on the ground looking in a mirror A statue of a man riding a horse on a tower of rocks. a guy sitting on a balcony using his laptop A parking meter is next to white wires. A group of men playing a game of tennis on a dirt court. There are many giraffes standing among each other Player walking away from home plate carrying bat during game. A woman strikes her tennis racket against a ball. Guests gather around and converse at a wine tasting. Several people mounted on horses riding down a trail. a group of surfboards stuck in the sand near the ocean A table topped with ripe bananas sitting in piles. A young boy standing on a street holding a skateboard. A man wearing a black ski suit preparing to go down a snow covered hill. a very large pizza with a fork and a knife Man in wetsuit surfing next to a small wave. A Chinese lady on a boat wearing a Chinaman hat A pink cat creature sewn to the side of a pink bag. A dog who is sitting on a couch. Many people standing in a field with a red flag and many kites. A large white cat sitting on a table in front of a TV. a bathroom with a lot of toilet paper next to the toilet A few people are in outside in the snow, with their ski gear. Two military officers cut a cake with two civilians. this kite is being flown above a city A man on a horse in the middle of the street. A man holding a cigarette and talking on a cell phone. The polar bear is white and showing his teeth A black cat laying on a green pillow. A plate of pastries with fruit and a fork and knife. A man surfing waves in the ocean on his surf board. A group of black cows with horns standing in the middle of a street. A broken surfboard on a beach with trees in the background. A group of people sitting around a table eating. Two people are passing a man playing a piano on the street. Black and white photograph of a fence next to a fire hydrant. Young child brushing teeth using blue and white toothbrush. Vintage red truck parked on a parking lot alone. Two hot dogs with chili, cheese and tomatoes. a small dog is sleeping on a chiar A man on a surfboard rides a wave. A baseball player is winding up for the pitch. A row of many kites in the shape of cows fly along with other kites. A woman holding a plate of food and a glass. A red fire hydrant on sidewalk next to a wet street. a star shaped kite flying high in the sky. Four pans of food on a stove in a restaurant. The blue necktie shows a picture of a pocket watch. A stop sign on a one way street. Traffic signal on the side of a bridge outside. A team of two makes their way down the water on a primitive raft. Two red double decker buses passing in opposite directions. A skier posing for picture while straddling a tree. Four people with a group of elephants on a hill. two cups of coffee next to a white plate of pastry and icecream A backpack with rollers is sitting unattended in the middle of this forested dirt road. a fire hydrant sits off a city street A bowl of soup, a metal spoon, and an orange on a wood surface People on a slope snowboard and skiing next to trees. A toilet sitting in a bathroom that is being remodeled. A cardboard box contains some old vegetables and some trash. Line of fire trucks driving down a city street. A plate full of different types of food. A man is holding scissors to his own head. A big pretty commercial plane on the runway. a person sitting on the ground wearing a suit and tie A little blonde boy wearing a tie and purple shirt A vase filled with pink flowers on top of a table. Two trains stopped side by side in a railway station, both with platforms A bathroom with shower and plenty of toiletries. Slices of pizza on plates and drinking glasses. A parking meter reads COPE on one side and four dollar signs on the right. Two trains on parallel tracks near a station A men's public washroom with a blue floor. Four people sitting around a computer station talking. A man next to a woman with his horse by a house. A sign on a metal pole on a street. A man sitting on a couch using a laptop Pack of elephants in tall green grass as one has its trunk raised. A home made pizza with cheese is on a shelf. A keyboard, mouse, and wires on a desk. Man speaking on phone with large sideburns A bus is going down a rural highway road. An Asian meal with noodles, vegetables and soup. A woman pushing a baby carriage by a building. A horse drawn carriage stopped near the water. A donut is laying on a large noodle looking mat. A piece of cake on a white plate next to whipped cream. People on the sidewalk near a no left turn sign on a post this living room is done in colors of black and red A street sign prohibiting bicycles, skates, and skateboards. A dog lying on a cement porch in front of a brightly painted building with a motorbike next to it A beached sailboat in the sand with a chair next to it. Fire hydrant in non-traditional paint, whitish yellow paint with black polka dots in front of old style firehouse with USA flag. The reflection of two dogs being walked down the street A young man riding on top of a skateboard. A vase sitting on top of a wooden table in a living room. A woman is looking at pastries in the shop's window. A pizza with onions peppers and cheese and coke to drink An elephant standing on the ground near a lake. An adult on skis is standing near a group of children with skis on. A bathroom with red and white tiling and a toilet and floor drain. Two horse next to each other walking down a road . A man reaches to catch a frisbee in a grass field. a group of kids standing next to each other in a room A train is approaching alongside a body of water. People are walking on a beach alongside giant rock formations and flying a kite. A bright kitchen with blond wood table and chairs and side server. Cross country Skiier trekking through heavily forested land with snow. A boy learning to skateboard in a park Gourmet pizza cooked and sliced and on a plate A variety of furniture sits scattered in a storage facility. A woman wake boarding in a lake having fun. some different items of food in a glass case Two tennis players play tennis on the court. Several horses walking along the beach by the ocean. Some big baskets filled with tasty looking apples. A polar bear standing open mouthed on a glacier There is a blue bike leaning up against the wall. A picture of a car waiting at a intersection. A zebra grazing and standing on the grass. A set of professional knives attached to a mounted magnet. A woman is sitting on a leather couch smiling at another woman. An airplane flying over a big harvest moon Group of giraffes standing behind a caged in area. Various equine horses and zebras inside stalls under a tent. a bent white sign with a black pole The buses are parked on the side of the street Baked pizza with red tomatoes and green olives. A cat drinking water out of a water bowl. There is a cat standing in a toilet. A very pretty horse in front of a big metal structure. A man and a baby with toothbrushes in their mouth. A hot dog or sausage in a bun with bowl containing condiment and bacon on the side. Man chopping a chicken on a butcher block with a bottle of wine in front. A bench that has some water drops on it. A street pole that has a street name sign, a one way street sign and a map sign on it. A girl is standing in a field and flying a kite. A surfer posing for a photo with a surfboard A close up of a zebra's back with its neighbor's mane in the background. A cat watching water go down a sink drain A man laying on a bench and a woman next to him touching his face. A person wearing blue jeans and black tennis shoes riding a skateboard. A very close up view of some very tasty looking food. a person is reaching for a piece of pizza in a box A man in a suit and tie posing for a photo in a large building. A street sign where St. Stevens St crosses 17 Ave S. a man in white holding a plate playing A man is holding his arms out on his surfboard in the middle of the sea. The chef is putting ingredients on the pizza. Young baseball players on a field with a pitch being thrown A modern style kitchen filled with may different items. A home office features full bookcase, a laptop and a red leather chair. A man happy about a truckload of bananas. A varied collection of glass bottle containers on three sleves the girls are standing in a room with the window behind them A couple of women shaking hands on top of a tennis court. A person wearing a wetsuit with a surfboard under one arm. A baby boy sucking on a pacifier while wearing a diaper. A man posing for a brochure picture with Akieys translations on it. A woman is watching a kid and man playing Wii. A group of guys standing behind tables on a stage before a presentation. White bowl with assorted fruits being eaten by fork. We are looking past a speaker at a monitor. A white stuffed teddy bear sitting on a couch. A wooden desk has an open lap top on it and a pair of scissors. A refrigerator and a stove in a kitchen. A woman is smoking a cigarette and on her phone. A bowl full of food that is sitting on the table. A person standing on a sandy beach flying a kite. Street sign showing the name of the street in English and then in Asian characters below two bikes sitting on a walkway next to some trees Several men standing beside of each other in a line. A giraffe leans its head over the fence of an enclosure. A personal size pizza with tomatoes, spinach, and garlic. A school girl in a uniform in front of a window. a man is parasailing out at the beach A female showing an open door to a refrigerator. A bathroom with a sink, tub, shower head and mirror. A zebra with his head down eating grass. A man stands on a street corner next to a stop sign. A retail sign is hanging above a stop sign alert for added effect. A mounted police officer riding down a city street past parked cars. A passenger train moving along a railway in the country side. Tw people ride horses with trees in the background. A white car at intersection of two roads. a young man staring into the camera sticking his nose in between the handles of a pair of old metal scissors A giraffe standing next to another giraffe on a lush green field. A room full of electronics and musical equipment A man riding a wave on top of a white surfboard. A clock tower with sculptures and a bell. Two people in a park throwing frisbees at the camera. The two zebras are standing together on the land. A made up bed in a well decorated room with art pieces on the wall. Women hiding from the sun on a city bench People sitting at the umbrella covered tables next to the river A train going down the rails passes under a pedestrian walkway. A cat standing by some bushes outside in the woods. A bathroom with tiled floors and double sinks Three stop signs in the middle of the street. Two men are snowboarding and skiing in the snow. Man with camera holding kite in park setting. A man is sitting behind a laptop computer. Two tennis players are walking by the tennis net. The man with the red and black bookbag is walking toward the building. An adult man helping a youth on a skateboard. this is a sign at a gas station A piece of thin-crust pizza sits on a plate. Zebras inside of a fenced in field eating grass A young girl scrunches up her face as she holds a video game remote. A cactus sits in a pretty green vase. A city bus stopped at a crosswalk on a street. A girl sitting at a table drinking from a bottle. A person with there feet propped up in a chair in front of computer equipment. A street side shop next to an intersection. A train sitting at a train station platform. a big plate of food that is on a table A man swinging a tennis racquet on a tennis court. An old lady smiling in a pink kitchen. Bray pickup truck parked in driveway of residential home. A white kitchen with stainless steel appliances and granite counter top. This is a black and white family picture taken in the mid 1900s, of Grandpap, and his progeny A picture of something and it appears like food. A flock of ducks floating on top of water. A woman with an umbrella stands with her belongings on the ground. A white, yellow and blue airplane on a runway. A resort with palm trees, bridge, people and bushes. a surfer on a surf board in the midst of a wave a person standing in a living room playing nintendo wii Several people are riding in a horse race. A red city bus driving through city streets. Bird walking in the water near the shore edge. A white and black bedroom with a white bed. A plate of food, dishes with food, and a pot of flowers sitting on top of a table. Two pictures of a burger, onion rings and a beer. Several pans containing a few slices of pizza are displayed on a table. A man with his arms crossed in a Santa hat and wearing a tie. The young person is carrying their surfboard into the water. A refrigerator in a kitchen next to a dining room. an image of a broken fighter plane on the runway The employee is carrying gas canisters on a bicycle. A skateboarder comes off his board on a ramp. Some kites flying over some buildings in the snow. there is a very tall tower with a clock on it A new roll of toilet paper is on the back of a toilet. a crowd of people are looking off of a balcony a child on top of a buket on the front lawn A man riding on the back of a green motorcycle. Boats docked in the water with a cloudy sky above. a close up of a number of different remote controls A woman sitting in front of a plate of food. The person is reading music and playing a keyboard. Commercial passenger jet at gate on airport tarmac. A man is surfing on a wave while another floats with is board. Two people are drinking red wine from wine glasses. A child standing completely upright in front of a refrigerator. A woman is on a red surfboard in the ocean. A male and a female sitting together, the female is texting on her phone. A person guiding a child down a hill on skis. A white airplane is on a crowded airport runway. A person is standing on a snowboard near a bridge. A giraffe is eating grass in an open field. A number of peach trees on a sunny day. a man talking to a group of kids as a cow stands in a cage a stuffed animal sits in front of a book Several people are playing at a beach with a boat in the distance. The man is sitting on a low ledge. a dog sitting in the passenger seat of a big truck a person taking a photo in a mirror A couch and furniture in a small room. Three men who are standing around a campsite. A sign mounted to a pole that reads " No Stops ". A cat curled up on a bed next to a stuffed animals. A jet is sitting on the tarmac with blue sky's around Broccoli dish in a bowl with a fork inside of it. there is a card board bus with a cat sitting in it A man holding a large umbrella with some girls and a woman underneath. Three people are posing next to a raw pizza. A single person skiis down part of a mountain A person holding a half eaten hot dog with toppings. A small child is in the kitchen with an adult and dog. This white passenger bus is waiting at a stop A train on railroad tracks beside a platform. Two dogs sleeping on a semi made bed A woman standing next to the ocean flying a colorful kite. An ornate clock is surrounded by artwork and white arch. A hitter watching the baseball approach during an at bat A young child holding a skate board and pointing into the distance. a small plane flying through a blue sky A dog sitting on a bed in a sweater with a indifferent look. Three elephants with seats and umbrellas stopped by a body of water A street crossing with a street sign for Mulholland and a no-U-turn sign. Chicken and assorted vegetables are frying in a pan. A photo of people looking off into the distance holding an umbrella. A skateboarder launching his skateboard into the air as he rides it. The little girl ordered a piece of cake at the restaurant. A dark road with power lines and street lights. a transportation bus parked in a parking lot A boy cutting a pizza on a wooden cutting board. A small bird sitting in the sand at the beach. A zebra, walking on dry grass, is seen from the rear. there is a piece of cake on a plate on the table Clock tower overlooking a red shuttle bus outside. A young foal nuzzling its mother in the nose. Two boys perform skateboard stunts on the street The cat is sitting on top of the remote. Various home appliances are lined up on the sidewalk. A child smiles in front of a container of carrots with a stuffed rabbit. A casserole sitting on a counter with apples and a measuring cup behind it. A strawberry pound cake with a slice taken out. a red and white airplane ascending in the sky A plate of food including rice, broccoli, protein and a sauce in a bowl. Two girls eating food at a Chinese restaurant A formally dressed man with a martini poses with two women in evening gowns. A big pair of scissors is on a wooden box. A tall vase full of orange flowers sitting on a table. Group of people crossing a busy city street in the rain. A male college student playing frisbee in the park. A big commercial plane flying low by a bridge. Cars stopped on a road blocked by a herd of sheep. a woman is holding a toothbrush up to a masked face there are many birds that are standing by the water A large cut pizza on a dining table. A person putting sliced carrots into a dish. A male emo hipster wearing a furry jacket in front of a laptop computer. A pile of carrots, radishes, green beans, and broccoli on a cutting board. two bulls in a field between bushes with a sky background A couple of birds are walking around the grass A cat relaxing on a plaid couch on a person's clothes. A work out ball sits on a chair near a cluttered desk. A cat walking into a kitchen with a phone and fridge visible. A man near the ocean catching a frisbee on the beach. A cow with a tag is staring at a viewer. This is an image of a man with an umbrella. A person on a cell phone by a big stone wall. A man and a elephant that are standing in the dirt. A crumbling bathroom has a sink and a medicine cabinet. A female pedestrian stands in the center of a crosswalk as a double-decker bus quickly approaches. The young batter wearing a helmet prepares to swing. The sheep are all standing around together in front of a monument. A crow sits on the roof of a blue car. a kitty standing in an empty food dish eating from aniother Panorama of a field with cows next to a dirt road A clock repairman working at his table displays his wares on the wall. A painting of a house showing the bathroom, kitchen and bedroom. a street sign on a pole with a sky background A photo taken through a window at houses on a hill. Three people near a truck in a sunflower field. a person riding a surf board in a wave tunnel A panda is eating a frozen treat with fruit in it. A cat sitting on a window sill outside. A cake decorated with things from a barbershop A group of elephants walking down a river with people riding them. A man is kissing a woman on the cheek. Vintage tour guides stand next to an early bus. A man holding up two ripe bananas in front of a house. A white plate topped with carrots, potatoes and dumplings. Shoes rest on a carpet next to a drawer with a picture on top. A plate with waffles, butter and a fork and knife. Man crouched over a suitcase looking at the items inside A stop sign on a corner of a road A woman on a surfboard in the ocean. An old goat with big horns resting in the shade A child in a room with a remote in hand. A new home banner sits beside a small curvy road. An orange has a frown drawn on it with a knife in it. a bathroom with a bath tub a sink and a mirror Many kites in a field launched and launching A clock that is on the side of a building. A woman sitting on the ground next to two dogs. Food sits on top of a refrigerator covered in magnets. a person standing in a living room playing nintendo wii A gray cat is wearing a red knitted rabbit hat cozy. A clock in a busy city at night. Two patrolman on horseback standing in front of an establishment. A large crowd watches a professional tennis match. A large black and white cow standing in a desert field. An orange cat laying on top of a wooden bench. A close up of a motorcycle parked on the sidewalk next to a door. A jumbo jet Fed Ex plane on a runway of an airport. A brown teddy bear in a forest with trees and shrubbery. a couple of birds stand on a grass hill A decal of a skateboarding man is applied to the wall. A bathroom with a pedestal fan in ti. an inflatable blue car on the beach with a man walking beside it Hungry man enjoys lunch at a local restaurant. A small dog sitting on top of a computer in a bag Abstract picture featuring girl on tennis court with racket. A couple dressed to be married are pretending to talk on cell phones. Friends playing and taking pictures with a camera phone. A girl is petting a horse out in a field. Traffic on a city street with busses, trucks and cars. A giant giraffe made of building bricks, outside of a building. A train sits on the rails beside the station. A toilet has several toilet paper roll dispensers. A young boy using a lap top by a table Guy in a helmet on a skateboard in red. A young man on a skateboard doing tricks on the cement A couple of people near a truck on the road. Many bunches of bananas sit atop this grocery store display case. A man is skateboarding while at the park. A professional baseball player running around a base on the field A flat screen and a keyboard and mouse on the desk A city garbage truck with three men in the front. A scene of something that is quite attractive. A man in a shop holding a picture of two men. A woman tennis player is waving at the fans while she holds her tennis racket. A road shot has a radio antennae and a small section of windshield, a brown hillside, the vanishing road beside it, bikers, close to the antennae, and far away, and off in the distance, signs, a car, and a big blue sky. A heard of cows stands in front of a man with a tractor. A baseball team talks with coaches on the outside of the field. A crowd is watching a baseball game being played. Television and computers on with no one utilizing them. a baseball player throws a pitch to a batter A child holds the line of a kite flying in the wind. A man standing over many doughnuts on display. A group of young men standing next to each other on a ski slope. A penguin is running through a pasture as sheep graze. A small portable set of burners with a tea kettle are on the counter top of a neat, clean efficiency kitchen. A reflection of a dog in a vehicle's side view mirror. A car driving by a herd of sheep. A tennis racket is laying on the floor of a tiled room. A fighter is jet flying through the clouds A miniature blue bow of fruits next to a penny. There are two pieces of cake on a plate and a glass of pumpkin juice. A boat that is inside of the water. A Thomas the train engine model cake with writing on the platform. A FedEx plane moving on the snowy runway. There is a white cake and some small cookies view of a bathroom with white toilet and white sink A workspace inside an office with snowy trees outside the window. The next hitter in the baseball game saunters to the plate. A little boy that is holding a bat. A large mirror reflecting a bus driving down a street. Silver and green train sitting at a train station. A clock that is sitting on top of a metal pole. A horse jumping over a wooden jump at a horse show. a person racing on a motorcycle on a race track. A smiling young buxom woman is displaying a sandwich and a glass of beer. A woman standing in a room with a remote. Two giraffes that are standing by each other in a field. a baseball player swings his bat at a ball A person on skis riding down a race course on a hill. A large clock fixed to a building as vehicles pass by. Two giraffes rubbing their heads and necks together. A person standing in the snow with their hand up to their face. A small boy skiing down a snowy hill A passenger bus that has two levels driving down a street. A view of a city street through the windshield of a vehicle. A man working on a propeller driven airplane. there is a dog laying on a couch with many blankets on it A living room filled with furniture and a TV. A young child swinging a baseball bat at a baseball. A man and his boys play Wii Fit in their home A jockey is on top of his horse number 6. A row boat is tied up to a dock. A man on his motorcycle with a teddy bear attached. two large elephants walk on the green grass A woman is cooking over an open flame in the cabin A woman standing near a kitchen counter talking to someone Tennis player getting ready to back hand the ball over the net. a brown and white dog is riding a skateboard A young boy is riding his skateboard down a hill. A dog holding a yellow frisbee in it's mouth. A plate of fruit near some other bottles of liquid An elegant white vase of colorful flowers rests on the windowsill. A bathroom with a small sink vanity and a toilet. A woman in heels pulling a suitcase behind her Two horses pulling some carts in the street A flock of birds sitting on top of a set of power lines. A vintage photo shows students sitting at their desks. People sitting on the beach and sitting on beach chairs. A man is at bat in either a baseball or softball game an old laptop and a dog rest on a bed a blonde girl is wearing a clip on tie A cat looking up between two plastic bottles Two cat lying on a floor playing with each other A man performing a jump on a skateboard. White show horses and handlers performing during public event. A girl eagerly bites into a hot dog bun A man in an orange shirt pushing a stroller. someone jumping up to get a frisbee out of a tree A train is passing through a residential area with houses, trees, cars and pedestrians. a man sitting on a rock while he watches elephants in the water A white jet airliner on runway with mountains in background. A red tray that has some food and an orange drink on it. A tour bus unloading at a rest stop. Two shirtless men playing Frisbee in a grassy area A woman leading a brown horse down a sandy beach. A man surfing a small wave in the ocean. An ad for Costa Rica shows a beach scene with surfers. A toddler is in the bathroom holding his ear with one hand and his other hand is closed together. A red and white tow truck tows a white car down the street. A passenger train is passing a cargo ship. Two gray elephants standing next to each other. People are riding horses through a parking lot. A close up of a dog wearing a Christmas themed hat. Man holding paddle in air on surfboard with patch in corner A busy looking street area in an asian country. A tall giraffe standing next to a tree. People sitting in a chair lift in a purely white landscape A man in black coat standing under umbrella next to a building. A young man sitting on the beach with a surf board. A passenger train parked at a train depot. A little girl sitting at a table with a piece of cake. Three people are riding down a street while buses are in the background. A person standing on skis with a backpack in front of them. Two young boys are seated with their legs crossed. Men are lying on a couch with a computer on the table. A kite is being flown by a man in the distance. A group of skiers posing for a photo. Two hot dogs smothered in salsa on hot dog buns. A hotdog and fries sit on a table. Many elephants are walking near a muddy watering hole. A boater smiles as he paddles his canoe. A rusted up bard sinking into a body of water. A bathroom with a sink, toilet and picture on the wall. A pizza is on a plate of tin foil. Dinnerplate with me vegetables and other condiments. A young man is sitting in a chair and has mismatched outfit and a name badge. A man on a skateboard performing a trick. A brown and white dog and person standing on a wooden floor. people on the beach playing with a brown cow A desk area with a window view with mugs, tablets, and books. A city bridge with a clock on the top of it Two zebras grazing on grass in a field. a woman with a blue umbrella standing by some stairs A person on a surfboard is riding a big wave. A large clock hanging off the side of a building. The cat is lying on top of a pair of shoes. Shrimp, broccoli and carrots are in white dishes. Canoes and motor boats sit along the water's edge. A persons legs with a dark colored cat rubbing against their legs and shoes. A RETRO FOOD CHOPPER IN A CORNER ON THE COUNTER A picture of a tennis player about to hit a ball. A yellow and green fire hydrant on the side of a street with peeling paint. Several elephants are standing in a desolate field. A picture of some food on a plate. Two skate boarders riding down a paved path. A person on a court with a tennis racket. A group of people in a park with food. a person jumping a skateboard into the air A view out the window of an airport terminal A polar bear is looking over the grass at something off camera. A woman serves a tennis ball during a match. many people in a kitchen area preparing a pizza Two identical airplanes are flying side by side with people doing tricks on top of them. A black and white dog sitting in the grass next to a frisbee. THREE MEN SITTING ON A HUGE GREEN BENCH IN FRONT OF A BIG YELLOW BUILDING. a young boy holding a tennis racket A clock tower with ornate designs above a bridge. a big plane sits parked as a bunch of people watch A baseball player catching a baseball in a catchers mitt. A man sleeping in clothing on a bed. this is a computer and books on a desk a man is standing surrounded by a lot of luggage A man jumping up on a blue tennis court with a black tennis racket in his hand. A dog laying under a brown computer desk. Antique black truck with a barrel in the bed. A bunch of stuffed bears and gift boxes in a suitcase. A horse near another horse in a building. View of a smartphone sitting on a computer keyboard. A blue train traveling down tracks next to a building. A man with a kite on a hill A couple of bears on a shore near some water. A wedding cake design with roses and wine glasses. A group of people standing with some motor bikes. A baseball player dropping his bat and beginning to run A skateboarder does a trick in a crowded skatepark covered in graffiti. Two people in the snow on skis taking pictures. a line of kites that look like cows next to the road An arrangement of items from a woman's purse including wallet, cell phone, MP3 player, gloves, hairbrush, eyeglasses case and day planner A man holds the hand of a child as they look at a row of cows. a bathroom with fancy sink in the corner the toilet in this bathroom is in disrepair The city bus is parked on the side of the road. A traffic light with a bike signal on a pole. A dog and a sheep are separated by a wire fence The entire pizza is in a box atop the dishwasher a soldier is receiving an award from a man in a suit A group of giraffes stand next to a building and tree. a white green and black sign and a bicycle without wheels A teddy bear with a book is placed in a wooden chair A bobble head baseball figurine on a desk. a meal on a table which includes pizza in a box and a bottle of beer along with a beer mug a small group of zebra and giraffe in a savanah many trains on tracks near a building A man standing next to a red motorcycle. A man smiles while holding his cell phone. a close up of a school bus parked in a lot A person holding a rope hovers over the ocean. A boy and a girl under an umbrella. A man standing on a lush green field holding a kite. A zebra standing in dry grass has dark and light stripes Computer desk with monitors and large monitor displays. People are gathered around at an outdoor table. A loving couple who has fallen to sleep together on a couch. this is a pizza cut into slices a man that is walking with something in his hand A fire hydrant with a painting of a face on it. a woman in red is riding a horse A young boy standing ready to hit at the plate in a baseball game. A group of people seated at a table in a restaurant A man seated on a park bench with his head down A very clean, modern living area has a very comfortable couch-bed and a wide screen TV. A player is in motion as he reaches back to throw the ball. Two elephants touching each others trunks beside each other. A man in racing gear and number under a banner. Two men sitting on a yellow boat in the water. several elephant type large yard ornaments setting outside. an image of a plane that is taking off in the center A man and woman standing in front of some pizza. a woman in a bathing suit standing near water taking a picture A picture of a giraffe walking around its enclosure. showing lemon, red pepper, zucchini, ginger, and yellow squash A toothbrush holder with tooth brushes inside of it. A red double deck bus traveling along a city street. There are baby birds in a birds nest Two fancy dressed people ride on horses down the street. A large black bear walking across a lush green field. A woman holding a teddy bear in a costume while wearing a really tight shirt. A person snowboarding down a snow covered hill. Apples, plums, peaches and pears sitting on a metal counter top. A woman is reading a book as she sits on bench with a sign in front of it. Woman on a Kitchen counter on the phone with paint. many people of all ages skiing on the snow clad mountains. A pizza on a large knotty pine kitchen table A group of policemen on motorcycles in a city. A crowd of people standing around each other in front of a shack. A large white polar bear standing on a icy pool. A suitcase sitting in a living room of a home. Boys playing with a colorful kite in a park A street sign on a pole next to a building. A cat that is laying down near a shoe. A woman with long red hair packing herself into a suitcase. Two people talking and a young lady that is reading a book on a bench. Many sheep are out in the green grass. a person swinging a baseball bat at a ball Man on grassy field getting ready to catch yellow frisbee. A piece of asparagus quiche and carrot salad on a white plate Multi colored cat laying on and among shoes and boots. A shaggy mother pony and her foal in a field. A person is getting a slice of pizza from a platter. A close-up photo of a young cauliflower plant. A comfort bus is driving on the street. a woman standing by a window while talking on the phone Skateboarder jumps high off of a ramp into the air. A parent and child playing with a plastic basebat and ball on the beach A grey bird perched on a tree branch A MAN IS ON THE SNOW BOARD IN THE SNOW Adults gathered in living room playing video games. Two woman at a table full of wine A "pet crossing" sign with a peace sign on it is on a pole by a tree near the highway. A group of zebra standing in the tall grass. A man in an apron standing at a table full of oranges. A bed near an open window with a small fan in the windowsill. A newly married couple kissing next to a food van. there is a small bowl with a lot of food in it A boy running and flying a kite in a field. A cat lying on a pink blanket sleeping. A large mirror with black framing on the wall of a bathroom above the sink. A bunch of people walking and doing things down the street a plate that has some cut up vegetables on it A bunch of doughnuts that are on a tray. A man is holding a bunch of banana's An elephant scratching his ear in the sun. Two horses pulling an old fashioned style carriage down an urban street A baseball player holding bat while standing on a field. two people standing next to an elephant in fenced enclosure. A red and white napkin covered with fries, a burger and coleslaw Several objects displayed on a kitchen table including bread, oranges and plating. there is a blue left turn sign on this street pole A man riding on a bicycle down a street while holding a surfboard under one arm. A woman with an intense look rared back with her tennis racquet. Young soccer players on field during match play. A plate of fruit next to a cup of coffee. A baseball player holding a bat standing next to home plate. Some young ladies in swimsuits sitting on a dock over water. A boy riding on the back of a motorcycle near a truck with pineapples in it. A stop sign at an intersection that has stickers and leaves on it. A beige and white bathroom with white toilet and honey colored hardwood vanity A little girl wearing a hat has one foot on a skateboard. Two desktop computers sitting on top of a desk. The desktop computer has three different working screens. a few people that are standing next some motorcycles A bird on a table eating from plates of food A desk with a midi controller to make music with. A boy and a girl on a boat while another boy is standing on land with one foot on the boat. A slice of rich and decadent cake covered with frosting sits on a plate. A tugboat sits beside a ferry on placid water with a mountain in the distance. A man is holding two sandwiches one in each hand. The cattle are standing in the dirt path. Display case full of several kinds of donuts in a shop. A momma zebra and her baby running through a field. A man in a black jacket taking a picture of a sink area. A cat sitting on the edge of a sink in a bathroom. Teddy bears seemingly hug one another against a dark background A small bathroom with a sink and vanity. a big animal that is in some grass A big sandy beach with some kites flying in the air. A man laying in bed with a book over his face. A baby plays with a teddybear while sitting on a green blanket outside. A small cat has it's front paws inside a toilet. A bird sitting on a house eave in a backyard. Some people sitting at a table with open luggage and papers A man in black jacket flying a kite on a beach. A urinal in a public restroom near a wooden table. A man in a very fashionable cleanly decorated bedroom. A container with a meat sandwich and fork is sitting on the grass. This seems to be a bear laying on the snow. A bedroom with a bed, desk and a television. Far shot of a clock on the side of a building. A rural street at an intersection with cars in the distance parked on the curb. A black and white colored cat on top of a wooden bench. a woman riding a surfboard on a wave in the ocean. a bath room with a toilet and a shower A gray bathroom is lit up to show to sinks. Four planes fly through the air in a black and white photograph. A baseball player is in motion with his bat. A slice of pizza is on a plate on a table. there is a large bowl of food on top of a table a train on a track near a platform A pizza with pasta on top and olives and pepperonis. A grey teddy bear with a red bow and a card. People in a square near a small clock tower. A striped cat laying on a wooden bench a silver oven some pots pans a knife and cabinets A young boy skis down a slope with adults standing in the background. A young boy eating mushrooms near a pizza. A bear looks ahead from a field of vegetation. A housewife holds a platter of food in the kitchen. Two giraffes in a zoo enclosure stand by a wall. a garbage truck in the city late at night A group of elephants gathered near some poles. A man holds a skateboard in his hand. The official box for the Wii game showing a hand holding a controller. A man walking across a field holding a baseball bat. A line of people crowd the sidewalk beside a business. Two men watch as yellow aircraft flies over a lake. A plate of food that includes meat, broccoli and potatoes. a vintage photo of a cake walking on a toilet A apple that is taped to the back of a laptop. An old lady is smiling happily sitting on a motorcycle. a brown teddy bear is sitting on a green bed A man with sunglasses talking on a cellphone. A WOMAN IS EATING A SANDWICH OUT ON THE GRASS A kitchen with hard wood floors and wooden cabinets Giraffes standing together and other animals in the background. A fat gray tiger cat laying on top of bed up against a pillow. A bunch of luggage bags with tags on the floor An empty chair is set in front of two computers at a work desk. A Ferris wheel is visible behind the building's clock tower. A group of people are putting their sweet treats all towards each other. a double decked bus parked by a stadium A group of people comparing cell phones together The two tables are each covered with food and plates. Young girl posed with a bunch of cell phones and a "New Years" party hat. The man in the hat walks along using his cell phone. A man that is leaning over a tray of doughnuts. Two Zebras are eating grass together in the wild. A group of cars driving past a mcdonalds near a bridge. A woman performing a shot in a tennis match A Canadian airplane with a big red maple leaf is flying high. Plane next to a boarding ramp under a cloudy sky. Sandwich made of two doughnuts sitting on top of a plastic plate. a girl balancing on a surf board while a man watches behind. A cross country skier stretches on an open field of snow. A man walking across a field holding a wand near a dog. A beach setting with tons of people around the shore. A train station stands majestically and functionally while passengers wait for their train. A table with a keyboard and some other items. A little baby zebra running around in a fenced in area. Some motorcycles are being displayed in a window. a dried up stream stands two zebras and there are other animals in the background with trees. Three people posing with sundaes in glass bowls. A black dog laying on a tile floor next to wall. A folder sitting on top of a wooden bench. An empty double-decker bus rests against the curb, alongside some buildings. A row of outdoor food tables look very primitive. A plate with two doughnuts, strawberries, and coffee. This is someones bathroom sink in their home. A tall clock with a small tree beside it. Two women who love and care for their horses. A man skateboarding in an old abandoned pool. a person on a train station platform A long train sitting on a railroad track. The baby boat is drinking milk from it's mother. A meal at a restaurant of a salad, a toasted sandwich and a pickle A group of men playing instrument next to a wooden wall. A set of coffee mugs sitting together on a small wooden table near a bedside. a big airplane that is parked on some concrete A small dog rests in a large dog bed, snuggled on a blanket. Several zebras in an open area during a not so sunny day. Women sitting at the table eating meals at the restaurant A man bends over an open toilet and looks in it. People at a park, taking walks, sitting on the grass and throwing Frisbees. A train engine carrying carts down a track past some buildings. A woman hovering over food on a wooden table. A cat laying on a TV in the middle of the room. A man rides a bicycle carrying snow skis. A mockup of an African elephant stands in a museum this is a bird sitting in some grass A woman chops vegetables in a kitchen. A classroom with a rug on the floor that looks like a computer keyboard Cupcakes with frosting sit on a foil covered tray. An old photo of a man with a pipe and a beer. A woman with bleeding nose and blood stained shirt looks into a cellphone. A nice shiny suitcase is positioned alongside sneakers for a quick getaway. A monk is looking at a mobile phone among ancient architecture. Many sheep grazing next to a busy road. There is a person sitting on a motorcycle. A small girl eating a plate of food with a fork A lush green field with colorful kites flying above it. White dog sticking his nose out from under red and white striped bed ruffle. A giraffe resting it's head on a fence at a zoo. A busy street full of cars and buses with buildings in the background. A plane flying with a smaller plane above it. An elephant is walking towards a tree in a park. A large elephant walking across a field of grass. A small black and brown dog standing next to a cow. a red bus is parking in the field. A bus makes its way through the city street. Boats in a river on a foggy day. This unusual animal figurine sits in front of a clock. A group of men are in discussion around bananas. Two zebras who are in a field together. A desktop computer has two keyboards and two mice. Several people walking around near a white van. An office with file cabinets, a keyboard and chairs. An elephant guided by a man in a blue shirt and followed by another elephant. A pan sitting on top of a stove top under a wooden spoon. Two women standing on a purple tennis court. Munching in the grass is a daily habit. A yellow school bus parked in a parking lot full of snow. a man holding a cell phone towards the camera Balls of garbage sitting on top of a toilet. A group of men cutting a giant sheet cake. A tram is traveling down a green track A pizza that is topped with an assortment of items and sliced. A man wearing a red baseball cap walks along a grass field with a backpack A big bear is standing next to the bars. Several people fly kites above a paved outdoor area. there are many beer signs on the side of buildings The clown is driving down the grassy area. A bus that is parked in a lot next to another bus. A cat sitting on haunches next to a wooden door. a man doing a jump with a skateboard in the road A skier leans as she makes a turn down the hill. people sitting at tables next to a building in the background A stop sign is posted near a road with a bridge in the background. Men on horseback going through a crowd of people. A group of people on line at an intersection. A couple both holding a knife and cutting their wedding cake together. A meatball sub served with french fries on the side. a desk with a keyboard and a monitor on it A beautiful woman playing a game of tennis. A person cutting out pictures of clothing items. A giraffe out amid the trees and grass. a double decker bus going down a road beside some stands Two boys playing frisbee on a soccer field. There is a small yellow bird standing on a fence A living room filled with furniture beneath a window. A couple of me playing tennis on a plane flying in the air. Three men are standing on a baseball field with a crowd watching. A family are in their skies posing for the camera. A small red belt clip cell phone case. A dresser with a clock and a potted plant on it. A person wearing brightly colored clothes is riding a motorcycle. A dimly light living room with wooden floors and large windows. A heard of Zebras moving with another animal group across a field. A girl is showing off a stadium hot dog Someone is using a small grill to melt his sandwich. A bright green frog on a bright green plant. a close up picture of a large variety of fruit A plate of food on a wooden serving tray A snowboarder with ski poles in midair facing the ground. The person is putting toppings on his food. a man in an orange and white striped shirt with some scissors and machines Two women share some chocolate cake and coffee. two people eating food off of a paper plate A skateboarder is featured at different positions on a ramp. People standing next to a bus with a cat face on the front. A group of walkers are seen while passengers ride in a train. A bright bedroom with a red bedspread and someone laying on the bed. A polar bear standing high on a rock. A man and a woman smiling while holding an electric keyboard. A river that has many boats floating in it. The two bears are wondering about the point of the camera. A woman showing a teddy bear to another woman and child. A table that has been served soup and fruit. A view of a kitchen from the doorway. A car is seen in the reflection of the microwave. A catcher reaching out to catch a ball while the batter is swinging. Cars line up to coin meters on at a busy sidewalk A guy sitting at a table in front of a birthday cake with candles in the cake A highway sign on a rocky slope along side the road. this is boats sitting in water near grass A dog chasing a group of birds outside. A man and a woman standing beside each other. A cluttered and dirty kitchen counter top, with food spread around. A tomato and an apple sitting on a table. There is a sink and toilet in the bathroom. A fire hydrant with writing on it on a street corner. A man in a suit holding a red ukulele A young child smiles as he holds a tennis racket. A herd of cattle walking along a sandy beach. a brown teddy bear and some wooden block toys a couple of dogs running around a field A guy riding a bike and carrying a surfboard turns to look behind. Unattended luggage in a roped section of an airport lobby. a man holding a sandwich and another on a plate A person holding up a cell phone taking a picture. White flowers in a tall brown contrasting vase. A white horse looking out over a fence. The sky is dim as the sun changes positions behind a building. A group of men riding on a horse drawn carriage. A wood bench under a tree in front of some bushes. A bus broke down on the side of the highway and all the passengers had to file out onto the side of the road. A cellphone, piece of fruit and cup are on the table. A living room area with wood accents on the wall and floor. someone that is holding a wii remote in their hand A sign for the Atlantic City Convention Center. A man is standing behind many different fruits. a close up of a sandwich on a plate A tall white clock tower with a black clock on each of it's sides. Plate of food, including hot dog, ribs, beans, and corn. The surfer in the wetsuit is coming through a very big wave. A group of people fly guiding on the sand. a person riding on an elephants head walking on a dirt road the woman is giving the solider something to eat A bench is sitting in front of the water. A toddler with a pacifier wearing a neck tie There is a suitcase with items surrounding it. The baseball player is getting ready to take his turn at bat. A green sign that says rockaway beach on a post. this image is of a boy with a skateboard doing tricks A boy does an ollie in a skate park on his skateboard. A bowl filled with soup sitting on top of a white place mat. A trash can on a corner has a microwave in it. A picture of a dog sitting in the backseat of a car. A person standing in a living room with a fire place. Motorcycle police are on large bikes in a crowd of people. A picture of a full bathroom with a large tub. Balloons and banners decorate the open fair grounds. A woman holding a tennis racquet on a tennis court. a person walking on a city street with an umbrella A vespa parked with a cover in a fence A fire place sitting in a living room under a mirror. A classroom with a purple chair and a chalkboard. The interior of a bathroom made of stone and colored glass. A pizza laying on top of a wooden board. Man with no shirt holding frisbee in grassy, rocky area Scissors, a hole punch, and paper laying next to each other. The luggage boxes are downloaded from the aeroplane. A stunning skyline sits in the back drop of traffic lights. all of the parking meters on this street are covered with plastic bags there are many people laying in the sand at this beach People are purchasing food from a fruit salesman. A woman is standing looking down at luggage. A smiling man at a table has a wine glass. A man riding a wave on a surfboard near a para sail chute. A tennis player on sand in the middle of a play. A person that is in the snow having some fun. A group of people are sitting by a truck on the ground. Several cakes are on display in the bakery a laptop sitting on a table, with a beer and tv in background. View of a snowy mountain outside the windshield of an aircraft. This kitchen layout appears choppy and full of "blocks". A plate with a wide variety of food on it. a woman in a dress and a tennis racket in hand A couple of men on horses and people on bicycles in a courtyard area in the nighttime. people skiing on a snowy ski bank while wearing ski wear. A smart device sitting inside of a white bunny bat. A clock that is embedded in the ornate top of a building. A pair of woman lunge after a tennis ball on a doubles tennis court. A young person sitting in his seat working on his laptop. A dog looking out a window of a car. A living room with two blue couches and entertainment equipment. A giraffe looks at the back of its enclosure. A plate with food and a newspaper on a table. person cutting paper with scissors at a table an older person standing playing nintendo wii system A young man in striped shorts rides the waves on a surfboard. A brightly lit, quaint and clean living room. A man is surfing on a wave in the ocean. A group of three men riding snowboards on a snow covered slope. A person is holding a nintendo wii controller A small pizza sliced into four pieces garnished with green leaves. a group of people under umbrellas at a beach A person is flying a kite high in the air. A fluffy cat is sitting on the sidewalk. a toothbrush holder with four toothbrushes in it An office with a two desks and a filing cabinet. The person is flying a kite with two strings. Bananas are hung up to ripen at an outdoor market. a woman is standing by a sink in a kitchen A woman sitting at a table holding up a pair of scissors. A black and white kitten is asleep on a keyboard laptop. Couple sitting at a table in a restaurant with pieces of cake. Carrots, celery, nuts, onions, and bay leaves are mixed together in a bowl. A man standing on top of a snowy mountain A kitchen area with a stove, refrigerator and sink. a white cat covering itself with an umbrella a bird is standing on a green bench A group of holiday bears are arranged in a group. A big white bird standing in front of rows of benches. A little boy holding a baseball bat getting ready to swing A futuristic bike parked in front of a sail boat. An elephant is spraying water out of it's trunk. The yellow train is headed towards the final destination. Black statue on marble base surrounded by security ropes. Table and chairs set up at the back of a church. this is a traing riding through a city an open suitcase and a closed suitcase on the floor and a cat on the bed A man about to hit a tennis ball with a racket. people walking on a path around log cabins a man on the tennis court with his arms stretched out There are two zebras in a rocky plain A man with a small backpack cross country skiing A tennis player standing on a tennis court looking up. A woman holding up a large carrot in a backyard. A person walking in the ocean with a surfboard under their arm. two slices of pizza sitting on a plate next to a fork a couple of people on skis ride through the snow A bunch of bananas hand from a banana tree. A couple of men playing a game of frisbee. a hot pocket sandwich laying on butcher paper These families are riding on the backs of elephants A fresh vegetable shop in a vegetable market. A person on a skateboard on a street. A man riding a surfboard in the ocean on water. A skate boarder falling down in a very big ramp. Livestock, people, and vehicles on asphalt near a building. The city bus is parked in the parking lot. A person with a kid on top of a horse. A person playing tennis on an outdoor court with trees. A statue stands in a courtyard near a colorful flower bed. A man riding on the back of a horse. The cabin of a small boat has two couches Two decker bus entering leaving Winchester Bus Station. a big sign saing where to go for parking A view shows the bedroom and bathroom close together. Two zebras are facing away from each other. A child flying a butterfly kite while another child rides a scooter. A couple of giraffe standing on a lush green field. A baseball player waiting for his turn in baseball game. Two people ridding horses on a dirt trail with woods behind them. A person walking with a small brown pony on a leash. A blue tent sitting in the middle of a forest. The child's bedroom has two low beds and storage space for toys and entertainment. people walking with umbrellas in a rainy london england A bedroom with bicycle, computer desk and checkered bedspread. a man doing a trick on a skaeboard The celery and carrots are on a cutting board with a knife. A hand holds a piece of fruit with the peel cut off. Group of black chairs sitting underneath a blue umbrella. two benches sitting on the beach by some trees A group of green traffic lights on a street filled with snow. A dog running behind three sheep in an open field.. A piece of pizza on a white plate with multiple toppings. Several airplanes can be seen at the airport but there is also snow on the ground here. The little girl in pink shirt and beige pants throws the frisbee. A car driving down a street near stores with bicycles outside of them. The female tennis player is heading towards her next match. there is a bench on top of bricks by the water Lady loses her ski on a snowy hill. A yellow cat wears a blue plastic sports hat. Two boys carrying hot dogs and other snacks at an outdoor sporting event. A beach with an area with umbrellas and an open area without them. A close up of an apple mouse and the numberpad of the keyboard. A person looking at their cell phone at another person taking a picture. A chicken sandwich and sweet potato fries on a plate. Two teenage boys playing a game of frisbee. Two people holding umbrellas looking at a statue of a man. A group of people waiting in line to board a train. A few snow skiers are going a mountain slope. A woman sitting on a bench at a park. A man riding a skateboard while a group of people watch. a close up of two slices of pizza on a plate A plate of vegetables arranged with flowers and herbs. an air plane at an air port run way A giraffe laying on the ground looking forward. A living room filled with furniture and a flat screen TV. A giraffe standing underneath a beautiful rainbow in a cloudy sky.. A coffee cup, food, and a passport sitting near each other. The meal is prepared and ready to be eaten. a soldier is carrying a couple of bags Dinner is served in a tray on the table. A blue sign that is pointing to the restrooms. A person on a motorcycle making a sharp turn in the dirt. a black and white photo with a double decker bus in color A woman wearing a white t-shirt and visor with pink shorts playing on a tennis court. Messy apartment in the middle of packing for travel. A close-up of a laptop on a desk with a book. The group of friends are enjoying their drinks. A washroom with many photos hanging all over the wall A girl is dressed in all red holding a red umbrella. a bunch of stuff is loaded in the back of a red truck A giraffe with dark spots lounges in the grass. A beach with several kites flying just slightly off the ground. A tall tower with a clock on it at night. A man sitting at a table using a laptop computer. A market is shown on the side of the road. Seven carrots of varying sizes lie on a table a person riding a bike wit ha dog in a basket A person on a skateboard being watched by a crowd. A street sign showing the intersection of Main and B. A yellow bus that is sitting in the grass. A bowl of food contains meat and broccoli. Surfer on knees on surfboard while riding wave. A herd of sheep grazing on a lush green field. A group of men standing around each other playing a game of baseball. a dog sits under neath a chair with a person in it A girl and a man are playing Frisbee on a lawn. A man is holding a box of some sorts near a bus and someone wearing a strange outfit. a little girl playing a game of wii golf A beautiful young woman laying on her stomach in front of a laptop. A group of skiers with backpacks carrying their skies up a mountainside. some people walking across a road with a sign on it A close up shot of a giraffe against a blurry background. Clock in middle of a sculpture on top of building a man standing next to a laptop and bottles of beer A brick sidewalk of various colored bricks next to a street with cars driving on it. A plate with broccoli, potatoes and a meat with sauce arranged on it. This is a picture of a women trying to figure out where her keys are. A woman gesturing with her hands and sitting at a table with a computer. A pile of veggies next to a pile of bananas. A skateboarder doing tricks on a ramp in the sun A man and girl are standing on a field holding baseball gloves. Two people hold up tennis racquets over a net. A pizza is topped with vegetable strips and garnishment. A cup with three pairs of scissors sitting on a table. a close up of a cat paw near a book Two skiers are traversing up a tall mountain. a man with a hat and a baseball bat swinging at a ball sandy deserted umbrella lined beach with houses on top the cliff An historic training sitting on railroad tracks. A vase filled with flowers on top of a table. Someone flying a kite in the sand on the beach. two little bird on a tree touching beaks A kitchen painted white with an automatic dishwasher and a large window. Many cartoon modeled objects are in the sand. A blue, white and red fire hydrant sitting on a sidewalk. A woman sitting on a beach taking a picture of a number of kites the table is set with many things to eat. This girl is happily filling her plate with the healthy and creative food choices served at the buffet in this yard party. Fresh fruit for sale hang by the side of the street. Two giraffes are standing side by side in a field. A man standing behind a white frisbee on a lush green field. A large number of cattle confined in a small area. A person riding a brown horse in full dress. A large airplane sits at a gate at an airport. A group of children sitting in a red wooden canoe on the seashore. A skier flips upside in the air performing a high jump in the mountains. A couple of men racing motorcycles next to each other. A painting of a luminous glass bottle seems to glow with inner light. The dog is lying on the white sheet. A cow sculpture sits on top of the grass. A group of motorcycles are sitting in front of a building. A tennis player looks at a tennis ball as she lifts up a tennis racket. A man holding a square shaped pizza pie. A line up of motorcycle cops riding motorcycles on a street. A person holding a skateboard with a dog tucked in their jacket. A boardwalk with a fence and bench lit by streetlights. Two yaks are standing in a grassy field. a barbecue sandwich with onions in a paper tray A jet waits on the runway of a mountain airport the horse is bending its head over and grazen A small yellow plane is leaving the hangar. A tree-lined city street with car and motorcycle traffic. He is skateboarding down the wall at the skateboard park. The gray elephant family is crossing the ditch. A businessman giving a slide show presentation in a meeting room. The two young children are sitting at the table together. A girl in white shirt and blue shorts playing tennis. An odd looking mechanism sits on a dirt road while beyond it someone rides a bicycle and in the background small flags are flying. A fuzzy black cat is sitting on a laptop computer. a plate with some eggs chicken and tomatoes on it a woman is standing at the beach with a surfboard Two computer monitors sitting next to each other. A person sitting down eating a sandwich next to a street. Serious looking couple with light brown Teddy bear, side sun light. Two trains sitting side by side on the tracks. A woman in glasses is taking a bite out of food. a group of sheep are all outside in the soil together An empty bathroom with a toilet and sink. An egg is served on top of a small pizza. This is a public restroom that is fully tiled. A group of people standing around a van in the rain. Two men are in a green train with yellow lettering. The little boy is brushing his teeth with a toothbrush. A dog jumping into the air to catch a toy. a baseball player getting ready to hit the ball with a bat A pie sitting on top of a stove top oven. A bunch of very cute signs hanging by a business. a close up of a bowl of fruit with oranges The traffic light is visible for all of us to see. a couple of people on a motorcycle dressed as santa A person snowboarding down a slope at an angle. Two women walk near a man skateboarding with a child. The fork sits next to a piece of chocolate cake. Some grilled fish is on a white plate with a fork and some carrots. Cars driving on a road near traffic lights. Slice of baked dessert item on platter ready for consumption. A man holds his arms in the air while standing in the snow. A man standing in a bathroom brushing teeth while wearing monster mask. A plate of pizza on a restaurant table. A black and white cat sits on a wooden porch The man watches his reflection while brushing his teeth in the mirror. two bento box meals with meat and vegetables A digital clock shows the current time at 653 A baseball player holding a baseball bat in the game. an image of a close up of food with meat and veggies A small kitten figurine on top of a cellphone screen. Two light brown cows standing inside of gated corral. A beach filled with lots of people next to the ocean. A street light that shows, horse crossing on it. a large truck in a field with trees in the background A man riding on the back of a white surfboard with two small dogs. a bench at a train station with seating on the front and back A woman presenting cupcakes with lit candles to a baby. A toilet in a stall with the toilet seat up. There is a man dressed in a purple tie and black suit. a street full of people walking and one riding a small motorcycle A large group of people looking at an elephant behind a fence. an image of a group of people in the woods playing with frisbee a elephant balances on a stepping stool There are three people posing with their drinks. A city street at twilight showing a bus crossing the intersection and people standing on the corner. Tabby cat sitting on the hood of a blue car. A young man jumping a metal railing on top of a skateboard. A person with their hand on the mouse of the computer A white bathroom with a toilet and a brown and white tiled floor. The large tray has a large sandwich, two pickle slices, and a bucket of fries. A group of people sitting around a living room. Two birds sit near a plate of partially eaten food. some boys having some food at a table together A dish with mean inside of it . Three people smiling and sitting at an outdoor dining table that has place settings for four plates. A baby sitting in the middle of a bunch of teddy bears. A tennis player about to hit the ball. A train is waiting at the station for passengers. A para sailer approaching the beach on a sunny day A man riding a skateboard is making a jump over a bench. A city bus stopped in front of a building. A piece of cake with many colours on a plate An umpire gets ready to call a player safe or out. a person with a red umbrella a building and a car A blue jacket laying on top of a fire hydrant A young baseball player winds up for a pitch. Images on the same man song tricks on a skateboard A pretty little girl standing on a hardwood floor. People are laying on a sunny beach near the water. A woman looking at a website on her computer. A person with skis and gears standing in the snow. a sky full of kites floating in the wind A stuffed blue bear with a tag in a room. A man eats a pizza in a small restaurant chain a man wearing protective gear is on a skateboard A woman surfer walking along the beach sand. A silver oven door is reflecting the wooden floor. 2 girls laughing while one holds a telephone A long white bath tub near a white toilet bowl A white plate of food on a table. There are many doughnuts and pastries arranged on platters A silver colored refrigeration unit, in a kitchen. A person that is playing in a tennis game. A dog chews on a box in a grassy yard. the girl id licking the spoon of batter A man throwing a Frisbee in a parkland A small apple tree sitting next to a wooden fence. A book shelf filled with lots of books. A living area with a futon, chair and a window. A display case at a store filled with lots of different vegetables. A cat is sitting on a car hood on a wintery day. A man playing tennis with two people watching the game a clean bathroom that has a big mirror A person that is surfing in the water. A pile of submarine sandwiches sitting in a stack. A white table with a bottle of soda and a hotdog. A plate of chocolate donuts and one has sprinkles on top sitting on a blue platter on a table. A man in a baseball uniform about to throw a ball. A child eating a slice of pizza at a table. A group of motorcycles parked on a dirt parking lot in a mountainous region. A large kitchen with a metallic refrigerator freezer and a center island. A man who is in the air with a skateboard. A young man riding a skateboard on a walkway. A young person on skis on a ski slope A banana split with white and dark chocolate A white dog sits in a basket with wheels on the floor. A woman poses with a large teddy bear. A corner of a room with a very big sink near a toilet. A man and a woman with cell phone in hand behind table of food trays. A man unpacking a laptop computer in his living room A tennis player gets ready to hit the ball. A sign for a restaurant and bar on a building. A young child brushes their teeth with a blue toothbrush. A small toy is sitting on a plate of pizza. A red, yellow and white transit bus traveleing down a street. A herd of dairy cows in a field behind a fence. a cat looking out from an open doorway A baby sitting on a kitchen floor in front of an open refrigerator. A piece of broccoli partially surrounded by knife blades. a few drag queens make some cake and eat it A cat sitting on top of a desk. Black and white photo of three suitcases stacked on top of each other. A man sitting at a table with pizza. A surfer riding on a wave well if it's crash in the ocean A man and dog are interacting on a bed. Two men sitting in the snow with their snowboards on while one man is standing. A large truck on a open city road. Four remote controls are placed next to a Universal remote still in its package. A person selecting some bananas from a bunch. A young woman feeding cattle on a dairy farm. Three young men eating food while sitting on an indoor bench. a person on a bicycle wearing a hat in a parking lot A teddy bear dressed in a pair of underwear sitting on a chair. Two zebras that are standing together in a field. Two people sit on a couch by a guitar. A cat that is standing over a bowl. A silver train parked in front of a train station. a waterway and a train going over it on a train bridge A cake that has dogs around and on top of it. A beach area that has seagulls on the rocks and sand near the water. A woman surfer riding a wave crashing behind her. a man on a snowboard is on a ramp A cartoon image of a man on a pair of skis. Batter, catcher and umpire at a baseball game. A picture of some oranges stacked on top of each other. A man standing on home base with a baseball bat. A large teddy bear sits at a yard sale. A room with holes in the wall and dirt on the bed looking utterly disgusting. A white sink sitting next to a toilet under a window. A utility truck parked on a incline covered in graffiti. A woman sits at a table with an open laptop in front of a screen. A group of people standing around a elephant. A man sleeping on a couch holding a ripe banana. People in a hall, bags and suitcases on the conveyor belt The group is going skiing on the snow. a girl choking up on the bat waiting to hit the ball A person taking a picture on their cell phone A person flying through the air while riding a skateboard. a cat layling on a red blanket and looking relaxed A personal pizza and beer on a table Several cows are standing near each other in the grass. Is that a tiny computer next to the phone? Two people stand next to a grill with hot dogs talking. One white sheep standing still on the pasture near a dried up tree. This is the grill of a large truck. A man plays with a frisbee in a grassy field. A train on the tracks up on a bridge. A large airplane mid flight among the clouds. The people are walking down the street with their umbrellas up. a bus that is parked in a parking lot A group of bikers make their way up the city street, as a line of buses park by the sidewalk on the opposite side of the street. A small bathroom with a vanity on one side and the shower on the other. A toy kitchen with a play sink, stove and oven. A girl wearing protective gear while riding a skateboard. Two teddy bears in front of two vases of flowers. A man in a den playing with remote controllers. A bag of luggage filled with personal items. Small celebration cake on a table with happy birthday decorations. woman in a hat feed a giraffe out of hand Two giraffes, and antelope and some zebra in tall dry grass A man on top of a car standing next to a group of mountain goats. A cat is in front of an open refrigerator door. THIS IS A PICTURE OF A KITCHEN ISLAND WITH SEATING A woman is paddle boarding down the river. A young lady playing soccer alone on a soccer field. A big bunch of ripe yellow bananas on display. A pair of men playing a game with some remote controllers. A laptop and some suitcases in a room. A group of people standing on the beach watching a low flyinf plane go overhead. Kitteh at rest on somebody's black and white shoes a couple of people sitting on a couch plays with a wii remote Plate of food including rice, meat, and vegetables. A blurry image of an object with signs behind it and motor bikes. A baseball player in a white uniform holds a bat up while standing near a. catcher and an umpire on home plate. A group of white sheep walking through a wide grassy field. A kitchen with many cups on the window sill. One tall giraffe on top of the dry terrain. A man riding a red scooter down the street. A man is standing on base at a baseball game. An elephant is the focal point in this photo. A little girl in a store playing with four large white Teddy bears. Woman in red shirt on a horse in a river. a man is talking on a phone outside An older man is examining a table of bananas. The front of a store with its doors wide open Two children sitting on a skateboard riding it on down a slope. Dogs gather to eat food out of a metal bowl. A bunch of hot dogs sitting next to each other on a table. A cowboy boot filled with flowers sitting on a bannister. A white toilet sitting next to a white bath tub. Two bears in a sunset sitting on a hill. A young man holding blue handled scissors to his tongue. A train is shown next to a platform. Women smiling looking into a mirror while fixing their hair. a living room with some antiques and a book case A kitchen with light wooden cabinets and an island in the middle. A dog is tied to a cart on the side of a motorcycle Meat with lentils, rice and vegetables sit on a blue plate on a wooden table. a fire hydrant stands before a partially visible cave A costumed employee is holding an open umbrella. A zebra herd standing around in the grass. An ornate building is viewed by a crowd. Several Air Canada jetliners parked at an airport. A man in a suit standing beside his bicycle. Two small beds are now together to form a single bed. a close up of a person holding a book near a dog A bunch of sheet and geese in a field with a bible quote A woman surfing in the ocean and riding a wave that is crashing behind her. Birds are in the water and sticking their heads in An Asian man riding a motor scooter on a street some boats parked on the side of the river A toothbrush and a mirror in a bathroom. Winded dog sitting and eagerly waiting for a frisbee to be thrown. A piece of luggage sits by train tracks with passengers waiting. A bunch of cats sitting in a fenced in enclosure Two plates have what looks like a hot dog and seaweed. This fruit basket contains orange and green fruit. A shot of feet riding down a street on the skateboard. A person that is brushing his teeth in a room. a little wooden bench sitting in front of some trees A group of people riding on a bush. Two Zebra in an empty field with trees and buildings behind the field. a dirty kitchen ith various appliances in it There are two people standing outside on a balcony of a very large living room A cat drinking from a toilet in a bathroom with toothbrushes. A curly brown dog is laying beside a novel. A police vehicle carries away a car from the scene of an accident. A piece of broccoli next to a kitchen knife setting on a painted wooden bench with the paint chipping of it. A dog doing tricks commanded by a person. Two planes sitting in a field on a cloudy day. The airplane is in the air flying over the mountains. A gray dog has a pink frisbee in it's mouth. a close up of a woman wearing a shirt and tie a green and white street sign in a busy intersection in a city Two horses are sniffing a frosted cake as a lady stands in front of them with a plate. Two hotdogs with a hand full of fish snacks A red brick tower with a clock in it. A computer mouse sitting on top of a laptop keyboard. The two buddies are cross country skiing through the mountainous region A shiny kitchen gas stove and oven with a black counter. a cat almost all the way inside the bowl of a toilet A desk with two monitors, a keyboard, a mouse, and a binder. A kitchen with a refrigerator, ovens, a sink, and cabinets. a black cat is laying next to his colorful toy A glass vase with a green plant in it sitting in front of a window. A young woman sits at a computer in an office. A girl standing under an umbrella reading a book. A bunch of stuffed teddy bears with flag shirts A foot next to a snowboard on the ground. a house very big showing a city clock A baseball player jumps over another to catch a ball. A person sitting on the floor playing computer games by holding remote. Skis displayed on a sedan mounted ski rack. A woman eating a hot dog bun covered in sesame seeds. A bowl of rice, meat, peas, and carrots. A stop sign, a kosher butcher sign, and a Rite Aid sign The concert audience is composed of many young Indian men, some taking pictures of the performer. A giraffe standing with a bird flying in the distance. a baseball player swings his bat at a ball A boy and his younger sister looking at a steam engine A surfer is gliding through a small wave. A young woman is playing a tennis game. A horse galloping through the sand on a farm. A child's hands hover over a small uncooked pizza sitting on a tabletop. Three people posing for a picture in a parking lot. The person is sitting while holding the string of a colorful kite. A large herd of sheep are grazing in the snow. A man sits on a bench looking at a book in the subway. A group of men stand playing a video game. there is a male skier that is riding down a mountain Small crocheted teddy bear on the side of a quilted blanket. A bus is in traffic near a sidewalk and eatery. A white and black passenger bus at a paved intersection. A group of people hand flowers to a man. Two people with green shirts caring for some animals. A variety of fruit - including oranges, apples, pears, and Kiwi fruit - sit in a cardboard box. Mature man speaking on microphone in front of curtains Some young soldiers are looking at their pictures. A passenger bus parked in a parking lot. a couple of elephants are in a field A bike standing on a sidewalk next to a road at sunset. A tabby cat sleeping with its head on a laptop keyboard. A train yard in a city with a train in the distance There is a close up photo of an elephants face wearing a garment Horses stand around a horse trailer grazing and drinking. A red fire hydrant outside a shopping center. Two teenage girls performing chores in a kitchen. Two men overlooking the activities of students on small computers. Two birds are flying over a sandy beach. there are many different dishes on this table A woman smiles as she eats a lunch of Chinese food. The cat is at the desk near the computer. A birthday cake is shaped like a sheep. a topless man laying on the bed some sheep standing together while surrounded by some tall grass The are two bananas, the brand of them are dole. A small bathroom with a yellow towel on the floor and a rack with magazines and various other items. A picture of an open air zone that looks incredible. People walking on the train platform pulling luggage bags on wheels The room in the old house is ready for the new mother and baby, decorated with vintage finds. A group of kids is skiing in the daylight. a bunch of urinals are lined up on the walls A group of men standing next to each other holding a racquet. A cat that is laying down next to apples. THERE ARE DIFFERENT SIGNS ON THE STREET A woman sitting on the ground in an organized room. A person that is eating some food on a table. A giraffe with his long neck bent over and his mouth on the ground in an outdoor area. large plate of french fries in sauce on a white tabletop a yellow car turning on a somewhat busy road A group of people on a street next to a food truck. Two people pose together for a photo of themselves on a ski resort besides the ski lift The pinnacle of the building is illuminated at night. Three elephants standing by a man made waterfall. some table and chairs sitting around a building with a clock on the top of it A group of boats tied to the rocks near the shore. An acoustic machine, speakers and remote control are sitting on a table. Three people pulling suitcases behind them on a wet pavement Two sheep are standing on some short grass. A simple computer desk with a desktop monitor keyboard and mouse and a laptop computer. The two people are in the kitchen cooking. A dirty train is sitting on the train tracks. A woman ists on a chair while a child stands under an umbrella with red dots. A small baby bird on a piece of metal. a all white bathroom with blue tape on the walls A blender pitcher on the counter near a sink. A horse with a white stripe is in the woods. The dog is in a field on the side of a parking lot. a close up of a propeller on a plane in the air An elephant is walking across a dirt road. People are standing outside of an old airplane. There is a blue pick up truck broken down on the road Man standing up playing a video game on a TV. A bed with a comforter that is slightly pulled down and pillows that have a note on one of the pillows. A sign that reads 'plaza drive' is being displayed. A mother and her child giraffe walking in tall grass. a white motorcycle is parked in a spot A living area with a couch and a television. A boat is going down the middle of a channel. A three-piece bathroom with wood shelves and a round mirror. A black bear is standing by the rock Three zebras walking in a dry grass field. A young person riding a body board on a wave. A tan building facade with a bench out front. A little girl standing on top of a tiled floor. A black bear lying down in the grass next to trees. A train is moving swiftly through the station tracks. A red tray of food on a table. Banana on table with three colored plastic wafers. A woman holding a dog above a bowl on a counter. The Christmas presents are left in the kitchen. A glass full of drink is on the table next to a slice of pizza. A bus stop and sidewalk near a park. Woman standing in grassy area near baseball field. Red wine being poured into five crystal glasses. A girl dressed in pink sports gear stands on a snowboard at the top of a snowy slope. Women playing in field with flying disc during competition. A kitchen area with stoves, coffee maker and cutting board. A small elephant toy pushed against an orange. A pizza with mozzarella, tomato, and basil on a table with silverware. a bunch of cars that are on a street a woman in white shirt talking on a cellphone. Three people ride their horses down a beach. The large room has a lot of furniture in it. a group of motor bikes parked in front of a store Dog sitting in the back basket of a bike outside the shop A lady scratching her head in the bathroom. A long boat with an ad on it floats down the river A front end of a boat sitting over a body of water. Three people are getting off the train with their luggage A black and white image of a line of umbrellas A wall dedicated to white cloth with suitcases out front A small airplane sitting on the tarmac at an airport. a person riding a surf board on a wave Two trains on separate tracks travel through a city A towel with his nose right next to the camera looking towards it A lot if people are in the conference too Two teenagers with backpacks are on the street corner. An animal is eating some food out of a bucket. A bushel of greens are on the table with various fruits. A man wearing a suit and a blue tie A man holding a broom on a surfboard with a dog. Black and white boat sitting at a pier near a building. part of a road with assorted food on tables for sale a girl petting a pony on the back of it's neck A green train engine moves down the tracks with many cars behind it. A couple of giraffe standing in front of a cage eating hay. A young girl stands on her bunk bed holding a paper. A group of three women sitting at a table sharing a cup of tea. Two halves of a sandwich that is on a plate. some people are playing ball in a field A bearded man poses with his breakfast meal at a cafe A toilet that is sitting in a bathroom under a window. There are two children who are holding tennis rackets. Multiple images overlaid of several women playing frisbee. A slice of lemon pie with frosting on a white plate. Group of women on a soccer field with the ball in the air. a male in a green shirt a bowl some food and a pan Several giraffes stand near each other in a large grassy area A ski boarder riding up a big hill doing tricks. An abstract graffiti on what looks like an old train A boy holds a baseball glove on his left hand. A person doing a tail slide on a rail in a skate park. Three people in an art gallery using their phones. A teen is seen mid-jump while flipping his skateboard in an indoor skate park. A fat cat laying on a rug and shoes. A hotel room features a balcony over looking the water A man riding skis across a snow covered slope. The shadow of a skateboarder in the middle of a stunt. two small children sit next to each other A plate with a hot dog, chips and a strawberry on it. A person laying down on a bench outside. a person in a black shirt a horse water and trees bathroom with its door open and is very clean A hot dog sitting on top of a white plate. Sheep and lamb standing in pasture by stone fence. An old picture of two women with two small sheep A person near a large screen with others at a long table. A man and a woman cutting a cake with a large knife. A room with a tea pot and two blue and white vases. The skiers are ready to try the snowy slopes. A woman looking up at someone taking a picture A person with something in their mouth while holding a cell phone. A group of giraffes eating leaves off trees. an image of a dog eating on his plates A baby eats some cake with a fork while several people hover over him. An orange truck driving down a street full of men in the back. A red train sits next to a passenger platform at a station. Group of different types of vegetables sitting on a metal railing. Man dressed in black snowboarding down a mountain. The little girl pokes her finger into the sheep cage. a tower with a clock on it in front of a street light A red light on a yellow contraption in a n intersection Group of people riding on the top of an elephant. a white and red boat with a bunch the people on it A group of young men riding skateboards in a skate park. a dog passing in front of a girl on her cell phone Lighthouse on a point with sailboats near it Brown cat sticking its face into a pair of white shoes. A harbor with various boats and people walking on the pier. A lone giraffe walking in dry vegetation in front of a tree. Two luggage cases near a desk and bed A clock with glow in the dark hands, sits in a dark room. Man playing game with Nintendo Wii control next to kid carrying a cup. Kids swimming and surfing in shallow water on a beach. A small snowman with a person holding a carrot next to it two chicken patties filled with cheese in the center A work desk with a computer books and keyboard A plate that has different types of food on it. The kitchen has wooden cupboards, plenty of counter space, and a sink adjacent to the oven. A group of sexy young ladies wearing bikini tops. two giraffes headed into a building and another one standing by the fence A girl serves a tennis ball on the court. A person is holding a purple bear with no eyes against a yellow back ground. A person is holding onto a cellphone somewhere. A white coat on a bench on paver stones. A man in a tie sitting on a wooden log. A small girl is on the beach near a kite. A couple of men adjusting their ties in front of red steps. a person riding a horse next to a baseball field A yellow and grey train on train tracks. there is a man with a beard sitting in the grass The little boy is pulling the suitcase by the handle. TWO PIECES OF PIZZA BOTH DIFFERENT IN A BOX A large bear standing on top of a stone ground. A street scene showing a group of cars stopped at a red light. There are two brown eggs in a metal bowl Two men on a boat with a dog on the front A woman walking a dog by a table of food. Two people under an umbrella on a wet sidewalk with stars. A tall building with a massive golden clock on it's face. A large building with a clock and some trees. Two men standing in a kitchen preparing food. There is a red light on a traffic light two women riding down the snowy hills on sleds A very old fashion looking red smaller bus. A group of people are around a birthday cake. A close up of a cut into piece of food A large plain with a couple zebras and many antelope. A baseball player wearing green and white standing next to a baseball player wearing red and gray on a baseball field. A young woman sitting near a tree eating food. A man is throwing a Frisbee into the air. A beer and a slice of pizza on a table The large bathroom is reflected in the mirror. an image of a bear that is in the woods A naked woman sitting in a large suitcase. A photo shopped photo is shown with a tiny fire hydrant. Person holding a toothbrush under a faucet with running water. A bunch of sheep together in a very narrow area. A airplane that is sitting on a tarmac. The surfer is riding the wave on his surf board. an image of two zebras in the middle of the wilderness A man in a T-shirt is typing on a laptop. Airplane at airport loading gate under hazy skies. Two giraffes are neck to neck in an enclosure. A man holding a surfboard is standing by the ocean. A zebra grazing on grass in it's natural habitat. A lighted fish tank above a toilet in a bathroom A young man holds his skateboard while in a courtyard that is next to a large rock building. the traffic signs are easy to read for the street Big Been clock tower in London, England on an overcast day. A wooden table with a purple laptop and orange pen. A dirty brown teddy bear in a trash can. Many people flying kites on a cloudy day. Something delicious and sweet is done baking in the oven. Two large piece of broccoli laying on a piece of paper. Large sized truck with a medium sized black dog in the passenger seat. A kitchen that has a wooden cabinets with a wine holder. two people standing in the snow by a sign A batter is getting ready to swing at a pitch. A white clock tower at the top of a tiled building. A person is doing a skateboard trick outdoors. A remote control sitting on a wooden table. a group of people stand under neath a tent on a beach A baseball player throwing a ball on a baseball field. Two people waiting at an intersection carrying umbrellas Some people that are walking on a sidewalk while it is raining. An orange motorcycle is next to a red car. A laptop and two controllers on a small table in front of a couch. A living room has a large animal cage in it. Large amounts of desserts set on different platters. An assembly line machine has many goods on it as two people stand in the background. Two people are playing video games in a living room. A young boy rides his skateboard amongst pedestrians. Group of white sheep walking in a field of grass together. A keyboard, mouse, and computer monitor on a desk. A living room filled with furniture and a flat screen TV. cherry tomatoes and various food dishes on a table top A building with large windows sitting inside of a building. A couple of women standing with a boy inside of a kitchen. man crosses skis while jumping in the air A small sewing kit sitting next to a pair of scissors. A yellow and blue bus is going down the street. Two large plates off a variety of food . Horses communing with each other on a shady street. A group of people order food from a food truck. A man cooking a large number of hot dogs on a grill. Many motorcycles are parked side by side. Three pieces of cheese bread are on a plate. A man flying through the air while riding a skateboard. A bunch of keyboards with mice on top of them. A woman cutting a birthday cake on a tray. Two toilets sitting on a sidewalk with a cardboard box. A black bear is emerging from the grass to cross a paved street. A laptop with a small screen is chained to a desk. A man on a piece of equipment resembling a bicycle that has very large wheels. A kitchen counter has a coffee pot and microwave. A man is hitting the tennis ball with a racket A black Sony remote control being held in a hand A young man standing next to a racecar on a display lot. A man doing a trick on a skateboard off a rail. One giraffe from a group of two reaches through a gate toward a group of people standing outside the gate. a woman and child watching a herd of elephants in a gated area A group of men in suits sitting on couches talking. A man at a table with a bowl of food. a kitchen cupboard with the doors open and plates and bowls on the shelves A cruise ship docked for letting passengers off to port Cars move through an intersection below a green stoplight. A big yellow train travelling by a road. A black cat is on a laptop computer. Two travel bags on shelf with a metal rail. Three Giraffes are standing in a row and they are all different sizes. A woman sitting at a table eating a giant hamburger. A bathroom with a white bath tub and a sink. a big sausage in a roll with cheese and cups of sauce and a person A white dog on a bed looking in a box. A photo taking of the inside of the building looking at three balconies and the clock. Wisps of smoke on a public street at night. A wooden chair that has a black vase with two flower holders at the top, and two sets of flowers in the vase. A beach covered in kites next to an umbrella. Many people sit at a table eating a meal. A giraffe standing in a small piece of shade. A dog laying on the side of a car door. The evening sky on the lake foretells hope "Red sky at night, sailors delight." a small boat parked next to a bigger boat in the water A woman stands with an umbrella next to a building. A red bike locked up next to a a pay meter. Pick up truck parked by side of road with white building in distance a group of people that are flying kites Poultry and broccoli on white pizza, with lemon slice. A man dressed up in zombie costume is wondering around the street. A kitchen counter full of freshly picked vegetables. A bird themed clock sitting inside of a green box. A giraffe munching on leaves with man standing in front. a man is sitting at a table on a train A falcon sitting in a pond of water. a big colorful buss parked on the side of a road A airplane flying through the sky with a leaf on it's tail. An intersectional street sign stands in front of a vast mansion. A broccoli and cheese quiche with a piece missing. an image of a boy walking on the beach with surfboard TWO PEOPLE ARE TRYING TO GET A BICYCLE IN THE BACK OF A VAN Two red and white cows standing in a pasture. A man holding a frisbee on a beach with a clouded sky. A young child is swing at a ball with a plastic bat. A man looking at a laptop next to a beer can and speaker. An Alaska Airlines passenger jet sitting on top of a runway. People reaching for sandwiches on a plate sitting on a countertop. A man about to run to first base after just swinging a bat A group of three boys sitting on top of a couch. A woman is jumping in the air with a frisbee A yellow and black bird perched on top of a dead sunflower plant. dog sitting on dog chair with toy next to its paws A close up of a care with an advertisement for a movie. Pastries shaped like bear heads are displayed for sale in Japan. Someone is touching a white plate that has a sandwich and chips on it. People at a table with cups and a plate with donuts on it. A large bathroom with tile flooring and white fixtures a trunk of a car filled with a lot of luggage lady in the jacket is sitting on the concrete bench smiling. A group of cows laying next to some trees. some rice chicken broccoli and carrots on a black plate Some yellow school buses parked in a row. A woman in grey shirt on park bench with cellphone and bicycle. A couple of foreign language road signs. two zebra standing in front of some goats Many people hold umbrellas on the street during a rainy day A dog catching a frisbee in its teeth in a field Plates of food are on a ledge overlooking a soccer game. A person sitting on a beach with some animals. A desk with two laptops on it and both turned on. Two people watch TV on a couch with their legs propped up. fourt plates of vegetables and fruit sitting individually in each A little boy standing next to a sheep smiling. A city street is busy with cars and a clock tower above. A teddy bear sits on a stair railing. A black dog in the snow playing with the Frisbee. Busy traffic in a city intersection at night. A bathroom complete with a toilet, sink and window A man in a suit sits alone on a bus. People walk through a shop with flowers on the table. A counter top that has a mug on it. A ship is sailing away from the dock. Two bears relaxing in a pond side by side. A woman with sun glasses on a cell phone. Ski patrol with helicopter at accident on steep ski slope. This is an incredible picture show of individuals having a fabulous time. an airplane that is parked out in a grassy field A man performing a skateboarding trick on a rail. A toilet sitting next to a sink, towel, vase and mirror in a bathroom. an image of a tour bus that is parked outside a house A woman texting on her phone, while sitting in a chair. Person wearing grey clothing on a motorcycle on a city street. Two young child skiers are headed down a small slope. A table with two people and two pizzas on the table, one at each place setting. an old black and white photo of four people sitting on a bench some little kids sitting in the grass with a green frisbee A dog watching another dog on a television at home The apple computer is sitting on the bed. A young boy playing with a plastic ball and bat. a close up of a cat laying on a dresser and watching tv A girl holding a wii remote looking forward A chicken burger and french fries laid on a plate. A group of men, standing while playing video games. a clean bathroom with some flowers and a window A herd of sheep are grazing in a field. Several giraffes are near a fallen tree on the grass. Small groups of people, including a person walking a dog, are scattered about an outdoor area, encompassing some streets, that is filled with classic cars. An elderly woman poses for a picture in the park. It is never too young to teach a child about tooth brushing. A man with a tie, dress shirt, sweater and headphones. A man on a bench is looking at a boat in the water. A skier is performing an advanced trick on a slope A bunch of street lights in a town hanging from ropes A woman concentrating on her work at a table in a sunny room Three men in military suits are sitting on a bench, A boy and a girl sitting down to eat a pizza. A carrot sitting on top of a wooden cutting board next to a small green knife. a man sitting in a lawn chair eating food An airplane is lit up as it sits on a runway. View of a subway train through a mirror. a cup that has some flowers in it Two bears are romping in the water with one showing teeth. A train going down a track beside many skyscrapers A young boy wearing a baseball uniform and holding a baseball bat. A horse walking down the road, in the daytime. a bunch of people on skate boards ride on some cement A person is preparing a meal in a large home kicthen. a bunch of motorcycles sits parked on a street curb A man surfing on a wave in the ocean. A view of a bathroom that is in the process of being remodeled. Group of people holding orange and blue frisbees. The backyard of a big house with outdoor seating furniture. A white cake with decorations of penguins and a Merry Christmas message. Two people are playing tennis in an outside court. Two elephants are in a field of grass together. A boy sleeps with his head on a pillow and an arm around his cat. a lonely horse tied up in the desert. An photo of a lake, fire hydrant, and sign. A pair of zebras cross a dirt road in the plains. A sub sandwhich sitting on a napkin next to a glass of water. a home made pizza sitting on a table top A large metallic refrigerator freezer combo in a kitchen. A man with two children posing on snowy ski slopes. A couple of men standing between two large elephants. Two men on a boat in a lake near a house. A skier and snowboarder going down the snowy hill. A small personal pizza sits on a small white plate. A man rides his bike on a deserted street. a computer room with shelfing that displaying various electronic devices A cut in half sandwich sitting on top of a white plate. A woman talking on a cell phone while wearing a bag. A cow that is standing in the grass. A street with a street sign and a stop light a kid skating very high on the walking steps A man flying though the air while riding skis. A group of people riding sailboats on blue water. A man standing next to a woman under a kite in a tree. Two woman playing with Wii remotes and a man in short shorts sitting in a chair watching. a table full of different kinds of pizzas A mom holding her baby while working on her laptop. A man on a surfboard riding an ocean wave. two vehicles are sharing space wide enough for just one a white steeple near the roof of a neighboring building. Pair of elephants walking along the shore of pond in desert. A silver bowl filled with salad on top of a table. The food on the plate looks really healthy and hearty. A young man playing tennis on a tennis court. Two people are sliding down the mountain slope. Two women a man and a boy all riding horses down a river path. Man laying on ground with skateboards under hand and feet being nailed by another man there is a baseball player that has hit the ball A strange looking shower curtain in an ordinary looking bathroom. A man standing next to a parked motorcycle. A small white dog is standing on a desk chair A black and white dog walking down a sidewalk. The people are playing the game in the living room. The motorcycle racers speed down the curvy track. A woman laying in bed while clutching a blanket. A person riding on the back of a brown horse through a dirt field. A PUBLIC BATHROOM WITH CLEAN FLOORS AND WINDOW Christmas teddy bear next to a coffee cup of a candycanes a man swinging a tennis racquet at a tennis ball A bot watches while a man cuts a blue and yellow cake. Signs on the corner of an east London street by apartment buildings A bus drives down the street in a town. A group of people sit holding glasses and smiling at a table with several bottles. a blond woman with a spoon and a blender a man riding a motor bike with a usa flag on the back Passengers board the transit bus from the station at the loading zone. A baseball player slides toward a base as another waits to catch a ball. A group of people sitting on a trail side with a dog looking onward. Several animals cross the road with a human behind them. The cat is laying on the pink blanket by a window. Thirteen children and one adult dressed in baseball attire holding sports equipment. A dog is lying down on the unmade bed an image of man riding his bike down the street A couch that has several blankets on it. A person slicing something with a dog watching An adorable little gir sitting on a park bench. Three red motorcycles with riders in protective gear are on the street. Two cows stare out while being in the meadow. A turkey sandwich smothered in cheese on a plate with vegetables. An elephant walking into watering hole while a mother and child watch. A bed with a brilliantly colored bedspread and pillows. some fruit and veggies sitting on a counter Grape tomatoes, apples, and an onion are on a table. A cat is wearing a small blue backpack. A woman is eating food as she sits in a crowd. The baseball player reaches out to catch a ball. A hotdog on a colorful plate with ketchup, some ketchup spilt on the table. A view through a bathroom doorway without a doorway, showing turquoise tile and an unfinished wall section. A small boat washes up onto the beach. A fence is put up in a desert climate. Very large bicycle sitting in the middle of a freshly polished flooring. Two plastic baskets filled with food sitting on top of a table. Looking down at skiers holding their skis on the ski slope A lady dressed warm on a bike in the street. Chicken sandwich, french fries, herb tomato, pepper salad with sour cream and ketchup condiment. Three people standing outside a small airplane on wet pavement. a man jumping over a black box with a skateboard People crossing the street in a busy, overcast city. A group of people play a game of tennis. A filthy bathroom with a grimy tub and toilet and grime covered floor and walls. A "Greenwave" bus stopped at a bus stop next to brick buildings. Three skiers jump to the snowy ground in front of a tree line. a stuffed elephant with a brown stuffed teddy bear leaning on it A horse stands is front of people on a sidewalk. a girl sitting on a bench looking at her cell phone Two men sitting on the street in front of a building. Many horses are walking near the guard rail down the side of a street. a number of kites flying in the sky above a field of people A messy desk with a computer that shows a young child on the screen. a table top sitting inside of a kitchen Two giraffes neck up closeup from behind at dusk. Two dogs are sleeping together on the bed. A tan clock tower with a black and white clock. The bagel sandwich has many ingredients inside of it. A desert sitting in a plate that has congratulations written in chocolate A spacious bedroom with access to a balcony. A deep red and white airplane sitting in front of a hanger. Rings radiate from a gray bird in the water. A machine with multiple clocks on it with wheels. a model airplane sitting next to a bigger plane A man vigorously serves the ball during a tennis match A toilet sitting in a unique bathroom with painted and designed walls. A person riding skis on top of a snow covered slope. a kid eating from a blue plate and a spoon A woman with a suitcase sitting outside at a park. The aerial view shows a crowd with many umbrellas below. A man holding a ball in his hand in a room. Black and white vintage picture of a man in a suit with glasses. An Apple mouse sits on a desk next to a keyboard. A tennis player reaching with his tennis racket at the ball. The breakfast setup includes pancakes with a cherry. A baseball player is at the plate about to bat. A black case on the ground with a small tire and jack. A animal with a very scared look on his face and a red thing on his head. A white toilet sitting in a bathroom surrounded by tiled walls. A cat on a table next to a vase of flowers. A young girl sitting on a bench holding a toothbrush Giraffes, zebras and ostriches in a large enclosure. A sheep stands alert with it's face to the camera while it's offspring, head hidden by the sheep's wool, drinks it's mother's milk. A dog jumps in the air to catch a white Frizbee on a grass field. a person holding a cell phone near a corch Several male horse riders crossing a river to shore. A white horse is standing on grass in the country. Food stands with red umbrellas on a crowded street. vintage black and white photograph of two baseball players People out in the ocean on surfboards by a large cliff. Three mirrors mounted on a tiled wall with lights. A person standing on the snowboard on top of the snow. A red phone sitting on a table by a folder. A man is walking down a main street. The street sign indicated the names of the two streets. A woman holding a tennis racket in her hand. A close shot of a grilled cheese sandwich on a plate. a tennis player getting ready to swing a racket at a ball a red and white bus a bicycle and some people A white toilet sitting under a bathroom window. A white dish plated with corn, carrots, tomatoes, onions, olives, herbs and oil. a tan teddy bear a white sheep and two other bears two girls soccer teams are playing soccer and player from each side fight for the ball. A box filled with two slices of pizza and sewing equipment. A young girl standing on a grate with a racket. A cupcake, piece of cake, and tort with raspberries. Street sign with plants growing around it on the side of the street. A girl standing next to a bed standing next to a bed. An elephant stands in front of a body of water. Displays of deliscious looking dessert in store window. Two people are walking over tracks with stuffed animals near two other men and a lady standing by a model train. Closeup of two laptop computers sitting on a desk. A Singapore Airlines commercial aircraft landing on the runway next to the water. A young boy jumping in the air on a skateboard People that are making a pizza from start to finish. A bus parked in front of a building and beside a fence. A man on a snowboard in the snow. A Skiier on trail hillside posing for picture with hands out A man sticks his tongue out to have his picture taken. a man in a suit in front of a white truck A couple of tennis players on a large, fenced-in outdoor court. Small slice of pizza sitting on a table next to the bottle of beer. a table top with some trey of food on it A major league baseball player in the batting box. five bagels are sitting on a silver tray A small sandwich with lettuce and tomato on it. A yellow wooden bench swing hanging from chains. Two plates with dessert crepes and a cup of coffee on a red tablecloth. Some carrots and bananas in a small bowl Several people enjoy a day at the sandy beach. A hand holding an apple with the tip of a knife piercing the fruit. some people trees two blue umbrellas and chairs A beautiful young lady looking into an empty microwave oven with lust. A brown horse in a grassy field with trees behind. A teddy bear with a red bow holds two red, white and blue pom poms A clock mounted to a wall next to tall buildings. a white horse at the top of a hill A toilet with the lid opened placed beside a shelf. A Women chef outside holding a pan with food in it. A train has graffiti on it while it sits on the track A man preparing to swing his bat as another holds a glove. A blue motorcycle with rusty tailpipe, parked beside a truck. a train on a track near a platform A bowl of antipasta with sausage and beans in it An airport scene where aeroplanes are landed on the ground. Skiers doing stunts over a hill of snow. a piece of bead with some sliced cheese and bananas on it a woman laying on a bed but peeking at someone A large horse studding next to a baby horse. Two plates of breakfast foods on a restaurant table. A piece of wood has a fresh pizza on it. Several people riding on horses at the beach. A black and white picture of a man in a suit wearing a tie. A baby in a high-chair being handed his first birthday cake. an image of a living room setting with fireplace this is a piece of broccoli on a table A large crowd watches as a pitcher throws a ball. A collection of vegetables inside a grocery a store. A desk contianing a computer monitor, telephone, modem, CD drive, and a cat above a keyboard drawer containing a keyboard and a mouse which is above a tangle of wires and next to a bed. a table with a white plate and knife with food on it . A man riding a motorcycle in the middle of the street. A tennis player in blue returns a volley. A man with a baseball bat that is standing in the dirt. Several pieces of luggage and bags near moving trucks. Two slice of cake and a fork rest on a plate A woman sitting on chair holding an umbrella. a male wearing white is playing tennis on a court A peeson at a table is eating a small pizza A little girl is standing in front of a refrigerator. A faded red fire hydrant on a sidewalk near a building. a person riding skis on a snowy slope there is a woman standing outside talking on the phone Multi colored scissors with a multi colored ribbon. A sign in front of a railroad explaining how to board the train. a snowboarder flies through the air with an onlooker taking a picture A large propeller plane sitting on top of an airport tarmac. A man holding his arm out, holding a game remote control. A parking lot full of open blue umbrellas. A pair of surfers approach the water's edge, where the waves spread thinly over the compacted sand. A kid in the grass swinging a baseball bat. a bunch of boats are sitting in a harbor The little boy is too close to the stove in the kitchen. This is an image of the inside of a modern kitchen. A man looking at a large pepperoni pizza. A woman posing on a skateboard on a sidewalk. Two plastic model airplanes lie on the ground. A bathroom with a white toilet and a white sink A family on the beach points into the water. A person is riding a surfboard on the water. Two sheep in a vast field during the day. a lady standing in front of potted plants. A guy on a surf board riding a wave. Compute on desk in next to green wall area of living space. A male taking a picture of himself wearing a cardboard Happy Father's Day tie BUNK BEDS WITH LADDER TO TOP BED WITH STRIPED SHEETS Two cats are laying on the keyboard of a computer. A couple of sheep are on a grassy field. A crowd of people crossing a cross walk. A black cat laying by two pairs of slippers on carpet. An oncoming railroad train traveling down the tracks. A woman is hugging an orange fire hydrant. A young surfer surfboarding in the ocean doing tricks A donkey painted with stripes has a snack while hitched to a decorated wagon in Mexico. A steer standing next to one that is laying down. Three baseball players are on the field during a game. A baseball player swinging while the catcher waits for the ball. A small baby lying in an open suitcase. Jet-skis sitting on the sand in front of the water. Two parked motorcycles in a lot near a large field. A beach with many umbrella's and chairs with people by them. iphone playing game while donuts are in background A giraffe walking through a grassy area near some rocks. a woman reaching up while jumping to hit a tennis ball A boy is holding a teddy bear figure. A woman is on a tennis court in mid serve. a woman holding a tennis racket by the side of a road. A blue room with a brown double door and a closet full of clothes with a pink television on a stand. A giraffe standing on top of a lush green field. near trees. Motorcycles parked with pedestrians nearby at outdoor event. People in a field flying a kite with large clouds in the sky. A blender filled with liquid on a counter. Two men are working on a train at a station. A man that is bent over in a boat. A person riding a ski lift over orange traffic cones. A man on skis going down a hill away from other skiers A train moves through a heavily forested area a male is riding a horse and some cows a street and trees A woman is tossing an omelet in a frying pan. A calico cat is standing outside a shoe store looking in. A gothic building with a magnificent clock tower featuring gothic columns and arches. Pictured from above are clothing and shoes scattered on a wood floor. Two giraffes on a hill and one is walking towards the other. Many people are flying their kites on the beach area. A plate of food containing a sandwich with a tooth pick, lettuce, tomato fries and cole slaw. A man is standing by some parked motorcycles. The line of people are riding horses through the plains. A close-up image of a black dog in a room. Many motorbikes are parked on the side of a city street. An individual is hiking in the snow with some skiing utensils. A sign for handicapped parking with mountains in the background. A bedroom with a bed, desk, and tv with paper and pen on the table A large open kitchen has wooden cabinets and white appliances. A man and woman sitting closely on a bean bag type chair together, and the man is holding a banana in his hand. A dog herding the sheep by running towards them. A white sheep standing on top of a dirt road. A giraffe stands tall among grass and trees. An equestrian lady riding on a brown horse. a shelf holding onto some assorted paperback books a plate of pizza on a table a small boat parked on the ground on display A large clock anchored on top of a building A passenger jet flies over houses on a coastline. Cattle walking on dirt path through green mountainous area. a giraffe standing beside a building and part of a tree A young person is jumping his skateboarder off of a lodge. The cows are looking at the photographer taking the picture. A girl with a coat and hat on is pulling luggage. a red and silver train is coming down a hill and snow A field and a fence sitting in front of a group of buses. A rider is dressed in red riding gear while sitting on a coordinating red motorcycle. A YOUNG LADY ON THE COURT PLAYING TENNIS. An orange and bottle of orange liquid on a table. a close up of a woman in pigtails a shirt and tie A tall white and red light house sitting on a green hill. A european city in nice a sunny bright day A man holding a cake knife and stretches it out toward a cake as he stands next to a woman in a darkened room. Many cattle are on the field while people ride them in the background. Two people skiing on cross-country skis on the snow. A woman wearing plastic gloves handing out fruit slices from behind a table. a counter top with a microwave inside of it A surfer on a surfboard flying over the crest of a wave. A man holding a baby, eating and sitting at a table with two pizza atop. A multi colored train comes around the bend on the tracks A dirty train sits on the railroad track. The Big Ben clock tower towering over the city of London. A group of three men standing next to each other. A group of zebras that are standing in the dirt. a popular sporting event being being witnessed by spectators A girl with big glasses is brushing her teeth A young man in a bathroom dancing while looking at his reflection in the mirror. A motorcycle has a red and white plastic container on the side. A sandwich is on a delicately-designed plate with other place settings. A person wearing skiis and jumping off a snow hill. A large panda bear laying down in a forest. A grey cat smelling a cut filled donut on a plate. A cat sits on a fence under an umbrella with ghost lights A motorcycle rider riding on the street near a grassy hill. A small train traveling on the railroad tracks A couple is cutting a wedding cake together. A group of people sit in a boat with a bike. This meal has four pastries, grapes, strawberries, and sauce. A shirtless male tennis player awaiting the ball. Two women wearing hats standing near a fence. A man rides a wave on a surfboard. a girl that is kicking a soccer ball around A person is holding a cup with food and a plastic sword in it. A tennis player returning a tennis ball hit to him. a little kid that is sitting on a toilet in a bathroom a city neighborhood with a stop sign on the corner A woman standing with a bag in a mirror. An umbrella strapped to a bicycle a rain shower. A locomotive on the tracks near buildings and wires. this bed is very large and is under a window Colorful red bar stools are lined up in a kitchen. some stools a white refrigerator and wood table and chairs a one way sign in front of a tree pointing to the right A person on the street with a skateboard. A female tennis player in action on a court. A purse has it's contents laid out on a table. a man and a woman sitting on a river with an umbrella Various fruits and vegetables sitting on a table. An adult riding a bike next to a little boy. Two people and a dog sit on a sidewalk and watch a commercial bus pass them. A person is holding a donut with two fingers. A cat sitting on top of a black refrigerator.. A large tall building with a small bird flying over the top. A knife point on the surface of an apple. The little boy is standing in front of the new refrigerator. A family of zebras and a giraffe in a grass field. a large group of cattle have been fed fresh hay. A train traveling down tracks next to a mountain. A man with a white beard sits wearing a top hat and a suit. A vase and two candles sitting on top of a table. A red truck with a flame paint job. A brass clock stands in a train station. a river with frozen sections floating in it A child holding a dragon kite standing in the grass. A bird walking on a beach with something it it's mouth A kite on on the ground on a grassy field A train is on the tracks that is red and yellow. A yellow and green bus is going next to grass. A baseball player sliding in to a base while the baseman tries to tag him out A train moving along a track during the day. A snowboarder sliding down a hill in the snow a close up of some white puffy balls A mother elephant and her baby walking through the brush. A group of teens are playing frisbee in a field with a view. The man is snowboarding down the snow covered hill. A table with a laptop, a phone and a drinking glass. A man standing on the sidewalk with his skate board. A man in a black suit carries a checkered umbrella as he walks on a crosswalk. A large brown dog holds a Frisbee in his mouth. A steak covered with seasonings of mushrooms and broccoli. Night time view of pole with too many signs on it resulting in joke street name. Airplane flying low over the treeline and field beyond. Different types of foods and vegetables side by side. A train prepares to depart from a station. Birthday cake decorated with a frosting in the shape of a truck. A sandy beach next to the ocean covered in kites. there are many people that are riding a elephant Person doing a trick on a skateboard on side of building as others walk by. A person sits next to a laptop on the wooden table. A large hill in a green pasture of grazing cattle. A MAN DOING A BICYCLE TRICK AMONG OTHER BICYCLISTS A portrait of a group of tennis players and coaches. A train on a track going under a trellis. A close-up of a stop sign in a snowy landscape. A bus is leading the pack heading towards the hotel. A pastry with fruit, mug and fork sitting on a counter. A photo taken from a field looking at a train going by. A yellow commuter train sitting at a station. There is a plate of mushroom pizza on a table. a brown bear walking on the side of the road a woman is walking and talking on her phone a bathroom with a toilet, a sink and a mirror Two men running for a frisbe on a field. A smiling woman in a formal dress holds an umbrella. A brown bear walking around in the river. An espresso machine brewing fresh coffee and a toaster. A woman riding a bike with a woman on back. a little dog running up to two bulls next to some bushes A baseball player swinging a baseball bat A locomotive engine blowing steam as it comes down a track. A mime soaked in the blood of the innocent while standing in a park. A chefs knife and a cutting board with uncut mushrooms and half of an onion. A woman smiles as she stands in skis on a snowy hill. A couple of people that are in the snow. A street sign reads "Jack Kerouac" on a street corner. A row of bikes sitting next to each other as people ride bikes past them. A fire hydrant located in a clearing in the woods. A prepared pizza is sitting on a table. A woman in a cowboy hat and Texas flag on a horse. A skateboarder is mid air doing a trick The gourmet pizza includes several very special ingredients. A person in a wet suit riding a wave A green motorcycle sits parked by a gas station. a pair of scissors with long shears sitting on a pattern A pond of water with three giraffe walking in the dirt. A person sitting on the sidewalk holding an umbrella. The giraffe is standing alone in the wilderness. A woman holding a banana in front of her mouth. A satellite dish is near the produce hanging above a door. A fake zebra is shown in the lobby of a hotel. Guacamole sits on a white plate with a garnish of shredded carrots. Several people walk out of a bus onto the street. A park bench has four people sitting on it under a large tree. A group of people standing on top of a sky slope. A red garbage truck and a man behind it. a close up of a laptop and a mouse on a small table A white toilet sitting next to a white sink. An orange cat trying to look underneath a closed door Cat sleeping in a high chair in the kitchen. A male tennis player in action on the court. A group of five sheep standing in a row A white refrigerator freezer sitting in a park. A bird sticking it's beak in the water. A motorcycle has a paint design in green. A couple of brown bears sitting and standing next to a brick wall. a person riding a surf board on a rivier two people in a kitchen area preparing food a couple of white couches in a room All of those bikes look exactly the same. A giraffe standing in front of trees and an open field. a big kitchen that has a lot of open space Many pictures and toys are posted in the office A dog is running around some cows in a field. A broken pair of sissors with a half of an orange handle. A person holding a smart device in their hand. A jet fighter sitting on top of a field of green grass. The young giraffe are eating from a branch. A woman looking at a group of giraffes. Two planes parked next to a runway on the grass. a white dining table and two chairs by a window and a cat in the corner A COUNTER FULL OF DESSERT INGREDIENTS AND BEER. Two brown horses in a pasture eating grass. A baseball player is on home plate with his bat. Three zebras are standing together in the dirt a man doing a rail slide on a skateboard A young Giraffe enjoying the sun on the grass. A street lamp is on a street with a sign and flowers. a desk that has a computer and a keyboard on it The lady is holding a baby eating dessert. A black bear in the background on a grassy slope. People are standing and sitting near the street. Kitchen with silver appliances and brown cabinets. A side mirror of a vehicle showing a street sign. A large bus on a open city street. A teddy bear is being tied on a pose with pink ribbon. An orange and white cat is sitting in a car. A fenced in area shows two leafy and low-hanging tree branches, casting shadows, and making shade for two horses that are grazing at some patchy grass. A plate with some meat, bread and salad on it a person trying to get something out of a plastic case Row of black suitcases on a wooden floor. A man holds a candle in one hand and an umbrella in another. A red bike is parked between others as people walk past Someone in the air on a snow board Lots of construction materials at a childrens park Three giraffe standing next to a man in front of a blue barrel. two men and two women receiving some kind of reward An old picture of three student in the library with there teacher. Broccoli, green beans and various other foods in a tray. An older gentlemen is wearing a black suit with a white shirt and tie and a red flower in his lapel. A street corner view from the bottom of a clock tower. The lunch was in a box and had carrots, berries, grapes and a sandwich. A picture of a meal of artisan pizza. Two riders on the backs of horses riding along the beach. A tour bus making a right turn as people wait. A bedroom with a window, armoire, chair and table with plant. Group of four people standing and playing a video game. Urban area intersection with traffic signals displayed at sunset. A cat sitting on a bench in front of a building. a person holding a surf board in a body of water A snowboarder is at the edge of an outdoor jump. A person riding a bike on the road near some stores A man stands with a beer in his hand. An assortment of veggies sitting on top of a wooden table. A surfer's surfboard is going straight up on a turbulent wave. Penguin balloon, an orange, coins and beverage at computer. A bride and groom walking from the church with umbrellas. Pedestrians cross at a crosswalk in a crowded city. A row of parked buses sitting in front of a buiding. a close up of two slices of pizza on a plate A finger that is pointing at bread on a plate. Plastic bento box lunch example with fresh food a lady on a phone sitting on a couch A large continental jet sitting on a tarmac at an airport. A man propped up against a bike looking at a cell phone. A collection of sailboats docked in a harbor. A man standing under a street sign looking at paper. a man is sitting in front of a small cake A man pouring a drink into a glass while a woman watches across the counter. A very cute looking girl on a cell phone. Woman standing on the porch holding a tennis racket. a close up of a dog near a door way a couple of signs are hanging on a wall Three people are handing bunches of bananas to a fourth person. A young man roasting a chicken in an oven. A girl raring back at a soccer ball on a field. A sandwich cut in half sitting on top of a wrapper. There is a little girl playing with a ball. a baseball player that is standing at home plate Four dogs playing with a Frisbee on a lawn. A Soutwest Airlines jet airplaine taxiing along a runway. A white bed that is in its room. a giraffe walks through a bunch of bus A little girl standing on skis in a snowy area. A small front is lying down on the leaf. A beautiful young woman standing on a tennis court. a bathroom that has a sink and some lights A woman in black jacket sitting in snow with snowboard. A person standing looking at a large statue with clocks built into it. A stop sign with people walking down the sidewalk. A young woman riding a skateboard at a skate park. A bus is stopping to pick up people in the snow. a group of young people watching a young boy skateboard down a rail over some steps A group of cow standing in a patch of dirt in a pasture. A woman that is sitting near a coin meter. a toilet and a bidet sit in a bathroom next to a garbage can The herd of sheep is walking near cars on a street. A cellphone sitting on a table with a cup. a white bus is driving on the dirt A man riding a skateboard down a street. two people standing in the snow mountain with their skis A couple hugging each other to pose for the camera A skateboard that is sitting on a beach. a women that has a large pizza on a table The horse looks at the camera while the people talk amongst themselves. a man doing a trick on a skateboard going down a hill A man holding a box of food while wearing glasses. Two guitarist playing while people sing in the background A small car is parked in front of a scooter A lady flying a kite with a black dog nearby. a lady o a urban street holding a see through umbrella with two men standing behind her. a half of a pepperoni pizza on paper A young boy that is holding a baseball bat. two people playing basketball at an apartment complex A street scene looking down at cars and motorcycles parked. a man standing while attempting a trick with a white frisbee The zebra is standing alone grazing in the grassy field. A group of women are walking with cups. Two odd-looking birds wander around in a field. The blue bus has arrived and parked on the side. A small brown ukulele sits on a small wooden table next to a vase. A man against a concrete wall talking on a mobile phone. An airplane hooked up to the umbilical walkway at an airport. A guy letting a bird eat from the palm of his hand. Three people stand around a small aircraft on a wet runway. Three zebras graze in a field with grass and trees. a large hotdog with lots of mustard and a Hawaiian punch soda a group of boats parked next to a dock in the water A young boy swinging a baseball bat during a game. A beautiful young woman in a bikini feeding a baby food. A airplane that is in the sky near clouds. A blue and red tour bus standing by a building with a tile roof. a sign with soem names on the top of it a disk with a computer sitting by two windows with a view elephants at the zoo standing in front of a waterfall A spoon next to a plate with fish, rice, beans and broccoli. a line of people that has skies on A couple skiers on a snowy mountain side The handle bars in the restroom are sturdy. A split image of two different women holding a object resembling an arrow A man standing next to a little girl on top of a field. A group of cats looking out of a window. A boy swinging a tennis racquet on a court with other kids. a red white and black sign of a man working A wooden paneled door opens to a spacious bathroom. A white-bearded man stands holding a puppy and a stuffed animal. A pocket sandwich filed with meat, cheese and a pickle. A cat sitting next to a bowl filled with water and roses. A white plate topped with a pizza and a knife. a female in a red dress is on a bed with a laptop A white plate topped with meat veggies and rice with sauce. A piece of bread sitting on top of a plate. A sandy beach covered in lawn chairs with blue umbrella over them. two people in the air standing on snow boards in the snow A man is at a table with three plates of food. a person standing next to a truck with its hood open in a parking lot A group of seagulls are flying over a wooden dock that is sitting in a lake during the early part of the evening. a person cutting a pizza with a knife A brown puppy passed out after drinking a bottle of coke. A large adorable cat resting on a big soft pillow. A sheep is laying on its side while another sits against a fence. there is a pink fridge and a pink stove in the grass A train on the tracks next to a wooded area. A room filled with furniture and boxes and clutter. A man does a handstand on his skateboard. A man sitting on a kitchen floor has tools spread out beside him and is holding a drill. A couple of zebra standing in the tall grass. A stop sign covered in stickers next to tall buildings. A man and a woman on a touch looking at a smart phone. A boat drives in a large body of water. This plate has meat, broccoli, and a potato. a number of sheep in a field with dogs A man preparing food in a kitchen on top of a stove. Several children are playing in a fire hydrant. A small kitten lies next to a laptop. Two sheep grazing in a field with buildings in the background. A very tasty looking dish with some assorted veggies. A top of a building that has a clock and is flying a flag. A digital clock on a bus can be seen above people's heads. this is a man standing in a field A man on his stomach in a white bed. Two cows are sitting on an open field during the daytime. two men stand in the sand in a baseball diamond, while one hands the other a bat Small boy holding up broken umbrella Ina parking lot. Three blue pieces of luggage stacked on top of each other. There is a toilet with the seat up in the bathroom. A woman riding a gray horse in the middle of a street. A person stands between two tents set up inside of a cabin. Cow tethered with chain eating hay in outdoor field. A young boy standing on a grass covered field under a flying kite. A man standing on the beach next to a surfboard. a close up of a pot of flowers with a box of flowers A person engaging in a water sport with skis on. A woman with skis is standing on the snow. a boat is docked in some water next to a house and a bridge A skier skis down a slope, with blue and red course markers in the background. A vintage photograph of a man riding a motorcycle. A public bathroom sink and hand drying area. A train engine is pulling cars down a stretch of track. A oddly colored zebra laying down on the dirt A plate with food on it next a a spoon and some more plates. Two children and a woman on a play-mat in a living room. A wedding cake with a bride and groom on top. A group of elephants marches down the city street in front of a large building. A woman smiling and talking on a cell phone. A group of zebra standing on top of a dirt field. a room that has a bunch of beds in it Sheep are gathered around a lone tree on the hill Various lights on the front of a white vehicle. a car with a mirror view of a dog walking behind it A man standing in the doorway of an umbrella and parasol shop. A man with a glove that is in the dirt. A train driving down the tracks near trees and a building. Two baskets on a table underneath hanging items. A pink plate with white polka dots and a slice of chocolate cake and white frosting. A giraffe walking across a dry grass field. A train on the tracks at a train station. RED, WHITE, BLUE AND YELLOW TRAIN COMING DOWN THE TRACKS Ambulance and fallen over motorcycle from viewpoint of injured. A bench and trash can are seen in this picture. A person skateboarding on a street barefoot with one foot up A living room in a well decorated house. A horse eating grass next to an old fence and building. A couple of large white airplanes and trucks. A empty bench in front of a green bush up against a building. Two horses pulling an older styled coach passing a home. A close up of stuffed animal bear face. A man tying a windsor knot in his tie. A long table covered and used as a desk The men are in the bathroom using it together. A food entree is served on a plate. Two smiling men are cutting into a cake. An old advertisement for Maxwell coffee with a family sitting around a table. A dog preparing to catch a frisbee in its mouth. A couple of people laying on top of surf boards near the shoreline. Commuter bus at roadway intersection in urban area at dusk. a man is riding down a ramp on a skateboard An man and a young girl on a motorcycle. a magazine cover showing a man getting ready to kick a soccer ball A couple of boxes filled with lots of donuts. a black cat next to a box of fruit and vegetables looking up at the camera A bird is standing on the shore next to the water. The dog is laying on top of the couch. A group of people cutting a cake with a sword. Man on the back of a surfboard riding on a wave. an image of two people on the beach The large bathroom mirror is clean and spotless. Young boy throwing a ball up and catching it Surfer and black outfit coming down the front of a wave. Two striped zebras are on knee high grazing grass. A little girl putting a blue umbrella over a yellow fire hydrant. A person in black is skiing down a snowy hill with trees. A man that is being pulled by a boat on a board. A group of children in a classroom with windows around. THERE ARE MEN SIGING WITH ALL OF THEM WEARIGN YELLOW TIES A picture of a empty street very late at night. a room with a big chair with some boxes behind it The teddy bear was posed at the table as if he was drinking. The person is flying a kite at the beach on an over cast day. A woman displays a homemade pizza dotted with mozzarella and herbs. A red traffic sign next to a uphill alley. A woman standing next to a building holding a phone. Four men jumping into the air to catch a frisbee A gray and white bellied bird stands on a branch A man walking down a sidewalk next to a busy city street. a group of signs that are next to some trees A living room with a corner chair and a scatter rug. Two men in pajamas are holding Nintendo Wii controllers. A large red fridge is sitting on the red carpet. A small young child is holding an umbrella in the sun. Different styled sinks next to each other under mirrors A woman cutting a cake at a bachelorette party. A man with a surfboard in the ocean. A couple of kids laying on top of booths. This is a road sign for La Brea Ave A baby giraffe standing with other young giraffes in captivity. A laptop computer sitting on top of a bed. A female skier competing in a skiing competition. A large pizza prepared and ready to go in the oven A man poses and smiles while holding a doughnut. A plate of sausages, bread and butter, and potato salad. a person holding a surf board on a beach near the water A train going down the train track. A man sitting at the kitchen counter looking at a picture. Police car is parked in front of a hydrant A blanket with various items that include a mouse, computer hard drive and a keyboard. Nine men pose together near a coach and a dog. Woman deliver serve in a professional tennis match A cake is frosted with a surfing teddy graham on the side. A young man is surfing behind the giant wave. Several people are standing around a decorated elephant. A large cat sits on the sofa arm next to a girl using a computer A red car is parked next to a black truck. The young girl smiles holding a donut with sprinkles. A man holding out his white eight bit tie. A cow makes its way down the street next to city traffic. The meal consists of beef, brocolli, and other vegetables. A woman in a red coat can be seen in the background talking on a phone. There are Indian people riding in a cow drawn carriage A woman swinging a tennis racquet at a ball. A person walking a dog on a sidewalk lined with vehicles. a motorcycle with two people driving by a car Some white cattle roaming down the street of a town. a black cat with it's head stuck in a boot A white and blue bus driving down a road next to trees. some fog traffic lights street lights and buildings A bus on the side of the road in traffic. A giraffe sits in the grass next to horned animals. People being social outside a large colorful amusement tent. A skateboarder jumps very high at a skate park. A child watches an animal on a rock platform in a zoo. A skateboarder skating off the top of an outdoor stairway. It is surprizing that these flying kites don't get tangled together. there is a lot of old stoves on the ground two big red double decker buses on the road A giraffe looks like a statue in the dirt. A woman and two teenagers are holding on to a stop sign. Two young men in dress clothes and ties standing in front of an outside door. Three empty wood benches sitting in a woody area. A kitchen area with a stove, microwave and counter space. some buildings and some boats are docked in a harbor A guy is going up the ramp with a skateboard. Two men are playing Frisbee in the park. Two people, most likely a couple, are on the bench. a black bear walks through the woods in the distance A bathroom scene with a sink, toilet and shower. A bathroom with a toilet, sink, mirror and shower stall. A person with a surfboard walks along a beach. a couple of people are skiing down a snowy hill People standing at a table putting toppings on their hotdogs. The blue white bus sign next to the trees on the campus. The silver refrigerator is across the kitchen from a black stove. A stop sign that is right by a road. A man swinging a tennis racket at a ball on a tennis court. A street is displayed at night with time lapse photography. An old school bus painted white with curtained windows parked under a freeway A small elephant is standing next to the other elephants You male poses against stone wall with leg up. The sun sets over the trees beyond some docks. A full view of a market place full of sheep and items. A grey and white cat laying behind a laptop. The contents of a refrigerator filled to over flowing a black bear pokes its head out of a field of tall grass A single piece of pizza sitting on a paper plate. A man in a suit carefully adjusts his tie. a bunch of boats all lined up on a dock. A man sitting on a couch holding a Nintendo Wii controller. A sink that is in front of a mirror. Three young girls holding ribbons in the snow. Some kids are talking together outside of a house A black and white cat laying down resting its head on a cushion. Two zebra standing next to each other next to a tree. A large clock suspended over a street sign. An adult black horse and a young brown horse interacting. A train sits on the tracks at an empty train yard Two men playing a video game as other look on. Toilet next to a sink with it's counter cluttered with bottles of lotion and stuff. a sandwich sits next to some fries A horse pulling a carriage wearing a straw hat A bunch of bananas on a small chair. A man riding a skateboard being towed by a woman on a bike. a suitcase with writing on it sitting next to a guitar A black cat laying down on a laptop. A young black cat resting on a colorful surface. A plate with two hot dogs covered in slaw, and french fries At sunset, a surfboard upside down on the wet sand. A man holds a toothbrush in his mouth. A vintage baseball team of ten pose for a photo. A group of people standing in the dirt near large tents. Two buckets with a bowl sitting between them. A recently remodeled kitchen with marble and wooden furnishings. A bus is passing through a city intersection. A person making a strange face at a very large pizza pie. A table with a laptop, phone and other devices sitting on it. Two people working at a market with oranges and apples. A group of people are playing soccer on a soccer field. A small dog chewing on a teddy bear A man holding a Nintendo Wii game controller. Modern espresso machine on counter in residential kitchen. A brown horse is on the grass with two people. there is a surfer that is walking towards the water Two zebras standing near a pile of sticks and a wooden fence. Three women and one man wear various skis on their feet while wearing swimming clothes. A thirtieth birthday cake with candles on it. A zombie apocalypse is happening on the street. A dog is on a beach with people in the background. a person bends down to put air into a car tire The people are having a discussion about cell phones on the table. Antique black and white photograph of a couple on their wedding day A woman using a laptop computer on top of a wooden table. Man folding banner while holding stick in unfinished carpet A herd of sheep crossing the road under a cloudy sky. some old wooden doors decorated with scissors for handles A toilet with a wooden seat is open. There is a large cooking pot and some staples on sitting on the shelf. A person is holding up a large colorful umbrella A man on his bike is between the busy traffic, including two buses. A girl is holding the strings to a kite. A snowboarder in the middle of a jump, with a mountain in the background. A yellow fire hydrant sitting in a plant with a green top. A building with a clock on the front and side of it. A truck that is in front of a building. To buses side by side with one being a double Decker bus. a woman some pizzas drinks and bottles and bowls A group of zebras are with a group of giraffes. A baseball player mid swing during a game. A bowl fo soup sitting on top of a wooden table. A person reaches for the cabinet as the cat sits in the sink. A large clock is on the colored wall of this building. A woman underneath a umbrella on a street. Some very pretty giraffes standing in some trees. Small boats sit unused in water by a dock. A neck tie that is knitted or crocheted from yarn. A cat leans halfway off of a bed. A group of people standing outside of a building Two side by side zebras are near the tall grass. The people are sitting down together having a meeting. A baseball pitcher pitches a ball while standing on a baseball field. A dog sits by and watches his owner. A bench sitting in front of a brick wall on a patio. A thomas the tank train traveling down tracks. A mirror hanging on the wall reflecting a toothbrush. A passenger train that is pulling into a station. a room with a brown sofa,computer on a table next to a window and a red book shelf A jet that is flying in the sky. A white toiler in a very small bathroom. A plate with a variety of Indian food on it. A white dog sitting on a ledge of a window. A man and a woman sit on a bench overlooking the water. a male is on his stomach riding a wave on a surfboard A woman standing on top of a lush green field. Open packed suitcase with too many extra clothes to fit. A woman dressed in military uniform speaks to a child. Sheep are grazing in the fenced in area. A man sitting at a table eating pizza slices A cat lies on a laptop and paws the keyboard A cat is sitting on the floor staring at the TV. A tennis racket being held by a person and balancing a tennis ball at the top of the racket. A white train colliding with a black car. THERE IS A WOMAN WALKING WITH AN UMBREALLA A man pushing a luggage bard through the middle of an airport. a girl is getting close to a giraffe A green bus near a curb in front of a brick building. A baseball player on the backswing of hitting a pitch A bedroom with wooden floors in an apartment. A brown stuffed teddy bear wearing a red bow tie. A man throwing up in a toilet, with his head in it.. A man doing a jump over a wave on a surfboard A bowl of apples and tangerines on a table. A man sitting on a big white horse. A cat enjoying the warmth of a laptop. A person making food inside of a factory on a machine. A child wearing skis stands on snow and smiles at the camera. a pan that has a big pizza on it A truck in the middle of the street. A man showing a women an image on a projector. A freshly made pizza sits on a cutting board and pizza wheel. A small child heading down the mountain on a snowboard A mid sized transport plane sitting on a tarmac at an airport. A man is standing in the street near a frisbee. The boys are standing beside a group of motorcycles. A person holding a dog's leash and looking at books. a truck on a city street in front of another vehicle A showroom in a high end furnitureinterior design store. A man in a sports jacket is sitting in front of a microphone. Airplane being loaded at a terminal on a cloudy day. Spectators watching a professional baseball game's action closely A man standing next to a woman with an open umbrella. A man in a baseball uniform hitting a ball. An apple on the ground, and an orange on the ground in a picture beside it. Man riding a bike on a wet street in an urban setting The skier in the red coat is doing a flip in the air. The furry cat is looking at it's own reflection in the mirror. A slice of pizza sitting on top of a white paper plate. A woman walking down a street holding an umbrella. Bridge and groom walking down a path surrounded by a crowd. A man smiles as he plays a guitar. A batter and catcher during a baseball game. Teddy bear in sweater sitting on shelf near plant. Two horses on sand face each other while one urinates. a clock that is on the outside of a building A person riding the waves on surf board. Old fashioned furniture arranged around a parlor on an oriental rug. A wooden table that has several types of pastries sitting on it. A white and black cat standing partially in an open refrigerator. A man with glasses is wearing three ties while holding a camera. A black cat resting in an flower pot A bunch of fruits and vegetables for sale on display A bulky laptop computer on a desk near a lamp. The people are waiting for the train to get there. a brown and black ox and a white and black one and grass An old bathroom with a sink and toilet. a close up of a clock on a pole near a building A blurry dog holds a frisbee in it's mouth. The two elephants are very close to each other. A couple of people are riding horses on a beach. A few items laid out on a towel on a table. A man looks at a hot dog he is eating. A herd of zebra grazing on a grass covered hillside. A beach with people surf boarding in the waves. people walking pulling their bags and the security looking at them A young child enjoying a serving of cake and ice cream. A living room with a computer desk in one corner, a coffee table and television. A teddy bear with no face made from denim. Black and white photo of woman on chair holding strap of leopard or cheetah skinned hand bag on ground. there are many people snowboarding down a hill Chef at counter with baked goods, baking pans and containers of toppings. Oranges and lemons sitting together on a white plate. A group of people sit on a boat on the water. A woman with short hair looks at a cell phone screen. a yellow sign of a person carrying a surf board A close up shot of horse, with it's baby in the back. A white plate topped with eggs, sausage and a cut in half tomato. A dog running in a field with people around. A room filled of shelves topped with lots of items. three baseball players on a dirt baseball field a wooden table with the tail of a cat and a plate of cookies A giraffe standing next to a tall wooden pole. A woman stands beside a pony wearing a blanket A beautifully appointed bathroom with classic color and amenities A green and white bus driving past a building. a fryer that has a bunch of doughnuts in it A man uses his laptop on a kitchen counter. A train that is riding on the tracks near the street. A child holds at bat at a baseball game while people watch in the background. A close up photo of a brown bear. a brown bear standing in the shade in the wood The people are trying to climb the mountain. A man breaking slices of pizza on a pan A yellow school bus reflected in a side mirror. Four red birds perched on a branch in front of the clock tower. Man in black blazer pouring wine in glasses. A young person holding a frisbee while standing on a field. A bearded man in dark clothing sleeping on a sofa. A black cow is looking over a grass covered chain link fence. a female playing tennis on a clay court. Four boys dressed up one talking while the other's are listening. A group of young children sitting on top of a bean bag chair. a man surfing on his surf board doing a trick a bathroom with a sink, mirror on a tiled floor with a door open A man is holding a bunch of green bananas in his yard. 3 dogs sitting in front of a fruit and veggie stand. A train sits in a train yard with an animal. The young child is learning how to ski. A MAN IS ON HIS SKATE BOARD IN THE PARK Small bathroom with toilet, bath tub and sink. Two giraffes are standing by a tree and eating. A little girl riding a horse next to another girl. a guys tie all up closes its black with strips there is a cat that is sitting on the kitchen counter A teen-aged boy standing near a jail replica. A boy holding spoons over a pan filled with food. Two buses next to each other in front of a fence. A businessman showing off a unique red tie. A cat sleeps in the sunlight beside a computer. A woman who is holding her little dog. A woman is crouched next to a suitcase on a city sidewalk, she is surrounded by people standing over her. A man standing with an umbrella in one hand and a flashlight in the other A red European passenger train sitting on the rails. a red truck parked on a bridge with people in the back A small plane sitting on top of an airport tarmac. A man folding his towel on the beach while his dog stands in the sand. Ben clock made as a model with bystanders walking by. A young man tossing a frisbee in a forest. A man that is holding a frisbee in his hand. A fire hydrant is surrounded by and covered with snow. Ten people and their dog pose for a picture while skiing. A blue clock with clear leaves coming out of it. There is a tower with a clock at the top. A boy laying on a bed with a black kitten. a little toy fire engine sitting on the ground outdoors Two men in military uniforms holding a large key in front of a house. a group of people excited to eat pizza A guy on a skate board near some graffiti. A cutting board with a long pizza and knife on it. A picture of a fire hydrant on the side of the road. A child's highchair has a little cat in it. A cat is laying inside a briefcase in a room. A group of people in white lab coats leading a group of cows. a white box with different kinds of donuts A bunch of stuffed toys inside of a homemade castle Two white toilets, white towels, and a shower. there are two zebras standing next to each other A computer mouse sitting on top of a table. A large bird sitting on top of a speed limit sign. Many people walk down the street with umbrellas in hand. Young boy taking swing with bat outdoors in play field. A man is on his roof with a large umbrella. A skateboarder doing a stunt on the edge of a ramp. A cat sitting on top of a shelf by a computer. Multiple vehicles parked curbside next to parking meters. A little boy that is standing on a skateboard. Two people are looking at a truck while a dog is being walked. The person ski's downhill on the mountain of snow. Large number of snow skiers at the bottom of a slope. a herd of zebra standing next to each other. Two young men retrieve plastic flying discs in the park. A large sandwich on some paper by a knife. A PICTURE OF A BATHROOM WITH A PLAID SHOWER CURTAIN. A green pan that is on a stove. A man brushing his teeth in front of a mirror. a large pizza is sitting on a pan A zebra stands near a giraffe in the wilderness. A man flying through the air while riding a snowboard. Baked pizza displayed on serving dish with beverages on small table. There is a family out on the ski slopes. A sign that reads public market center is shown. Young man looking into the inside of a refrigerator through bottles. a small girl in a white shirt and another person a dog is under a man with a laptop lady wearing work out clothes and glasses with a cat in her lap A clock on a stone tower is against the blue sky. Large striped zebra walking down a patch of grass. A man standing on top of a beach under a cloudy sky. A couple of elephants standing next to each other. A messy baby eats the broccoli off of the table. A man plowing the field with two horses on the country side a group of females standing in a grassy field playing frisby an intersection with different poles filled with street lights and a camera Panda bear climbing tree with paw over limb. A man with glasses playing with a Nintendo Wii. A giraffe sitting on a rocky dirt and grass covered ground. A man on a phone on a ddr pad An orderly bathroom is seen in this picture. A man standing with a dog in a field of grass. The person with the bag is walking down the street. The elephant family is walking down the road. A skier in all white standing in the snow. The grinch riding a motorcycle with a small dog with antlers. there is a withe toilet and the tub has a blue curtain A elephant fenced in a large land area . Identical street signs pointing in the opposite directions of each other. A man and a young girl on a motorcycle. a man with a white beard and hat on a cellphone A person with their feet propped up by a flower vase and couch. A living room arrangement looking into a kitchen and dining room. Two surfers-are in the Ocean one stands and look's at his board A man flying through the air while riding a snow board. A small kitchen with a stove and refrigerator. A man swinging a tennis racquet at a tennis ball. A giraffe looking alert at the camera in a field. A view of a shower and toilet from above. Two men standing in a living room holding Wii remotes and nun-chucks. Emotional person hugging a stuffed bear while sitting in a plain room. A street sign with two streets and two block numbers. A herd of sheep crossing a bridge over a river. A small bedroom picture taken through a fisheye lens A picture of a person fixing a road sign. the woman is sitting at a table in a purple chair A very cute elephant covered in mud in some tall grass. several people play video games with remote controls A group of people taking pictures of two pizzas in open boxes on a counter A white toilet sitting next to a bathroom sink. A hamburger and fries sitting on wax paper. A nice hotel has a full living suite A A bowl and a sandwich on an orange plate on a table. a tennis player swinging a racket to hit a ball A group of colorful umbrellas sitting next to each other. A picture of some trash being wasted in a trash. A truck driver adjusted the straps on his load. Two groups of people rowing in boats side by side. A young man riding a motorcycle having a good time. A girl walking behind an open fire hydrant spraying water. a woman is petting an elephant and a fence The man is holding up his chat pad in his hand A man looks into the mirror as he styles his hair. A refrigerator and table and chairs in a garage. A boy with a racquet swinging at a tennis ball. A peanut butter bagel is sitting on a white plate with several other food items surrounding it. A white bed sitting next to two windows. A giraffe putting it's head in a leafy green tree. A bird sits in a fruit tree with many leaves Three teddy bears dressed up for Christmas on display A maroon vehicle stops at the stop sign. A woman spooning cookie dough onto a cookie sheet. The silhouette of a group of people and a horse. A boy in grey shirt sheering a sheep by wall. An army jeep with an American Flag sitting at an airport. A young boy skinning carrots into a sink A happy stray puppy lies in the street. A street sign on a busy sidewalk corner An oreo cookie and chocolate dessert on a plate. a display shelf with a few bananas on it A man in a pink bow tie and a pink shirt is being hugged by a man in a blue shirt. Two street signs indicating no parking or towing. A picture of a bunch food sitting on a table. Several bicycles sit parked nest to each other. A vintage airplane museum, with people walking underneath displays of WWII-era planes in a hangar. A group of people on skies with contestant numbers. two boys are playing a video game and people are watching The side of the building has a large clock and several windows. a group of people standing playing nintendo wii A tabby cat is laying in an open packed suitcase. A skier putting their feet in the skies. A chocolate bunt cake is adorned with cashews. A group of giraffes on a jungle path. A passenger jet rolling along a runway at an airport. Several vehicles are stopped at an intersection behind a red light. A young man performing a skate board trick outside. a bright day and skiing in the mountains A woman in shorts and heels waiting on a train platform Traditional looking around the umbrella girl with old clothing. Small piece of bread and a donut sitting on a white napkin. A man sits on a surfboard in shallow ocean water A clock on a tower in the middle of a brick building. The women sits in shade working on her laptop. Dad, son and teddy bear are all smiling and happy. an image of a baby eating a spoon there are people sitting at a table using lap tops a living room with a person playing with a kid Chocalate covered deserts on a stick on the table. A small white-and-brown dog curled up on a flower-print pillow. three people standing at the zoo watching a elephant A man riding skis down a snow covered slope. A train crossing the road with cars waiting. A man is wearing a pink shirt and a tie. The two airplanes are close on a runway. A bowl of chicken, lo mein noodles and vegetables. A crowd of people mill about on the street. Two people skiing on a snowy mountain with a building in the background. A man on a surfboard performing a trick. A young boy flying a kite near a house. A minimalist room features white appliances and beige walls. There are two horses walking in a grassy field Two brown horses pulling a carriage as people sitting on the side of the road watch. A man shaving his face with another man hiding behind him. A kid laying down with a stuffed dog on him. a group of three people talking to each other on the sidewalk with a skateboard Two giraffes standing next to each other in their natural habitat. A man flying a kite in an open field under cloudy skies. A woman is sitting on a canoe going down a river. A group of people with surfboards enjoying a small river. a cat with its hair sticking out as it looks at a dog by the window a polar bear swimming in the water by a wall A kitten that appears to be focused on a computer mouse. A group of men standing next to each other. Three packages of toilet paper sitting on top of a toilet seat. A motor scooter has multiple rear view mirrors. A little girl crawling out of a piece of luggage. Two men with racquets on a tennis court. a little girl sits on a bench by herself An industrial kitchen has a double oven with glass doors next to a shelf of dishes and utensils. Many laptops and their assorted wires atop a wooden bench. Colorful Adirondack chairs at the end of a pier. There are four goats and one giraffe standing in a group. Purple orchid and colored leaves in a green vase. a bunch of different colored vases on a table A giraffe and a baby giraffe standing in an enclosure. pink double decker bus with two woman pictured on side The view inside a suit case, and a backpack. A dog standing on top of a boat in a body of water. A group of men doing tricks on skateboard next to ramp. a close up of a buses rear view mirror There is a bowl of food with bread and a plate of fruit. Three children sitting at a table with food and drinks. Stop sign at the intersection of two rather rural roads horses graze and drink from the water at a lake a black and white dog is herding some white animals A large orange striped cat laying next to a computer keyboard. A brown cardboard box with glazed doughnuts and wax paper. a bed sitting inside of a bedroom on a wooden floor. A person holding a pair of scissors in one hand. A small group of giraffes walk across the savannah. A bundled up woman skier falling in the snow. A person riding a horse and wearing armor in front of a crowd. A table with two drinks and glasses flanked by two chairs. A display case with various types of pastries. A couple of cats are sitting next to a dirty door. The guy with the white shirt and baseball cap is milking the cow. A striped plane flying up into the sky as the sun shines behind it. A man's torso wearing a brown patterned tie, pens in pocket and a large checked shirt. A slice of strawberry cheesecake on a plate with a fork The buffet features several different types of pizza. A man is leisurely crossing the street on a skateboard The cat is wandering around in front of the cardboard boxes. A view from a house looking outside at the front of a black car. Freshly cooked food and salad on a paper plate with a fork a airplane that is flying through the air The right hand of someone unpacking a Wii remote and sports games A large clock is displayed on the side of a building. a close up of a plate of food Young child playing baseball in a local park league A plate loaded full with well cooked food Two women and a man posing for a photo on the dance floor. Zebras and wildebeest walking in their natural environment A female Tennis player is holding her racket while the crowd and man look on. A man is cooking a pan full of various foods. A white dog in grassy field with red frisbee. A clock and a picture hung above a big window. A gooey piece of pizza with peppers, cheese and onions. A fire place sitting inside of a living room. a woman walking down the street with a baby carriage three groups of yellow flowers in vases on table Two males are watching something on a camcorder. there is a toilet with dirt on it A baby laying on its tummy on a bed is looking at a blue elephant. A laptop next to a wall in a room. A jet airliner leaves a faint trail of smoke during landing. a laptop sits in front of a group of people Jet parked with no one around in the area. A white polar bear is laying in the snow. A man is riding a motorcycle across the sandy shore line. A group of children sit on a bench outside. A group of people standing near surfboards in the sand. Small children wearing a cast holding up a Wii controller. A man is playing tennis on a dirt court. A man eating food while wearing a gray hat. a man that is cutting a pizza that is on a stove A sleeping black cat sitting on a pizza box. Three doughnut holes sit on a white plate with a doughnut that has been topped with topping and drizzled with sauce. A store with items on display in it's front windows. a person holding an apple near a tree A woman on a cell phone sitting on the ground. A man handing another man something inside of a room. A person riding a skateboard while wearing blue shoes. A bunch of people waiting on a subway train. A horse drawn wagon driving down a dirt road. A man standing over a table presenting food. A city street filled with lots of traffic. People standing on surfboards on waves in the water. A close-up of a metal statue of a bird landing on the nest. An owl among a few leaves, next to a wire fence. A women in a blue shirt cuddles up with her cat Two girls looking at a calf in a fence. A man in a grassy field about to catch a frisbee. The food is a mixture of pizza, salad, and wine. a group of people walking on a city street An old fashion looking clock tower near some bright lights. A person stands under an umbrella on a sunny day. The is a line of elephants in the street. An abstract designed bowl holding a bunch of oranges. A couple cargo trucks parked outside of a few shops. A man riding his surfboard through the waves. The head of the black and white horse has a red decoration. An elephant standing in water and surrounded by grass. A group of people standing around a green tent next to a horse, A clock tower in the middle of a road. a clock on a wooden pole in the middle of a beach A little boy brushing his teeth with a tooth brush. A double decker bus driving down a road. A woman playing games on a laptop computer. Closeup of a pastry with white and brown frosted petals. A bird perched on a wooden peg ready to take flight. A teenager standing on a ramp while holding a skateboard. A young child riding on the back of a sheep. A wooden doll is next to a teddy bear. Gray and white dog sitting on top of the bed with a black cat. A person in a wet suit in the water engaging in a water sport. A guy wearing a black wet suit on a white board, surfing. A man in a suit waits in a room with a tv. A black and white picture showing small children in a dormitory setting. A bed sitting in a room near two lamps and a couple of pictures on the wall. Three boys hanging out in a living room with the T.V. on in the background A table full of assorted snacks and plates. A red double decker bus parked on the side of a road. A hallway lined with doors and filled with suitcases. a engine sits parked inside of a ware house A yellow and blue fire hydrant in front of a building. plated vegetables on white dish displayed on hard surface. a van that is parked by some people with umbrellas A cat lying on an open laptop that is on a bed. An elephant standing next to a tree outside. Trio of zebras stands idle on the savanna. a dark picture of two men on skate boards two cats, one orange and one gray, sit on a shelf intended for shoes A submarine sandwich cut in half on a white plate next to a cup of coffee. A boy with a blue jacket is smiling on a ski-slope. A young child with a spoon eating a slice of cake An assorted group of standing and reclining cell phones. Two pizzas on a wooden table with a person seated. A person in their car views a ram in the street. There is a horse race going on in a carriage cart Signs displaying foot and seating area hanging inside restaurant A traffic light with an orange and a red having faces drawn on them. an image of a flamingo drinking something orange A group of people holding umbrellas standing behind a sign for a umbrella drive. A little girl is playing a game on the television. A cat is in a bathroom standing on an open toilet. A woman stands by her luggage and carries a large bag. A woman standing on a surfboard riding a wave. A healthy meal of fruits and vegetables on a table. THERE ARE CARS AND A TRUCK THAT IS PARKED IN THE PARKING LOT A train on the railroad track in an underground subway. TWO CONTAINERS OF FOOD SITTING ON TOP OF CONCRETE STEP Two men are sitting on a couch and their ties have been tied together. A clothes line with clothes hanging from it and cattle in the background A young girl is taking a nap next to her mother. A stuffed animal with colorful decorations on it and clothes hanging on a wall. A small bird perched on the handle of a bicycle. A giraffe standing next to a tree covered in leaves. A hand holding a piece of food at a table. A guy on a snow board in the dark. A display of historic pots and artifacts on display steps. A group of women standing under a red and white umbrella. A woman holds an electronic device in front of the camera. A piece of cake sits on top of a plate. A herd of cows make their way across a river. A woman sitting at a table with a little girl and a man. A man with a toothbrush in his mouth and uncombed hair takes a picture of himself at his computer desk. A banana, red pepper, carrot, and green apple A man standing on a baseball field while wearing a glove. A clock tower on top of a building with a wind indicator. A woman sitting on a bench with a dog sitting on the ground by her. A bunch of people in a building doing different things A group of people stands around and looks at a phone. A British Airways airplane taking off into the sky. A dog with a white hat at the field A woman trying to take a frisbee from her dog. A group of people at the beach flying kites A shot of a field and road taken from outside of a vehicle window. A train passing through a railway station.Railway platform is seen. A bathroom has a sink on legs and round lights. A cat sits in the foreground looking at the camera while a bright yellow motor cycle is in the background. A kitchen with a large white counter top. The cat naps on a shelf near the desk. A person eating food from a white plate next to a glass of wine. The toaster adorned with a face sits atop the tiled surface. A cat is standing on a board game Large and small elephants standing near a watering hole in the grass. A small restroom that is painted the color blue. There is a fruit slushie next to a very sloppy chili dog. A airport runway filled with jetliners next to large tanks. A kid in a white shirt stands on the grass while another boy stands on a pathway near a hovering white Frisbee. A high shot of a counter with a microwave and other food. A fruit market with shops of banana and apple.people buying banana. Delicious looking pasta with a variety of noodles Two young men sit on a couch in a sloppy room with a laptop, a phone, and a flat screen tv. A plate of food with broccoli and beef. A dog sleeping on a rug next to a stuffed animal. Boats floating on a lake near a dock. Sea birds gather on a broken pier surrounded by algae. Several bundles of fruit hanging from a plant. A crowded city street with a row of bicycles A black bear that is walking on a branch. A small white bird walking across a lush green field. A woman sits on the curb talking on her phone. A high statue with a clock inside on a very nice day. A man riding a skateboard down a wooden ramp. An intersection of two streets in front of a home. A man with a snowboard that is standing up. The baseball player is running from home plate. A train is going down the tracks in the dark. A person riding on a skateboard down a ramp. A large building with a clock tower on top of it Three ladies and a man sitting in a room with drinks on the table.Two of them playing video games. A bed with covers turned down and a messenger bag against a pillow. An old building sits in the background behind an illuminated signal light. The man is sitting on the post beside the water. A bathroom with a large tub next to a toilet and sink. A baseball player pitching a baseball on a field. A happy couple taking a selfie while sharing a drink. three people and one is petting an elephant BLACK AND WHITE PHOTO OF A WOMAN, TWO CHILDREN,HORSE,COW AND A DOG Several boys on a field playing with a frisbee. A crock pot on top of a microwave on top of a refrigerator. A table topped with two pizza and plates next to glasses. A man carries a surf board as a dog walks beside him. A young man doing a jump off a ramp at a skate park. The man in the suit is cutting the cake. People are flying their kites in the sky. Several people are standing in a living room while one examines a remote. A man riding skis down a snow covered slope. The Central Railway Station tracks in an old photograph. A bathroom with a sink and toilet and very small mirror. A woman in a black helmet jumping a hurdle while riding a horse. Two zeba standing on a dried grass plain looking off into the distance. A man and a woman smiling at the camera inside a large building. A wet floor sign is between a toilet and a urinal. Two marble vases one containing white flowers, the other green grass. A woman in black jacket sitting at a park bench in woods. A banana sitting in a bowl that is on the table. A dinking room table in the living room right next to the fire place. A girl with long brown hair with streaks of red lays on a bed and looks at an open laptop computer. Several elephant statues on display in a mall. a red bus that is in line with other cars A clock tower on a roundabout next to a building. Smoothie ingredients are in a blender including blueberries, strawberries, and bananas. A man on the beach is playing Frisbee. a back to the future mcclaren and time machine toy A bear is standing outdoors in the wilderness. A toilet and bathtub are in a bathroom. this is a pink box with food inside of it A trolley bus is coming down the street near trees. A woman takes a close up photo with her cat. A counter with a bunch of bananas and oranges on it. The boy in the green shirt and green hat is holding a baseball mitt. Adult riding breaking wave in open ocean on sunny day. four white and blue street signs on a wooden pole A giraffe is stepping on a log in a grassy area. A bathroom with a metal sink and an odd shaped toilet. A piece of chocolate cake is on a plate with a fork. a couple of men that have wine bottles in hand a porcelain toilet that must be used by crouching over it rather than sitting on it The person is bodyboarding as the waves crash around him. A dog that is swimming in some water. Multiple men climbing and hiking through the snowy mountains A tray that has two forks, a bowl , and food on it. Two birds perched up on a large tree branch. A lone skier, dressed all in black, going down a hill. People standing on the top of a green hill area with kites flying in the blue sky. A man wearing a tie next to a woman. A woman sits on a bed in a dark room. An older boy and a young boy are playing a video game. Two horses in a rope corral in a courtyard with one being groomed by a woman. Brown and white cows lined up against a barbed wire fence. A clock tower with a toy doll display below it. People riding and pushing tricycle carriages down the street. A train driving along tracks next to a city street. A boat is docked alone on the side of a river. Some cattle next to a brick building and a guy on motorcycle. Woman sitting at a restaurant holding a wine glass. A large bear in a tree biting into a branch. A toilet in a stall with a sink attached to the toilet tank and a console attached to the lid. Some baby bears are having fun on a sunny day. A person in red jacket snowboarding down a snowy hill. A farm with dozens of sheep in an enclosure. A kitchen that has a hanging rack and a refrigerator. very clean bathroom with white towels and some bathing soaps A couple at a cafe each on their respective cell phones a big giraffe and a small giraffe are in their pen A large dog in a room with yellow walls. A woman walks through a busy area holding a purple umbrella a giraffe eating food from a food dispenser A tennis player is running on a tennis court. A subway car stops at a station, its doors open. A snowboarder standing on a snowy mountain looking out. Some people standing on a surf board on the beach. A desert with some fruit on a plate. An adult and child elephant are eating grass. a black and white dog standing in front of a glass window The multi-colored cat is standing on a luggage bag. a bunch of stuff in a home living room A group of cows grazing near a passing train a big swimming pool that has some people in it A train with closed doors near a platform. Kids playing baseball while parents watch from benches. two women with a basket sitting at the bottom of the stairs A rack of bow ties hanging from clothes pins. Crabs walk across the sand along the ocean. Two old style planes flying side by side in the sky. a truck has pulled off the road to look at an elephant A group of nine jet planes flies in formation. A woman carrying a surf board by the ocean. a pink and white plate with some banana slices on bread and a drink A man that is on a surfboard in the air. A glass vase sitting on top of a table. A close up of raw meat and meat cooking in a deep fryer. A kitten is standing in a refrigerator shelf. A skateboarder skating next to a concrete street divider. a close up of an electric blender on a table a person in a kitchen preparing food Two Teddy bears sit next to each other. A group of people set on the ground talking in a park. A group of young people gather with surfboards on a tropical beach. Three giraffes standing around inside of their enclosure. A black and white street sign with a white building behind it. a young person laying on a couch with s nintendo wii remote A sandwich with peppers and ale are setting on a table. Busses paused to a stop at a bus stop. A red and blue dump truck traveling along a city street. A sprinkled doughnut sitting on a white napkin next to the bag it came in. a large elephant that is standing in grass A brown and black cat licking a woman's face. Two hitched horses standing next to each other with pink coverings on their heads. A tennis player leans into her stroke on the court A pizza slice on plate, beer in mugs and beer bottle on a kitchen table with place mats. A donut has a bunch of nuts on top of it. A train sitting on top of train tracks near forest. A couple of giraffe standing on top of a grass covered field. A half unmade bed in a hotel room a close up of a person cutting a pizza with scissors A man in a suit and tie holding a water bottle and people with cameras standing around him. A man trying to get his dog to herd goats. A man in wetsuit riding a white surfboard on wave. A woman stands ready with a tennis racket. A person and skateboard in air over a ledge by a sidewalk of city road with cars. A teenager rides a skateboard down the stair railing. The two giraffes are standing together in the grasslands. A large passenger jet flying over an airport. A cat laying on top of a blue dresser near a chair. a blurry photo of an empty city street a woman is holding a teddy bear in a room A man watches a flatscreen TV set above wrapped gifts. a couple of computers are sitting on a desk A bathroom mirror that is trimmed in gold and reflecting the room. The back side of a vehicle packed with bags. Small horse sitting beside a large brown horse. a plate of pizza sits on a checkered table cover The Asian kid is gleefully playing with the cellular telephone. A woman petting the trunk of a elephant. a man flying through the air on top of a skateboard. There is a room with various items in the picture. This is a baseball player trying to hit a ball The urinal on the ground has a toilet scrubber next to it. Four giraffes are behind a fence in the dirt. A pink kite is flying in the sky at a beach. An orange cat laying on a black laptop in living room. A man pitching a baseball on top of a field. A female runner eating a banana during her run. A guy in a bandana leaning over a laptop. A elephant standing in a field with lots of grass. Three zebras are running in bright green grass. Two guys are shaking hands while one grips the tennis racket. A laptop on a wooden chair of some sort. a street with two stop signs and people walking down the street A picture of a hotel room having just been cleaned. A brown dog touches noses with a sheep. A little boy holding a baseball bat on a field. A group of people that are in a market. A zebra waling in a field of dead grass by some trees. Yellow and black older snowmobile inside room with blue walls. A man standing up in front of doors with a folder in his hands. A smiling woman with scissors cutting a sign A clean white toilet with the lid down in a bathroom. A woman that has purple socks and a book. There is a large gray elephant standing next to a tree. A small white living room with sofa and lunge chair a brown and white cat has its paws on a laptop A group of people in a room playing video games A picture of a bear that is in the grass. Several people in a group are flying very colorful kites. there are two statues of zebras at a exhibit The bowl of greens is near a wooden bowl. Bunches of bananas are placed on flat newspapers. A giraffe looking up at a tree behind some large rocks. Four laptops sitting on a cluttered desk with a phone and a pair of headphones. A man in a black shirt opens an oven door while looking at the inside of the oven. A group of airplanes are parked at a runway and a truck is parked next to a plane. The couple is sharing a piece of cake while being photographed. a man on a pole with an umbrella A group of people skiing down a mountain in the snow. A man carrying a basket filled with fruit and clippers. A green double decker city bus by the curb. Black and white photograph of people with umbrella next to cars in snow. Orange tiger stuffed animal sitting on the bed of a pickup truck. A plate that has sandwiches and chips on it. Man sits on his parked motorcycle with body of water and bridge behind him. A cat lays on top of a blanket and sleeps. A person on a motor bike on a road. A pizza with a few toppings is on a plate. A brown basket filled with bananas and apples. A blue and white fire hydrant on a street. Fresh produce is arranged in a grocery store display. a man is sitting at a table with some food A wooden crate holding bananas under a roof area. some people in orange are standing together outside Two people are flying a kite on a hill. a young person and an older person holding a kite a fire hydrant is spraying water onto the street A green bus parked in front of a tall building. there is a male tennis player on the court in a game A bunch of stuffed animals stacked on top of each other. Beach goers enjoying sunny day on sandy beach at ocean. Two firetrucks are ready to be deployed to a fire. A city bus drives down a quiet road. A large elephant walking in front of a vehicle. Two giraffe's standing in the shade under a canopy. Woman on cell phone in city at night. A girl sits on a bench on grass outside a red door. A variety of Apple Ipod products on display. Two giraffe standing on top of a muddy puddle of water. A panda bear that is holding a stick. A view of a baseball field from behind home plate A blue and yellow mass transit bus turning a corner. A plate of food on a table with a tall glass A man standing and holding a tennis racket on court. Small dog playing with toilet paper in bathroom. Two men and two women make breakfast plates in a kitchen. Two people in skis standing together a snowy hill. People are riding horses through the grassy plain. A building with three steeples and a clock in the center. A parking meter on the sidewalk of a busy street. a square shaped pizza with bacon, an egg and tomatoes on a white paper plate. A full view of suitcases with some clothes on it. underside of a plane flying through a cloudy sky A large group of people on the street. Two men sit in front of large baskets of fruits and vegetables. THERE IS A WOMAN THAT IS STANDING ON THE STREET Two triangular street signs on grass next to brick pathway. there is a small dog that has fallen asleep with a book A pizza on a tray with a fork and glasses on the table A dog laying in the back of a moving truck. A small bird standing on the ground next to body of water. A couple of elephants standing on a lush green forest. A close up of a duck walking on a path. Toddler enjoying playing with a colorful kite in a grassy field a statue standing next to a clock and some bells Three plates with a different dessert on each. The view of a bathroom showing a toilet with a small waste bin next to it. A blowup seat in the back of a blow up raft The plate has a picture of a kitty on it. Horse drawn carriage with a pair of black horses in front. The man is pitching the baseball on the field. A planter that is standing on a stand. A very large doughnut sits atop a building as an advertisement. A sleeping dog laying on a stone walkway. a man skating on the road very fast An empty bus is parked in front of a building. A black table has a white vase with flowers. Male tennis player on the middle of the court. a close up of doughnuts on a plate on a table A skier performs a trick in the air off a ramp A fire hydrant sits on the curb in the snow. A small bathroom has an open skylight in the ceiling. The small dog wearing a pink scarf stands in the yard near a bowl. A living room complete with a couch, chair and television. a couple of benches in front of a body of water. A zebra standing in a sandy spot surrounded by green ground cover. A boat travels in one direction of the ocean while a smaller pleasure craft travels in the opposite direction. A laptop set up on a wooden table. A chair lift over a long ski run. A stop sign on a pole in a city Cardboard boxes stacked up in a living room A fireman is on top of the truck ladder A big ocean wave with someone trying to stay on the surfboard. A man hitting a tennis ball on the tennis court. A man is in a kayak in a pool with a ball. People sitting at tables working on laptop computers. a pizza with pesto sauce sitting on some oven mitts A group of people riding motorcycles is going down a road. The large double decker bus is coming around a corner. A brown horse grazing in a grassy area. China Airlines plane in air with landing gear out. a sandwich has a bite taken out of it A suite case that has a large quantity of glasses in it. The fluffy cat is sitting on top of a toilet in the bathroom. A spacious bathroom with two sinks and a claw foot tub. Two young people are riding a bike together next to the parked vehicles. The three trains are stopped on the railroad tracks. A kitchen with brown cabinets has an island. I am unable to see the image above. there is a man sitting outside at a table with a large pizza A train engine with train cars behind it, riding on a set of tracks with smoke blowing from the engine. A vase filled with a large yellow and black sunflower and other flowers. A group of people standing around a room together. women standing next to a truck on display A large jet sitting on top of an airport tarmac. there are many blue and white umbrellas on this beach A boy is sitting at a table eating pizza. A cat sitting on top of a television. A white cow surrounded by many dark cows inside a coral. Computer screen with the keyboard and printer sitting next to it. Partially open door leading to a kitchen from a hallway. an Olympic event going on with many skiers A box of doughnuts being held open by a hand A FedEx truck waits at the bottom of a San Francisco hill. A city bus driving down the street to georgetown some baseball players are playing baseball on a field Two women riding on the back of somebody else motorcycles Residential pantry with food items stocked on shelves. A very large building, that appears to be a truck. A person feeding a giraffe while wearing a hat. A young man riding a boogie board on top of a wave pool Professional dirt biker with woman on backseat of bike. Two zebras grazing on flowers in a pasture. The powdered pastry has filling in the middle of it. Boy with a football book and his dog outside. A table topped with plates and bowls of food. He is hitting the baseball with the bat. a baseball player is swinging at the pitch An intersection shows an expanse of empty road and then a car coming out from under a large arch that looks like a giant Chinese letter and stands between two buildings that stand at the forefront of am open walled walkway and retail venues. A man in his car using his phone A motorcycle police officer is pulled alongside fellow officers in car. Two elephants cross a dirt road between two stands of trees. The man is throwing the baseball during the game. A skier in a green jacket going down a slope covered in snow. A large passenger airplane flying against a partly cloudy sky. A young boy eats a piece of pizza. A flock of birds flying through the sky. A giant panda sitting on logs lazily yawning. a man is making pizza in his brick stove oven A picture taken from between an individuals knees at the sky. A part of hands with scissors trimming a plant. A banana next to a sprig of vanilla and a shot glass. A group of people in the snow, putting on snowboards. A chair sitting in the middle of the room, in a black and white photo. Two riders dressed as knights are on horseback. A medium-sized brown-colored worm wiggles as a large yellow slimy slug looks on. A half eaten pizza on a table with dishes. A beautifully maintained bedroom with rustic charm features natural wood. Two parking meters that are almost covered in snow. Some pancakes with icecream and bananas and a coffee People riding elephants who are wading through a river. A small group of penguins approaching a pool of water with one already swimming A group of people sitting around a table eating food. A stop sign flashes with an exit sign below it. a bunch of small children holding tennis rackets on a tennis court A black dog is laying on a white pillow a guy grinding his skateboard on a wooden post A bear that is standing in front of a rock. A man standing in front of a TV holding a Wii game controller. A man swings his Wii controller back in a living room. A large clock next to other smaller clocks set to different time zones. a room showing a cooker and an oven A man with a helmet holding wires attached to something in the sky. An elephant with tusks is standing between two fences. A very big bright colored truck and a van on a narrow road. The ball player is preparing to pitch the ball. The unicycle is on the curb in front of a parking meter. A book in french laying on a bed. A giraffe standing inside an enclosure with two deer. A train with multiple cars passing by trees. Spectators watching men on horses riding in an ANZAC Day parade in Australia two police riding horses on a london street A door that is opened wiith a chair inside . An empty bathroom with 2 toilets next to each other. Two black bears sit on the ground beside a structure made of wooden logs while another stands on top of it. THERE IS A METET THAT IS ON THE STREET ON THE SIDE WALK The young woman is jumping into the air as birds fly over the ocean behind her. Three stop lights and one way signs are in the intersection. A cat lays in the window on a sunny day. A man on a skate board who is touching the ground. a bamboo tray holding several bowls of asian food A man riding on the back of a brown horse down a street. A group of people with toy swords in a crowd. Two women in the snow on skis in front of a large building. A pigeon that is sitting on top of a head stone. A herd of zebra and horses standing next to each other. A man holding a baby in front of a plate with cake. A person clothed head to toe in white paints a room. A BIG GROUP OF PEOPLE FLYING KITES IN A FIELD A bathroom with a tub and shower and a sink. A baby bear standing among some tall grass. A living room filled with furniture and a flat screen TV. Bright red umbrella open on the sand of a beach. a motorcycle parked on a side walk near a brick building lose up of various trays of croissants and muffins. A man is taking a selfie with a mountain range in the background. a zebra in some brown grass and some green plants A car and motorcycle riding on a pavement road. A row of pizzas sit on tables underneath lamps. Some people in an arena with other people watching from the stands. A living room near an open window has furniture and an area rug on the floor. A man and woman dressed in wedding attire walking out of a building together. A bunch of bananas sit next to a cup of coffee. A woman relaxes on her bed and uses her computer A man with gray hair is holding a colorful kite. A polar bear grazing in a vibrant green grass A purple skateboard sitting at the back of a bus isle. A black horse and white horse graze for grass a guy and a girl getting ready to stand up on their surf boards A long red table with dishes on it seats many people in a room. Small toy train engine set with a train station. View of adult elephant seen through the trees a couple of cars pass through a city street A woman in the process of serving a tennis ball. A woman with a shorn sheep on a grate. A smiling young woman uses a computer in the kitchen. a photo of city buildings near beautiful plants A group of cows laying in a green pasture or grazing. A giraffe looking ahead in front of a stone wall. A man wearing a purple die and work shirt A radio sitting on a table next to a record player. A stove top in a storage type of room with several spices on the stove. Green onions sit on a cutting board along with carrot sticks. A man setting at a table in a restaurant cutting his food. A picture of a large cathedral with clock in the center. Two plush bears are found as a gift along with a Starbucks cup A row of floor height urinals in a public restroom. Two shake boarders playing on the street with one individual sitting under a tree. A child playing with his hand-held game system. A birthday cake that is decorated with a dolphin and sea horse on it A white bowl with a few pieces of broccoli. A train is coming down the track near a hillside. There is a figurine by the computer keyboard in the office. A bedroom with a plain neatly made bed with no headboard A man petting a giraffe whose face it over the fence An elephant standing on rocks next to a wood bridge. A large bathroom has a tiny window and a tub and toilet and sink and mirror. A group of children playing with a ball. A cooked pizza made with various separated toppings A man is snowboarding and is mid air over the snow. The person in the black and white photo is jumping up with a skateboard. A big clock tower topped with a walk and an American flag, stands tall against a blue sky, far ahead of city skyline, and right above a lot of teal-roofed domiciles. a blue bus parked at a street corner. Couple of people out in the ocean on surfboards The dogs are playing together out in the yard. An adorable little girl holding her hand over her mouth. A dog catching a Frisbee in a park, with people in the background. A woman with short, brown hair is looking into a circular mirror and holding a camera up to her cheek. A man and a small child fly a butterfly kite in a park. A snowboarder jumping through the air and performing a trick. A very beautiful kitchen with very modern updates people bringing their vegetables to the market by boat A bowl with steamed broccoli topped with nuts in it. A wooden table topped with cooking tools next to a sink. A man is jumping up to catch a Frisbee between his legs. Two men playing a game with steering wheel controllers. a giraffe eating some leaves off a tree A fire hydrant that was busted and is shooting water out. A woman walking past a table with a plate of food on top of it. A wooden table topped with lots of camera equipment. A stemmed bottle is holding a slender flower in a window sill with a view of rain. A blender filled with food on top of a counter. A person took a picture of his torso and legs while laying on the top of a bunk bed. A boy with a helmet stands next to a clock. A man that is holding a banana in his mouth. A caste all it up, reflecting off of the water. A male maneuvering up a ramp while on his skateboard. A red pick up truck with a plow blade drives down a snowy suburban road. Several toilets are place outside on a lawn. a bird eating out of a pizza box that is on the ground An airplane flying under the clouds in daytime. A hand with a gold ring is posed over a wireless keyboard, beside a wired mouse A couple of people dancing in some sand with no shoes on. The cat is observing its own visage in the circular make-up mirror. A group of people standing around a baby elephant in a river. a guy attempting a trick with his skateboard while othes watch A man and two boys herd 5 sheep into a truck. A sub sandwich in a box next to two hot dogs. Men in suits smiling and walking across a green soccer field. Four tanned men and a girl at an event. Two born bears walking though a forest surrounded by trees. A skiers lies on her back with the skis straight up. A building at a railroad crossing billows smoke. Construction loading truck driving in front of a building. The nose of an airplane sits on the landing strip, boarding passengers. A bowl filled with pasta, veggies and seasoning. a small dog is walking next to the fruit stand. A cup of Starbucks coffee is sitting on the side of a court. A street scene with cars on the road and people on the sidewalk. A person taking a photo in a mirror on a mass transit vehicle. A woman sitting on a bench in a stone alcove. A picture of a boat and some water. A small plane is getting ready for a flight Airport during a snowstorm with planes awaiting boarding. A dog and cat sitting on a couch A little girl makes a pizza with a smiley face. A woman in sunglasses petting the trunk of an elephant. A woman holds a mirror and tool up to a woman's mouth. The skier is sitting down in the snow. A street pole has an enormous number of signs on it. The carrots in the dish are marinating in beer. There is a woman holding a wine glass and a man wearing a necklace. A person holding a Chocolate Lab dog while the dog holds an old teddy bear. there is a male wake boarder holding on to a rope in the water three young people holding wine glasses laughing a cow walking on a city street near people A young man holding a white frisbee next to poles. A kitchen mostly empty with lots of cupboard and counter space. The food is seasoned and ready to be cooked. Two men in a kitchen are standing by a refrigerator. Two baseball players are walking on the field. A man riding skis down a snow covered slope. Some lemons are in a vase and oranges and grapes are in a plate. Two men are holding tennis balls and rackets. A circular mirror reflecting a woman's stomach in turquoise shirt. A birthday cake with gum drops and a bag of Cheetos cheese bacon snacks on a table. A blue and silver railroad train placed on the tracks A man welding the back of an oven. a close up of a box of open pizza The Helen J sitting in the ocean not moving. A yellow fire hydrant on a street corner. It looks like a human figure hanging in the tree limbs, partially concealed by foliage. A person sitting on a wooden bench outside. A guy in a helmet skate boards down the street. a girl is on a phone standing near a sign A group of young men standing on a basketball court. A cutting board with green peppers already cut and some awaiting their cutting. A man standing on a field holding a catchers mitt. A woman standing in front of the Eiffel Tower surrounded by photo shopped animals. A kitchen area with a double sink, a stove, a refrigerator and several other kitchen utensils. Picture of reflection in a mirror of a kitchen Two people holding surfboards on the shoreline of the beach A few surfers ride a good wave in the ocean. A building lined street with three lanes and light traffic. A small bathroom with a toilet next to a cabinet. A woman holding a white teddy bear next to a wood cabinet. Family room with furniture, fireplace and wood flooring. A tennis player is in air while extending his arm up to return the ball. Two little dogs hiding in the pillows of a couch. A bear dressed in a green outfit sitting outside. A man teaching a boy how to play baseball An office break room with table, microwave, sink and lockers. The boy throws a baseball to another boy who is ready to hit it. A home kitchen stripped down to be painted. an assortment of fruit including oranges and bananas The top of a church showing steeples and windows. A service truck at an airport terminal with planes reflected in the windows. A computer screen and keyboard on a desk. A boat heading upriver to a harbor town. A group of bicyclists are riding down a path Baseball players are in action as a crowd watches. A sheepdog prepares to guide a sheep into a corral. Two people that are standing on ski's in the snow. A person that is about to catch a frisbee. There is a bowl of food and a sandwich on a plate A fire hydrant sitting on the side of a road. A crane is stacked high with lots of luggage. A man with no fashion sense holding several frisbees. Two cats are laying down together on what seems to be a table cloth. Rows of handmade grass umbrellas lying on their sides. a red double decked bus advertising a shop The woman walks through sand with a black horse. A heard of cows with yellow tags on their ears in a field of grass. A plane is flying low during the evening. Snow covered mountains can be seen past the boats on the water. People hold six corn dogs with various mustard designs a couple skiers skiing through cones down a slope A man prepares to serve a tennis ball. A cat sitting behind storage containers and a computer. Two emergency vehicles on a driveway next to a garage. A sidewalk is next to many different signs. Some men with snowboards standing on a hill image does not appear in this particular one A giraffe standing on top of a dirt field. Three people check on a number of bicycles in a showroom Group of mixed fruits sitting inside a metal basket. A blue and yellow train is parked on the tracks. A brown and white dog laying next to luggage at an airport. A grassy field with three zebra grazing from the ground. A baseball player holds a bat while standing next to home plate. a tennis player swings his tennis racket A hotel lobby with a table and flowers in a vase. Two planes are on a runway beside trucks. a man in a black hat standing next to and holding the reigns of a horse A boy takes a selfie in a bathroom with Harry Potter decorations. A very big cute giraffe by a pretty palm tree. A group of people walking around a shopping center. A smiling man is playing tennis on a brown court. a small and dirty zebra inside of a corral A young boy holding a baseball bat to his face. A crowded store with several different displays of goods for sale. A black cat staring into the distance in a room A young girl on a bench with a kite. A bike on a pole in front of a brick building. A man is performing tricks on a bicycle. A busy city intersection with public transit and pedestrians. A dark room with a bed and black chair. A man doing a trick on a skateboard on a ramp. A man holding the string to a kite in a park. A man holding a tennis racket with a ball in the air on the tennis court. A person in the snow with two dogs on leashes. A brown bear walking with rocks in the background. A clock above a glove resting on a leopard print ledge. A cat places its mouth on a computer keyboard. A couple of people eating a slice of pizza. A silver fire hydrant with a blue top at a road corner. A man on a a fake horse is in the parade. The inside of a vehicle driving down a highway with a tv playing an image. A large group of elephants are in the water. A colorful plate of avocado, carrot, and cabbage. A young man doing tricks on his skate board. An airplane sitting on top of an airport tarmac. A man sitting on the raised cement border around a tree and looking at his cellphone. This is a small bathroom with a towel on the floor. there is a man drinking whine from a glass A red truck with patriotic bunting drags a parade float. two black cats are drinking out of a toilet A herd of goats standing on a public street. Sandwiches on buns topped with black olives and tomato. a birthday cake with candles on top of it Looking up at a tall clock tower in a blue sky A city bus is leaving the bus station. A person walking out of the waves with a surfboard. Two bowls of soup set on a restaurant table. ================================================ FILE: DiT-ToCa/cache_functions/__init__.py ================================================ from .cache_cutfresh import cache_cutfresh from .fresh_ratio_scheduler import fresh_ratio_scheduler from .score_evaluate import score_evaluate from .global_force_fresh import global_force_fresh from .cache_cutfresh import cache_cutfresh from .update_cache import update_cache from .force_init import force_init from .attention import Attention from .cache_init import cache_init from .cal_type import cal_type ================================================ FILE: DiT-ToCa/cache_functions/attention.py ================================================ # Besides, re-arrange the attention module from torch.jit import Final from timm.layers import use_fused_attn import torch import torch.nn as nn import torch.nn.functional as F import os class Attention(nn.Module): fused_attn: Final[bool] def __init__( self, dim: int, num_heads: int = 8, qkv_bias: bool = False, qk_norm: bool = False, attn_drop: float = 0., proj_drop: float = 0., norm_layer: nn.Module = nn.LayerNorm, ) -> None: super().__init__() assert dim % num_heads == 0, 'dim should be divisible by num_heads' self.num_heads = num_heads self.head_dim = dim // num_heads self.scale = self.head_dim ** -0.5 self.fused_attn = use_fused_attn() self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) def forward(self, x: torch.Tensor, cache_dic, current, fresh_indices=None) -> torch.Tensor: # 0.4ms extra cost on A800, mainly tensor operations """ fresh_indices: (B, fresh_ratio*N), the index tensor for the fresh tokens """ B, N, C = x.shape qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) q, k, v = qkv.unbind(0) #q: (B, num_heads, N, head_dim) if cache_dic['cache_type'] == 'kv-norm': cache_dic['cache'][-1][current['layer']]['v_norm'] = torch.norm(v, dim=-1, p=2) q, k = self.q_norm(q), self.k_norm(k) #q: (B, num_heads, N-M, head_dim), k: (B, num_heads, N, head_dim), v: (B, num_heads, N, head_dim) if (self.fused_attn) and (cache_dic['cache_type'] !='attention'): x = F.scaled_dot_product_attention( q, k, v, dropout_p=self.attn_drop.p if self.training else 0., ) attn_map = None else: q = q * self.scale attn = q @ k.transpose(-2, -1) attn_map= attn.softmax(dim=-1) #extra cost for attn attn = self.attn_drop(attn_map) x = attn @ v attn_map = attn_map.mean(dim=1) #head mean x = x.transpose(1, 2).reshape(B, N, C) x = self.proj(x) x = self.proj_drop(x) flops = ( B * N * C * 3 * C * 2 # QKV projection + B * self.num_heads * N * self.head_dim # Scale q + B * self.num_heads * N * N * self.head_dim * 2 # Q @ K + B * self.num_heads * N * N * 5 # Softmax + B * self.num_heads * N * N * self.head_dim * 2 # Attn @ V + B * N * C * C * 2 # Projection ) cache_dic['flops']+=flops return x, attn_map # x: (B, N-M, C), attn_map: (B, N-M, N) ================================================ FILE: DiT-ToCa/cache_functions/cache_cutfresh.py ================================================ from .fresh_ratio_scheduler import fresh_ratio_scheduler from .score_evaluate import score_evaluate from .token_merge import token_merge import torch def cache_cutfresh(cache_dic, tokens, current): ''' Cut fresh tokens from the input tokens and update the cache counter. cache_dic: dict, the cache dictionary containing cache(main extra memory cost), indices and some other information. tokens: torch.Tensor, the input tokens to be cut. current: dict, the current step, layer, and module information. Particularly convenient for debugging. ''' step = current['step'] layer = current['layer'] module = current['module'] fresh_ratio = fresh_ratio_scheduler(cache_dic, current) fresh_ratio = torch.clamp(torch.tensor(fresh_ratio), 0.0, 1.0) # Generate the index tensor for fresh tokens score = score_evaluate(cache_dic, tokens, current) score = local_selection_with_bonus(score, 0.6, 2) # Uniform Spatial Distribution s4 mentioned in the paper # 0.6, 2 indices = score.argsort(dim=-1, descending=True) topk = int(fresh_ratio * score.shape[1]) fresh_indices = indices[:, :topk] #stale_indices = indices[:, topk:] # (B, fresh_ratio *N) # Updating the Cache Frequency Score s3 mentioned in the paper # stale tokens index + 1, fresh tokens index = 0 cache_dic['cache_index'][-1][layer][module] += 1 cache_dic['cache_index'][-1][layer][module].scatter_(dim=1, index=fresh_indices, src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device)) ## not used in the final version #cache_dic['cache_index']['layer_index'][module] += 1 #cache_dic['cache_index']['layer_index'][module].scatter_(dim=1, index=fresh_indices, # src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device)) # select the fresh tokens out fresh_indices_expand = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1]) if module in ['mlp', 'attn']: # cut out the fresh tokens fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices_expand) return fresh_indices, fresh_tokens else: # no need for this branch hhh. raise ValueError("Unrecognized module?", module) def local_selection_with_bonus(score, bonus_ratio, grid_size=2): ''' Uniform Spatial Distribution s4 mentioned in the paper ''' batch_size, num_tokens = score.shape image_size = int(num_tokens ** 0.5) block_size = grid_size * grid_size assert num_tokens % block_size == 0, "The number of tokens must be divisible by the block size." # Step 1: Reshape score to group it by blocks score_reshaped = score.view(batch_size, image_size // grid_size, grid_size, image_size // grid_size, grid_size) score_reshaped = score_reshaped.permute(0, 1, 3, 2, 4).contiguous() score_reshaped = score_reshaped.view(batch_size, -1, block_size) # [batch_size, num_blocks, block_size] # Step 2: Find the max token in each block max_scores, max_indices = score_reshaped.max(dim=-1, keepdim=True) # [batch_size, num_blocks, 1] # Step 3: Create a mask to identify max score tokens mask = torch.zeros_like(score_reshaped) mask.scatter_(-1, max_indices, 1) # Set mask to 1 at the max indices # Step 4: Apply the bonus only to the max score tokens score_reshaped = score_reshaped + (mask * max_scores * bonus_ratio) # Apply bonus only to max tokens # Step 5: Reshape the score back to its original shape score_modified = score_reshaped.view(batch_size, image_size // grid_size, image_size // grid_size, grid_size, grid_size) score_modified = score_modified.permute(0, 1, 3, 2, 4).contiguous() score_modified = score_modified.view(batch_size, num_tokens) return score_modified ================================================ FILE: DiT-ToCa/cache_functions/cache_init.py ================================================ def cache_init(model_kwargs, num_steps): ''' Initialization for cache. ''' cache_dic = {} cache = {} cache_index = {} cache[-1]={} cache_index[-1]={} cache_index['layer_index']={} cache_dic['attn_map'] = {} cache_dic['attn_map'][-1] = {} for j in range(28): cache[-1][j] = {} cache_index[-1][j] = {} cache_dic['attn_map'][-1][j] = {} for i in range(num_steps): cache[i]={} for j in range(28): cache[i][j] = {} cache_dic['cache_type'] = model_kwargs['cache_type'] cache_dic['cache_index'] = cache_index cache_dic['cache'] = cache cache_dic['fresh_ratio_schedule'] = model_kwargs['ratio_scheduler'] cache_dic['fresh_ratio'] = model_kwargs['fresh_ratio'] cache_dic['fresh_threshold'] = model_kwargs['fresh_threshold'] cache_dic['force_fresh'] = model_kwargs['force_fresh'] cache_dic['soft_fresh_weight'] = model_kwargs['soft_fresh_weight'] cache_dic['flops'] = 0.0 cache_dic['test_FLOPs'] = model_kwargs['test_FLOPs'] cache_dic['cache'][-1]['noise_steps'] = {} cache_dic['counter'] = 0.0 current = {} current['num_steps'] = num_steps return cache_dic, current ================================================ FILE: DiT-ToCa/cache_functions/cal_type.py ================================================ def cal_type(cache_dic, current): ''' Determine calculation type for this step ''' last_steps = (current['step'] <=2) first_step = (current['step'] == (current['num_steps'] - 1)) force_fresh = cache_dic['force_fresh'] if not first_step: fresh_interval = cache_dic['cal_threshold'] else: fresh_interval = cache_dic['fresh_threshold'] if (current['step'] % fresh_interval == 0) or first_step: current['type'] = 'full' elif ((current['step'] % fresh_interval) % 2 == 1): #[1,3,5] [2,4,6] current['type'] = 'ToCa' # 'ToCa' 'FORA' else: current['type'] = 'ToCa' ================================================ FILE: DiT-ToCa/cache_functions/force_init.py ================================================ import torch from .force_scheduler import force_scheduler def force_init(cache_dic, current, tokens): ''' Initialization for Force Activation step. ''' # reset the cache index to 0 cache_dic['cache_index'][-1][current['layer']][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device) if current['layer'] == 0: cache_dic['cache_index']['layer_index'][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device) #if current['layer'] == 27: force_scheduler(cache_dic, current) ================================================ FILE: DiT-ToCa/cache_functions/force_scheduler.py ================================================ import torch def force_scheduler(cache_dic, current): ''' Force Activation Cycle Scheduler ''' if cache_dic['fresh_ratio'] == 0: # FORA linear_step_weight = 0.0 else: # ToCa linear_step_weight = 0.4 #0.4 step_factor = torch.tensor(1 + linear_step_weight - 2 * linear_step_weight * current['step'] / current['num_steps']) threshold = torch.round(cache_dic['fresh_threshold'] / step_factor) if (current['step'] in range(int(current['num_steps']*0.2),int(current['num_steps']*0.4))) and (cache_dic['fresh_ratio'] != 0): # We find that in these 20% steps, the model is extremely sensitive for cache, i.e. worse temporal redundancy. threshold = 2 cache_dic['cal_threshold'] = threshold ================================================ FILE: DiT-ToCa/cache_functions/fresh_ratio_scheduler.py ================================================ import torch def fresh_ratio_scheduler(cache_dic, current): ''' Return the fresh ratio for the current step. ''' fresh_ratio = cache_dic['fresh_ratio'] fresh_ratio_schedule = cache_dic['fresh_ratio_schedule'] step = current['step'] num_steps = current['num_steps'] threshold = cache_dic['fresh_threshold'] weight = 0.9 if fresh_ratio_schedule == 'constant': return fresh_ratio elif fresh_ratio_schedule == 'linear': return fresh_ratio * (1 + weight - 2 * weight * step / num_steps) elif fresh_ratio_schedule == 'exp': #return 0.5 * (0.052 ** (step/num_steps)) return fresh_ratio * (weight ** (step / num_steps)) elif fresh_ratio_schedule == 'linear-mode': mode = (step % threshold)/threshold - 0.5 mode_weight = 0.1 return fresh_ratio * (1 + weight - 2 * weight * step / num_steps + mode_weight * mode) elif fresh_ratio_schedule == 'layerwise': return fresh_ratio * (1 + weight - 2 * weight * current['layer'] / 27) elif fresh_ratio_schedule == 'linear-layerwise': step_weight = 0.4 step_factor = 1 + step_weight - 2 * step_weight * step / num_steps layer_weight = 0.8 layer_factor = 1 + layer_weight - 2 * layer_weight * current['layer'] / 27 module_weight = 2.5 module_time_weight = 0.6 module_factor = (1 - (1-module_time_weight) * module_weight) if current['module']=='attn' else (1 + module_time_weight * module_weight) return fresh_ratio * layer_factor * step_factor * module_factor ###### Recommended Configurations ###### elif fresh_ratio_schedule == 'ToCa-ddim50': # Proposed scheduling method in toca. # step wise scheduling, we find there is little differece if change the weight of step factor, so this is not a key factor. step_weight = 2.0 #0.4 #0.0 # 2.0 step_factor = 1 + step_weight - 2 * step_weight * step / num_steps # layer wise scheduling, important. Meaning caculate more in the front layers, less in the back layers. layer_weight = -0.2#0.8 #0.0 # -0.2 layer_factor = 1 + layer_weight - 2 * layer_weight * current['layer'] / 27 # Module wise scheduling, important. Meaning caculate more in the mlp module, less in the attn module. module_weight = 2.5 # no calculations for attn module (2.5 * 0.4 = 1.0), compuation is transformed to mlp module. module_time_weight = 0.6 # estimated from the time and flops of mlp and attn module, may change in different situations. module_factor = (1 - (1-module_time_weight) * module_weight) if current['module']=='attn' else (1 + module_time_weight * module_weight) return fresh_ratio * layer_factor * step_factor * module_factor elif fresh_ratio_schedule == 'ToCa-ddpm250': # Proposed scheduling method in toca. # step wise scheduling, we find there is little differece if change the weight of step factor, so this is not a key factor. step_weight = 0.4 #0.0 # 2.0 step_factor = 1 + step_weight - 2 * step_weight * step / num_steps # layer wise scheduling, important. Meaning caculate more in the front layers, less in the back layers. layer_weight = 0.8 #0.0 # -0.2 layer_factor = 1 + layer_weight - 2 * layer_weight * current['layer'] / 27 # Module wise scheduling, important. Meaning caculate more in the mlp module, less in the attn module. module_weight = 2.5 # no calculations for attn module (2.5 * 0.4 = 1.0), compuation is transformed to mlp module. module_time_weight = 0.6 # estimated from the time and flops of mlp and attn module, may change in different situations. module_factor = (1 - (1-module_time_weight) * module_weight) if current['module']=='attn' else (1 + module_time_weight * module_weight) return fresh_ratio * layer_factor * step_factor * module_factor else: raise ValueError("unrecognized fresh ratio schedule", fresh_ratio_schedule) ================================================ FILE: DiT-ToCa/cache_functions/global_force_fresh.py ================================================ from .force_scheduler import force_scheduler def global_force_fresh(cache_dic, current): ''' Return whether to force fresh tokens globally. ''' last_steps = (current['step'] <= 2) first_step = (current['step'] == (current['num_steps'] - 1)) force_fresh = cache_dic['force_fresh'] if not first_step: fresh_threshold = cache_dic['cal_threshold'] else: fresh_threshold = cache_dic['fresh_threshold'] if force_fresh == 'global': # global force fresh means force activate all tokens in this step. return (first_step or (current['step']% fresh_threshold == 0)) elif force_fresh == 'local': # fresh locally cause much worse results, for the misalignment of cache and computed tokens. return first_step elif force_fresh == 'none': return first_step else: raise ValueError("unrecognized force fresh strategy", force_fresh) ================================================ FILE: DiT-ToCa/cache_functions/score_evaluate.py ================================================ import torch import torch.nn as nn from .scores import attn_score, similarity_score, norm_score, kv_norm_score def score_evaluate(cache_dic, tokens, current) -> torch.Tensor: ''' Return the score tensor (B, N) for the given tokens. Mainly include s1, (s2,) s3 mentioned in the paper. ''' #if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')): ## abandoned branch, if you want to explore the local force fresh strategy, this may help. # force_fresh_mask = torch.as_tensor((cache_dic['cache_index'][-1][current['layer']][current['module']] >= 2 * cache_dic['fresh_threshold']), dtype = int) # 2 because the threshold is for step, not module # force_len = force_fresh_mask.sum(dim=1) # force_indices = force_fresh_mask.argsort(dim = -1, descending = True)[:, :force_len.min()] # # force_indices = force_indices[:, torch.randperm(force_indices.shape[1])] if cache_dic['cache_type'] == 'random': # select tokens randomly, but remember to keep the same for cfg and no cfg. score = torch.rand(int(tokens.shape[0]*0.5), tokens.shape[1], device=tokens.device) score = torch.cat([score, score], dim=0).to(tokens.device) elif cache_dic['cache_type'] == 'straight': # abandon the cache, just return 1 hhh, obviously no use. score = torch.ones(tokens.shape[0], tokens.shape[1]).to(tokens.device) elif cache_dic['cache_type'] == 'attention': # Recommended selection method in the paper. # cache_dic['attn_map'][step][layer] (B, N, N), the last dimention has get softmaxed # calculate the attention score, for DiT, there is no cross-attention, so just self-attention score s1 applied. score = attn_score(cache_dic, current) # if you'd like to add some randomness to the score as SiTo does to avoid tokens been over cached. This works, but we have another elegant way. #score = score + 0.0 * torch.rand_like(score, device= score.device) elif cache_dic['cache_type'] == 'kv-norm': score = kv_norm_score(cache_dic, current) elif cache_dic['cache_type'] == 'similarity': # why don't we calculate similarity score? # This is natural but we find it cost **TOO MUCH TIME**, cause in DiT series models, you can calculate similarity for scoring every where. score = similarity_score(cache_dic, current, tokens) elif cache_dic['cache_type'] == 'norm': # an interesting exploration, but not used in the final version. # use norm as the selectioon method is probably because of the norm of the tokens may indicate the importance of the token. but it is not the case. score = norm_score(cache_dic, current, tokens) elif cache_dic['cache_type'] == 'compress': # if you want to combine any of the methods mentioned, we have not tried this yet hhh. score1 = torch.rand(int(tokens.shape[0]*0.5), tokens.shape[1]) score1 = torch.cat([score1, score1], dim=0).to(tokens.device) score2 = cache_dic['attn_map'][-1][current['layer']].sum(dim=1)#.mean(dim=0) # (B, N) # normalize score2 = score2 / score2.max(dim=1, keepdim=True)[0] score = 0.5 * score1 + 0.5 * score2 # abandon the branch, if you want to explore the local force fresh strategy, this may help. #if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')): # current['is_force_fresh'] is False, cause when it is True, no cut and fresh are needed # #print(torch.ones_like(force_indices, dtype=float, device=force_indices.device).dtype) # score.scatter_(dim=1, index=force_indices, src=torch.ones_like(force_indices, dtype=torch.float32, # device=force_indices.device)) if (True and (cache_dic['force_fresh'] == 'global')): # apply s3 mentioned in the paper, the "True" above is for a switch to turn on/off the s3. soft_step_score = cache_dic['cache_index'][-1][current['layer']][current['module']].float() / (cache_dic['fresh_threshold']) # layer wise s3, not used in the final version. seems it is not necessary to add if step wise is applied. #soft_layer_score = cache_dic['cache_index']['layer_index'][current['module']].float() / (27) score = score + cache_dic['soft_fresh_weight'] * soft_step_score #+ 0.1 *soft_layer_score #cfg_score, no_cfg_score = torch.split(score, len(score)//2, dim = 0) #score = 0.5* cfg_score + 0.5* no_cfg_score #score = torch.cat([score,score], dim=0) return score.to(tokens.device) ================================================ FILE: DiT-ToCa/cache_functions/scores.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F def attn_score(cache_dic, current): ''' Attention Score s1 (s2, but dit doesn't contain cross-attention for s2) ''' #self_attn_score = 1- cache_dic['attn_map'][-1][current['layer']].diagonal(dim1=1, dim2=2) #self_attn_score = F.normalize(self_attn_score, dim=1, p=2) attention_score = F.normalize(cache_dic['attn_map'][-1][current['layer']].sum(dim=1), dim=1, p=2) #score = self_attn_score score = attention_score return score def similarity_score(cache_dic, current, tokens): cosine_sim = F.cosine_similarity(tokens, cache_dic['cache'][-1][current['layer']][current['module']], dim=-1) return F.normalize(1- cosine_sim, dim=-1, p=2) def norm_score(cache_dic, current, tokens): norm = tokens.norm(dim=-1, p=2) return F.normalize(norm, dim=-1, p=2) def kv_norm_score(cache_dic, current): # (B, num_heads, N) #k_norm = cache_dic['cache'][-1][current['layer']]['k_norm'] v_norm = cache_dic['cache'][-1][current['layer']]['v_norm'] kv_norm = 1- v_norm return F.normalize(kv_norm.sum(dim = -2), p=2) ================================================ FILE: DiT-ToCa/cache_functions/token_merge.py ================================================ import torch def token_merge(cache_dic, tokens, current, fresh_indices, stale_indices): ''' An abandoned branch in exploring if token merge helps. The answer is no, at least no for training-free strategy. ''' if (current['layer'] % 1 == 0): fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1])) stale_tokens = torch.gather(input = tokens, dim = 1, index = stale_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1])) method = 'similarity' if method == 'distance': descending = False distance = torch.cdist(stale_tokens, fresh_tokens, p=1) stale_fresh_dist, stale_fresh_indices_allstale = torch.min(distance, dim=2) elif method == 'similarity': descending = True fresh_tokens = torch.nn.functional.normalize(fresh_tokens, p=2, dim=-1) stale_tokens = torch.nn.functional.normalize(stale_tokens, p=2, dim=-1) similarity = stale_tokens @ fresh_tokens.transpose(1, 2) stale_fresh_dist, stale_fresh_indices_allstale = torch.max(similarity, dim=2) saved_topk_stale = int((stale_fresh_dist > 0.995).sum(dim=1).min()) merged_stale_sequence = torch.sort(stale_fresh_dist, dim=1, descending=descending)[1][:,:saved_topk_stale] stale_fresh_indices = stale_fresh_indices_allstale.gather(1, merged_stale_sequence) merged_stale_sequence = stale_indices.gather(1, merged_stale_sequence) merged_stale_fresh_indices = fresh_indices.gather(1, stale_fresh_indices) cache_dic['merged_stale_fresh_indices'] = merged_stale_fresh_indices cache_dic['merged_stale_sequence'] = merged_stale_sequence ================================================ FILE: DiT-ToCa/cache_functions/update_cache.py ================================================ import torch def update_cache(fresh_indices, fresh_tokens, cache_dic, current, fresh_attn_map=None): ''' Update the cache with the fresh tokens. ''' step = current['step'] layer = current['layer'] module = current['module'] # Update the cached tokens at the positions if module == 'attn': # this branch is not used in the final version, but if you explore the partial fresh strategy of attention, it works. indices = fresh_indices.sort(dim=1, descending=False)[0] cache_dic['attn_map'][-1][layer].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_attn_map.shape[-1]), src=fresh_attn_map) elif module == 'mlp': indices = fresh_indices cache_dic['cache'][-1][layer][module].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_tokens.shape[-1]), src=fresh_tokens) ================================================ FILE: DiT-ToCa/diffusion/__init__.py ================================================ # Modified from OpenAI's diffusion repos # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py from . import gaussian_diffusion as gd from .respace import SpacedDiffusion, space_timesteps def create_diffusion( timestep_respacing, noise_schedule="linear", use_kl=False, sigma_small=False, predict_xstart=False, learn_sigma=True, rescale_learned_sigmas=False, diffusion_steps=1000 ): betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps) if use_kl: loss_type = gd.LossType.RESCALED_KL elif rescale_learned_sigmas: loss_type = gd.LossType.RESCALED_MSE else: loss_type = gd.LossType.MSE if timestep_respacing is None or timestep_respacing == "": timestep_respacing = [diffusion_steps] return SpacedDiffusion( use_timesteps=space_timesteps(diffusion_steps, timestep_respacing), betas=betas, model_mean_type=( gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X ), model_var_type=( ( gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL ) if not learn_sigma else gd.ModelVarType.LEARNED_RANGE ), loss_type=loss_type # rescale_timesteps=rescale_timesteps, ) ================================================ FILE: DiT-ToCa/diffusion/diffusion_utils.py ================================================ # Modified from OpenAI's diffusion repos # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py import torch as th import numpy as np def normal_kl(mean1, logvar1, mean2, logvar2): """ Compute the KL divergence between two gaussians. Shapes are automatically broadcasted, so batches can be compared to scalars, among other use cases. """ tensor = None for obj in (mean1, logvar1, mean2, logvar2): if isinstance(obj, th.Tensor): tensor = obj break assert tensor is not None, "at least one argument must be a Tensor" # Force variances to be Tensors. Broadcasting helps convert scalars to # Tensors, but it does not work for th.exp(). logvar1, logvar2 = [ x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) for x in (logvar1, logvar2) ] return 0.5 * ( -1.0 + logvar2 - logvar1 + th.exp(logvar1 - logvar2) + ((mean1 - mean2) ** 2) * th.exp(-logvar2) ) def approx_standard_normal_cdf(x): """ A fast approximation of the cumulative distribution function of the standard normal. """ return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3)))) def continuous_gaussian_log_likelihood(x, *, means, log_scales): """ Compute the log-likelihood of a continuous Gaussian distribution. :param x: the targets :param means: the Gaussian mean Tensor. :param log_scales: the Gaussian log stddev Tensor. :return: a tensor like x of log probabilities (in nats). """ centered_x = x - means inv_stdv = th.exp(-log_scales) normalized_x = centered_x * inv_stdv log_probs = th.distributions.Normal(th.zeros_like(x), th.ones_like(x)).log_prob(normalized_x) return log_probs def discretized_gaussian_log_likelihood(x, *, means, log_scales): """ Compute the log-likelihood of a Gaussian distribution discretizing to a given image. :param x: the target images. It is assumed that this was uint8 values, rescaled to the range [-1, 1]. :param means: the Gaussian mean Tensor. :param log_scales: the Gaussian log stddev Tensor. :return: a tensor like x of log probabilities (in nats). """ assert x.shape == means.shape == log_scales.shape centered_x = x - means inv_stdv = th.exp(-log_scales) plus_in = inv_stdv * (centered_x + 1.0 / 255.0) cdf_plus = approx_standard_normal_cdf(plus_in) min_in = inv_stdv * (centered_x - 1.0 / 255.0) cdf_min = approx_standard_normal_cdf(min_in) log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12)) log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12)) cdf_delta = cdf_plus - cdf_min log_probs = th.where( x < -0.999, log_cdf_plus, th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))), ) assert log_probs.shape == x.shape return log_probs ================================================ FILE: DiT-ToCa/diffusion/gaussian_diffusion.py ================================================ # Modified from OpenAI's diffusion repos # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py import math import numpy as np import torch as th import enum from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl from cache_functions import cache_init def mean_flat(tensor): """ Take the mean over all non-batch dimensions. """ return tensor.mean(dim=list(range(1, len(tensor.shape)))) class ModelMeanType(enum.Enum): """ Which type of output the model predicts. """ PREVIOUS_X = enum.auto() # the model predicts x_{t-1} START_X = enum.auto() # the model predicts x_0 EPSILON = enum.auto() # the model predicts epsilon class ModelVarType(enum.Enum): """ What is used as the model's output variance. The LEARNED_RANGE option has been added to allow the model to predict values between FIXED_SMALL and FIXED_LARGE, making its job easier. """ LEARNED = enum.auto() FIXED_SMALL = enum.auto() FIXED_LARGE = enum.auto() LEARNED_RANGE = enum.auto() class LossType(enum.Enum): MSE = enum.auto() # use raw MSE loss (and KL when learning variances) RESCALED_MSE = ( enum.auto() ) # use raw MSE loss (with RESCALED_KL when learning variances) KL = enum.auto() # use the variational lower-bound RESCALED_KL = enum.auto() # like KL, but rescale to estimate the full VLB def is_vb(self): return self == LossType.KL or self == LossType.RESCALED_KL def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac): betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64) warmup_time = int(num_diffusion_timesteps * warmup_frac) betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64) return betas def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps): """ This is the deprecated API for creating beta schedules. See get_named_beta_schedule() for the new library of schedules. """ if beta_schedule == "quad": betas = ( np.linspace( beta_start ** 0.5, beta_end ** 0.5, num_diffusion_timesteps, dtype=np.float64, ) ** 2 ) elif beta_schedule == "linear": betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64) elif beta_schedule == "warmup10": betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1) elif beta_schedule == "warmup50": betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5) elif beta_schedule == "const": betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64) elif beta_schedule == "jsd": # 1/T, 1/(T-1), 1/(T-2), ..., 1 betas = 1.0 / np.linspace( num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64 ) else: raise NotImplementedError(beta_schedule) assert betas.shape == (num_diffusion_timesteps,) return betas def get_named_beta_schedule(schedule_name, num_diffusion_timesteps): """ Get a pre-defined beta schedule for the given name. The beta schedule library consists of beta schedules which remain similar in the limit of num_diffusion_timesteps. Beta schedules may be added, but should not be removed or changed once they are committed to maintain backwards compatibility. """ if schedule_name == "linear": # Linear schedule from Ho et al, extended to work for any number of # diffusion steps. scale = 1000 / num_diffusion_timesteps return get_beta_schedule( "linear", beta_start=scale * 0.0001, beta_end=scale * 0.02, num_diffusion_timesteps=num_diffusion_timesteps, ) elif schedule_name == "squaredcos_cap_v2": return betas_for_alpha_bar( num_diffusion_timesteps, lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2, ) else: raise NotImplementedError(f"unknown beta schedule: {schedule_name}") def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. :param num_diffusion_timesteps: the number of betas to produce. :param alpha_bar: a lambda that takes an argument t from 0 to 1 and produces the cumulative product of (1-beta) up to that part of the diffusion process. :param max_beta: the maximum beta to use; use values lower than 1 to prevent singularities. """ betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) return np.array(betas) class GaussianDiffusion: """ Utilities for training and sampling diffusion models. Original ported from this codebase: https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42 :param betas: a 1-D numpy array of betas for each diffusion timestep, starting at T and going to 1. """ def __init__( self, *, betas, model_mean_type, model_var_type, loss_type ): self.model_mean_type = model_mean_type self.model_var_type = model_var_type self.loss_type = loss_type # Use float64 for accuracy. betas = np.array(betas, dtype=np.float64) self.betas = betas assert len(betas.shape) == 1, "betas must be 1-D" assert (betas > 0).all() and (betas <= 1).all() self.num_timesteps = int(betas.shape[0]) alphas = 1.0 - betas self.alphas_cumprod = np.cumprod(alphas, axis=0) self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1]) self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0) assert self.alphas_cumprod_prev.shape == (self.num_timesteps,) # calculations for diffusion q(x_t | x_{t-1}) and others self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod) self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod) self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod) self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod) self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1) # calculations for posterior q(x_{t-1} | x_t, x_0) self.posterior_variance = ( betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod) ) # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain self.posterior_log_variance_clipped = np.log( np.append(self.posterior_variance[1], self.posterior_variance[1:]) ) if len(self.posterior_variance) > 1 else np.array([]) self.posterior_mean_coef1 = ( betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod) ) self.posterior_mean_coef2 = ( (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod) ) def q_mean_variance(self, x_start, t): """ Get the distribution q(x_t | x_0). :param x_start: the [N x C x ...] tensor of noiseless inputs. :param t: the number of diffusion steps (minus 1). Here, 0 means one step. :return: A tuple (mean, variance, log_variance), all of x_start's shape. """ mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape) log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape) return mean, variance, log_variance def q_sample(self, x_start, t, noise=None): """ Diffuse the data for a given number of diffusion steps. In other words, sample from q(x_t | x_0). :param x_start: the initial data batch. :param t: the number of diffusion steps (minus 1). Here, 0 means one step. :param noise: if specified, the split-out normal noise. :return: A noisy version of x_start. """ if noise is None: noise = th.randn_like(x_start) assert noise.shape == x_start.shape return ( _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise ) def q_posterior_mean_variance(self, x_start, x_t, t): """ Compute the mean and variance of the diffusion posterior: q(x_{t-1} | x_t, x_0) """ assert x_start.shape == x_t.shape posterior_mean = ( _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t ) posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape) posterior_log_variance_clipped = _extract_into_tensor( self.posterior_log_variance_clipped, t, x_t.shape ) assert ( posterior_mean.shape[0] == posterior_variance.shape[0] == posterior_log_variance_clipped.shape[0] == x_start.shape[0] ) return posterior_mean, posterior_variance, posterior_log_variance_clipped def p_mean_variance(self, model, x, t, current=None, cache_dic=None, clip_denoised=True, denoised_fn=None, model_kwargs=None): #def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None): """ Apply the model to get p(x_{t-1} | x_t), as well as a prediction of the initial x, x_0. :param model: the model, which takes a signal and a batch of timesteps as input. :param x: the [N x C x ...] tensor at time t. :param t: a 1-D Tensor of timesteps. :param clip_denoised: if True, clip the denoised signal into [-1, 1]. :param denoised_fn: if not None, a function which applies to the x_start prediction before it is used to sample. Applies before clip_denoised. :param model_kwargs: if not None, a dict of extra keyword arguments to pass to the model. This can be used for conditioning. :return: a dict with the following keys: - 'mean': the model mean output. - 'variance': the model variance output. - 'log_variance': the log of 'variance'. - 'pred_xstart': the prediction for x_0. """ if model_kwargs is None: model_kwargs = {} B, C = x.shape[:2] assert t.shape == (B,) model_output = model(x, t, current=current, cache_dic=cache_dic, **model_kwargs) if isinstance(model_output, tuple): model_output, extra = model_output else: extra = None if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]: assert model_output.shape == (B, C * 2, *x.shape[2:]) model_output, model_var_values = th.split(model_output, C, dim=1) min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape) max_log = _extract_into_tensor(np.log(self.betas), t, x.shape) # The model_var_values is [-1, 1] for [min_var, max_var]. frac = (model_var_values + 1) / 2 model_log_variance = frac * max_log + (1 - frac) * min_log model_variance = th.exp(model_log_variance) else: model_variance, model_log_variance = { # for fixedlarge, we set the initial (log-)variance like so # to get a better decoder log likelihood. ModelVarType.FIXED_LARGE: ( np.append(self.posterior_variance[1], self.betas[1:]), np.log(np.append(self.posterior_variance[1], self.betas[1:])), ), ModelVarType.FIXED_SMALL: ( self.posterior_variance, self.posterior_log_variance_clipped, ), }[self.model_var_type] model_variance = _extract_into_tensor(model_variance, t, x.shape) model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape) def process_xstart(x): if denoised_fn is not None: x = denoised_fn(x) if clip_denoised: return x.clamp(-1, 1) return x if self.model_mean_type == ModelMeanType.START_X: pred_xstart = process_xstart(model_output) else: pred_xstart = process_xstart( self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output) ) model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t) assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape return { "mean": model_mean, "variance": model_variance, "log_variance": model_log_variance, "pred_xstart": pred_xstart, "extra": extra, } def _predict_xstart_from_eps(self, x_t, t, eps): assert x_t.shape == eps.shape return ( _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps ) def _predict_eps_from_xstart(self, x_t, t, pred_xstart): return ( _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None): """ Compute the mean for the previous step, given a function cond_fn that computes the gradient of a conditional log probability with respect to x. In particular, cond_fn computes grad(log(p(y|x))), and we want to condition on y. This uses the conditioning strategy from Sohl-Dickstein et al. (2015). """ gradient = cond_fn(x, t, **model_kwargs) new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float() return new_mean def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None): """ Compute what the p_mean_variance output would have been, should the model's score function be conditioned by cond_fn. See condition_mean() for details on cond_fn. Unlike condition_mean(), this instead uses the conditioning strategy from Song et al (2020). """ alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"]) eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs) out = p_mean_var.copy() out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps) out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t) return out def p_sample( self, model, x, t, clip_denoised=True, current=None, cache_dic=None, denoised_fn=None, cond_fn=None, model_kwargs=None, ): """ Sample x_{t-1} from the model at the given timestep. :param model: the model to sample from. :param x: the current tensor at x_{t-1}. :param t: the value of t, starting at 0 for the first diffusion step. :param clip_denoised: if True, clip the x_start prediction to [-1, 1]. :param denoised_fn: if not None, a function which applies to the x_start prediction before it is used to sample. :param cond_fn: if not None, this is a gradient function that acts similarly to the model. :param model_kwargs: if not None, a dict of extra keyword arguments to pass to the model. This can be used for conditioning. :return: a dict containing the following keys: - 'sample': a random sample from the model. - 'pred_xstart': a prediction of x_0. """ out = self.p_mean_variance( model, x, t, current=current, cache_dic=cache_dic, clip_denoised=clip_denoised, denoised_fn=denoised_fn, model_kwargs=model_kwargs, ) noise = th.randn_like(x) nonzero_mask = ( (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) ) # no noise when t == 0 if cond_fn is not None: out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs) sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise return {"sample": sample, "pred_xstart": out["pred_xstart"]} def p_sample_loop( self, model, shape, noise=None, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, device=None, progress=False, ): """ Generate samples from the model. :param model: the model module. :param shape: the shape of the samples, (N, C, H, W). :param noise: if specified, the noise from the encoder to sample. Should be of the same shape as `shape`. :param clip_denoised: if True, clip x_start predictions to [-1, 1]. :param denoised_fn: if not None, a function which applies to the x_start prediction before it is used to sample. :param cond_fn: if not None, this is a gradient function that acts similarly to the model. :param model_kwargs: if not None, a dict of extra keyword arguments to pass to the model. This can be used for conditioning. :param device: if specified, the device to create the samples on. If not specified, use a model parameter's device. :param progress: if True, show a tqdm progress bar. :return: a non-differentiable batch of samples. """ final = None for sample in self.p_sample_loop_progressive( model, shape, noise=noise, clip_denoised=clip_denoised, denoised_fn=denoised_fn, cond_fn=cond_fn, model_kwargs=model_kwargs, device=device, progress=progress, ): final = sample return final["sample"] def p_sample_loop_progressive( self, model, shape, noise=None, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, device=None, progress=False, ): """ Generate samples from the model and yield intermediate samples from each timestep of diffusion. Arguments are the same as p_sample_loop(). Returns a generator over dicts, where each dict is the return value of p_sample(). """ if device is None: device = next(model.parameters()).device assert isinstance(shape, (tuple, list)) if noise is not None: img = noise else: img = th.randn(*shape, device=device) indices = list(range(self.num_timesteps))[::-1] if progress: # Lazy import so that we don't depend on tqdm. from tqdm.auto import tqdm indices = tqdm(indices) # Initialization for ToCa cache_dic, current = cache_init(model_kwargs=model_kwargs, num_steps=self.num_timesteps) for i in indices: t = th.tensor([i] * shape[0], device=device) with th.no_grad(): current['step'] = i out = self.p_sample( model, img, t, current=current, cache_dic=cache_dic, clip_denoised=clip_denoised, denoised_fn=denoised_fn, cond_fn=cond_fn, model_kwargs=model_kwargs, ) yield out img = out["sample"] if cache_dic['test_FLOPs'] == True: print(cache_dic['flops'] * 1e-12, "TFLOPs") def ddim_sample( self, model, x, t, current = None, cache_dic = None, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, eta=0.0, ): """ Sample x_{t-1} from the model using DDIM. Same usage as p_sample(). """ out = self.p_mean_variance( model, x, t, current=current, cache_dic=cache_dic, clip_denoised=clip_denoised, denoised_fn=denoised_fn, model_kwargs=model_kwargs, ) if cond_fn is not None: out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs) # Usually our model outputs epsilon, but we re-derive it # in case we used x_start or x_prev prediction. eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"]) alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape) sigma = ( eta * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * th.sqrt(1 - alpha_bar / alpha_bar_prev) ) # Equation 12. noise = th.randn_like(x) mean_pred = ( out["pred_xstart"] * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps ) nonzero_mask = ( (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) ) # no noise when t == 0 sample = mean_pred + nonzero_mask * sigma * noise return {"sample": sample, "pred_xstart": out["pred_xstart"]} def ddim_reverse_sample( self, model, x, t, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, eta=0.0, ): """ Sample x_{t+1} from the model using DDIM reverse ODE. """ assert eta == 0.0, "Reverse ODE only for deterministic path" out = self.p_mean_variance( model, x, t, clip_denoised=clip_denoised, denoised_fn=denoised_fn, model_kwargs=model_kwargs, ) if cond_fn is not None: out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs) # Usually our model outputs epsilon, but we re-derive it # in case we used x_start or x_prev prediction. eps = ( _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x - out["pred_xstart"] ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape) alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape) # Equation 12. reversed mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]} def ddim_sample_loop( self, model, shape, noise=None, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, device=None, progress=False, eta=0.0, ): """ Generate samples from the model using DDIM. Same usage as p_sample_loop(). """ final = None for sample in self.ddim_sample_loop_progressive( model, shape, noise=noise, clip_denoised=clip_denoised, denoised_fn=denoised_fn, cond_fn=cond_fn, model_kwargs=model_kwargs, device=device, progress=progress, eta=eta, ): final = sample return final["sample"] def ddim_sample_loop_progressive( self, model, shape, noise=None, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, device=None, progress=False, eta=0.0, ): """ Use DDIM to sample from the model and yield intermediate samples from each timestep of DDIM. Same usage as p_sample_loop_progressive(). """ if device is None: device = next(model.parameters()).device assert isinstance(shape, (tuple, list)) if noise is not None: img = noise else: img = th.randn(*shape, device=device) indices = list(range(self.num_timesteps))[::-1] if progress: # Lazy import so that we don't depend on tqdm. from tqdm.auto import tqdm indices = tqdm(indices) # Initialization for ToCa cache_dic, current = cache_init(model_kwargs=model_kwargs, num_steps=self.num_timesteps) for i in indices: t = th.tensor([i] * shape[0], device=device) with th.no_grad(): current['step'] = i out = self.ddim_sample( model, img, t, current=current, cache_dic=cache_dic, clip_denoised=clip_denoised, denoised_fn=denoised_fn, cond_fn=cond_fn, model_kwargs=model_kwargs, eta=eta, ) yield out img = out["sample"] if cache_dic['test_FLOPs'] == True: print(cache_dic['flops'] * 1e-12, "TFLOPs") def _vb_terms_bpd( self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None ): """ Get a term for the variational lower-bound. The resulting units are bits (rather than nats, as one might expect). This allows for comparison to other papers. :return: a dict with the following keys: - 'output': a shape [N] tensor of NLLs or KLs. - 'pred_xstart': the x_0 predictions. """ true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance( x_start=x_start, x_t=x_t, t=t ) out = self.p_mean_variance( model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs ) kl = normal_kl( true_mean, true_log_variance_clipped, out["mean"], out["log_variance"] ) kl = mean_flat(kl) / np.log(2.0) decoder_nll = -discretized_gaussian_log_likelihood( x_start, means=out["mean"], log_scales=0.5 * out["log_variance"] ) assert decoder_nll.shape == x_start.shape decoder_nll = mean_flat(decoder_nll) / np.log(2.0) # At the first timestep return the decoder NLL, # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t)) output = th.where((t == 0), decoder_nll, kl) return {"output": output, "pred_xstart": out["pred_xstart"]} def training_losses(self, model, x_start, t, model_kwargs=None, noise=None): """ Compute training losses for a single timestep. :param model: the model to evaluate loss on. :param x_start: the [N x C x ...] tensor of inputs. :param t: a batch of timestep indices. :param model_kwargs: if not None, a dict of extra keyword arguments to pass to the model. This can be used for conditioning. :param noise: if specified, the specific Gaussian noise to try to remove. :return: a dict with the key "loss" containing a tensor of shape [N]. Some mean or variance settings may also have other keys. """ if model_kwargs is None: model_kwargs = {} if noise is None: noise = th.randn_like(x_start) x_t = self.q_sample(x_start, t, noise=noise) terms = {} if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL: terms["loss"] = self._vb_terms_bpd( model=model, x_start=x_start, x_t=x_t, t=t, clip_denoised=False, model_kwargs=model_kwargs, )["output"] if self.loss_type == LossType.RESCALED_KL: terms["loss"] *= self.num_timesteps elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE: model_output = model(x_t, t, **model_kwargs) if self.model_var_type in [ ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE, ]: B, C = x_t.shape[:2] assert model_output.shape == (B, C * 2, *x_t.shape[2:]) model_output, model_var_values = th.split(model_output, C, dim=1) # Learn the variance using the variational bound, but don't let # it affect our mean prediction. frozen_out = th.cat([model_output.detach(), model_var_values], dim=1) terms["vb"] = self._vb_terms_bpd( model=lambda *args, r=frozen_out: r, x_start=x_start, x_t=x_t, t=t, clip_denoised=False, )["output"] if self.loss_type == LossType.RESCALED_MSE: # Divide by 1000 for equivalence with initial implementation. # Without a factor of 1/1000, the VB term hurts the MSE term. terms["vb"] *= self.num_timesteps / 1000.0 target = { ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance( x_start=x_start, x_t=x_t, t=t )[0], ModelMeanType.START_X: x_start, ModelMeanType.EPSILON: noise, }[self.model_mean_type] assert model_output.shape == target.shape == x_start.shape terms["mse"] = mean_flat((target - model_output) ** 2) if "vb" in terms: terms["loss"] = terms["mse"] + terms["vb"] else: terms["loss"] = terms["mse"] else: raise NotImplementedError(self.loss_type) return terms def _prior_bpd(self, x_start): """ Get the prior KL term for the variational lower-bound, measured in bits-per-dim. This term can't be optimized, as it only depends on the encoder. :param x_start: the [N x C x ...] tensor of inputs. :return: a batch of [N] KL values (in bits), one per batch element. """ batch_size = x_start.shape[0] t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device) qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t) kl_prior = normal_kl( mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0 ) return mean_flat(kl_prior) / np.log(2.0) def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None): """ Compute the entire variational lower-bound, measured in bits-per-dim, as well as other related quantities. :param model: the model to evaluate loss on. :param x_start: the [N x C x ...] tensor of inputs. :param clip_denoised: if True, clip denoised samples. :param model_kwargs: if not None, a dict of extra keyword arguments to pass to the model. This can be used for conditioning. :return: a dict containing the following keys: - total_bpd: the total variational lower-bound, per batch element. - prior_bpd: the prior term in the lower-bound. - vb: an [N x T] tensor of terms in the lower-bound. - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep. - mse: an [N x T] tensor of epsilon MSEs for each timestep. """ device = x_start.device batch_size = x_start.shape[0] vb = [] xstart_mse = [] mse = [] for t in list(range(self.num_timesteps))[::-1]: t_batch = th.tensor([t] * batch_size, device=device) noise = th.randn_like(x_start) x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise) # Calculate VLB term at the current timestep with th.no_grad(): out = self._vb_terms_bpd( model, x_start=x_start, x_t=x_t, t=t_batch, clip_denoised=clip_denoised, model_kwargs=model_kwargs, ) vb.append(out["output"]) xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2)) eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"]) mse.append(mean_flat((eps - noise) ** 2)) vb = th.stack(vb, dim=1) xstart_mse = th.stack(xstart_mse, dim=1) mse = th.stack(mse, dim=1) prior_bpd = self._prior_bpd(x_start) total_bpd = vb.sum(dim=1) + prior_bpd return { "total_bpd": total_bpd, "prior_bpd": prior_bpd, "vb": vb, "xstart_mse": xstart_mse, "mse": mse, } def _extract_into_tensor(arr, timesteps, broadcast_shape): """ Extract values from a 1-D numpy array for a batch of indices. :param arr: the 1-D numpy array. :param timesteps: a tensor of indices into the array to extract. :param broadcast_shape: a larger shape of K dimensions with the batch dimension equal to the length of timesteps. :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims. """ res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float() while len(res.shape) < len(broadcast_shape): res = res[..., None] return res + th.zeros(broadcast_shape, device=timesteps.device) ================================================ FILE: DiT-ToCa/diffusion/respace.py ================================================ # Modified from OpenAI's diffusion repos # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py import numpy as np import torch as th from .gaussian_diffusion import GaussianDiffusion def space_timesteps(num_timesteps, section_counts): """ Create a list of timesteps to use from an original diffusion process, given the number of timesteps we want to take from equally-sized portions of the original process. For example, if there's 300 timesteps and the section counts are [10,15,20] then the first 100 timesteps are strided to be 10 timesteps, the second 100 are strided to be 15 timesteps, and the final 100 are strided to be 20. If the stride is a string starting with "ddim", then the fixed striding from the DDIM paper is used, and only one section is allowed. :param num_timesteps: the number of diffusion steps in the original process to divide up. :param section_counts: either a list of numbers, or a string containing comma-separated numbers, indicating the step count per section. As a special case, use "ddimN" where N is a number of steps to use the striding from the DDIM paper. :return: a set of diffusion steps from the original process to use. """ if isinstance(section_counts, str): if section_counts.startswith("ddim"): desired_count = int(section_counts[len("ddim") :]) for i in range(1, num_timesteps): if len(range(0, num_timesteps, i)) == desired_count: return set(range(0, num_timesteps, i)) raise ValueError( f"cannot create exactly {num_timesteps} steps with an integer stride" ) section_counts = [int(x) for x in section_counts.split(",")] size_per = num_timesteps // len(section_counts) extra = num_timesteps % len(section_counts) start_idx = 0 all_steps = [] for i, section_count in enumerate(section_counts): size = size_per + (1 if i < extra else 0) if size < section_count: raise ValueError( f"cannot divide section of {size} steps into {section_count}" ) if section_count <= 1: frac_stride = 1 else: frac_stride = (size - 1) / (section_count - 1) cur_idx = 0.0 taken_steps = [] for _ in range(section_count): taken_steps.append(start_idx + round(cur_idx)) cur_idx += frac_stride all_steps += taken_steps start_idx += size return set(all_steps) class SpacedDiffusion(GaussianDiffusion): """ A diffusion process which can skip steps in a base diffusion process. :param use_timesteps: a collection (sequence or set) of timesteps from the original diffusion process to retain. :param kwargs: the kwargs to create the base diffusion process. """ def __init__(self, use_timesteps, **kwargs): self.use_timesteps = set(use_timesteps) self.timestep_map = [] self.original_num_steps = len(kwargs["betas"]) base_diffusion = GaussianDiffusion(**kwargs) # pylint: disable=missing-kwoa last_alpha_cumprod = 1.0 new_betas = [] for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod): if i in self.use_timesteps: new_betas.append(1 - alpha_cumprod / last_alpha_cumprod) last_alpha_cumprod = alpha_cumprod self.timestep_map.append(i) kwargs["betas"] = np.array(new_betas) super().__init__(**kwargs) def p_mean_variance( self, model, *args, **kwargs ): # pylint: disable=signature-differs return super().p_mean_variance(self._wrap_model(model), *args, **kwargs) def training_losses( self, model, *args, **kwargs ): # pylint: disable=signature-differs return super().training_losses(self._wrap_model(model), *args, **kwargs) def condition_mean(self, cond_fn, *args, **kwargs): return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs) def condition_score(self, cond_fn, *args, **kwargs): return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs) def _wrap_model(self, model): if isinstance(model, _WrappedModel): return model return _WrappedModel( model, self.timestep_map, self.original_num_steps ) def _scale_timesteps(self, t): # Scaling is done by the wrapped model. return t class _WrappedModel: def __init__(self, model, timestep_map, original_num_steps): self.model = model self.timestep_map = timestep_map # self.rescale_timesteps = rescale_timesteps self.original_num_steps = original_num_steps def __call__(self, x, ts, **kwargs): map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype) new_ts = map_tensor[ts] # if self.rescale_timesteps: # new_ts = new_ts.float() * (1000.0 / self.original_num_steps) return self.model(x, new_ts, **kwargs) ================================================ FILE: DiT-ToCa/diffusion/timestep_sampler.py ================================================ # Modified from OpenAI's diffusion repos # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py from abc import ABC, abstractmethod import numpy as np import torch as th import torch.distributed as dist def create_named_schedule_sampler(name, diffusion): """ Create a ScheduleSampler from a library of pre-defined samplers. :param name: the name of the sampler. :param diffusion: the diffusion object to sample for. """ if name == "uniform": return UniformSampler(diffusion) elif name == "loss-second-moment": return LossSecondMomentResampler(diffusion) else: raise NotImplementedError(f"unknown schedule sampler: {name}") class ScheduleSampler(ABC): """ A distribution over timesteps in the diffusion process, intended to reduce variance of the objective. By default, samplers perform unbiased importance sampling, in which the objective's mean is unchanged. However, subclasses may override sample() to change how the resampled terms are reweighted, allowing for actual changes in the objective. """ @abstractmethod def weights(self): """ Get a numpy array of weights, one per diffusion step. The weights needn't be normalized, but must be positive. """ def sample(self, batch_size, device): """ Importance-sample timesteps for a batch. :param batch_size: the number of timesteps. :param device: the torch device to save to. :return: a tuple (timesteps, weights): - timesteps: a tensor of timestep indices. - weights: a tensor of weights to scale the resulting losses. """ w = self.weights() p = w / np.sum(w) indices_np = np.random.choice(len(p), size=(batch_size,), p=p) indices = th.from_numpy(indices_np).long().to(device) weights_np = 1 / (len(p) * p[indices_np]) weights = th.from_numpy(weights_np).float().to(device) return indices, weights class UniformSampler(ScheduleSampler): def __init__(self, diffusion): self.diffusion = diffusion self._weights = np.ones([diffusion.num_timesteps]) def weights(self): return self._weights class LossAwareSampler(ScheduleSampler): def update_with_local_losses(self, local_ts, local_losses): """ Update the reweighting using losses from a model. Call this method from each rank with a batch of timesteps and the corresponding losses for each of those timesteps. This method will perform synchronization to make sure all of the ranks maintain the exact same reweighting. :param local_ts: an integer Tensor of timesteps. :param local_losses: a 1D Tensor of losses. """ batch_sizes = [ th.tensor([0], dtype=th.int32, device=local_ts.device) for _ in range(dist.get_world_size()) ] dist.all_gather( batch_sizes, th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device), ) # Pad all_gather batches to be the maximum batch size. batch_sizes = [x.item() for x in batch_sizes] max_bs = max(batch_sizes) timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes] loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes] dist.all_gather(timestep_batches, local_ts) dist.all_gather(loss_batches, local_losses) timesteps = [ x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs] ] losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]] self.update_with_all_losses(timesteps, losses) @abstractmethod def update_with_all_losses(self, ts, losses): """ Update the reweighting using losses from a model. Sub-classes should override this method to update the reweighting using losses from the model. This method directly updates the reweighting without synchronizing between workers. It is called by update_with_local_losses from all ranks with identical arguments. Thus, it should have deterministic behavior to maintain state across workers. :param ts: a list of int timesteps. :param losses: a list of float losses, one per timestep. """ class LossSecondMomentResampler(LossAwareSampler): def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001): self.diffusion = diffusion self.history_per_term = history_per_term self.uniform_prob = uniform_prob self._loss_history = np.zeros( [diffusion.num_timesteps, history_per_term], dtype=np.float64 ) self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int) def weights(self): if not self._warmed_up(): return np.ones([self.diffusion.num_timesteps], dtype=np.float64) weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1)) weights /= np.sum(weights) weights *= 1 - self.uniform_prob weights += self.uniform_prob / len(weights) return weights def update_with_all_losses(self, ts, losses): for t, loss in zip(ts, losses): if self._loss_counts[t] == self.history_per_term: # Shift out the oldest loss term. self._loss_history[t, :-1] = self._loss_history[t, 1:] self._loss_history[t, -1] = loss else: self._loss_history[t, self._loss_counts[t]] = loss self._loss_counts[t] += 1 def _warmed_up(self): return (self._loss_counts == self.history_per_term).all() ================================================ FILE: DiT-ToCa/download.py ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. """ Functions for downloading pre-trained DiT models """ from torchvision.datasets.utils import download_url import torch import os pretrained_models = {'DiT-XL-2-512x512.pt', 'DiT-XL-2-256x256.pt'} def find_model(model_name): """ Finds a pre-trained DiT model, downloading it if necessary. Alternatively, loads a model from a local path. """ if model_name in pretrained_models: # Find/download our pre-trained DiT checkpoints return download_model(model_name) else: # Load a custom DiT checkpoint: assert os.path.isfile(model_name), f'Could not find DiT checkpoint at {model_name}' checkpoint = torch.load(model_name, map_location=lambda storage, loc: storage, weights_only=True) if "ema" in checkpoint: # supports checkpoints from train.py checkpoint = checkpoint["ema"] return checkpoint def download_model(model_name): """ Downloads a pre-trained DiT model from the web. """ assert model_name in pretrained_models local_path = f'pretrained_models/{model_name}' if not os.path.isfile(local_path): os.makedirs('pretrained_models', exist_ok=True) web_path = f'https://dl.fbaipublicfiles.com/DiT/models/{model_name}' download_url(web_path, 'pretrained_models') model = torch.load(local_path, map_location=lambda storage, loc: storage) return model if __name__ == "__main__": # Download all DiT checkpoints for model in pretrained_models: download_model(model) print('Done.') ================================================ FILE: DiT-ToCa/environment-dit.yml ================================================ name: base channels: - pytorch - nvidia - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/ - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/ - defaults dependencies: - _libgcc_mutex=0.1=main - _openmp_mutex=5.1=1_gnu - aiohttp=3.9.5=py312h5eee18b_0 - aiosignal=1.2.0=pyhd3eb1b0_0 - anaconda-anon-usage=0.4.4=py312hfc0e8ea_100 - archspec=0.2.3=pyhd3eb1b0_0 - arrow-cpp=16.1.0=hc1eb8f0_0 - aws-c-auth=0.6.19=h5eee18b_0 - aws-c-cal=0.5.20=hdbd6064_0 - aws-c-common=0.8.5=h5eee18b_0 - aws-c-compression=0.2.16=h5eee18b_0 - aws-c-event-stream=0.2.15=h6a678d5_0 - aws-c-http=0.6.25=h5eee18b_0 - aws-c-io=0.13.10=h5eee18b_0 - aws-c-mqtt=0.7.13=h5eee18b_0 - aws-c-s3=0.1.51=hdbd6064_0 - aws-c-sdkutils=0.1.6=h5eee18b_0 - aws-checksums=0.1.13=h5eee18b_0 - aws-crt-cpp=0.18.16=h6a678d5_0 - aws-sdk-cpp=1.10.55=h721c034_0 - blas=1.0=mkl - boltons=23.0.0=py312h06a4308_0 - boost-cpp=1.82.0=hdb19cb5_2 - bottleneck=1.3.7=py312ha883a20_0 - brotli-python=1.0.9=py312h6a678d5_8 - bzip2=1.0.8=h5eee18b_6 - c-ares=1.19.1=h5eee18b_0 - ca-certificates=2024.7.2=h06a4308_0 - certifi=2024.7.4=py312h06a4308_0 - cffi=1.16.0=py312h5eee18b_1 - charset-normalizer=2.0.4=pyhd3eb1b0_0 - conda=24.7.1=py312h06a4308_0 - conda-content-trust=0.2.0=py312h06a4308_1 - conda-libmamba-solver=24.1.0=pyhd3eb1b0_0 - conda-package-handling=2.2.0=py312h06a4308_1 - conda-package-streaming=0.9.0=py312h06a4308_0 - cryptography=42.0.5=py312hdda0065_1 - cuda-cudart=12.1.105=0 - cuda-cupti=12.1.105=0 - cuda-libraries=12.1.0=0 - cuda-nvrtc=12.1.105=0 - cuda-nvtx=12.1.105=0 - cuda-opencl=12.6.37=0 - cuda-runtime=12.1.0=0 - cuda-version=12.6=3 - datasets=2.19.1=py312h06a4308_0 - diffusers=0.18.2=py312he106c6f_0 - diffusers-base=0.18.2=py312he106c6f_0 - diffusers-torch=0.18.2=py312he106c6f_0 - dill=0.3.8=py312h06a4308_0 - distro=1.9.0=py312h06a4308_0 - expat=2.6.2=h6a678d5_0 - ffmpeg=4.3=hf484d3e_0 - fmt=9.1.0=hdb19cb5_1 - freetype=2.12.1=h4a9f257_0 - frozendict=2.4.2=py312h06a4308_0 - frozenlist=1.4.0=py312h5eee18b_0 - gflags=2.2.2=h6a678d5_1 - glog=0.5.0=h6a678d5_1 - gmp=6.2.1=h295c915_3 - gnutls=3.6.15=he1e5248_0 - huggingface_accelerate=0.21.0=py312h06a4308_0 - huggingface_hub=0.23.1=py312h06a4308_0 - icu=73.1=h6a678d5_0 - idna=3.7=py312h06a4308_0 - importlib-metadata=7.0.1=py312h06a4308_0 - intel-openmp=2023.1.0=hdb19cb5_46306 - jinja2=3.1.4=py312h06a4308_0 - jpeg=9e=h5eee18b_3 - jsonpatch=1.33=py312h06a4308_1 - jsonpointer=2.1=pyhd3eb1b0_0 - krb5=1.20.1=h143b758_1 - lame=3.100=h7b6447c_0 - lcms2=2.12=h3be6417_0 - ld_impl_linux-64=2.38=h1181459_1 - lerc=3.0=h295c915_0 - libabseil=20240116.2=cxx17_h6a678d5_0 - libarchive=3.6.2=h6ac8c49_3 - libboost=1.82.0=h109eef0_2 - libbrotlicommon=1.0.9=h5eee18b_8 - libbrotlidec=1.0.9=h5eee18b_8 - libbrotlienc=1.0.9=h5eee18b_8 - libcublas=12.1.0.26=0 - libcufft=11.0.2.4=0 - libcufile=1.11.0.15=0 - libcurand=10.3.7.37=0 - libcurl=8.7.1=h251f7ec_0 - libcusolver=11.4.4.55=0 - libcusparse=12.0.2.55=0 - libdeflate=1.17=h5eee18b_1 - libedit=3.1.20230828=h5eee18b_0 - libev=4.33=h7f8727e_1 - libevent=2.1.12=hdbd6064_1 - libffi=3.4.4=h6a678d5_1 - libgcc-ng=11.2.0=h1234567_1 - libgomp=11.2.0=h1234567_1 - libgrpc=1.62.2=h2d74bed_0 - libiconv=1.16=h5eee18b_3 - libidn2=2.3.4=h5eee18b_0 - libjpeg-turbo=2.0.0=h9bf148f_0 - libmamba=1.5.8=hfe524e5_2 - libmambapy=1.5.8=py312h2dafd23_2 - libnghttp2=1.57.0=h2d74bed_0 - libnpp=12.0.2.50=0 - libnvjitlink=12.1.105=0 - libnvjpeg=12.1.1.14=0 - libpng=1.6.39=h5eee18b_0 - libprotobuf=4.25.3=he621ea3_0 - libsolv=0.7.24=he621ea3_1 - libssh2=1.11.0=h251f7ec_0 - libstdcxx-ng=11.2.0=h1234567_1 - libtasn1=4.19.0=h5eee18b_0 - libthrift=0.15.0=h1795dd8_2 - libtiff=4.5.1=h6a678d5_0 - libunistring=0.9.10=h27cfd23_0 - libuuid=1.41.5=h5eee18b_0 - libwebp-base=1.3.2=h5eee18b_0 - libxml2=2.10.4=hfdd30dd_2 - llvm-openmp=14.0.6=h9e868ea_0 - lz4-c=1.9.4=h6a678d5_1 - menuinst=2.0.2=py312h06a4308_1 - mkl=2023.1.0=h213fc3f_46344 - mkl-service=2.4.0=py312h5eee18b_1 - mkl_fft=1.3.8=py312h5eee18b_0 - mkl_random=1.2.4=py312hdb19cb5_0 - mpmath=1.3.0=py312h06a4308_0 - multidict=6.0.4=py312h5eee18b_0 - multiprocess=0.70.15=py312h06a4308_0 - ncurses=6.4=h6a678d5_0 - nettle=3.7.3=hbbd107a_1 - networkx=3.3=py312h06a4308_0 - numexpr=2.8.7=py312hf827012_0 - numpy=1.26.4=py312hc5e2394_0 - numpy-base=1.26.4=py312h0da6c21_0 - openh264=2.1.1=h4ff587b_0 - openjpeg=2.5.2=he7f1fd0_0 - openssl=3.0.14=h5eee18b_0 - orc=2.0.1=h2d29ad5_0 - packaging=23.2=py312h06a4308_0 - pandas=2.2.2=py312h526ad5a_0 - pcre2=10.42=hebb0a14_1 - pip=24.0=py312h06a4308_0 - platformdirs=3.10.0=py312h06a4308_0 - pluggy=1.0.0=py312h06a4308_1 - pyarrow=16.1.0=py312h526ad5a_0 - pybind11-abi=5=hd3eb1b0_0 - pycosat=0.6.6=py312h5eee18b_1 - pycparser=2.21=pyhd3eb1b0_0 - pysocks=1.7.1=py312h06a4308_0 - python=3.12.3=h996f2a0_1 - python-dateutil=2.9.0post0=py312h06a4308_2 - python-tzdata=2023.3=pyhd3eb1b0_0 - python-xxhash=2.0.2=py312h5eee18b_1 - pytorch=2.4.0=py3.12_cuda12.1_cudnn9.1.0_0 - pytorch-cuda=12.1=ha16c6d3_5 - pytorch-mutex=1.0=cuda - pytz=2024.1=py312h06a4308_0 - pyyaml=6.0.1=py312h5eee18b_0 - re2=2022.04.01=h295c915_0 - readline=8.2=h5eee18b_0 - regex=2024.7.24=py312h5eee18b_0 - reproc=14.2.4=h6a678d5_2 - reproc-cpp=14.2.4=h6a678d5_2 - requests=2.31.0=py312h06a4308_1 - ruamel.yaml=0.17.21=py312h5eee18b_0 - s2n=1.3.27=hdbd6064_0 - safetensors=0.4.2=py312hb7cc22b_1 - setuptools=69.5.1=py312h06a4308_0 - six=1.16.0=pyhd3eb1b0_1 - snappy=1.2.1=h6a678d5_0 - sqlite=3.45.3=h5eee18b_0 - tbb=2021.8.0=hdb19cb5_0 - tk=8.6.14=h39e8969_0 - tokenizers=0.19.1=py312ha11519a_0 - torchaudio=2.4.0=py312_cu121 - torchtriton=3.0.0=py312 - tqdm=4.66.2=py312he106c6f_0 - transformers=4.41.2=py312h06a4308_0 - truststore=0.8.0=py312h06a4308_0 - typing_extensions=4.11.0=py312h06a4308_0 - tzdata=2024a=h04d1e81_0 - urllib3=2.1.0=py312h06a4308_1 - utf8proc=2.6.1=h5eee18b_1 - wheel=0.43.0=py312h06a4308_0 - xxhash=0.8.0=h7f8727e_3 - xz=5.4.6=h5eee18b_1 - yaml=0.2.5=h7b6447c_0 - yaml-cpp=0.8.0=h6a678d5_1 - yarl=1.9.3=py312h5eee18b_0 - zipp=3.17.0=py312h06a4308_0 - zlib=1.2.13=h5eee18b_1 - zstandard=0.22.0=py312h2c38b39_0 - zstd=1.5.5=hc292b87_2 - pip: - absl-py==2.1.0 - anyio==4.4.0 - argon2-cffi==23.1.0 - argon2-cffi-bindings==21.2.0 - arrow==1.3.0 - asttokens==2.4.1 - async-lru==2.0.4 - attrs==23.2.0 - babel==2.15.0 - beautifulsoup4==4.12.3 - bleach==6.1.0 - brokenaxes==0.6.2 - comm==0.2.2 - contourpy==1.2.1 - cycler==0.12.1 - debugpy==1.8.1 - decorator==5.1.1 - defusedxml==0.7.1 - executing==2.0.1 - fastjsonschema==2.19.1 - filelock==3.14.0 - fonttools==4.53.0 - fqdn==1.5.1 - fsspec==2024.5.0 - grpcio==1.64.0 - h11==0.14.0 - httpcore==1.0.5 - httpx==0.27.0 - ipykernel==6.29.4 - ipython==8.25.0 - ipywidgets==8.1.3 - isoduration==20.11.0 - jedi==0.19.1 - json5==0.9.25 - jsonschema==4.22.0 - jsonschema-specifications==2023.12.1 - jupyter-client==8.6.2 - jupyter-core==5.7.2 - jupyter-events==0.10.0 - jupyter-lsp==2.2.5 - jupyter-server==2.14.1 - jupyter-server-terminals==0.5.3 - jupyterlab==4.2.1 - jupyterlab-language-pack-zh-cn==4.2.post1 - jupyterlab-pygments==0.3.0 - jupyterlab-server==2.27.2 - jupyterlab-widgets==3.0.11 - kiwisolver==1.4.5 - markdown==3.6 - markupsafe==2.1.5 - matplotlib==3.9.0 - matplotlib-inline==0.1.7 - mistune==3.0.2 - nbclient==0.10.0 - nbconvert==7.16.4 - nbformat==5.10.4 - nest-asyncio==1.6.0 - notebook-shim==0.2.4 - nvidia-cublas-cu12==12.1.3.1 - nvidia-cuda-cupti-cu12==12.1.105 - nvidia-cuda-nvrtc-cu12==12.1.105 - nvidia-cuda-runtime-cu12==12.1.105 - nvidia-cudnn-cu12==9.1.0.70 - nvidia-cufft-cu12==11.0.2.54 - nvidia-curand-cu12==10.3.2.106 - nvidia-cusolver-cu12==11.4.5.107 - nvidia-cusparse-cu12==12.1.0.106 - nvidia-nccl-cu12==2.20.5 - nvidia-nvjitlink-cu12==12.5.40 - nvidia-nvtx-cu12==12.1.105 - overrides==7.7.0 - pandocfilters==1.5.1 - parso==0.8.4 - pexpect==4.9.0 - pillow==10.3.0 - prometheus-client==0.20.0 - prompt-toolkit==3.0.45 - protobuf==5.27.0 - psutil==5.9.8 - ptyprocess==0.7.0 - pure-eval==0.2.2 - pygments==2.18.0 - pyparsing==3.1.2 - python-json-logger==2.0.7 - pytorch-fid==0.3.0 - pyzmq==26.0.3 - referencing==0.35.1 - rfc3339-validator==0.1.4 - rfc3986-validator==0.1.1 - rpds-py==0.18.1 - scipy==1.14.1 - send2trash==1.8.3 - sniffio==1.3.1 - soupsieve==2.5 - stack-data==0.6.3 - supervisor==4.2.5 - sympy==1.12.1 - tensorboard==2.16.2 - tensorboard-data-server==0.7.2 - terminado==0.18.1 - timm==1.0.8 - tinycss2==1.3.0 - torch==2.4.0 - torchvision==0.19.0 - tornado==6.4 - traitlets==5.14.3 - triton==3.0.0 - types-python-dateutil==2.9.0.20240316 - typing-extensions==4.12.1 - uri-template==1.3.0 - wcwidth==0.2.13 - webcolors==1.13 - webencodings==0.5.1 - websocket-client==1.8.0 - werkzeug==3.0.3 - widgetsnbextension==4.0.11 prefix: /root/miniconda3 ================================================ FILE: DiT-ToCa/models.py ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # GLIDE: https://github.com/openai/glide-text2im # MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py # -------------------------------------------------------- import torch import torch.nn as nn import numpy as np import math #from timm.models.vision_transformer import PatchEmbed, Attention, Mlp from timm.models.vision_transformer import PatchEmbed, Mlp #import os.path as osp from cache_functions import global_force_fresh, cache_cutfresh, update_cache, force_init, Attention, cal_type def modulate(x, shift, scale): return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) ################################################################################# # Embedding Layers for Timesteps and Class Labels # ################################################################################# class TimestepEmbedder(nn.Module): """ Embeds scalar timesteps into vector representations. """ def __init__(self, hidden_size, frequency_embedding_size=256): super().__init__() self.mlp = nn.Sequential( nn.Linear(frequency_embedding_size, hidden_size, bias=True), nn.SiLU(), nn.Linear(hidden_size, hidden_size, bias=True), ) self.frequency_embedding_size = frequency_embedding_size @staticmethod def timestep_embedding(t, dim, max_period=10000): """ Create sinusoidal timestep embeddings. :param t: a 1-D Tensor of N indices, one per batch element. These may be fractional. :param dim: the dimension of the output. :param max_period: controls the minimum frequency of the embeddings. :return: an (N, D) Tensor of positional embeddings. """ # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py half = dim // 2 freqs = torch.exp( -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half ).to(device=t.device) args = t[:, None].float() * freqs[None] embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) if dim % 2: embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) return embedding def forward(self, t): t_freq = self.timestep_embedding(t, self.frequency_embedding_size) t_emb = self.mlp(t_freq) return t_emb class LabelEmbedder(nn.Module): """ Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance. """ def __init__(self, num_classes, hidden_size, dropout_prob): super().__init__() use_cfg_embedding = dropout_prob > 0 self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size) self.num_classes = num_classes self.dropout_prob = dropout_prob def token_drop(self, labels, force_drop_ids=None): """ Drops labels to enable classifier-free guidance. """ if force_drop_ids is None: drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob else: drop_ids = force_drop_ids == 1 labels = torch.where(drop_ids, self.num_classes, labels) return labels def forward(self, labels, train, force_drop_ids=None): use_dropout = self.dropout_prob > 0 if (train and use_dropout) or (force_drop_ids is not None): labels = self.token_drop(labels, force_drop_ids) embeddings = self.embedding_table(labels) return embeddings ################################################################################# # Core DiT Model # ################################################################################# class DiTBlock(nn.Module): """ A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning. """ def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs): super().__init__() self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs) self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) mlp_hidden_dim = int(hidden_size * mlp_ratio) approx_gelu = lambda: nn.GELU(approximate="tanh") self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0) self.adaLN_modulation = nn.Sequential( nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True) ) def forward(self, x, c, current, cache_dic): B, N, C = x.shape layer = current['layer'] # FLOPs calculation initialization flops = 0 test_FLOPs = cache_dic.get('test_FLOPs', False) # check if test_FLOPs is enabled # determine current working status cal_type(cache_dic, current) if current['type'] == 'full': # Force Activation: Compute all tokens and save them in cache # AdaLN Modulation shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1) # LayerNorm FLOPs (for both norm1 and norm2) if test_FLOPs: flops += 2 * B * N * C # AdaLN FLOPs (SiLU and Linear) if test_FLOPs: flops += B * C # SiLU FLOPs flops += B * C * 6 * C # Linear FLOPs in adaLN_modulation current['module'] = 'attn' attn_output, attn_map = self.attn(modulate(self.norm1(x), shift_msa, scale_msa), cache_dic=cache_dic, current=current) cache_dic['cache'][-1][layer]['attn'] = attn_output cache_dic['attn_map'][-1][layer] = attn_map force_init(cache_dic, current, x) x = x + gate_msa.unsqueeze(1) * attn_output current['module'] = 'mlp' mlp_output = self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp)) cache_dic['cache'][-1][layer]['mlp'] = mlp_output force_init(cache_dic, current, x) x = x + gate_mlp.unsqueeze(1) * mlp_output # MLP FLOPs if test_FLOPs: mlp_hidden_dim = int(C * 4) # Assuming mlp_ratio = 4 flops += B * N * C * mlp_hidden_dim * 2 # First projection flops += B * N * mlp_hidden_dim * C * 2# Second projection flops += B * N * mlp_hidden_dim * 6 # GELU activation elif current['type'] == 'ToCa': # Partial Computation: Compute only fresh tokens and save them in cache, no attention token computation in the final version # AdaLN Modulation shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1) # LayerNorm FLOPs (for both norm1 and norm2) if test_FLOPs: flops += 2 * B * N * C # AdaLN FLOPs (SiLU and Linear) if test_FLOPs: flops += B * C # SiLU FLOPs flops += B * C * 6 * C # Linear FLOPs in adaLN_modulation current['module'] = 'attn' x = x + gate_msa.unsqueeze(1) * cache_dic['cache'][-1][layer]['attn'] current['module'] = 'mlp' fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x, current) fresh_tokens = self.mlp(modulate(self.norm2(fresh_tokens), shift_mlp, scale_mlp)) update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current) x = x + gate_mlp.unsqueeze(1) * cache_dic['cache'][-1][layer]['mlp'] # MLP FLOPs for the 'else' branch if test_FLOPs: B_fresh, N_fresh, C_fresh = fresh_tokens.shape mlp_hidden_dim = int(C_fresh * 4) # Assuming mlp_ratio = 4 flops += B_fresh * N_fresh * C_fresh * mlp_hidden_dim * 2 # First projection flops += B_fresh * N_fresh * mlp_hidden_dim * C_fresh * 2 # Second projection flops += B_fresh * N_fresh * mlp_hidden_dim * 6 # GELU activation elif current['type'] == 'FORA': # AdaLN Modulation shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1) # AdaLN FLOPs (SiLU and Linear) if test_FLOPs: flops += B * C # SiLU FLOPs flops += B * C * 6 * C # Linear FLOPs in adaLN_modulation current['module'] = 'attn' x = x + gate_msa.unsqueeze(1) * cache_dic['cache'][-1][layer]['attn'] current['module'] = 'mlp' x = x + gate_mlp.unsqueeze(1) * cache_dic['cache'][-1][layer]['mlp'] else: current['module'] = 'skipped' if current['layer'] == 27: x = cache_dic['cache'][-1]['noise'] cache_dic['flops'] += flops if current['layer'] == 27: cache_dic['cache'][-1]['noise'] = x return x class FinalLayer(nn.Module): """ The final layer of DiT. """ def __init__(self, hidden_size, patch_size, out_channels): super().__init__() self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True) self.adaLN_modulation = nn.Sequential( nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True) ) def forward(self, x, c): shift, scale = self.adaLN_modulation(c).chunk(2, dim=1) x = modulate(self.norm_final(x), shift, scale) x = self.linear(x) return x class DiT(nn.Module): """ Diffusion model with a Transformer backbone. """ def __init__( self, input_size=32, patch_size=2, in_channels=4, hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, num_classes=1000, learn_sigma=True, ): super().__init__() self.learn_sigma = learn_sigma self.in_channels = in_channels self.out_channels = in_channels * 2 if learn_sigma else in_channels self.patch_size = patch_size self.num_heads = num_heads self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True) self.t_embedder = TimestepEmbedder(hidden_size) self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob) num_patches = self.x_embedder.num_patches # Will use fixed sin-cos embedding: self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, hidden_size), requires_grad=False) self.blocks = nn.ModuleList([ DiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth) ]) self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels) self.initialize_weights() def initialize_weights(self): # Initialize transformer layers: def _basic_init(module): if isinstance(module, nn.Linear): torch.nn.init.xavier_uniform_(module.weight) if module.bias is not None: nn.init.constant_(module.bias, 0) self.apply(_basic_init) # Initialize (and freeze) pos_embed by sin-cos embedding: pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5)) self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0)) # Initialize patch_embed like nn.Linear (instead of nn.Conv2d): w = self.x_embedder.proj.weight.data nn.init.xavier_uniform_(w.view([w.shape[0], -1])) nn.init.constant_(self.x_embedder.proj.bias, 0) # Initialize label embedding table: nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02) # Initialize timestep embedding MLP: nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) # Zero-out adaLN modulation layers in DiT blocks: for block in self.blocks: nn.init.constant_(block.adaLN_modulation[-1].weight, 0) nn.init.constant_(block.adaLN_modulation[-1].bias, 0) # Zero-out output layers: nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0) nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0) nn.init.constant_(self.final_layer.linear.weight, 0) nn.init.constant_(self.final_layer.linear.bias, 0) def unpatchify(self, x): """ x: (N, T, patch_size**2 * C) imgs: (N, H, W, C) """ c = self.out_channels p = self.x_embedder.patch_size[0] h = w = int(x.shape[1] ** 0.5) assert h * w == x.shape[1] x = x.reshape(shape=(x.shape[0], h, w, p, p, c)) x = torch.einsum('nhwpqc->nchpwq', x) imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p)) return imgs def forward(self, x, t, current, cache_dic, y): """ Forward pass of DiT. x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) t: (N,) tensor of diffusion timesteps y: (N,) tensor of class labels """ x = self.x_embedder(x) + self.pos_embed # (N, T, D), where T = H * W / patch_size ** 2 t = self.t_embedder(t) # (N, D) y = self.y_embedder(y, self.training) # (N, D) c = t + y # (N, D) for layeridx, block in enumerate(self.blocks): current['layer'] = layeridx x = block(x, c, current, cache_dic) # (N, T, D) x = self.final_layer(x, c) # (N, T, patch_size ** 2 * out_channels) x = self.unpatchify(x) # (N, out_channels, H, W) return x def forward_with_cfg(self, x, t, current, cache_dic, y, cfg_scale, **kwargs): #def forward_with_cfg(self, x, t, y, cfg_scale): """ Forward pass of DiT, but also batches the unconditional forward pass for classifier-free guidance. """ # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb half = x[: len(x) // 2] combined = torch.cat([half, half], dim=0) #model_out = self.forward(combined, t, y) model_out = self.forward(combined, t, current, cache_dic, y) # For exact reproducibility reasons, we apply classifier-free guidance on only # three channels by default. The standard approach to cfg applies it to all channels. # This can be done by uncommenting the following line and commenting-out the line following that. # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:] eps, rest = model_out[:, :3], model_out[:, 3:] cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0) half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps) eps = torch.cat([half_eps, half_eps], dim=0) return torch.cat([eps, rest], dim=1) ################################################################################# # Sine/Cosine Positional Embedding Functions # ################################################################################# # https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0): """ grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) """ grid_h = np.arange(grid_size, dtype=np.float32) grid_w = np.arange(grid_size, dtype=np.float32) grid = np.meshgrid(grid_w, grid_h) # here w goes first grid = np.stack(grid, axis=0) grid = grid.reshape([2, 1, grid_size, grid_size]) pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) if cls_token and extra_tokens > 0: pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0) return pos_embed def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): assert embed_dim % 2 == 0 # use half of dimensions to encode grid_h emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) return emb def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): """ embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D) """ assert embed_dim % 2 == 0 omega = np.arange(embed_dim // 2, dtype=np.float64) omega /= embed_dim / 2. omega = 1. / 10000**omega # (D/2,) pos = pos.reshape(-1) # (M,) out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product emb_sin = np.sin(out) # (M, D/2) emb_cos = np.cos(out) # (M, D/2) emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) return emb ################################################################################# # DiT Configs # ################################################################################# def DiT_XL_2(**kwargs): return DiT(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs) def DiT_XL_4(**kwargs): return DiT(depth=28, hidden_size=1152, patch_size=4, num_heads=16, **kwargs) def DiT_XL_8(**kwargs): return DiT(depth=28, hidden_size=1152, patch_size=8, num_heads=16, **kwargs) def DiT_L_2(**kwargs): return DiT(depth=24, hidden_size=1024, patch_size=2, num_heads=16, **kwargs) def DiT_L_4(**kwargs): return DiT(depth=24, hidden_size=1024, patch_size=4, num_heads=16, **kwargs) def DiT_L_8(**kwargs): return DiT(depth=24, hidden_size=1024, patch_size=8, num_heads=16, **kwargs) def DiT_B_2(**kwargs): return DiT(depth=12, hidden_size=768, patch_size=2, num_heads=12, **kwargs) def DiT_B_4(**kwargs): return DiT(depth=12, hidden_size=768, patch_size=4, num_heads=12, **kwargs) def DiT_B_8(**kwargs): return DiT(depth=12, hidden_size=768, patch_size=8, num_heads=12, **kwargs) def DiT_S_2(**kwargs): return DiT(depth=12, hidden_size=384, patch_size=2, num_heads=6, **kwargs) def DiT_S_4(**kwargs): return DiT(depth=12, hidden_size=384, patch_size=4, num_heads=6, **kwargs) def DiT_S_8(**kwargs): return DiT(depth=12, hidden_size=384, patch_size=8, num_heads=6, **kwargs) DiT_models = { 'DiT-XL/2': DiT_XL_2, 'DiT-XL/4': DiT_XL_4, 'DiT-XL/8': DiT_XL_8, 'DiT-L/2': DiT_L_2, 'DiT-L/4': DiT_L_4, 'DiT-L/8': DiT_L_8, 'DiT-B/2': DiT_B_2, 'DiT-B/4': DiT_B_4, 'DiT-B/8': DiT_B_8, 'DiT-S/2': DiT_S_2, 'DiT-S/4': DiT_S_4, 'DiT-S/8': DiT_S_8, } ================================================ FILE: DiT-ToCa/sample.py ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. """ Sample new images from a pre-trained DiT. """ import torch torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True from torchvision.utils import save_image from diffusion import create_diffusion from diffusers.models import AutoencoderKL from download import find_model from models import DiT_models import argparse def main(args): # Setup PyTorch: torch.manual_seed(args.seed) torch.set_grad_enabled(False) device = "cuda" if torch.cuda.is_available() else "cpu" #device = "cpu" #print("device = ", device, flush=True) #print(torch.cuda.device_count(), flush=True) if args.ckpt is None: assert args.model == "DiT-XL/2", "Only DiT-XL/2 models are available for auto-download." assert args.image_size in [256, 512] assert args.num_classes == 1000 # Load model: latent_size = args.image_size // 8 model = DiT_models[args.model]( input_size=latent_size, num_classes=args.num_classes ).to(device) # Auto-download a pre-trained model or load a custom DiT checkpoint from train.py: ckpt_path = args.ckpt or f"/root/autodl-tmp/pretrained_models/DiT/DiT-XL-2-{args.image_size}x{args.image_size}.pt" state_dict = find_model(ckpt_path) model.load_state_dict(state_dict) model.eval() # important! diffusion = create_diffusion(str(args.num_sampling_steps)) vae = AutoencoderKL.from_pretrained(f"/root/autodl-tmp/pretrained_models/stabilityai/sd-vae-ft-{args.vae}").to(device) #vae = AutoencoderKL.from_pretrained(f"/root/autodl-tmp/pretrained_models").to(device) # Labels to condition the model with (feel free to change): class_labels = [985] # Create sampling noise: n = len(class_labels) # Sample 4 images for category label z = torch.randn(n, 4, latent_size, latent_size, device=device) y = torch.tensor(class_labels, device=device) # Setup classifier-free guidance: #print("cfg scale = ", args.cfg_scale, flush=True) z = torch.cat([z, z], 0) y_null = torch.tensor([1000] * n, device=device) y = torch.cat([y, y_null], 0) model_kwargs = dict(y=y, cfg_scale=args.cfg_scale) model_kwargs['cache_type'] = args.cache_type model_kwargs['fresh_ratio'] = args.fresh_ratio model_kwargs['force_fresh'] = args.force_fresh model_kwargs['fresh_threshold'] = args.fresh_threshold model_kwargs['ratio_scheduler'] = args.ratio_scheduler model_kwargs['soft_fresh_weight'] = args.soft_fresh_weight model_kwargs['test_FLOPs'] = args.test_FLOPs start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() if args.ddim_sample: samples = diffusion.ddim_sample_loop( model.forward_with_cfg, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device ) else: samples = diffusion.p_sample_loop( model.forward_with_cfg, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device ) end.record() torch.cuda.synchronize() print(f"Total Sampling took {start.elapsed_time(end)*0.001} seconds") samples, _ = samples.chunk(2, dim=0) # Remove null class samples samples = vae.decode(samples / 0.18215).sample # Save and display images: save_image(samples, "sample.png", nrow=4, normalize=True, value_range=(-1, 1)) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model", type=str, choices=list(DiT_models.keys()), default="DiT-XL/2") parser.add_argument("--vae", type=str, choices=["ema", "mse"], default="mse") parser.add_argument("--image-size", type=int, choices=[256, 512], default=256) parser.add_argument("--num-classes", type=int, default=1000) parser.add_argument("--cfg-scale", type=float, default=1.5) parser.add_argument("--num-sampling-steps", type=int, default=250) parser.add_argument("--seed", type=int, default=0) parser.add_argument("--ckpt", type=str, default=None, help="Optional path to a DiT checkpoint (default: auto-download a pre-trained DiT-XL/2 model).") parser.add_argument("--ddim-sample", action="store_true", default=False) parser.add_argument("--cache-type", type=str, choices=['random', 'attention','similarity','norm', 'compress','kv-norm'], default='attention') # only attention is supported currently parser.add_argument("--fresh-ratio", type=float, default=0.07) parser.add_argument("--ratio-scheduler", type=str, default='ToCa', choices=['linear', 'cosine', 'exp', 'constant','linear-mode','layerwise','ToCa-ddpm250', 'ToCa-ddim50']) # 'ToCa' is the proposed scheduler in Final version of the paper parser.add_argument("--force-fresh", type=str, choices=['global', 'local'], default='global', help="Force fresh strategy. global: fresh all tokens. local: fresh tokens acheiving fresh step threshold.") # only global is supported currently, local causes bad results parser.add_argument("--fresh-threshold", type=int, default=4) # N in the paper parser.add_argument("--soft-fresh-weight", type=float, default=0.25, # lambda_3 in the paper help="soft weight for updating the stale tokens by adding extra scores.") parser.add_argument("--test-FLOPs", action="store_true", default=False) #parser.add_argument("--merge-weight", type=float, default=0.0) # never used in the paper, just for exploration args = parser.parse_args() main(args) ================================================ FILE: DiT-ToCa/sample_ddp.py ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. """ Samples a large number of images from a pre-trained DiT model using DDP. Subsequently saves a .npz file that can be used to compute FID and other evaluation metrics via the ADM repo: https://github.com/openai/guided-diffusion/tree/main/evaluations For a simple single-GPU/CPU sampling script, see sample.py. """ import torch import torch.distributed as dist from models import DiT_models from download import find_model from diffusion import create_diffusion from diffusers.models import AutoencoderKL from tqdm import tqdm import os from PIL import Image import numpy as np import math import argparse def create_npz_from_sample_folder(sample_dir, num=50_000): """ Builds a single .npz file from a folder of .png samples. """ samples = [] for i in tqdm(range(num), desc="Building .npz file from samples"): sample_pil = Image.open(f"{sample_dir}/{i:06d}.png") sample_np = np.asarray(sample_pil).astype(np.uint8) samples.append(sample_np) samples = np.stack(samples) assert samples.shape == (num, samples.shape[1], samples.shape[2], 3) npz_path = f"{sample_dir}.npz" np.savez(npz_path, arr_0=samples) print(f"Saved .npz file to {npz_path} [shape={samples.shape}].") return npz_path def main(args): """ Run sampling. """ torch.backends.cuda.matmul.allow_tf32 = args.tf32 # True: fast but may lead to some small numerical differences assert torch.cuda.is_available(), "Sampling with DDP requires at least one GPU. sample.py supports CPU-only usage" torch.set_grad_enabled(False) # Setup DDP: dist.init_process_group("nccl") rank = dist.get_rank() device = rank % torch.cuda.device_count() seed = args.global_seed * dist.get_world_size() + rank torch.manual_seed(seed) torch.cuda.set_device(device) print(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.") if args.ckpt is None: assert args.model == "DiT-XL/2", "Only DiT-XL/2 models are available for auto-download." assert args.image_size in [256, 512] assert args.num_classes == 1000 # Load model: latent_size = args.image_size // 8 model = DiT_models[args.model]( input_size=latent_size, num_classes=args.num_classes ).to(device) # Auto-download a pre-trained model or load a custom DiT checkpoint from train.py: ckpt_path = args.ckpt or f"/root/autodl-tmp/pretrained_models/DiT/DiT-XL-2-{args.image_size}x{args.image_size}.pt" state_dict = find_model(ckpt_path) model.load_state_dict(state_dict) model.eval() # important! diffusion = create_diffusion(str(args.num_sampling_steps)) vae = AutoencoderKL.from_pretrained(f"/root/autodl-tmp/pretrained_models/stabilityai/sd-vae-ft-{args.vae}").to(device) #vae = AutoencoderKL.from_pretrained(f"/root/autodl-tmp/pretrained_models").to(device) assert args.cfg_scale >= 1.0, "In almost all cases, cfg_scale be >= 1.0" using_cfg = args.cfg_scale > 1.0 # Create folder to save samples: model_string_name = args.model.replace("/", "-") ckpt_string_name = os.path.basename(args.ckpt).replace(".pt", "") if args.ckpt else "pretrained" folder_name = f"ToCa-{model_string_name}-{ckpt_string_name}-size-{args.image_size}-vae-{args.vae}-" \ f"cfg-{args.cfg_scale}-seed-{args.global_seed}-step-{args.num_sampling_steps}-num-{args.num_fid_samples}"\ f"-{args.cache_type}-{args.fresh_ratio}-{args.ratio_scheduler}-{args.force_fresh}-{args.fresh_threshold}"\ f"-softweight-{args.soft_fresh_weight}" sample_folder_dir = f"{args.sample_dir}/{folder_name}" if rank == 0: os.makedirs(sample_folder_dir, exist_ok=True) print(f"Saving .png samples at {sample_folder_dir}") dist.barrier() # Figure out how many samples we need to generate on each GPU and how many iterations we need to run: n = args.per_proc_batch_size global_batch_size = n * dist.get_world_size() # To make things evenly-divisible, we'll sample a bit more than we need and then discard the extra samples: total_samples = int(math.ceil(args.num_fid_samples / global_batch_size) * global_batch_size) if rank == 0: print(f"Total number of images that will be sampled: {total_samples}") assert total_samples % dist.get_world_size() == 0, "total_samples must be divisible by world_size" samples_needed_this_gpu = int(total_samples // dist.get_world_size()) assert samples_needed_this_gpu % n == 0, "samples_needed_this_gpu must be divisible by the per-GPU batch size" iterations = int(samples_needed_this_gpu // n) pbar = range(iterations) pbar = tqdm(pbar) if rank == 0 else pbar total = 0 for _ in pbar: # Sample inputs: z = torch.randn(n, model.in_channels, latent_size, latent_size, device=device) y = torch.randint(0, args.num_classes, (n,), device=device) # Setup classifier-free guidance: if using_cfg: z = torch.cat([z, z], 0) y_null = torch.tensor([1000] * n, device=device) y = torch.cat([y, y_null], 0) model_kwargs = dict(y=y, cfg_scale=args.cfg_scale) sample_fn = model.forward_with_cfg else: model_kwargs = dict(y=y) sample_fn = model.forward model_kwargs['cache_type'] = args.cache_type model_kwargs['fresh_ratio'] = args.fresh_ratio model_kwargs['force_fresh'] = args.force_fresh model_kwargs['fresh_threshold'] = args.fresh_threshold model_kwargs['ratio_scheduler'] = args.ratio_scheduler model_kwargs['soft_fresh_weight'] = args.soft_fresh_weight model_kwargs['test_FLOPs'] = args.test_FLOPs # Sample images: if args.ddim_sample: samples = diffusion.ddim_sample_loop( sample_fn, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=False, device=device ) else: samples = diffusion.p_sample_loop( sample_fn, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=False, device=device, ) if using_cfg: samples, _ = samples.chunk(2, dim=0) # Remove null class samples samples = vae.decode(samples / 0.18215).sample samples = torch.clamp(127.5 * samples + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy() # Save samples to disk as individual .png files for i, sample in enumerate(samples): index = i * dist.get_world_size() + rank + total Image.fromarray(sample).save(f"{sample_folder_dir}/{index:06d}.png") total += global_batch_size # Make sure all processes have finished saving their samples before attempting to convert to .npz dist.barrier() if rank == 0: create_npz_from_sample_folder(sample_folder_dir, args.num_fid_samples) print("Done.") dist.barrier() dist.destroy_process_group() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model", type=str, choices=list(DiT_models.keys()), default="DiT-XL/2") parser.add_argument("--vae", type=str, choices=["ema", "mse"], default="ema") parser.add_argument("--sample-dir", type=str, default="/root/autodl-tmp/samples") # Change this to your desired sample directory parser.add_argument("--per-proc-batch-size", type=int, default=32) parser.add_argument("--num-fid-samples", type=int, default=50_000) parser.add_argument("--image-size", type=int, choices=[256, 512], default=256) parser.add_argument("--num-classes", type=int, default=1000) parser.add_argument("--cfg-scale", type=float, default=1.5) parser.add_argument("--num-sampling-steps", type=int, default=250) parser.add_argument("--global-seed", type=int, default=0) parser.add_argument("--tf32", action=argparse.BooleanOptionalAction, default=True, help="By default, use TF32 matmuls. This massively accelerates sampling on Ampere GPUs.") parser.add_argument("--ckpt", type=str, default=None, help="Optional path to a DiT checkpoint (default: auto-download a pre-trained DiT-XL/2 model).") parser.add_argument("--ddim-sample", action="store_true", default=False) parser.add_argument("--fresh-ratio", type=float, default=0.07) parser.add_argument("--cache-type", type=str, choices=['random', 'attention','similarity','norm', 'compress','kv-norm'], default='random') # only attention supported currently parser.add_argument("--ratio-scheduler", type=str, default='ToCa', choices=['linear', 'cosine', 'exp', 'constant','linear-mode','layerwise','ToCa-ddpm250', 'ToCa-ddim50']) # 'ToCa' is the proposed scheduler in Final version of the paper parser.add_argument("--force-fresh", type=str, choices=['global', 'local'], default='global', # only global is supported currently, local causes bad results help="Force fresh strategy. global: fresh all tokens. local: fresh tokens acheiving fresh step threshold.") parser.add_argument("--fresh-threshold", type=int, default=4) # N in the paper parser.add_argument("--soft-fresh-weight", type=float, default=0.25, # lambda_3 in the paper help="soft weight for updating the stale tokens by adding extra scores.") parser.add_argument("--test-FLOPs", action="store_true", default=False) #parser.add_argument("--merge-weight", type=float, default=0.0) # never used in the paper, just for exploration args = parser.parse_args() main(args) ================================================ FILE: DiT-ToCa/train.py ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. """ A minimal training script for DiT using PyTorch DDP. """ import torch # the first flag below was False when we tested this script but True makes A100 training a lot faster: torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler from torchvision.datasets import ImageFolder from torchvision import transforms import numpy as np from collections import OrderedDict from PIL import Image from copy import deepcopy from glob import glob from time import time import argparse import logging import os from models import DiT_models from diffusion import create_diffusion from diffusers.models import AutoencoderKL ################################################################################# # Training Helper Functions # ################################################################################# @torch.no_grad() def update_ema(ema_model, model, decay=0.9999): """ Step the EMA model towards the current model. """ ema_params = OrderedDict(ema_model.named_parameters()) model_params = OrderedDict(model.named_parameters()) for name, param in model_params.items(): # TODO: Consider applying only to params that require_grad to avoid small numerical changes of pos_embed ema_params[name].mul_(decay).add_(param.data, alpha=1 - decay) def requires_grad(model, flag=True): """ Set requires_grad flag for all parameters in a model. """ for p in model.parameters(): p.requires_grad = flag def cleanup(): """ End DDP training. """ dist.destroy_process_group() def create_logger(logging_dir): """ Create a logger that writes to a log file and stdout. """ if dist.get_rank() == 0: # real logger logging.basicConfig( level=logging.INFO, format='[\033[34m%(asctime)s\033[0m] %(message)s', datefmt='%Y-%m-%d %H:%M:%S', handlers=[logging.StreamHandler(), logging.FileHandler(f"{logging_dir}/log.txt")] ) logger = logging.getLogger(__name__) else: # dummy logger (does nothing) logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) return logger def center_crop_arr(pil_image, image_size): """ Center cropping implementation from ADM. https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126 """ while min(*pil_image.size) >= 2 * image_size: pil_image = pil_image.resize( tuple(x // 2 for x in pil_image.size), resample=Image.BOX ) scale = image_size / min(*pil_image.size) pil_image = pil_image.resize( tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC ) arr = np.array(pil_image) crop_y = (arr.shape[0] - image_size) // 2 crop_x = (arr.shape[1] - image_size) // 2 return Image.fromarray(arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size]) ################################################################################# # Training Loop # ################################################################################# def main(args): """ Trains a new DiT model. """ assert torch.cuda.is_available(), "Training currently requires at least one GPU." # Setup DDP: dist.init_process_group("nccl") assert args.global_batch_size % dist.get_world_size() == 0, f"Batch size must be divisible by world size." rank = dist.get_rank() device = rank % torch.cuda.device_count() seed = args.global_seed * dist.get_world_size() + rank torch.manual_seed(seed) torch.cuda.set_device(device) print(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.") # Setup an experiment folder: if rank == 0: os.makedirs(args.results_dir, exist_ok=True) # Make results folder (holds all experiment subfolders) experiment_index = len(glob(f"{args.results_dir}/*")) model_string_name = args.model.replace("/", "-") # e.g., DiT-XL/2 --> DiT-XL-2 (for naming folders) experiment_dir = f"{args.results_dir}/{experiment_index:03d}-{model_string_name}" # Create an experiment folder checkpoint_dir = f"{experiment_dir}/checkpoints" # Stores saved model checkpoints os.makedirs(checkpoint_dir, exist_ok=True) logger = create_logger(experiment_dir) logger.info(f"Experiment directory created at {experiment_dir}") else: logger = create_logger(None) # Create model: assert args.image_size % 8 == 0, "Image size must be divisible by 8 (for the VAE encoder)." latent_size = args.image_size // 8 model = DiT_models[args.model]( input_size=latent_size, num_classes=args.num_classes ) # Note that parameter initialization is done within the DiT constructor ema = deepcopy(model).to(device) # Create an EMA of the model for use after training requires_grad(ema, False) model = DDP(model.to(device), device_ids=[rank]) diffusion = create_diffusion(timestep_respacing="") # default: 1000 steps, linear noise schedule vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-{args.vae}").to(device) logger.info(f"DiT Parameters: {sum(p.numel() for p in model.parameters()):,}") # Setup optimizer (we used default Adam betas=(0.9, 0.999) and a constant learning rate of 1e-4 in our paper): opt = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0) # Setup data: transform = transforms.Compose([ transforms.Lambda(lambda pil_image: center_crop_arr(pil_image, args.image_size)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True) ]) dataset = ImageFolder(args.data_path, transform=transform) sampler = DistributedSampler( dataset, num_replicas=dist.get_world_size(), rank=rank, shuffle=True, seed=args.global_seed ) loader = DataLoader( dataset, batch_size=int(args.global_batch_size // dist.get_world_size()), shuffle=False, sampler=sampler, num_workers=args.num_workers, pin_memory=True, drop_last=True ) logger.info(f"Dataset contains {len(dataset):,} images ({args.data_path})") # Prepare models for training: update_ema(ema, model.module, decay=0) # Ensure EMA is initialized with synced weights model.train() # important! This enables embedding dropout for classifier-free guidance ema.eval() # EMA model should always be in eval mode # Variables for monitoring/logging purposes: train_steps = 0 log_steps = 0 running_loss = 0 start_time = time() logger.info(f"Training for {args.epochs} epochs...") for epoch in range(args.epochs): sampler.set_epoch(epoch) logger.info(f"Beginning epoch {epoch}...") for x, y in loader: x = x.to(device) y = y.to(device) with torch.no_grad(): # Map input images to latent space + normalize latents: x = vae.encode(x).latent_dist.sample().mul_(0.18215) t = torch.randint(0, diffusion.num_timesteps, (x.shape[0],), device=device) model_kwargs = dict(y=y) loss_dict = diffusion.training_losses(model, x, t, model_kwargs) loss = loss_dict["loss"].mean() opt.zero_grad() loss.backward() opt.step() update_ema(ema, model.module) # Log loss values: running_loss += loss.item() log_steps += 1 train_steps += 1 if train_steps % args.log_every == 0: # Measure training speed: torch.cuda.synchronize() end_time = time() steps_per_sec = log_steps / (end_time - start_time) # Reduce loss history over all processes: avg_loss = torch.tensor(running_loss / log_steps, device=device) dist.all_reduce(avg_loss, op=dist.ReduceOp.SUM) avg_loss = avg_loss.item() / dist.get_world_size() logger.info(f"(step={train_steps:07d}) Train Loss: {avg_loss:.4f}, Train Steps/Sec: {steps_per_sec:.2f}") # Reset monitoring variables: running_loss = 0 log_steps = 0 start_time = time() # Save DiT checkpoint: if train_steps % args.ckpt_every == 0 and train_steps > 0: if rank == 0: checkpoint = { "model": model.module.state_dict(), "ema": ema.state_dict(), "opt": opt.state_dict(), "args": args } checkpoint_path = f"{checkpoint_dir}/{train_steps:07d}.pt" torch.save(checkpoint, checkpoint_path) logger.info(f"Saved checkpoint to {checkpoint_path}") dist.barrier() model.eval() # important! This disables randomized embedding dropout # do any sampling/FID calculation/etc. with ema (or model) in eval mode ... logger.info("Done!") cleanup() if __name__ == "__main__": # Default args here will train DiT-XL/2 with the hyperparameters we used in our paper (except training iters). parser = argparse.ArgumentParser() parser.add_argument("--data-path", type=str, required=True) parser.add_argument("--results-dir", type=str, default="results") parser.add_argument("--model", type=str, choices=list(DiT_models.keys()), default="DiT-XL/2") parser.add_argument("--image-size", type=int, choices=[256, 512], default=256) parser.add_argument("--num-classes", type=int, default=1000) parser.add_argument("--epochs", type=int, default=1400) parser.add_argument("--global-batch-size", type=int, default=256) parser.add_argument("--global-seed", type=int, default=0) parser.add_argument("--vae", type=str, choices=["ema", "mse"], default="ema") # Choice doesn't affect training parser.add_argument("--num-workers", type=int, default=4) parser.add_argument("--log-every", type=int, default=100) parser.add_argument("--ckpt-every", type=int, default=50_000) args = parser.parse_args() main(args) ================================================ FILE: DrawBench200.txt ================================================ A red colored car. A black colored car. A pink colored car. A black colored dog. A red colored dog. A blue colored dog. A green colored banana. A red colored banana. A black colored banana. A white colored sandwich. A black colored sandwich. An orange colored sandwich. A pink colored giraffe. A yellow colored giraffe. A brown colored giraffe. A red car and a white sheep. A blue bird and a brown bear. A green apple and a black backpack. A green cup and a blue cell phone. A yellow book and a red vase. A white car and a red sheep. A brown bird and a blue bear. A black apple and a green backpack. A blue cup and a green cell phone. A red book and a yellow vase. A horse riding an astronaut. A pizza cooking an oven. A bird scaring a scarecrow. A blue coloured pizza. Hovering cow abducting aliens. A panda making latte art. A shark in the desert. An elephant under the sea. Rainbow coloured penguin. A fish eating a pelican. One car on the street. Two cars on the street. Three cars on the street. Four cars on the street. Five cars on the street. One dog on the street. Two dogs on the street. Three dogs on the street. Four dogs on the street. Five dogs on the street. One cat and one dog sitting on the grass. One cat and two dogs sitting on the grass. One cat and three dogs sitting on the grass. Two cats and one dog sitting on the grass. Two cats and two dogs sitting on the grass. Two cats and three dogs sitting on the grass. Three cats and one dog sitting on the grass. Three cats and two dogs sitting on the grass. Three cats and three dogs sitting on the grass. A triangular purple flower pot. A purple flower pot in the shape of a triangle. A triangular orange picture frame. An orange picture frame in the shape of a triangle. A triangular pink stop sign. A pink stop sign in the shape of a triangle. A cube made of denim. A cube with the texture of denim. A sphere made of kitchen tile. A sphere with the texture of kitchen tile. A cube made of brick. A cube with the texture of brick. A collection of nail is sitting on a table. A single clock is sitting on a table. A couple of glasses are sitting on a table. An illustration of a large red elephant sitting on a small blue mouse. An illustration of a small green elephant standing behind a large red mouse. A small blue book sitting on a large red book. "A stack of 3 plates. A blue plate is on the top, sitting on a blue plate. The blue plate is in the middle, sitting on a green plate. The green plate is on the bottom." "A stack of 3 cubes. A red cube is on the top, sitting on a red cube. The red cube is in the middle, sitting on a green cube. The green cube is on the bottom." "A stack of 3 books. A green book is on the top, sitting on a red book. The red book is in the middle, sitting on a blue book. The blue book is on the bottom." "An emoji of a baby panda wearing a red hat, green gloves, red shirt, and green pants." "An emoji of a baby panda wearing a red hat, blue gloves, green shirt, and blue pants." A fisheye lens view of a turtle sitting in a forest. A side view of an owl sitting in a field. A cross-section view of a brain. "A vehicle composed of two wheels held in a frame one behind the other, propelled by pedals and steered with handlebars attached to the front wheel." "A large motor vehicle carrying passengers by road, typically one serving the public on a fixed route and for a fare." "A small vessel propelled on water by oars, sails, or an engine." A connection point by which firefighters can tap into a water supply. "A machine next to a parking space in a street, into which the driver puts money so as to be authorized to park the vehicle for a particular length of time." "A device consisting of a circular canopy of cloth on a folding metal frame supported by a central rod, used as protection against rain or sometimes sun." "A separate seat for one person, typically with a back and four legs." An appliance or compartment which is artificially kept cool and used to store food and drink. A mechanical or electrical device for measuring time. "An instrument used for cutting cloth, paper, and other thin material, consisting of two blades laid one on top of the other and fastened in the middle so as to allow them to be opened and closed by a thumb and finger inserted through rings on the end of their handles." "A large plant-eating domesticated mammal with solid hoofs and a flowing mane and tail, used for riding, racing, and to carry and pull loads." A long curved fruit which grows in clusters and has soft pulpy flesh and yellow skin when ripe. "A small domesticated carnivorous mammal with soft fur, a short snout, and retractable claws. It is widely kept as a pet or for catching mice, and many breeds have been developed." "A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, nonretractable claws, and a barking, howling, or whining voice." "An organ of soft nervous tissue contained in the skull of vertebrates, functioning as the coordinating center of sensation and intellectual and nervous activity." "An American multinational technology company that focuses on artificial intelligence, search engine, online advertising, cloud computing, computer software, quantum computing, e-commerce, and consumer electronics." "A large keyboard musical instrument with a wooden case enclosing a soundboard and metal strings, which are struck by hammers when the keys are depressed. The strings' vibration is stopped by dampers when the keys are released and can be regulated for length and volume by two or three pedals." "A type of digital currency in which a record of transactions is maintained and new units of currency are generated by the computational solution of mathematical problems, and which operates independently of a central bank." "A large thick-skinned semiaquatic African mammal, with massive jaws and large tusks." A machine resembling a human being and able to replicate certain human movements and functions automatically. Paying for a quarter-sized pizza with a pizza-sized quarter. An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with no umbrellas. "A grocery store refrigerator has pint cartons of milk on the top shelf, quart cartons on the middle shelf, and gallon plastic jugs on the bottom shelf." "In late afternoon in January in New England, a man stands in the shadow of a maple tree." An elephant is behind a tree. You can see the trunk on one side and the back legs on the other. A tomato has been put on top of a pumpkin on a kitchen stool. There is a fork sticking into the pumpkin. The scene is viewed from above. A pear cut into seven pieces arranged in a ring. "A donkey and an octopus are playing a game. The donkey is holding a rope on one end, the octopus is holding onto the other. The donkey holds the rope in its mouth. A cat is jumping over the rope." "Supreme Court Justices play a baseball game with the FBI. The FBI is at bat, the justices are on the field." Abraham Lincoln touches his toes while George Washington does chin-ups. Lincoln is barefoot. Washington is wearing boots. Tcennis rpacket. Bzaseball galove. Rbefraigerator. Dininrg tablez. Pafrking metr. "A smafml vessef epropoeilled on watvewr by ors, sauls, or han engie." "A sjmall domesticated carnivorious mammnal with sof fuh,y a sthort sout, and retracwtablbe flaws. It iw widexly kept as a pet or for catchitng mic, ad many breeds zhlyde beefn develvoked." "An instqrumemnt used for cutting cloth, paper, axdz othr thdin mteroial, consamistng of two blades lad one on tvopb of the other and fhastned in tle mixdqdjle so as to bllow them txo be pened and closed by thumb and fitngesr inserted tgrough rings on kthe end oc thei vatndlzes." "A domesticated carnivvorous mzammal that typicbally hfaas a lons sfnout, an acxujte sense off osmell, noneetractaaln crlaws, anid xbarkring,y howlingu, or whining rvoiche." "A ldarge keybord msical instroument lwith a woden case enmclosig a qsouvnkboajrd and mfgtal strivgf, which are strucrk b hammrs when the nels are depresdsmed.f lhe strsingsj' vibration ie stopped by damperds when the keys re released and can bce regulavewdd for lengh and vnolume y two or three pedalvs." A train on top of a surfboard. A wine glass on top of a dog. A bicycle on top of a boat. An umbrella on top of a spoon. A laptop on top of a teddy bear. A giraffe underneath a microwave. A donut underneath a toilet. A hair drier underneath a sheep. A tennis racket underneath a traffic light. A zebra underneath a broccoli. A banana on the left of an apple. A couch on the left of a chair. A car on the left of a bus. A cat on the left of a dog. A carrot on the left of a broccoli. A pizza on the right of a suitcase. A cat on the right of a tennis racket. A stop sign on the right of a refrigerator. A sheep to the right of a wine glass. A zebra to the right of a fire hydrant. Acersecomicke. Jentacular. Matutinal. Peristeronic. Artophagous. Backlotter. Octothorpe. A church with stained glass windows depicting a hamburger and french fries. "Painting of the orange cat Otto von Garfield, Count of Bismarck-Schönhausen, Duke of Lauenburg, Minister-President of Prussia. Depicted wearing a Prussian Pickelhaube and eating his favorite meal - lasagna." "A baby fennec sneezing onto a strawberry, detailed, macro, studio light, droplets, backlit ears." A photo of a confused grizzly bear in calculus class. An ancient Egyptian painting depicting an argument over whose turn it is to take out the trash. "A fluffy baby sloth with a knitted hat trying to figure out a laptop, close up, highly detailed, studio lighting, screen reflecting in its eyes." "A tiger in a lab coat with a 1980s Miami vibe, turning a well oiled science content machine, digital art." A 1960s yearbook photo with animals dressed as humans. Lego Arnold Schwarzenegger. A yellow and black bus cruising through the rainforest. A medieval painting of the wifi not working. "An IT-guy trying to fix hardware of a PC tower is being tangled by the PC cables like Laokoon. Marble, copy after Hellenistic original from ca. 200 BC. Found in the Baths of Trajan, 1506." "35mm macro shot a kitten licking a baby duck, studio lighting." McDonalds Church. Photo of an athlete cat explaining it's latest scandal at a press conference to journalists. Greek statue of a man tripping over a cat. "An old photograph of a 1920s airship shaped like a pig, floating over a wheat field." Photo of a cat singing in a barbershop quartet. "A painting by Grant Wood of an astronaut couple, american gothic style." An oil painting portrait of the regal Burger King posing with a Whopper. "A keyboard made of water, the water is made of light, the light is turned off." Painting of Mona Lisa but the view is from behind of Mona Lisa. Hyper-realistic photo of an abandoned industrial site during a storm. A screenshot of an iOS app for ordering different types of milk. "A real life photography of super mario, 8k Ultra HD." Colouring page of large cats climbing the eifel tower in a cyberpunk future. Photo of a mega Lego space station inside a kid's bedroom. A spider with a moustache bidding an equally gentlemanly grasshopper a good day during his walk to work. A photocopy of a photograph of a painting of a sculpture of a giraffe. "A bridge connecting Europe and North America on the Atlantic Ocean, bird's eye view." "A maglev train going vertically downward in high speed, New York Times photojournalism." A magnifying glass over a page of a 1950s batman comic. "A car playing soccer, digital art." Darth Vader playing with raccoon in Mars during sunset. A 1960s poster warning against climate change. Illustration of a mouse using a mushroom as an umbrella. A realistic photo of a Pomeranian dressed up like a 1980s professional wrestler with neon green and neon orange face paint and bright green wrestling tights with bright orange boots. A pyramid made of falafel with a partial solar eclipse in the background. A storefront with 'Hello World' written on it. A storefront with 'Diffusion' written on it. A storefront with 'Text to Image' written on it. A storefront with 'NeurIPS' written on it. A storefront with 'Deep Learning' written on it. A storefront with 'Google Brain Toronto' written on it. A storefront with 'Google Research Pizza Cafe' written on it. A sign that says 'Hello World'. A sign that says 'Diffusion'. A sign that says 'Text to Image'. A sign that says 'NeurIPS'. A sign that says 'Deep Learning'. A sign that says 'Google Brain Toronto'. A sign that says 'Google Research Pizza Cafe'. New York Skyline with 'Hello World' written with fireworks on the sky. New York Skyline with 'Diffusion' written with fireworks on the sky. New York Skyline with 'Text to Image' written with fireworks on the sky. New York Skyline with 'NeurIPS' written with fireworks on the sky. New York Skyline with 'Deep Learning' written with fireworks on the sky. New York Skyline with 'Google Brain Toronto' written with fireworks on the sky. New York Skyline with 'Google Research Pizza Cafe' written with fireworks on the sky. ================================================ FILE: LICENSE ================================================ GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . ================================================ FILE: Open-Sora/Dockerfile ================================================ FROM hpcaitech/pytorch-cuda:2.1.0-12.1.0 # metainformation LABEL org.opencontainers.image.source = "https://github.com/hpcaitech/Open-Sora" LABEL org.opencontainers.image.licenses = "Apache License 2.0" LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/pytorch-cuda:2.1.0-12.1.0" # Set the working directory WORKDIR /workspace/Open-Sora # Copy the current directory contents into the container at /workspace/Open-Sora COPY . . # inatall library dependencies RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y # install flash attention RUN pip install flash-attn --no-build-isolation # install apex RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git # install xformers RUN pip install xformers --index-url https://download.pytorch.org/whl/cu121 # install this project RUN pip install -v . ================================================ FILE: Open-Sora/LICENSE ================================================ Copyright 2024 HPC-AI Technology Inc. All rights reserved. Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2024 HPC-AI Technology Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ========================================================================= This project is inspired by the listed projects and is subject to the following licenses: 1. Latte (https://github.com/Vchitect/Latte/blob/main/LICENSE) Copyright 2024 Latte Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. 2. PixArt-alpha (https://github.com/PixArt-alpha/PixArt-alpha/blob/master/LICENSE) Copyright (C) 2024 PixArt-alpha/PixArt-alpha This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . 3. dpm-solver (https://github.com/LuChengTHU/dpm-solver/blob/main/LICENSE) MIT License Copyright (c) 2022 Cheng Lu Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 4. DiT (https://github.com/facebookresearch/DiT/blob/main/LICENSE.txt) Attribution-NonCommercial 4.0 International ======================================================================= Creative Commons Corporation ("Creative Commons") is not a law firm and does not provide legal services or legal advice. Distribution of Creative Commons public licenses does not create a lawyer-client or other relationship. Creative Commons makes its licenses and related information available on an "as-is" basis. Creative Commons gives no warranties regarding its licenses, any material licensed under their terms and conditions, or any related information. Creative Commons disclaims all liability for damages resulting from their use to the fullest extent possible. Using Creative Commons Public Licenses Creative Commons public licenses provide a standard set of terms and conditions that creators and other rights holders may use to share original works of authorship and other material subject to copyright and certain other rights specified in the public license below. The following considerations are for informational purposes only, are not exhaustive, and do not form part of our licenses. Considerations for licensors: Our public licenses are intended for use by those authorized to give the public permission to use material in ways otherwise restricted by copyright and certain other rights. Our licenses are irrevocable. Licensors should read and understand the terms and conditions of the license they choose before applying it. Licensors should also secure all rights necessary before applying our licenses so that the public can reuse the material as expected. Licensors should clearly mark any material not subject to the license. This includes other CC- licensed material, or material used under an exception or limitation to copyright. More considerations for licensors: wiki.creativecommons.org/Considerations_for_licensors Considerations for the public: By using one of our public licenses, a licensor grants the public permission to use the licensed material under specified terms and conditions. If the licensor's permission is not necessary for any reason--for example, because of any applicable exception or limitation to copyright--then that use is not regulated by the license. Our licenses grant only permissions under copyright and certain other rights that a licensor has authority to grant. Use of the licensed material may still be restricted for other reasons, including because others have copyright or other rights in the material. A licensor may make special requests, such as asking that all changes be marked or described. Although not required by our licenses, you are encouraged to respect those requests where reasonable. More_considerations for the public: wiki.creativecommons.org/Considerations_for_licensees ======================================================================= Creative Commons Attribution-NonCommercial 4.0 International Public License By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions. Section 1 -- Definitions. a. Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image. b. Adapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License. c. Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights. d. Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements. e. Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material. f. Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License. g. Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license. h. Licensor means the individual(s) or entity(ies) granting rights under this Public License. i. NonCommercial means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange. j. Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them. k. Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world. l. You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning. Section 2 -- Scope. a. License grant. 1. Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to: a. reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and b. produce, reproduce, and Share Adapted Material for NonCommercial purposes only. 2. Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions. 3. Term. The term of this Public License is specified in Section 6(a). 4. Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a) (4) never produces Adapted Material. 5. Downstream recipients. a. Offer from the Licensor -- Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License. b. No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material. 6. No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i). b. Other rights. 1. Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise. 2. Patent and trademark rights are not licensed under this Public License. 3. To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes. Section 3 -- License Conditions. Your exercise of the Licensed Rights is expressly made subject to the following conditions. a. Attribution. 1. If You Share the Licensed Material (including in modified form), You must: a. retain the following if it is supplied by the Licensor with the Licensed Material: i. identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated); ii. a copyright notice; iii. a notice that refers to this Public License; iv. a notice that refers to the disclaimer of warranties; v. a URI or hyperlink to the Licensed Material to the extent reasonably practicable; b. indicate if You modified the Licensed Material and retain an indication of any previous modifications; and c. indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License. 2. You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information. 3. If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable. 4. If You Share Adapted Material You produce, the Adapter's License You apply must not prevent recipients of the Adapted Material from complying with this Public License. Section 4 -- Sui Generis Database Rights. Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material: a. for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only; b. if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material; and c. You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database. For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights. Section 5 -- Disclaimer of Warranties and Limitation of Liability. a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability. Section 6 -- Term and Termination. a. This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically. b. Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates: 1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or 2. upon express reinstatement by the Licensor. For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License. c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License. d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License. Section 7 -- Other Terms and Conditions. a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed. b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License. Section 8 -- Interpretation. a. For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License. b. To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions. c. No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor. d. Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority. ======================================================================= Creative Commons is not a party to its public licenses. Notwithstanding, Creative Commons may elect to apply one of its public licenses to material it publishes and in those instances will be considered the “Licensor.” The text of the Creative Commons public licenses is dedicated to the public domain under the CC0 Public Domain Dedication. Except for the limited purpose of indicating that material is shared under a Creative Commons public license or as otherwise permitted by the Creative Commons policies published at creativecommons.org/policies, Creative Commons does not authorize the use of the trademark "Creative Commons" or any other trademark or logo of Creative Commons without its prior written consent including, without limitation, in connection with any unauthorized modifications to any of its public licenses or any other arrangements, understandings, or agreements concerning use of licensed material. For the avoidance of doubt, this paragraph does not form part of the public licenses. Creative Commons may be contacted at creativecommons.org. 5. OpenDiT (https://github.com/NUS-HPC-AI-Lab/OpenDiT/blob/master/LICENSE) Copyright OpenDiT Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: Open-Sora/README.md ================================================

## Open-Sora: Democratizing Efficient Video Production for All We design and implement **Open-Sora**, an initiative dedicated to **efficiently** producing high-quality video. We hope to make the model, tools and all details accessible to all. By embracing **open-source** principles, Open-Sora not only democratizes access to advanced video generation techniques, but also offers a streamlined and user-friendly platform that simplifies the complexities of video generation. With Open-Sora, our goal is to foster innovation, creativity, and inclusivity within the field of content creation. [[中文文档](/docs/zh_CN/README.md)] [[潞晨云](https://cloud.luchentech.com/)|[OpenSora镜像](https://cloud.luchentech.com/doc/docs/image/open-sora/)|[视频教程](https://www.bilibili.com/video/BV1ow4m1e7PX/?vd_source=c6b752764cd36ff0e535a768e35d98d2)] ## 📰 News - **[2024.06.17]** 🔥 We released **Open-Sora 1.2**, which includes **3D-VAE**, **rectified flow**, and **score condition**. The video quality is greatly improved. [[checkpoints]](#open-sora-10-model-weights) [[report]](/docs/report_03.md) [[blog]](https://hpc-ai.com/blog/open-sora-from-hpc-ai-tech-team-continues-open-source-generate-any-16-second-720p-hd-video-with-one-click-model-weights-ready-to-use) - **[2024.04.25]** 🤗 We released the [Gradio demo for Open-Sora](https://huggingface.co/spaces/hpcai-tech/open-sora) on Hugging Face Spaces. - **[2024.04.25]** We released **Open-Sora 1.1**, which supports **2s~15s, 144p to 720p, any aspect ratio** text-to-image, **text-to-video, image-to-video, video-to-video, infinite time** generation. In addition, a full video processing pipeline is released. [[checkpoints]]() [[report]](/docs/report_02.md) - **[2024.03.18]** We released **Open-Sora 1.0**, a fully open-source project for video generation. Open-Sora 1.0 supports a full pipeline of video data preprocessing, training with acceleration, inference, and more. Our model can produce 2s 512x512 videos with only 3 days training. [[checkpoints]](#open-sora-10-model-weights) [[blog]](https://hpc-ai.com/blog/open-sora-v1.0) [[report]](/docs/report_01.md) - **[2024.03.04]** Open-Sora provides training with 46% cost reduction. [[blog]](https://hpc-ai.com/blog/open-sora) ## 🎥 Latest Demo 🔥 You can experience Open-Sora on our [🤗 Gradio application on Hugging Face](https://huggingface.co/spaces/hpcai-tech/open-sora). More samples and corresponding prompts are available in our [Gallery](https://hpcaitech.github.io/Open-Sora/). | **4s 720×1280** | **4s 720×1280** | **4s 720×1280** | | ---------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/7895aab6-ed23-488c-8486-091480c26327) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/20f07c7b-182b-4562-bbee-f1df74c86c9a) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/3d897e0d-dc21-453a-b911-b3bda838acc2) | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/644bf938-96ce-44aa-b797-b3c0b513d64c) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/272d88ac-4b4a-484d-a665-8d07431671d0) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/ebbac621-c34e-4bb4-9543-1c34f8989764) | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/a1e3a1a3-4abd-45f5-8df2-6cced69da4ca) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/d6ce9c13-28e1-4dff-9644-cc01f5f11926) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/561978f8-f1b0-4f4d-ae7b-45bec9001b4a) |
OpenSora 1.1 Demo | **2s 240×426** | **2s 240×426** | | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) | | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/f7ce4aaa-528f-40a8-be7a-72e61eaacbbd) | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/5d58d71e-1fda-4d90-9ad3-5f2f7b75c6a9) | | **2s 426×240** | **4s 480×854** | | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/34ecb4a0-4eef-4286-ad4c-8e3a87e5a9fd) | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c1619333-25d7-42ba-a91c-18dbc1870b18) | | **16s 320×320** | **16s 224×448** | **2s 426×240** | | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------- | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/3cab536e-9b43-4b33-8da8-a0f9cf842ff2) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/9fb0b9e0-c6f4-4935-b29e-4cac10b373c4) | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/3e892ad2-9543-4049-b005-643a4c1bf3bf) |
OpenSora 1.0 Demo | **2s 512×512** | **2s 512×512** | **2s 512×512** | | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/de1963d3-b43b-4e68-a670-bb821ebb6f80) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/13f8338f-3d42-4b71-8142-d234fbd746cc) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/fa6a65a6-e32a-4d64-9a9e-eabb0ebb8c16) | | A serene night scene in a forested area. [...] The video is a time-lapse, capturing the transition from day to night, with the lake and forest serving as a constant backdrop. | A soaring drone footage captures the majestic beauty of a coastal cliff, [...] The water gently laps at the rock base and the greenery that clings to the top of the cliff. | The majestic beauty of a waterfall cascading down a cliff into a serene lake. [...] The camera angle provides a bird's eye view of the waterfall. | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/64232f84-1b36-4750-a6c0-3e610fa9aa94) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/983a1965-a374-41a7-a76b-c07941a6c1e9) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/ec10c879-9767-4c31-865f-2e8d6cf11e65) | | A bustling city street at night, filled with the glow of car headlights and the ambient light of streetlights. [...] | The vibrant beauty of a sunflower field. The sunflowers are arranged in neat rows, creating a sense of order and symmetry. [...] | A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell [...] | Videos are downsampled to `.gif` for display. Click for original videos. Prompts are trimmed for display, see [here](/assets/texts/t2v_samples.txt) for full prompts.
## 🔆 New Features/Updates - 📍 **Open-Sora 1.2** released. Model weights are available [here](#model-weights). See our **[report 1.2](/docs/report_03.md)** for more details. - ✅ Support rectified flow scheduling. - ✅ Support more conditioning including fps, aesthetic score, motion strength and camera motion. - ✅ Trained our 3D-VAE for temporal dimension compression. - 📍 **Open-Sora 1.1** released. Model weights are available [here](#model-weights). It is trained on **0s~15s, 144p to 720p, various aspect ratios** videos. See our **[report 1.1](/docs/report_02.md)** for more discussions. - 🔧 **Data processing pipeline v1.1** is released. An automatic [processing pipeline](#data-processing) from raw videos to (text, video clip) pairs is provided, including scene cutting $\rightarrow$ filtering(aesthetic, optical flow, OCR, etc.) $\rightarrow$ captioning $\rightarrow$ managing. With this tool, you can easily build your video dataset.
View more - ✅ Improved ST-DiT architecture includes rope positional encoding, qk norm, longer text length, etc. - ✅ Support training with any resolution, aspect ratio, and duration (including images). - ✅ Support image and video conditioning and video editing, and thus support animating images, connecting videos, etc. - 📍 **Open-Sora 1.0** released. Model weights are available [here](#model-weights). With only 400K video clips and 200 H800 days (compared with 152M samples in Stable Video Diffusion), we are able to generate 2s 512×512 videos. See our **[report 1.0](docs/report_01.md)** for more discussions. - ✅ Three-stage training from an image diffusion model to a video diffusion model. We provide the weights for each stage. - ✅ Support training acceleration including accelerated transformer, faster T5 and VAE, and sequence parallelism. Open-Sora improves **55%** training speed when training on 64x512x512 videos. Details locates at [acceleration.md](docs/acceleration.md). - 🔧 **Data preprocessing pipeline v1.0**, including [downloading](tools/datasets/README.md), [video cutting](tools/scene_cut/README.md), and [captioning](tools/caption/README.md) tools. Our data collection plan can be found at [datasets.md](docs/datasets.md). - ✅ We find VQ-VAE from [VideoGPT](https://wilson1yan.github.io/videogpt/index.html) has a low quality and thus adopt a better VAE from [Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original). We also find patching in the time dimension deteriorates the quality. See our **[report](docs/report_01.md)** for more discussions. - ✅ We investigate different architectures including DiT, Latte, and our proposed STDiT. Our **STDiT** achieves a better trade-off between quality and speed. See our **[report](docs/report_01.md)** for more discussions. - ✅ Support clip and T5 text conditioning. - ✅ By viewing images as one-frame videos, our project supports training DiT on both images and videos (e.g., ImageNet & UCF101). See [commands.md](docs/commands.md) for more instructions. - ✅ Support inference with official weights from [DiT](https://github.com/facebookresearch/DiT), [Latte](https://github.com/Vchitect/Latte), and [PixArt](https://pixart-alpha.github.io/). - ✅ Refactor the codebase. See [structure.md](docs/structure.md) to learn the project structure and how to use the config files.
### TODO list sorted by priority
View more - [x] Training Video-VAE and adapt our model to new VAE. - [x] Scaling model parameters and dataset size. - [x] Incoporate a better scheduler (rectified flow). - [x] Evaluation pipeline. - [x] Complete the data processing pipeline (including dense optical flow, aesthetics scores, text-image similarity, etc.). See [the dataset](/docs/datasets.md) for more information - [x] Support image and video conditioning. - [x] Support variable aspect ratios, resolutions, durations.
## Contents - [Installation](#installation) - [Model Weights](#model-weights) - [Gradio Demo](#gradio-demo) - [Inference](#inference) - [Data Processing](#data-processing) - [Training](#training) - [Evaluation](#evaluation) - [VAE Training & Evaluation](#vae-training--evaluation) - [Contribution](#contribution) - [Citation](#citation) - [Acknowledgement](#acknowledgement) Other useful documents and links are listed below. - Report: each version is trained from a image base seperately (not continuously trained), while a newer version will incorporate the techniques from the previous version. - [report 1.2](docs/report_03.md): rectified flow, 3d-VAE, score condition, evaluation, etc. - [report 1.1](docs/report_02.md): multi-resolution/length/aspect-ratio, image/video conditioning/editing, data preprocessing, etc. - [report 1.0](docs/report_01.md): architecture, captioning, etc. - [acceleration.md](docs/acceleration.md) - Repo structure: [structure.md](docs/structure.md) - Config file explanation: [config.md](docs/config.md) - Useful commands: [commands.md](docs/commands.md) - Data processing pipeline and dataset: [datasets.md](docs/datasets.md) - Each data processing tool's README: [dataset conventions and management](/tools/datasets/README.md), [scene cutting](/tools/scene_cut/README.md), [scoring](/tools/scoring/README.md), [caption](/tools/caption/README.md) - Evaluation: [eval/README.md](/eval/README.md) - Gallery: [gallery](https://hpcaitech.github.io/Open-Sora/) ## Installation ### Install from Source For CUDA 12.1, you can install the dependencies with the following commands. Otherwise, please refer to [Installation Documentation](docs/installation.md) for more instructions on different cuda version, and additional dependency for data preprocessing, VAE, and model evaluation. ```bash # create a virtual env and activate (conda as an example) conda create -n opensora python=3.9 conda activate opensora # download the repo git clone https://github.com/hpcaitech/Open-Sora cd Open-Sora # install torch, torchvision and xformers pip install -r requirements/requirements-cu121.txt # the default installation is for inference only pip install -v . # for development mode, `pip install -v -e .` ``` (Optional, recommended for fast speed, especially for training) To enable `layernorm_kernel` and `flash_attn`, you need to install `apex` and `flash-attn` with the following commands. ```bash # install flash attention # set enable_flash_attn=False in config to disable flash attention pip install packaging ninja pip install flash-attn --no-build-isolation # install apex # set enable_layernorm_kernel=False in config to disable apex pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git ``` ### Use Docker Run the following command to build a docker image from Dockerfile provided. ```bash docker build -t opensora . ``` Run the following command to start the docker container in interactive mode. ```bash docker run -ti --gpus all -v .:/workspace/Open-Sora opensora ``` ## Model Weights ### Open-Sora 1.2 Model Weights | Model | Model Size | Data | #iterations | Batch Size | URL | | --------- | ---------- | ---- | ----------- | ---------- | ------------------------------------------------------------- | | Diffusion | 1.1B | 30M | 70k | Dynamic | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v3) | | VAE | 384M | 3M | 1M | 8 | [:link:](https://huggingface.co/hpcai-tech/OpenSora-VAE-v1.2) | See our **[report 1.2](docs/report_03.md)** for more infomation. Weight will be automatically downloaded when you run the inference script. > For users from mainland China, try `export HF_ENDPOINT=https://hf-mirror.com` to successfully download the weights. ### Open-Sora 1.1 Model Weights
View more | Resolution | Model Size | Data | #iterations | Batch Size | URL | | ------------------ | ---------- | -------------------------- | ----------- | ------------------------------------------------- | -------------------------------------------------------------------- | | mainly 144p & 240p | 700M | 10M videos + 2M images | 100k | [dynamic](/configs/opensora-v1-1/train/stage2.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage2) | | 144p to 720p | 700M | 500K HQ videos + 1M images | 4k | [dynamic](/configs/opensora-v1-1/train/stage3.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage3) | See our **[report 1.1](docs/report_02.md)** for more infomation. :warning: **LIMITATION**: This version contains known issues which we are going to fix in the next version (as we save computation resource for the next release). In addition, the video generation may fail for long duration, and high resolution will have noisy results due to this problem.
### Open-Sora 1.0 Model Weights
View more | Resolution | Model Size | Data | #iterations | Batch Size | GPU days (H800) | URL | | ---------- | ---------- | ------ | ----------- | ---------- | --------------- | --------------------------------------------------------------------------------------------- | | 16×512×512 | 700M | 20K HQ | 20k | 2×64 | 35 | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x512x512.pth) | | 16×256×256 | 700M | 20K HQ | 24k | 8×64 | 45 | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x256x256.pth) | | 16×256×256 | 700M | 366K | 80k | 8×64 | 117 | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-16x256x256.pth) | Training orders: 16x256x256 $\rightarrow$ 16x256x256 HQ $\rightarrow$ 16x512x512 HQ. Our model's weight is partially initialized from [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha). The number of parameters is 724M. More information about training can be found in our **[report](/docs/report_01.md)**. More about the dataset can be found in [datasets.md](/docs/datasets.md). HQ means high quality. :warning: **LIMITATION**: Our model is trained on a limited budget. The quality and text alignment is relatively poor. The model performs badly, especially on generating human beings and cannot follow detailed instructions. We are working on improving the quality and text alignment.
## Gradio Demo 🔥 You can experience Open-Sora on our [🤗 Gradio application](https://huggingface.co/spaces/hpcai-tech/open-sora) on Hugging Face online. ### Local Deployment If you want to deploy gradio locally, we have also provided a [Gradio application](./gradio) in this repository, you can use the following the command to start an interactive web application to experience video generation with Open-Sora. ```bash pip install gradio spaces python gradio/app.py ``` This will launch a Gradio application on your localhost. If you want to know more about the Gradio applicaiton, you can refer to the [Gradio README](./gradio/README.md). To enable prompt enhancement and other language input (e.g., 中文输入), you need to set the `OPENAI_API_KEY` in the environment. Check [OpenAI's documentation](https://platform.openai.com/docs/quickstart) to get your API key. ```bash export OPENAI_API_KEY=YOUR_API_KEY ``` ### Getting Started In the Gradio application, the basic options are as follows: ![Gradio Demo](assets/readme/gradio_basic.png) The easiest way to generate a video is to input a text prompt and click the "**Generate video**" button (scroll down if you cannot find). The generated video will be displayed in the right panel. Checking the "**Enhance prompt with GPT4o**" will use GPT-4o to refine the prompt, while "**Random Prompt**" button will generate a random prompt by GPT-4o for you. Due to the OpenAI's API limit, the prompt refinement result has some randomness. Then, you can choose the **resolution**, **duration**, and **aspect ratio** of the generated video. Different resolution and video length will affect the video generation speed. On a 80G H100 GPU, the generation speed (with `num_sampling_step=30`) and peak memory usage is: | | Image | 2s | 4s | 8s | 16s | | ---- | ------- | -------- | --------- | --------- | --------- | | 360p | 3s, 24G | 18s, 27G | 31s, 27G | 62s, 28G | 121s, 33G | | 480p | 2s, 24G | 29s, 31G | 55s, 30G | 108s, 32G | 219s, 36G | | 720p | 6s, 27G | 68s, 41G | 130s, 39G | 260s, 45G | 547s, 67G | Note that besides text to video, you can also use **image to video generation**. You can upload an image and then click the "**Generate video**" button to generate a video with the image as the first frame. Or you can fill in the text prompt and click the "**Generate image**" button to generate an image with the text prompt, and then click the "**Generate video**" button to generate a video with the image generated with the same model. ![Gradio Demo](assets/readme/gradio_option.png) Then you can specify more options, including "**Motion Strength**", "**Aesthetic**" and "**Camera Motion**". If "Enable" not checked or the choice is "none", the information is not passed to the model. Otherwise, the model will generate videos with the specified motion strength, aesthetic score, and camera motion. For the **aesthetic score**, we recommend using values higher than 6. For **motion strength**, a smaller value will lead to a smoother but less dynamic video, while a larger value will lead to a more dynamic but likely more blurry video. Thus, you can try without it and then adjust it according to the generated video. For the **camera motion**, sometimes the model cannot follow the instruction well, and we are working on improving it. You can also adjust the "**Sampling steps**", this is directly related to the generation speed as it is the number of denoising. A number smaller than 30 usually leads to a poor generation results, while a number larger than 100 usually has no significant improvement. The "**Seed**" is used for reproducibility, you can set it to a fixed number to generate the same video. The "**CFG Scale**" controls how much the model follows the text prompt, a smaller value will lead to a more random video, while a larger value will lead to a more text-following video (7 is recommended). For more advanced usage, you can refer to [Gradio README](./gradio/README.md#advanced-usage). ## Inference ### Open-Sora 1.2 Command Line Inference The basic command line inference is as follows: ```bash # text to video python scripts/inference.py configs/opensora-v1-2/inference/sample.py \ --num-frames 4s --resolution 720p --aspect-ratio 9:16 \ --prompt "a beautiful waterfall" ``` You can add more options to the command line to customize the generation. ```bash python scripts/inference.py configs/opensora-v1-2/inference/sample.py \ --num-frames 4s --resolution 720p --aspect-ratio 9:16 \ --num-sampling-steps 30 --flow 5 --aes 6.5 \ --prompt "a beautiful waterfall" ``` For image to video generation and other functionalities, the API is compatible with Open-Sora 1.1. See [here](docs/commands.md) for more instructions. If your installation do not contain `apex` and `flash-attn`, you need to disable them in the config file, or via the folowing command. ```bash python scripts/inference.py configs/opensora-v1-2/inference/sample.py \ --num-frames 4s --resolution 720p \ --layernorm-kernel False --flash-attn False \ --prompt "a beautiful waterfall" ``` ### Sequence Parallelism Inference To enable sequence parallelism, you need to use `torchrun` to run the inference script. The following command will run the inference with 2 GPUs. ```bash # text to video CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node 2 scripts/inference.py configs/opensora-v1-2/inference/sample.py \ --num-frames 4s --resolution 720p --aspect-ratio 9:16 \ --prompt "a beautiful waterfall" ``` :warning: **LIMITATION**: The sequence parallelism is not supported for gradio deployment. For now, the sequence parallelism is only supported when the dimension can be divided by the number of GPUs. Thus, it may fail for some cases. We tested 4 GPUs for 720p and 2 GPUs for 480p. ### GPT-4o Prompt Refinement We find that GPT-4o can refine the prompt and improve the quality of the generated video. With this feature, you can also use other language (e.g., Chinese) as the prompt. To enable this feature, you need prepare your openai api key in the environment: ```bash export OPENAI_API_KEY=YOUR_API_KEY ``` Then you can inference with `--llm-refine True` to enable the GPT-4o prompt refinement, or leave prompt empty to get a random prompt generated by GPT-4o. ```bash python scripts/inference.py configs/opensora-v1-2/inference/sample.py \ --num-frames 4s --resolution 720p --llm-refine True ``` ### Open-Sora 1.1 Command Line Inference
View more Since Open-Sora 1.1 supports inference with dynamic input size, you can pass the input size as an argument. ```bash # text to video python scripts/inference.py configs/opensora-v1-1/inference/sample.py --prompt "A beautiful sunset over the city" --num-frames 32 --image-size 480 854 ``` If your installation do not contain `apex` and `flash-attn`, you need to disable them in the config file, or via the folowing command. ```bash python scripts/inference.py configs/opensora-v1-1/inference/sample.py --prompt "A beautiful sunset over the city" --num-frames 32 --image-size 480 854 --layernorm-kernel False --flash-attn False ``` See [here](docs/commands.md#inference-with-open-sora-11) for more instructions including text-to-image, image-to-video, video-to-video, and infinite time generation.
### Open-Sora 1.0 Command Line Inference
View more We have also provided an offline inference script. Run the following commands to generate samples, the required model weights will be automatically downloaded. To change sampling prompts, modify the txt file passed to `--prompt-path`. See [here](docs/structure.md#inference-config-demos) to customize the configuration. ```bash # Sample 16x512x512 (20s/sample, 100 time steps, 24 GB memory) torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x512x512.py --ckpt-path OpenSora-v1-HQ-16x512x512.pth --prompt-path ./assets/texts/t2v_samples.txt # Sample 16x256x256 (5s/sample, 100 time steps, 22 GB memory) torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path OpenSora-v1-HQ-16x256x256.pth --prompt-path ./assets/texts/t2v_samples.txt # Sample 64x512x512 (40s/sample, 100 time steps) torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth --prompt-path ./assets/texts/t2v_samples.txt # Sample 64x512x512 with sequence parallelism (30s/sample, 100 time steps) # sequence parallelism is enabled automatically when nproc_per_node is larger than 1 torchrun --standalone --nproc_per_node 2 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth --prompt-path ./assets/texts/t2v_samples.txt ``` The speed is tested on H800 GPUs. For inference with other models, see [here](docs/commands.md) for more instructions. To lower the memory usage, set a smaller `vae.micro_batch_size` in the config (slightly lower sampling speed).
## Data Processing High-quality data is crucial for training good generation models. To this end, we establish a complete pipeline for data processing, which could seamlessly convert raw videos to high-quality video-text pairs. The pipeline is shown below. For detailed information, please refer to [data processing](docs/data_processing.md). Also check out the [datasets](docs/datasets.md) we use. ![Data Processing Pipeline](assets/readme/report_data_pipeline.png) ## Training ### Open-Sora 1.2 Training The training process is same as Open-Sora 1.1. ```bash # one node torchrun --standalone --nproc_per_node 8 scripts/train.py \ configs/opensora-v1-2/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT # multiple nodes colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py \ configs/opensora-v1-2/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT ``` ### Open-Sora 1.1 Training
View more Once you prepare the data in a `csv` file, run the following commands to launch training on a single node. ```bash # one node torchrun --standalone --nproc_per_node 8 scripts/train.py \ configs/opensora-v1-1/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT # multiple nodes colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py \ configs/opensora-v1-1/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT ```
### Open-Sora 1.0 Training
View more Once you prepare the data in a `csv` file, run the following commands to launch training on a single node. ```bash # 1 GPU, 16x256x256 torchrun --nnodes=1 --nproc_per_node=1 scripts/train.py configs/opensora/train/16x256x256.py --data-path YOUR_CSV_PATH # 8 GPUs, 64x512x512 torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT ``` To launch training on multiple nodes, prepare a hostfile according to [ColossalAI](https://colossalai.org/docs/basics/launch_colossalai/#launch-with-colossal-ai-cli), and run the following commands. ```bash colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT ``` For training other models and advanced usage, see [here](docs/commands.md) for more instructions.
## Evaluation We support evaluation based on: - Validation loss - [VBench](https://github.com/Vchitect/VBench/tree/master) score - VBench-i2v score - Batch generation for human evaluation All the evaluation code is released in `eval` folder. Check the [README](/eval/README.md) for more details. Our [report](/docs/report_03.md#evaluation) also provides more information about the evaluation during training. The following table shows Open-Sora 1.2 greatly improves Open-Sora 1.0. | Model | Total Score | Quality Score | Semantic Score | | -------------- | ----------- | ------------- | -------------- | | Open-Sora V1.0 | 75.91% | 78.81% | 64.28% | | Open-Sora V1.2 | 79.23% | 80.71% | 73.30% | ## VAE Training & Evaluation We train a VAE pipeline that consists of a spatial VAE followed by a temporal VAE. For more details, refer to [VAE Documentation](docs/vae.md). Before you run the following commands, follow our [Installation Documentation](docs/installation.md) to install the required dependencies for VAE and Evaluation. If you want to train your own VAE, we need to prepare data in the csv following the [data processing](#data-processing) pipeline, then run the following commands. Note that you need to adjust the number of trained epochs (`epochs`) in the config file accordingly with respect to your own csv data size. ```bash # stage 1 training, 380k steps, 8 GPUs torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage1.py --data-path YOUR_CSV_PATH # stage 2 training, 260k steps, 8 GPUs torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage2.py --data-path YOUR_CSV_PATH # stage 3 training, 540k steps, 24 GPUs torchrun --nnodes=3 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage3.py --data-path YOUR_CSV_PATH ``` To evaluate the VAE performance, you need to run VAE inference first to generate the videos, then calculate scores on the generated videos: ```bash # video generation torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference_vae.py configs/vae/inference/video.py --ckpt-path YOUR_VAE_CKPT_PATH --data-path YOUR_CSV_PATH --save-dir YOUR_VIDEO_DIR # the original videos will be saved to `YOUR_VIDEO_DIR_ori` # the reconstructed videos through the pipeline will be saved to `YOUR_VIDEO_DIR_rec` # the reconstructed videos through the spatial VAE only will be saved to `YOUR_VIDEO_DIR_spatial` # score calculation python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir YOUR_VIDEO_DIR_ori --generated_video_dir YOUR_VIDEO_DIR_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips ``` ## Contribution Thanks goes to these wonderful contributors: If you wish to contribute to this project, please refer to the [Contribution Guideline](./CONTRIBUTING.md). ## Acknowledgement Here we only list a few of the projects. For other works and datasets, please refer to our report. - [ColossalAI](https://github.com/hpcaitech/ColossalAI): A powerful large model parallel acceleration and optimization system. - [DiT](https://github.com/facebookresearch/DiT): Scalable Diffusion Models with Transformers. - [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT): An acceleration for DiT training. We adopt valuable acceleration strategies for training progress from OpenDiT. - [PixArt](https://github.com/PixArt-alpha/PixArt-alpha): An open-source DiT-based text-to-image model. - [Latte](https://github.com/Vchitect/Latte): An attempt to efficiently train DiT for video. - [StabilityAI VAE](https://huggingface.co/stabilityai/sd-vae-ft-mse-original): A powerful image VAE model. - [CLIP](https://github.com/openai/CLIP): A powerful text-image embedding model. - [T5](https://github.com/google-research/text-to-text-transfer-transformer): A powerful text encoder. - [LLaVA](https://github.com/haotian-liu/LLaVA): A powerful image captioning model based on [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) and [Yi-34B](https://huggingface.co/01-ai/Yi-34B). - [PLLaVA](https://github.com/magic-research/PLLaVA): A powerful video captioning model. - [MiraData](https://github.com/mira-space/MiraData): A large-scale video dataset with long durations and structured caption. We are grateful for their exceptional work and generous contribution to open source. Special thanks go to the authors of [MiraData](https://github.com/mira-space/MiraData) and [Rectified Flow](https://github.com/gnobitab/RectifiedFlow) for their valuable advice and help. We wish to express gratitude towards AK for sharing this project on social media and Hugging Face for providing free GPU resources for our online Gradio demo. ## Citation ```bibtex @software{opensora, author = {Zangwei Zheng and Xiangyu Peng and Tianji Yang and Chenhui Shen and Shenggui Li and Hongxin Liu and Yukun Zhou and Tianyi Li and Yang You}, title = {Open-Sora: Democratizing Efficient Video Production for All}, month = {March}, year = {2024}, url = {https://github.com/hpcaitech/Open-Sora} } ``` ## Star History [![Star History Chart](https://api.star-history.com/svg?repos=hpcaitech/Open-Sora&type=Date)](https://star-history.com/#hpcaitech/Open-Sora&Date) ================================================ FILE: Open-Sora/assets/texts/VBench/all_category.txt ================================================ a black dog wearing halloween costume spider making a web bat eating fruits while hanging a snake crawling on a wooden flooring a close up video of a dragonfly macro shot of ladybug on green leaf plant chameleon eating ant a bee feeding on nectars bird nests on a tree captured with moving camera a squirrel eating nuts close up video of snail top view of a hermit crab crawling on a wooden surface cat licking another cat red dragonfly perched on green leaf close up view of a brown caterpillar crawling on green leaf ants eating dead spider an eagle on a tree branch a frog eating an ant white rabbit near the fence a gorilla eating a carrot close up of wolf a meerkat looking around a hyena in a zoo lemur eating grass leaves an owl being trained by a man a lizard on a bamboo brown chicken hunting for its food video of parrots perched on bird stand underwater footage of an octopus in a coral reef a cute pomeranian dog playing with a soccer ball white fox on rock close up footage of a horse figurine giraffe feeding on a tree in a savannah curious cat sitting and looking around hummingbird hawk moth flying near pink flowers close up of a scorpion on a rock close up on fish in net koala eating leaves from a branch a pod of dolphins swirling in the sea catching forage fish low angle view of a hawk perched on a tree branch a lion standing on wild grass deer grazing in the field elephant herd in a savanna close up on lobster under water hedgehog crossing road in forest a sheep eating yellow flowers from behind a wire fence twin sisters and a turtle a pig wallowing in mud flock of goose eating on the lake water cow in a field irritated with flies a close up shot of a fly cheetah lying on the grass close up of a lemur close up shot of a kangaroo itching in the sand a tortoise covered with algae turkey in cage a great blue heron bird in the lakeside crab with shell in aquarium a seagull walking on shore an american crocodile a tiger walking inside a cage alligator in the nature a raccoon climbing a tree wild rabbit in a green meadow group of ring tailed lemurs a clouded leopard on a tree branch duck grooming its feathers an african penguin walking on a beach a video of a peacock close up shot of a wild bear baby rhino plays with mom porcupine climbs tree branches close up of a natterjack toad on a rock a sleeping orangutan mother whale swimming with babies a bear wearing red jersey pink jellyfish swimming underwater in a blue sea beautiful clown fish swimming animation of disposable objects shaped as a whale paper cut out of a pair of hands a whale and a heart vertical video of camel roaming in the field during daytime a still video of mosquito biting human a curious sloth hanging from a tree branch a plastic flamingo bird stumbles from the wind a wolf in its natural habitat a monkey sitting in the stone and scratching his head bat hanging upside down a red panda eating leaves snake on ground a harbour seal swimming near the shore shark swimming in the sea otter on branch while eating goat standing over a rock a troop of monkey on top of a mountain a zebra eating grass on the field a colorful butterfly perching on a bud a snail crawling on a leaf zookeeper showering a baby elephant a beetle emerging from the sand a nine banded armadillo searching for food an apartment building with balcony asian garden and medieval castle illuminated tower in berlin a wooden house overseeing the lake a crowd of people in a plaza in front of a government building a church interior jewish friends posing with hanukkah menorah in a cabin house a destroyed building after a missile attack in ukraine abandoned building in the woods drone video of an abandoned school building in pripyat ukraine elegant university building architecture and designs of buildings in central london a pancake tower with chocolate syrup and strawberries on top an ancient white building friends hanging out at a coffee house house front door with christmas decorations city night dark building a bird house hanging on a tree branch sacred sculpture in a temple high angle shot of a clock tower modern wooden house interior the interior of an abandoned building opera house overlooking sea a concrete structure near the green trees dome like building in scotland low angle shot of a building tower on hill a miniature house eiffel tower from the seine river low angle footage of an apartment building island with pier and antique building asian historic architecture drone footage of a beautiful mansion mosque in the middle east building a tent and hammock in the forest camping site top view of a high rise building house covered in snow skyscraper at night house in village a casino with people outside the building silhouette of a building a woman climbing a tree house drone view of house near lake during golden hour an under construction concrete house a watch tower by the sea exterior view of arabic style building video of a hotel building red paper lantern decorations hanging outside a building house on seashore aerial footage of the palace of culture and science building in warsaw poland aerial video of stuttgart tv tower in germany aerial view of the highway and building in a city drone shot of a skyscraper san francisco california usa waterfall and house view of the sky through a building drone footage of a house on top of the mountain abandoned house in the nature clouds hovering over a mansion light house on the ocean buddhist temple at sunrise people walking by a graveyard near a mosque at sunset view of lifeguard tower on the beach scenic view of a house in the mountains the landscape in front of a government building aerial footage of a building and its surrounding landscape in winter time lapse of a cloudy sky behind a transmission tower blue ocean near the brown castle fog over temple house in countryside top view building under construction turkish flag waving on old tower the georgian building close up shot of a steel structure the atrium and interior design of a multi floor building city view reflected on a glass building aerial view of a luxurious house with pool an unpaved road leading to the house drone footage of a lookout tower in mountain landscape wind turbines on hill behind building time lapse footage of the sun light in front of a small house porch a building built with lots of stairways overcast over house on seashore the view of the sydney opera house from the other side of the harbor candle on a jar and a house figurine on a surface video of a farm and house a dilapidated building made of bricks a view of a unique building from a moving vehicle aerial footage of a tall building in cambodia push in shot of a huge house a beach house built over a seawall protected from the sea waves exotic house surrounded by trees drone video of a house surrounded by tropical vegetation drone footage of a building beside a pond observation tower on hill in forest a tree house in the woods a video of vessel structure during daytime fire in front of illuminated building at night a footage of a wooden house on a wheat field tilt shot of a solar panel below a light tower water tower on the desert freshly baked finger looking cookies video of fake blood in wine glass halloween food art a person slicing a vegetable a serving of pumpkin dish in a plate close up view of green leafy vegetable a birthday cake in the plate video of a slice papaya fruit a muffin with a burning candle and a love sign by a ceramic mug a jack o lantern designed cookie baked bread with chocolate a broccoli soup on wooden table a freshly brewed coffee on a pink mug grabbing sourdough neapolitan style pizza slices person cooking mushrooms in frying pan rice grains placed on a reusable cloth bag slices of kiwi fruit grilling a steak on a pan grill close up of bread popping out of a toaster man eating noodle preparing a cocktail drink close up pasta with bacon on plate milk and cinnamon rolls boy getting a dumpling using chopsticks a mother preparing food with her kids man using his phone while eating fresh salmon salad on a plate cutting cucumbers into long thin slices as ingredient for sushi roll a steaming cup of tea by the window a glass filled with beer a kid eating popcorn while watching tv close up shot of fried fish on the plate a man eating a donut person making a vegetarian dish spreading cheese on bagel close up view of a man drinking red wine a couple having breakfast in a restaurant a student eating her sandwich girl peeling a banana red rice in a small bowl pancake with blueberry on the top green apple fruit on white wooden table a man eating a taco by the bar making of a burrito squeezing lemon into salad a chef cutting sushi rolls video of a delicious dessert deep frying a crab on a wok in high fire close up video of a orange juice video of a cooked chicken breast woman holding a pineapple a woman eating a bar of chocolate decorating christmas cookie squeezing a slice of fruit tuna sashimi on a plate a strawberry fruit mixed in an alcoholic drink preparing hot dogs in a grill a woman cutting a tomato an orange fruit cut in half a coconut fruit with drinking straw woman holding a dragon fruit a woman pouring hot beverage on a cup waffles with whipped cream and fruit focus shot of an insect at the bottom of a fruit preparing a healthy broccoli dish man eating snack at picnic close up video of a grilled shrimp skewer a woman mixing a smoothie drinks close up video of woman having a bite of jelly businessman drinking whiskey at the bar counter of a hotel lounge cutting an onion with a knife over a wooden chopping board fresh lemonade in bottles grilling a meat on a charcoal grill people enjoying asian cuisine close up footage of a hot dish on a clay pot pork ribs dish waffle with strawberry and syrup for breakfast tofu dish with rose garnish uncooked pork meat egg yolk being dumped over gourmet dish tasty brunch dish close up little boy pretending to eat the watermelon slicing roasted beef close up of a chef adding teriyaki sauce to a dish flat lay mexican dish a person placing an octopus dish on a marble surface close up of tea leaves brewing in a glass kettle adding fresh herbs to soup dish a scoop of roasted coffee beans fresh dim sum set up on a bamboo steam tray for cooking a girl putting ketchup on food at the kitchen cooking on electric stove a woman with a slice of a pie grapes and wine on a wooden board man taking picture of his food hamburger and fries on restaurant table close up video of japanese food a cracker sandwich with cheese filling for snack barista preparing matcha tea close up of onion rings being deep fried people carving a pumpkin people sitting on a sofa a man with a muertos face painting man walking in the dark men in front of their computer editing photos men loading christmas tree on tow truck woman washing the dishes woman adding honey to the cinnamon rolls two women kissing and smiling three women looking at watercolor paintings a family wearing paper bag masks a family posing for the camera a boy covering a rose flower with a dome glass boy sitting on grass petting a dog a girl in her tennis sportswear a girl coloring the cardboard silhouette of the couple during sunset couple dancing with body paint a child playing with water a woman with her child sitting on a couch in the living room a group of friend place doing hand gestures of agreement friends having a group selfie friends talking while on the basketball court group of people protesting a group of campers with a cute dog a group of photographers taking pictures at the north western gardens in llandudno north wales a group of students laughing and talking a group of martial artist warming up a person playing golf a person walking on a wet wooden bridge person doing a leg exercise ice hockey athlete on rink a young athlete training in swimming chess player dusting a chessboard baseball player holding his bat a bearded man putting a vinyl record on a vinyl player an orchestra finishes a performance people applauding the performance of the kids band performance at the recording studio father and his children playing jenga game people playing a board game man playing a video game a man video recording the movie in theater man and a woman eating while watching a movie movie crew talking together a director explaining the movie scene man and woman listening to music on car man playing music couple dancing slow dance with sun glare a ballerina practicing in the dance studio father and son holding hands father and daughter talking together a mother and her kids engaged in a video call mother and daughter reading a book together a mother teaching her daughter playing a violin kid in a halloween costume a happy kid playing the ukulele a chef slicing a cucumber chef wearing his gloves properly brother and sister using hammock girl applying sunblock to her brother a girl pushing the chair while her sister is on the chair colleagues talking in office building fighter practice kicking a woman fighter in her cosplay costume an engineer holding blueprints while talking with her colleague a young woman looking at vr controllers with her friend workmates teasing a colleague in the work a male police officer talking on the radio teacher holding a marker while talking teacher writing on her notebook a young student attending her online classes a student showing his classmates his wand a male vendor selling fruits a shirtless male climber a sound engineer listening to music female talking to a psychiatrist in a therapy session young female activist posing with flag a man in a hoodie and woman with a red bandana talking to each other and smiling a medium close up of women wearing kimonos a male interviewer listening to a person talking a social worker having a conversation with the foster parents a farm worker harvesting onions worker packing street food worker and client at barber shop elderly man lifting kettlebell mom assisting son in riding a bicycle dad watching her daughter eat young guy with vr headset pregnant woman exercising with trainer a fortune teller talking to a client wizard doing a ritual on a woman a footage of an actor on a movie scene a man holding a best actor trophy a singer of a music band a young singer performing on stage young dancer practicing at home seller showing room to a couple cab driver talking to passenger a policeman talking to the car driver kids celebrating halloween at home little boy helping mother in kitchen video of a indoor green plant a girl arranges a christmas garland hanging by the kitchen cabinet candle burning in dark room couple having fun and goofing around the bedroom girls jumping up and down in the bedroom woman and man in pajamas working from home a muslim family sitting and talking in the living room family enjoying snack time while sitting in the living room woman holding an animal puppet and a little girl playing together at the living room kids playing in the indoor tent young people celebrating new year at the office a woman writing on the sticky note in the office a woman exercising at home over a yoga mat girls preparing easter decorations at home dog on floor in room turning on a fluorescent light inside a room colleagues talking to each other near the office windows a woman recording herself while exercising at home music room different kind of tools kept in a utility room sofa beds and other furniture a girl finding her brother reading a book in the bedroom an elegant ceramic plant pot and hanging plant on indoor furniture inside a bedroom interior design of the bar section living room with party decoration firewood burning in dark room a young woman playing the ukulele at home woman painting at home a woman in a locker room video of a bathroom interior the interior design of a jewish synagogue a woman in protective suit disinfecting the kitchen modern minimalist home interior modern interior design of a coffee shop person arranging minimalist furniture aerial shot of interior of the warehouse a room of a manufacturing facility interior of catholic interior design of a restaurant a female model in a changing room looking herself in mirror men walking in the office hallway people sitting in a conference room the interior design of a shopping mall chandeliers in room lucerne railway station interior a female fencer posing in a foggy room a toolbox and a paint roller beside a huge package in a room bedroom in hotel a woman lying in the operating room a chef holding and checking kitchen utensils a couple singing in the shower room together a woman cleaning mess in the living room an empty meeting room with natural light person dancing in a dark room close up on blood in hospital room a couple resting on their home floor a young female staff at courier office a man entering the gym locker room a bored man sitting by the tv at home woman dancing in indoor garden rubble in the interior of an abandoned house indoor farm in a greenhouse man doing handstand in indoor garden an abandoned indoor swimming pool home decorations on top of a cabinet graffiti art on the interior walls of an abandoned mansion indoor wall climbing activity sunlight inside a room teenage girl roller skating at indoor rink home deco with lighted baby in the shower room men enjoying office christmas party a bedroom with a brick wall actors prepping in the dressing room kids playing at an indoor playground a person sanitizing an office space using smoke machine mother and daughter choosing clothes at home a woman sitting by the indoor fire pit man standing on the corner of the room while looking around person assembling furniture a family stacking cardboard boxes in a room family having fun in the dining room person disinfecting a room a woman washing strawberries in the kitchen sink modern office waiting room close up view of a person slicing with a kitchen knife boiling coffee on a stove in the kitchen modern equipment used in a home studio interior of a recording studio people working in a call center office band performing at a home concert a group of people watching a concert in a room people packing their furniture young employees in office holding a certificate a criminal inside a dark room handcuffed in a table couple browsing and looking for furniture in the store workspace at home video of a indoor green plant close up view of a plant close up shot of a burning plant plucking leaves from plant a plant on gold pot with glass lid a branch of a tree and a plant a leafless tree close up shot of fern leaf close up video of strawberry plant plant with blooming flowers close up video of flower petals watering yellow plant beautiful flower decoration cannabis flower in a jar a footage of the tree leaves a red leaf plant close up view of a white christmas tree snow pouring on a tree close up shot of white flowers on the tree leaves in the trees daytime a dead tree lying on a grass field tree branches in a flowing river purple flowers with leaves a coconut tree by the house close up on flower in winter bamboo leaves backlit by the sun close up video of a wet flower a man putting a flower in a box dropping flower petals on a wooden bowl a close up shot of gypsophila flower variety of succulent plants on a garden variety of trees and plants in a botanical garden forest of deciduous trees a stack of dried leaves burning in a forest tall forest trees on a misty morning close up view of dewdrops on a leaf close up view of white petaled flower removing a pineapple leaf a dragonfly perched on a leaf butterfly pollinating flower person visiting and checking a corn plant woman picking beans from a plant woman plucking mint leaves single tree in the middle of farmland a plant on a soil drone footage of a tree on farm field a tractor harvesting lavender flower people putting christmas ornaments on a christmas tree jack o lantern hanging on a tree tree with halloween decoration flower field near the waterfall truck carrying the tree logs raindrops falling on leaves shot of a palm tree swaying with the wind squirrels on a tree branch person holding a flower a fallen tree trunk tree with golden leaves cherry tree wind blows through leaves of the tree in autumn a leaf on a glass the long trunks of tall trees in the forest trees in the forest during sunny day close up video of tree bark reflection of tree branches trunks of many trees in the forest tree leaves providing shades from the sun leaves swaying in the wind low angle shot of baobab tree bare trees in forest a plant surrounded by fallen leaves a couple preparing food and pruning a plant a man cutting a tree bark oranges on a tree branch plant connected on the stones video of a sawmill machine cutting tree log women drying flower petals macro view of an agave plant a video of a person tying a plant on a string green moss in forest nature coconut tree near sea under blue sky the canopy of a coconut tree a man leaning on a tree at the beach a full grown plant on a pot candle wax dripping on flower petals close up of leaves in autumn a woman opening a book with a flower inside a man holding leaves looking at the camera a shadow of a swaying plant a tree and concrete structure under a blue and cloudy sky trimming excess leaves on a potted plant the changing color of the tree leaves during autumn season a gooseberry tree swayed by the wind forest trees and a medieval castle at sunset woman cut down tree an old oak tree in a park across the street from a hotel wild flowers growing in a forest ground a mossy fountain and green plants in a botanical garden mansion with beautiful garden ants on a dragon fruit flower scenery of desert landscape landscape agriculture farm tractor burning slash piles in the forest graveyard at sunset view of a jack o lantern with pumpkins in a smoky garden sun view through a spider web view of the sea from an abandoned building close up view of a full moon close up view of lighted candles close up view of swaying white flowers and leaves scenery of a relaxing beach selective focus video of grass during sunny day aerial view of brown dry landscape fireworks display in the sky at night a bonfire near river mountain view waterfalls in between mountain a picturesque view of nature exotic view of a riverfront city tall trees in the forest under the clear sky snow on branches in forest stream in the nature an airplane flying above the sea of clouds scenic video of sunset view of houses with bush fence under a blue and cloudy sky scenic view from wooden pathway scenic view of a tropical beach drone footage of waves crashing on beach shore a scenic view of the golden hour at norway time lapse video of foggy mountain forest brown mountain during fall season video of ocean during daytime boat sailing in the ocean top view of yachts beautiful scenery of flowing waterfalls and river wild ducks paddling on the lake surface a relaxing scenery of beach view under cloudy sky natural rock formations on beach under cloudy sky a palm tree against blue sky video of sailboat on a lake during sunset aerial view of snow piles time lapse of a sunset sky in the countryside aerial footage of a statue time lapse video of a farm during sunset clouds formation in the sky at sunset aerial shot of a village drone shot of a beautiful sunrise at the mountains time lapse video of foggy morning during sunrise sun shining between tree leaves at sunrise video of lake during dawn vehicles traveling on roadway under cloudy sky view of golden domed church a monument under the blue sky firecrackers in the sky view of fruit signage in the farm a dark clouds over shadowing the full moon view of the amazon river a big river swamp in a dense forest a blooming cherry blossom tree under a blue sky with white clouds a river waterfall cascading down the plunge basin flooded landscape with palm trees a blurry waterfall background waterfall in the mountains aerial footage of a city at night pond by small waterfall in forest aerial view of farmlands at the bay of lake rice terraces in the countryside a highway built across an agricultural area in the countryside gloomy morning in the countryside drone shot of an abandoned coliseum on a snowy mountain top boat sailing in the middle of ocean drone shot of the grass field natural landscape of mountain and sea with islets developed into a community aerial view of zaporizhia in ukraine aerial footage of a herd an aerial footage of a red sky grass and plants growing in the remains of an abandoned house view from hill on city aerial view on orthodox church aerial view of bay in croatia a footage of a frozen river overlooking view of a city at daylight view outside the cemetery clear sky with moon over meadow clouds over railway aerial footage of moving vehicles on the road at night aerial view of town and park top view of skyscrapers top view of the empire state building in manhattan top view of the central park in new york city sheep running in a grass field clear sky over factory smoke and fire in birds eye view view of a pathway with snow melting on its side ferry under bridge on river near city in malaysia mountain slopes covered in green vegetation panoramic view of a town surrounded by snow covered mountains aerial view of a palace top view of vehicles driving on the intersection a graveyard by a church in a mountain landscape a modern railway station in malaysia use for public transportation drone footage of amsterdam metro station train arriving at a station red vehicle driving on field close up view of flashing emergency vehicle lighting vehicle with fertilizer on field a highway built across an agricultural area in the countryside drone footage of motorcycles driving on country road between agricultural fields a road in the woods under fog footage of a car driving through a wheat field vehicle stops for an ambulance passing through city traffic emergency vehicle parked outside the casino zombies attacking a woman and a boy inside a car woman seating inside the car while chewing video of passengers riding a double decker bus during night traffic in london street at night elderly couple checking engine of automobile a green vintage automobile with an open hood parked in a parking area close up of a prototype automobile with exposed engine on the back seat of the car aerial view of road in forest train departing from station aerial view of a train passing by a bridge video of a train tracks video footage of a subway video of blinking traffic lights couple walking out on the subway time lapse of a subway tunnel monitor board inside the subway metro train at night zoom in video of a tram passing by city young man using laptop in the tram man reading a book at bus stop close up shot of a moving taxi night travel in london street on a public bus red bus in a rainy city flow of traffic in the city close up shot of a yellow taxi turning left two women calling for a taxi drone view of an illuminated bridge across a river policeman in police car talking on radio airplane taking off at night view through window in airplane an airplane in the sky helicopter landing on the street a pilot getting out of a helicopter a helicopter flying under blue sky boat sailing in the middle of the ocean girl playing with a toy boat silhouette of a boat on sea during golden hour a boat travelling around the lake road on mountain ridge ship sailing on danube river slow motion video of a ship water trail in the sea drone footage of a wreck ship on shore a white yacht traveling on a river and passing under the bridge female teenagers drinking champagne in the yacht video of yacht sailing in the ocean red combine harvester on road on field a woman sitting on a bicycle while using a mobile phone a woman sitting on a motorcycle looking around three teenagers fixing a bicycle a woman in a halloween costume posing on a motorcycle a parked motorcycle on a foggy roadside cable car near sea shore a truck travelling in the road footage of the road without any traffic a road sign love padlocks on a bridge camera moving at highway construction site vehicles driving on highway a motorbike on highway at timelapse mode point of view of a car driving through a tunnel time lapse of heavy traffic on an avenue ferry boat on city canal black vintage car in museum a zigzag road across a forest people crossing the road video of a kayak boat in a river a person paddling a wooden boat in a lake a car charging in the parking area cars parked on the road footage of the street with people and vehicle passing by in the rain traffic on busy city street a woman getting out of the car to walk with their dog yacht sailing through the ocean people in queue to military ship man wearing motorcycle helmet looking at the camera empty seats in the bus empty boat on the water cargo train traveling on the mountainside cruise ship in harbor counting down at traffic lights pressing the car ignition fire truck driving on the road a footage of a broken bicycle drone footage of an ambulance on the road slow motion footage of a racing car ship sailing on sea against sunset big cargo ship passing on the shore back view of man and woman walking on unpaved road ================================================ FILE: Open-Sora/assets/texts/VBench/all_dimension.txt ================================================ In a still frame, a stop sign a toilet, frozen in time a laptop, frozen in time A tranquil tableau of alley A tranquil tableau of bar A tranquil tableau of barn A tranquil tableau of bathroom A tranquil tableau of bedroom A tranquil tableau of cliff In a still frame, courtyard In a still frame, gas station A tranquil tableau of house indoor gymnasium, frozen in time A tranquil tableau of indoor library A tranquil tableau of kitchen A tranquil tableau of palace In a still frame, parking lot In a still frame, phone booth A tranquil tableau of restaurant A tranquil tableau of tower A tranquil tableau of a bowl A tranquil tableau of an apple A tranquil tableau of a bench A tranquil tableau of a bed A tranquil tableau of a chair A tranquil tableau of a cup A tranquil tableau of a dining table In a still frame, a pear A tranquil tableau of a bunch of grapes A tranquil tableau of a bowl on the kitchen counter A tranquil tableau of a beautiful, handcrafted ceramic bowl A tranquil tableau of an antique bowl A tranquil tableau of an exquisite mahogany dining table A tranquil tableau of a wooden bench in the park A tranquil tableau of a beautiful wrought-iron bench surrounded by blooming flowers In a still frame, a park bench with a view of the lake A tranquil tableau of a vintage rocking chair was placed on the porch A tranquil tableau of the jail cell was small and dimly lit, with cold, steel bars A tranquil tableau of the phone booth was tucked away in a quiet alley a dilapidated phone booth stood as a relic of a bygone era on the sidewalk, frozen in time A tranquil tableau of the old red barn stood weathered and iconic against the backdrop of the countryside A tranquil tableau of a picturesque barn was painted a warm shade of red and nestled in a picturesque meadow In a still frame, within the desolate desert, an oasis unfolded, characterized by the stoic presence of palm trees and a motionless, glassy pool of water In a still frame, the Parthenon's majestic Doric columns stand in serene solitude atop the Acropolis, framed by the tranquil Athenian landscape In a still frame, the Temple of Hephaestus, with its timeless Doric grace, stands stoically against the backdrop of a quiet Athens In a still frame, the ornate Victorian streetlamp stands solemnly, adorned with intricate ironwork and stained glass panels A tranquil tableau of the Stonehenge presented itself as an enigmatic puzzle, each colossal stone meticulously placed against the backdrop of tranquility In a still frame, in the vast desert, an oasis nestled among dunes, featuring tall palm trees and an air of serenity static view on a desert scene with an oasis, palm trees, and a clear, calm pool of water A tranquil tableau of an ornate Victorian streetlamp standing on a cobblestone street corner, illuminating the empty night A tranquil tableau of a tranquil lakeside cabin nestled among tall pines, its reflection mirrored perfectly in the calm water In a still frame, a vintage gas lantern, adorned with intricate details, gracing a historic cobblestone square In a still frame, a tranquil Japanese tea ceremony room, with tatami mats, a delicate tea set, and a bonsai tree in the corner A tranquil tableau of the Parthenon stands resolute in its classical elegance, a timeless symbol of Athens' cultural legacy A tranquil tableau of in the heart of Plaka, the neoclassical architecture of the old city harmonizes with the ancient ruins A tranquil tableau of in the desolate beauty of the American Southwest, Chaco Canyon's ancient ruins whispered tales of an enigmatic civilization that once thrived amidst the arid landscapes A tranquil tableau of at the edge of the Arabian Desert, the ancient city of Petra beckoned with its enigmatic rock-carved façades In a still frame, amidst the cobblestone streets, an Art Nouveau lamppost stood tall A tranquil tableau of in the quaint village square, a traditional wrought-iron streetlamp featured delicate filigree patterns and amber-hued glass panels A tranquil tableau of the lampposts were adorned with Art Deco motifs, their geometric shapes and frosted glass creating a sense of vintage glamour In a still frame, in the picturesque square, a Gothic-style lamppost adorned with intricate stone carvings added a touch of medieval charm to the setting In a still frame, in the heart of the old city, a row of ornate lantern-style streetlamps bathed the narrow alleyway in a warm, welcoming light A tranquil tableau of in the heart of the Utah desert, a massive sandstone arch spanned the horizon A tranquil tableau of in the Arizona desert, a massive stone bridge arched across a rugged canyon A tranquil tableau of in the corner of the minimalist tea room, a bonsai tree added a touch of nature's beauty to the otherwise simple and elegant space In a still frame, amidst the hushed ambiance of the traditional tea room, a meticulously arranged tea set awaited, with porcelain cups, a bamboo whisk In a still frame, nestled in the Zen garden, a rustic teahouse featured tatami seating and a traditional charcoal brazier A tranquil tableau of a country estate's library featured elegant wooden shelves A tranquil tableau of beneath the shade of a solitary oak tree, an old wooden park bench sat patiently A tranquil tableau of beside a tranquil pond, a weeping willow tree draped its branches gracefully over the water's surface, creating a serene tableau of reflection and calm A tranquil tableau of in the Zen garden, a perfectly raked gravel path led to a serene rock garden In a still frame, a tranquil pond was fringed by weeping cherry trees, their blossoms drifting lazily onto the glassy surface In a still frame, within the historic library's reading room, rows of antique leather chairs and mahogany tables offered a serene haven for literary contemplation A tranquil tableau of a peaceful orchid garden showcased a variety of delicate blooms A tranquil tableau of in the serene courtyard, a centuries-old stone well stood as a symbol of a bygone era, its mossy stones bearing witness to the passage of time a bird and a cat a cat and a dog a dog and a horse a horse and a sheep a sheep and a cow a cow and an elephant an elephant and a bear a bear and a zebra a zebra and a giraffe a giraffe and a bird a chair and a couch a couch and a potted plant a potted plant and a tv a tv and a laptop a laptop and a remote a remote and a keyboard a keyboard and a cell phone a cell phone and a book a book and a clock a clock and a backpack a backpack and an umbrella an umbrella and a handbag a handbag and a tie a tie and a suitcase a suitcase and a vase a vase and scissors scissors and a teddy bear a teddy bear and a frisbee a frisbee and skis skis and a snowboard a snowboard and a sports ball a sports ball and a kite a kite and a baseball bat a baseball bat and a baseball glove a baseball glove and a skateboard a skateboard and a surfboard a surfboard and a tennis racket a tennis racket and a bottle a bottle and a chair an airplane and a train a train and a boat a boat and an airplane a bicycle and a car a car and a motorcycle a motorcycle and a bus a bus and a traffic light a traffic light and a fire hydrant a fire hydrant and a stop sign a stop sign and a parking meter a parking meter and a truck a truck and a bicycle a toilet and a hair drier a hair drier and a toothbrush a toothbrush and a sink a sink and a toilet a wine glass and a chair a cup and a couch a fork and a potted plant a knife and a tv a spoon and a laptop a bowl and a remote a banana and a keyboard an apple and a cell phone a sandwich and a book an orange and a clock broccoli and a backpack a carrot and an umbrella a hot dog and a handbag a pizza and a tie a donut and a suitcase a cake and a vase an oven and scissors a toaster and a teddy bear a microwave and a frisbee a refrigerator and skis a bicycle and an airplane a car and a train a motorcycle and a boat a person and a toilet a person and a hair drier a person and a toothbrush a person and a sink A person is riding a bike A person is marching A person is roller skating A person is tasting beer A person is clapping A person is drawing A person is petting animal (not cat) A person is eating watermelon A person is playing harp A person is wrestling A person is riding scooter A person is sweeping floor A person is skateboarding A person is dunking basketball A person is playing flute A person is stretching leg A person is tying tie A person is skydiving A person is shooting goal (soccer) A person is playing piano A person is finger snapping A person is canoeing or kayaking A person is laughing A person is digging A person is clay pottery making A person is shooting basketball A person is bending back A person is shaking hands A person is bandaging A person is push up A person is catching or throwing frisbee A person is playing trumpet A person is flying kite A person is filling eyebrows A person is shuffling cards A person is folding clothes A person is smoking A person is tai chi A person is squat A person is playing controller A person is throwing axe A person is giving or receiving award A person is air drumming A person is taking a shower A person is planting trees A person is sharpening knives A person is robot dancing A person is rock climbing A person is hula hooping A person is writing A person is bungee jumping A person is pushing cart A person is cleaning windows A person is cutting watermelon A person is cheerleading A person is washing hands A person is ironing A person is cutting nails A person is hugging A person is trimming or shaving beard A person is jogging A person is making bed A person is washing dishes A person is grooming dog A person is doing laundry A person is knitting A person is reading book A person is baby waking up A person is massaging legs A person is brushing teeth A person is crawling baby A person is motorcycling A person is driving car A person is sticking tongue out A person is shaking head A person is sword fighting A person is doing aerobics A person is strumming guitar A person is riding or walking with horse A person is archery A person is catching or throwing baseball A person is playing chess A person is rock scissors paper A person is using computer A person is arranging flowers A person is bending metal A person is ice skating A person is climbing a rope A person is crying A person is dancing ballet A person is getting a haircut A person is running on treadmill A person is kissing A person is counting money A person is barbequing A person is peeling apples A person is milking cow A person is shining shoes A person is making snowman A person is sailing a person swimming in ocean a person giving a presentation to a room full of colleagues a person washing the dishes a person eating a burger a person walking in the snowstorm a person drinking coffee in a cafe a person playing guitar a bicycle leaning against a tree a bicycle gliding through a snowy field a bicycle slowing down to stop a bicycle accelerating to gain speed a car stuck in traffic during rush hour a car turning a corner a car slowing down to stop a car accelerating to gain speed a motorcycle cruising along a coastal highway a motorcycle turning a corner a motorcycle slowing down to stop a motorcycle gliding through a snowy field a motorcycle accelerating to gain speed an airplane soaring through a clear blue sky an airplane taking off an airplane landing smoothly on a runway an airplane accelerating to gain speed a bus turning a corner a bus stuck in traffic during rush hour a bus accelerating to gain speed a train speeding down the tracks a train crossing over a tall bridge a train accelerating to gain speed a truck turning a corner a truck anchored in a tranquil bay a truck stuck in traffic during rush hour a truck slowing down to stop a truck accelerating to gain speed a boat sailing smoothly on a calm lake a boat slowing down to stop a boat accelerating to gain speed a bird soaring gracefully in the sky a bird building a nest from twigs and leaves a bird flying over a snowy forest a cat grooming itself meticulously with its tongue a cat playing in park a cat drinking water a cat running happily a dog enjoying a peaceful walk a dog playing in park a dog drinking water a dog running happily a horse bending down to drink water from a river a horse galloping across an open field a horse taking a peaceful walk a horse running to join a herd of its kind a sheep bending down to drink water from a river a sheep taking a peaceful walk a sheep running to join a herd of its kind a cow bending down to drink water from a river a cow chewing cud while resting in a tranquil barn a cow running to join a herd of its kind an elephant spraying itself with water using its trunk to cool down an elephant taking a peaceful walk an elephant running to join a herd of its kind a bear catching a salmon in its powerful jaws a bear sniffing the air for scents of food a bear climbing a tree a bear hunting for prey a zebra bending down to drink water from a river a zebra running to join a herd of its kind a zebra taking a peaceful walk a giraffe bending down to drink water from a river a giraffe taking a peaceful walk a giraffe running to join a herd of its kind a person a bicycle a car a motorcycle an airplane a bus a train a truck a boat a traffic light a fire hydrant a stop sign a parking meter a bench a bird a cat a dog a horse a sheep a cow an elephant a bear a zebra a giraffe a backpack an umbrella a handbag a tie a suitcase a frisbee skis a snowboard a sports ball a kite a baseball bat a baseball glove a skateboard a surfboard a tennis racket a bottle a wine glass a cup a fork a knife a spoon a bowl a banana an apple a sandwich an orange broccoli a carrot a hot dog a pizza a donut a cake a chair a couch a potted plant a bed a dining table a toilet a tv a laptop a remote a keyboard a cell phone a microwave an oven a toaster a sink a refrigerator a book a clock a vase scissors a teddy bear a hair drier a toothbrush a red bicycle a green bicycle a blue bicycle a yellow bicycle an orange bicycle a purple bicycle a pink bicycle a black bicycle a white bicycle a red car a green car a blue car a yellow car an orange car a purple car a pink car a black car a white car a red bird a green bird a blue bird a yellow bird an orange bird a purple bird a pink bird a black bird a white bird a black cat a white cat an orange cat a yellow cat a red umbrella a green umbrella a blue umbrella a yellow umbrella an orange umbrella a purple umbrella a pink umbrella a black umbrella a white umbrella a red suitcase a green suitcase a blue suitcase a yellow suitcase an orange suitcase a purple suitcase a pink suitcase a black suitcase a white suitcase a red bowl a green bowl a blue bowl a yellow bowl an orange bowl a purple bowl a pink bowl a black bowl a white bowl a red chair a green chair a blue chair a yellow chair an orange chair a purple chair a pink chair a black chair a white chair a red clock a green clock a blue clock a yellow clock an orange clock a purple clock a pink clock a black clock a white clock a red vase a green vase a blue vase a yellow vase an orange vase a purple vase a pink vase a black vase a white vase A beautiful coastal beach in spring, waves lapping on sand, Van Gogh style A beautiful coastal beach in spring, waves lapping on sand, oil painting A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo A beautiful coastal beach in spring, waves lapping on sand, black and white A beautiful coastal beach in spring, waves lapping on sand, pixel art A beautiful coastal beach in spring, waves lapping on sand, in cyberpunk style A beautiful coastal beach in spring, waves lapping on sand, animated style A beautiful coastal beach in spring, waves lapping on sand, watercolor painting A beautiful coastal beach in spring, waves lapping on sand, surrealism style The bund Shanghai, Van Gogh style The bund Shanghai, oil painting The bund Shanghai by Hokusai, in the style of Ukiyo The bund Shanghai, black and white The bund Shanghai, pixel art The bund Shanghai, in cyberpunk style The bund Shanghai, animated style The bund Shanghai, watercolor painting The bund Shanghai, surrealism style a shark is swimming in the ocean, Van Gogh style a shark is swimming in the ocean, oil painting a shark is swimming in the ocean by Hokusai, in the style of Ukiyo a shark is swimming in the ocean, black and white a shark is swimming in the ocean, pixel art a shark is swimming in the ocean, in cyberpunk style a shark is swimming in the ocean, animated style a shark is swimming in the ocean, watercolor painting a shark is swimming in the ocean, surrealism style A panda drinking coffee in a cafe in Paris, Van Gogh style A panda drinking coffee in a cafe in Paris, oil painting A panda drinking coffee in a cafe in Paris by Hokusai, in the style of Ukiyo A panda drinking coffee in a cafe in Paris, black and white A panda drinking coffee in a cafe in Paris, pixel art A panda drinking coffee in a cafe in Paris, in cyberpunk style A panda drinking coffee in a cafe in Paris, animated style A panda drinking coffee in a cafe in Paris, watercolor painting A panda drinking coffee in a cafe in Paris, surrealism style A cute happy Corgi playing in park, sunset, Van Gogh style A cute happy Corgi playing in park, sunset, oil painting A cute happy Corgi playing in park, sunset by Hokusai, in the style of Ukiyo A cute happy Corgi playing in park, sunset, black and white A cute happy Corgi playing in park, sunset, pixel art A cute happy Corgi playing in park, sunset, in cyberpunk style A cute happy Corgi playing in park, sunset, animated style A cute happy Corgi playing in park, sunset, watercolor painting A cute happy Corgi playing in park, sunset, surrealism style Gwen Stacy reading a book, Van Gogh style Gwen Stacy reading a book, oil painting Gwen Stacy reading a book by Hokusai, in the style of Ukiyo Gwen Stacy reading a book, black and white Gwen Stacy reading a book, pixel art Gwen Stacy reading a book, in cyberpunk style Gwen Stacy reading a book, animated style Gwen Stacy reading a book, watercolor painting Gwen Stacy reading a book, surrealism style A boat sailing leisurely along the Seine River with the Eiffel Tower in background, Van Gogh style A boat sailing leisurely along the Seine River with the Eiffel Tower in background, oil painting A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Hokusai, in the style of Ukiyo A boat sailing leisurely along the Seine River with the Eiffel Tower in background, black and white A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pixel art A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in cyberpunk style A boat sailing leisurely along the Seine River with the Eiffel Tower in background, animated style A boat sailing leisurely along the Seine River with the Eiffel Tower in background, watercolor painting A boat sailing leisurely along the Seine River with the Eiffel Tower in background, surrealism style A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, Van Gogh style A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, oil painting A couple in formal evening wear going home get caught in a heavy downpour with umbrellas by Hokusai, in the style of Ukiyo A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, black and white A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pixel art A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in cyberpunk style A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, animated style A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, watercolor painting A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, surrealism style An astronaut flying in space, Van Gogh style An astronaut flying in space, oil painting An astronaut flying in space by Hokusai, in the style of Ukiyo An astronaut flying in space, black and white An astronaut flying in space, pixel art An astronaut flying in space, in cyberpunk style An astronaut flying in space, animated style An astronaut flying in space, watercolor painting An astronaut flying in space, surrealism style Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, Van Gogh style Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, oil painting Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks by Hokusai, in the style of Ukiyo Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, black and white Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pixel art Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in cyberpunk style Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, animated style Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, watercolor painting Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, surrealism style A beautiful coastal beach in spring, waves lapping on sand, in super slow motion A beautiful coastal beach in spring, waves lapping on sand, zoom in A beautiful coastal beach in spring, waves lapping on sand, zoom out A beautiful coastal beach in spring, waves lapping on sand, pan left A beautiful coastal beach in spring, waves lapping on sand, pan right A beautiful coastal beach in spring, waves lapping on sand, tilt up A beautiful coastal beach in spring, waves lapping on sand, tilt down A beautiful coastal beach in spring, waves lapping on sand, with an intense shaking effect A beautiful coastal beach in spring, waves lapping on sand, featuring a steady and smooth perspective A beautiful coastal beach in spring, waves lapping on sand, racking focus The bund Shanghai, in super slow motion The bund Shanghai, zoom in The bund Shanghai, zoom out The bund Shanghai, pan left The bund Shanghai, pan right The bund Shanghai, tilt up The bund Shanghai, tilt down The bund Shanghai, with an intense shaking effect The bund Shanghai, featuring a steady and smooth perspective The bund Shanghai, racking focus a shark is swimming in the ocean, in super slow motion a shark is swimming in the ocean, zoom in a shark is swimming in the ocean, zoom out a shark is swimming in the ocean, pan left a shark is swimming in the ocean, pan right a shark is swimming in the ocean, tilt up a shark is swimming in the ocean, tilt down a shark is swimming in the ocean, with an intense shaking effect a shark is swimming in the ocean, featuring a steady and smooth perspective a shark is swimming in the ocean, racking focus A panda drinking coffee in a cafe in Paris, in super slow motion A panda drinking coffee in a cafe in Paris, zoom in A panda drinking coffee in a cafe in Paris, zoom out A panda drinking coffee in a cafe in Paris, pan left A panda drinking coffee in a cafe in Paris, pan right A panda drinking coffee in a cafe in Paris, tilt up A panda drinking coffee in a cafe in Paris, tilt down A panda drinking coffee in a cafe in Paris, with an intense shaking effect A panda drinking coffee in a cafe in Paris, featuring a steady and smooth perspective A panda drinking coffee in a cafe in Paris, racking focus A cute happy Corgi playing in park, sunset, in super slow motion A cute happy Corgi playing in park, sunset, zoom in A cute happy Corgi playing in park, sunset, zoom out A cute happy Corgi playing in park, sunset, pan left A cute happy Corgi playing in park, sunset, pan right A cute happy Corgi playing in park, sunset, tilt up A cute happy Corgi playing in park, sunset, tilt down A cute happy Corgi playing in park, sunset, with an intense shaking effect A cute happy Corgi playing in park, sunset, featuring a steady and smooth perspective A cute happy Corgi playing in park, sunset, racking focus Gwen Stacy reading a book, in super slow motion Gwen Stacy reading a book, zoom in Gwen Stacy reading a book, zoom out Gwen Stacy reading a book, pan left Gwen Stacy reading a book, pan right Gwen Stacy reading a book, tilt up Gwen Stacy reading a book, tilt down Gwen Stacy reading a book, with an intense shaking effect Gwen Stacy reading a book, featuring a steady and smooth perspective Gwen Stacy reading a book, racking focus A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in super slow motion A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom in A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom out A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan left A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan right A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt up A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt down A boat sailing leisurely along the Seine River with the Eiffel Tower in background, with an intense shaking effect A boat sailing leisurely along the Seine River with the Eiffel Tower in background, featuring a steady and smooth perspective A boat sailing leisurely along the Seine River with the Eiffel Tower in background, racking focus A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in super slow motion A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom in A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom out A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan left A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan right A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt up A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt down A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, with an intense shaking effect A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, featuring a steady and smooth perspective A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, racking focus An astronaut flying in space, in super slow motion An astronaut flying in space, zoom in An astronaut flying in space, zoom out An astronaut flying in space, pan left An astronaut flying in space, pan right An astronaut flying in space, tilt up An astronaut flying in space, tilt down An astronaut flying in space, with an intense shaking effect An astronaut flying in space, featuring a steady and smooth perspective An astronaut flying in space, racking focus Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in super slow motion Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom in Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom out Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan left Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan right Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt up Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt down Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, with an intense shaking effect Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, featuring a steady and smooth perspective Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, racking focus Close up of grapes on a rotating table. Turtle swimming in ocean. A storm trooper vacuuming the beach. A panda standing on a surfboard in the ocean in sunset. An astronaut feeding ducks on a sunny afternoon, reflection from the water. Two pandas discussing an academic paper. Sunset time lapse at the beach with moving clouds and colors in the sky. A fat rabbit wearing a purple robe walking through a fantasy landscape. A koala bear playing piano in the forest. An astronaut flying in space. Fireworks. An animated painting of fluffy white clouds moving in sky. Flying through fantasy landscapes. A bigfoot walking in the snowstorm. A squirrel eating a burger. A cat wearing sunglasses and working as a lifeguard at a pool. Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks. Splash of turquoise water in extreme slow motion, alpha channel included. an ice cream is melting on the table. a drone flying over a snowy forest. a shark is swimming in the ocean. Aerial panoramic video from a drone of a fantasy land. a teddy bear is swimming in the ocean. time lapse of sunrise on mars. golden fish swimming in the ocean. An artist brush painting on a canvas close up. A drone view of celebration with Christmas tree and fireworks, starry sky - background. happy dog wearing a yellow turtleneck, studio, portrait, facing camera, dark background Origami dancers in white paper, 3D render, on white background, studio shot, dancing modern dance. Campfire at night in a snowy forest with starry sky in the background. a fantasy landscape A 3D model of a 1800s victorian house. this is how I do makeup in the morning. A raccoon that looks like a turtle, digital art. Robot dancing in Times Square. Busy freeway at night. Balloon full of water exploding in extreme slow motion. An astronaut is riding a horse in the space in a photorealistic style. Macro slo-mo. Slow motion cropped closeup of roasted coffee beans falling into an empty bowl. Sewing machine, old sewing machine working. Motion colour drop in water, ink swirling in water, colourful ink in water, abstraction fancy dream cloud of ink. Few big purple plums rotating on the turntable. water drops appear on the skin during rotation. isolated on the white background. close-up. macro. Vampire makeup face of beautiful girl, red contact lenses. Ashtray full of butts on table, smoke flowing on black background, close-up Pacific coast, carmel by the sea ocean and waves. A teddy bear is playing drum kit in NYC Times Square. A corgi is playing drum kit. An Iron man is playing the electronic guitar, high electronic guitar. A raccoon is playing the electronic guitar. A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Vincent van Gogh A corgi's head depicted as an explosion of a nebula A fantasy landscape A future where humans have achieved teleportation technology A jellyfish floating through the ocean, with bioluminescent tentacles A Mars rover moving on Mars A panda drinking coffee in a cafe in Paris A space shuttle launching into orbit, with flames and smoke billowing out from the engines A steam train moving on a mountainside A super cool giant robot in Cyberpunk Beijing A tropical beach at sunrise, with palm trees and crystal-clear water in the foreground Cinematic shot of Van Gogh's selfie, Van Gogh style Gwen Stacy reading a book Iron Man flying in the sky The bund Shanghai, oil painting Yoda playing guitar on the stage A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo A beautiful coastal beach in spring, waves lapping on sand by Vincent van Gogh A boat sailing leisurely along the Seine River with the Eiffel Tower in background A car moving slowly on an empty street, rainy evening A cat eating food out of a bowl A cat wearing sunglasses at a pool A confused panda in calculus class A cute fluffy panda eating Chinese food in a restaurant A cute happy Corgi playing in park, sunset A cute raccoon playing guitar in a boat on the ocean A happy fuzzy panda playing guitar nearby a campfire, snow mountain in the background A lightning striking atop of eiffel tower, dark clouds in the sky A modern art museum, with colorful paintings A panda cooking in the kitchen A panda playing on a swing set A polar bear is playing guitar A raccoon dressed in suit playing the trumpet, stage background A robot DJ is playing the turntable, in heavy raining futuristic tokyo rooftop cyberpunk night, sci-fi, fantasy A shark swimming in clear Caribbean ocean A super robot protecting city A teddy bear washing the dishes An epic tornado attacking above a glowing city at night, the tornado is made of smoke An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with umbrellas Clown fish swimming through the coral reef Hyper-realistic spaceship landing on Mars The bund Shanghai, vibrant color Vincent van Gogh is painting in the room Yellow flowers swing in the wind alley amusement park aquarium arch art gallery bathroom bakery shop ballroom bar barn basement beach bedroom bridge botanical garden cafeteria campsite campus carrousel castle cemetery classroom cliff crosswalk construction site corridor courtyard desert downtown driveway farm food court football field forest road fountain gas station glacier golf course indoor gymnasium harbor highway hospital house iceberg industrial area jail cell junkyard kitchen indoor library lighthouse laboratory mansion marsh mountain indoor movie theater indoor museum music studio nursery ocean office palace parking lot pharmacy phone booth raceway restaurant river science museum shower ski slope sky skyscraper baseball stadium staircase street supermarket indoor swimming pool tower outdoor track train railway train station platform underwater coral reef valley volcano waterfall windmill a bicycle on the left of a car, front view a car on the right of a motorcycle, front view a motorcycle on the left of a bus, front view a bus on the right of a traffic light, front view a traffic light on the left of a fire hydrant, front view a fire hydrant on the right of a stop sign, front view a stop sign on the left of a parking meter, front view a parking meter on the right of a bench, front view a bench on the left of a truck, front view a truck on the right of a bicycle, front view a bird on the left of a cat, front view a cat on the right of a dog, front view a dog on the left of a horse, front view a horse on the right of a sheep, front view a sheep on the left of a cow, front view a cow on the right of an elephant, front view an elephant on the left of a bear, front view a bear on the right of a zebra, front view a zebra on the left of a giraffe, front view a giraffe on the right of a bird, front view a bottle on the left of a wine glass, front view a wine glass on the right of a cup, front view a cup on the left of a fork, front view a fork on the right of a knife, front view a knife on the left of a spoon, front view a spoon on the right of a bowl, front view a bowl on the left of a bottle, front view a potted plant on the left of a remote, front view a remote on the right of a clock, front view a clock on the left of a vase, front view a vase on the right of scissors, front view scissors on the left of a teddy bear, front view a teddy bear on the right of a potted plant, front view a frisbee on the left of a sports ball, front view a sports ball on the right of a baseball bat, front view a baseball bat on the left of a baseball glove, front view a baseball glove on the right of a tennis racket, front view a tennis racket on the left of a frisbee, front view a toilet on the left of a hair drier, front view a hair drier on the right of a toothbrush, front view a toothbrush on the left of a sink, front view a sink on the right of a toilet, front view a chair on the left of a couch, front view a couch on the right of a bed, front view a bed on the left of a tv, front view a tv on the right of a dining table, front view a dining table on the left of a chair, front view an airplane on the left of a train, front view a train on the right of a boat, front view a boat on the left of an airplane, front view an oven on the top of a toaster, front view an oven on the bottom of a toaster, front view a toaster on the top of a microwave, front view a toaster on the bottom of a microwave, front view a microwave on the top of an oven, front view a microwave on the bottom of an oven, front view a banana on the top of an apple, front view a banana on the bottom of an apple, front view an apple on the top of a sandwich, front view an apple on the bottom of a sandwich, front view a sandwich on the top of an orange, front view a sandwich on the bottom of an orange, front view an orange on the top of a carrot, front view an orange on the bottom of a carrot, front view a carrot on the top of a hot dog, front view a carrot on the bottom of a hot dog, front view a hot dog on the top of a pizza, front view a hot dog on the bottom of a pizza, front view a pizza on the top of a donut, front view a pizza on the bottom of a donut, front view a donut on the top of broccoli, front view a donut on the bottom of broccoli, front view broccoli on the top of a banana, front view broccoli on the bottom of a banana, front view skis on the top of a snowboard, front view skis on the bottom of a snowboard, front view a snowboard on the top of a kite, front view a snowboard on the bottom of a kite, front view a kite on the top of a skateboard, front view a kite on the bottom of a skateboard, front view a skateboard on the top of a surfboard, front view a skateboard on the bottom of a surfboard, front view a surfboard on the top of skis, front view a surfboard on the bottom of skis, front view ================================================ FILE: Open-Sora/assets/texts/VBench/all_i2v.txt ================================================ a close up of a blue and orange liquid{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"} a close up of a blue and orange liquid, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"} a close up of a blue and orange liquid, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"} a close up of a blue and orange liquid, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"} a close up of a blue and orange liquid, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"} a close up of a blue and orange liquid, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"} a close up of a blue and orange liquid, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"} a close up of a blue and orange liquid, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"} A black and white abstract video featuring mesmerizing bubbles{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"} A black and white abstract video featuring mesmerizing bubbles, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"} A black and white abstract video featuring mesmerizing bubbles, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"} A black and white abstract video featuring mesmerizing bubbles, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"} A black and white abstract video featuring mesmerizing bubbles, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"} A black and white abstract video featuring mesmerizing bubbles, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"} A black and white abstract video featuring mesmerizing bubbles, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"} A black and white abstract video featuring mesmerizing bubbles, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"} a blue and white smoke is swirly in the dark{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"} a blue and white smoke is swirly in the dark, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"} a blue and white smoke is swirly in the dark, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"} a blue and white smoke is swirly in the dark, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"} a blue and white smoke is swirly in the dark, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"} a blue and white smoke is swirly in the dark, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"} a blue and white smoke is swirly in the dark, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"} a blue and white smoke is swirly in the dark, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"} a close-up view of a sea fan in the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"} a close-up view of a sea fan in the water, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"} a close-up view of a sea fan in the water, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"} a close-up view of a sea fan in the water, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"} a close-up view of a sea fan in the water, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"} a close-up view of a sea fan in the water, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"} a close-up view of a sea fan in the water, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"} a close-up view of a sea fan in the water, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"} a visually captivating abstract video, rich in color, set against a dramatic black background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"} a visually captivating abstract video, rich in color, set against a dramatic black background, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"} a visually captivating abstract video, rich in color, set against a dramatic black background, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"} a visually captivating abstract video, rich in color, set against a dramatic black background, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"} a visually captivating abstract video, rich in color, set against a dramatic black background, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"} a visually captivating abstract video, rich in color, set against a dramatic black background, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"} a visually captivating abstract video, rich in color, set against a dramatic black background, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"} a visually captivating abstract video, rich in color, set against a dramatic black background, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"} a purple and yellow abstract painting with a black background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"} a purple and yellow abstract painting with a black background, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"} a purple and yellow abstract painting with a black background, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"} a purple and yellow abstract painting with a black background, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"} a purple and yellow abstract painting with a black background, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"} a purple and yellow abstract painting with a black background, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"} a purple and yellow abstract painting with a black background, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"} a purple and yellow abstract painting with a black background, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"} a dynamic video of a blurry neon light in the dark, radiating captivating colors{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"} a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"} a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"} a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"} a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"} a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"} a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"} a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"} a view of a star trail in the night sky{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"} a view of a star trail in the night sky, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"} a view of a star trail in the night sky, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"} a view of a star trail in the night sky, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"} a view of a star trail in the night sky, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"} a view of a star trail in the night sky, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"} a view of a star trail in the night sky, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"} a view of a star trail in the night sky, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"} an aerial view of a small town on the edge of the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"} an aerial view of a small town on the edge of the ocean, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"} an aerial view of a small town on the edge of the ocean, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"} an aerial view of a small town on the edge of the ocean, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"} an aerial view of a small town on the edge of the ocean, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"} an aerial view of a small town on the edge of the ocean, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"} an aerial view of a small town on the edge of the ocean, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"} an aerial view of a small town on the edge of the ocean, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"} Colorful buildings on the seaside cliffs{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"} Colorful buildings on the seaside cliffs, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"} Colorful buildings on the seaside cliffs, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"} Colorful buildings on the seaside cliffs, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"} Colorful buildings on the seaside cliffs, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"} Colorful buildings on the seaside cliffs, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"} Colorful buildings on the seaside cliffs, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"} Colorful buildings on the seaside cliffs, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"} a bunch of houses that are on a hillside{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"} a bunch of houses that are on a hillside, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"} a bunch of houses that are on a hillside, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"} a bunch of houses that are on a hillside, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"} a bunch of houses that are on a hillside, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"} a bunch of houses that are on a hillside, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"} a bunch of houses that are on a hillside, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"} a bunch of houses that are on a hillside, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"} a building that is sitting on the side of a pond{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"} a building that is sitting on the side of a pond, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"} a building that is sitting on the side of a pond, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"} a building that is sitting on the side of a pond, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"} a building that is sitting on the side of a pond, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"} a building that is sitting on the side of a pond, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"} a building that is sitting on the side of a pond, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"} a building that is sitting on the side of a pond, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"} an aerial view of a busy city with a bridge in the background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"} an aerial view of a busy city with a bridge in the background, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"} an aerial view of a busy city with a bridge in the background, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"} an aerial view of a busy city with a bridge in the background, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"} an aerial view of a busy city with a bridge in the background, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"} an aerial view of a busy city with a bridge in the background, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"} an aerial view of a busy city with a bridge in the background, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"} an aerial view of a busy city with a bridge in the background, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"} a bridge that is over a body of water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"} a bridge that is over a body of water, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"} a bridge that is over a body of water, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"} a bridge that is over a body of water, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"} a bridge that is over a body of water, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"} a bridge that is over a body of water, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"} a bridge that is over a body of water, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"} a bridge that is over a body of water, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"} a pile of wood sitting next to a log house{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"} a pile of wood sitting next to a log house, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"} a pile of wood sitting next to a log house, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"} a pile of wood sitting next to a log house, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"} a pile of wood sitting next to a log house, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"} a pile of wood sitting next to a log house, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"} a pile of wood sitting next to a log house, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"} a pile of wood sitting next to a log house, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"} a view of a snowy mountain side with many buildings{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"} a view of a snowy mountain side with many buildings, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"} a view of a snowy mountain side with many buildings, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"} a view of a snowy mountain side with many buildings, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"} a view of a snowy mountain side with many buildings, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"} a view of a snowy mountain side with many buildings, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"} a view of a snowy mountain side with many buildings, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"} a view of a snowy mountain side with many buildings, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"} san francisco skyline at sunset{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"} san francisco skyline at sunset, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"} san francisco skyline at sunset, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"} san francisco skyline at sunset, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"} san francisco skyline at sunset, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"} san francisco skyline at sunset, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"} san francisco skyline at sunset, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"} san francisco skyline at sunset, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"} a castle on top of a hill covered in snow{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"} a castle on top of a hill covered in snow, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"} a castle on top of a hill covered in snow, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"} a castle on top of a hill covered in snow, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"} a castle on top of a hill covered in snow, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"} a castle on top of a hill covered in snow, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"} a castle on top of a hill covered in snow, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"} a castle on top of a hill covered in snow, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"} an aerial view of big ben and the houses of parliament in london{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"} an aerial view of big ben and the houses of parliament in london, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"} an aerial view of big ben and the houses of parliament in london, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"} an aerial view of big ben and the houses of parliament in london, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"} an aerial view of big ben and the houses of parliament in london, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"} an aerial view of big ben and the houses of parliament in london, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"} an aerial view of big ben and the houses of parliament in london, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"} an aerial view of big ben and the houses of parliament in london, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"} a beach with a lot of buildings on the side of a cliff{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"} a beach with a lot of buildings on the side of a cliff, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"} a beach with a lot of buildings on the side of a cliff, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"} a beach with a lot of buildings on the side of a cliff, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"} a beach with a lot of buildings on the side of a cliff, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"} a beach with a lot of buildings on the side of a cliff, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"} a beach with a lot of buildings on the side of a cliff, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"} a beach with a lot of buildings on the side of a cliff, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"} an alley way in an old european city{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"} an alley way in an old european city, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"} an alley way in an old european city, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"} an alley way in an old european city, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"} an alley way in an old european city, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"} an alley way in an old european city, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"} an alley way in an old european city, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"} an alley way in an old european city, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"} the golden gate bridge in san franscisco is lit up by the setting sun{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"} the golden gate bridge in san franscisco is lit up by the setting sun, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"} the golden gate bridge in san franscisco is lit up by the setting sun, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"} the golden gate bridge in san franscisco is lit up by the setting sun, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"} the golden gate bridge in san franscisco is lit up by the setting sun, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"} the golden gate bridge in san franscisco is lit up by the setting sun, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"} the golden gate bridge in san franscisco is lit up by the setting sun, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"} the golden gate bridge in san franscisco is lit up by the setting sun, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"} the great wall of china in autumn{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"} the great wall of china in autumn, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"} the great wall of china in autumn, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"} the great wall of china in autumn, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"} the great wall of china in autumn, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"} the great wall of china in autumn, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"} the great wall of china in autumn, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"} the great wall of china in autumn, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"} the town of hallstatt is surrounded by mountains and water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"} the town of hallstatt is surrounded by mountains and water, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"} the town of hallstatt is surrounded by mountains and water, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"} the town of hallstatt is surrounded by mountains and water, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"} the town of hallstatt is surrounded by mountains and water, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"} the town of hallstatt is surrounded by mountains and water, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"} the town of hallstatt is surrounded by mountains and water, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"} the town of hallstatt is surrounded by mountains and water, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"} tokyo skyline at night{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"} tokyo skyline at night, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"} tokyo skyline at night, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"} tokyo skyline at night, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"} tokyo skyline at night, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"} tokyo skyline at night, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"} tokyo skyline at night, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"} tokyo skyline at night, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"} a large wave crashes into a lighthouse{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"} a large wave crashes into a lighthouse, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"} a large wave crashes into a lighthouse, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"} a large wave crashes into a lighthouse, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"} a large wave crashes into a lighthouse, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"} a large wave crashes into a lighthouse, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"} a large wave crashes into a lighthouse, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"} a large wave crashes into a lighthouse, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"} a church sits on top of a hill under a cloudy sky{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"} a church sits on top of a hill under a cloudy sky, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"} a church sits on top of a hill under a cloudy sky, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"} a church sits on top of a hill under a cloudy sky, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"} a church sits on top of a hill under a cloudy sky, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"} a church sits on top of a hill under a cloudy sky, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"} a church sits on top of a hill under a cloudy sky, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"} a church sits on top of a hill under a cloudy sky, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"} the parthenon in acropolis, greece{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"} the parthenon in acropolis, greece, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"} the parthenon in acropolis, greece, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"} the parthenon in acropolis, greece, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"} the parthenon in acropolis, greece, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"} the parthenon in acropolis, greece, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"} the parthenon in acropolis, greece, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"} the parthenon in acropolis, greece, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"} a large crowd of people walking in a shopping mall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"} a large crowd of people walking in a shopping mall, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"} a large crowd of people walking in a shopping mall, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"} a large crowd of people walking in a shopping mall, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"} a large crowd of people walking in a shopping mall, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"} a large crowd of people walking in a shopping mall, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"} a large crowd of people walking in a shopping mall, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"} a large crowd of people walking in a shopping mall, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"} the pyramids of giza, egypt{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"} the pyramids of giza, egypt, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"} the pyramids of giza, egypt, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"} the pyramids of giza, egypt, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"} the pyramids of giza, egypt, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"} the pyramids of giza, egypt, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"} the pyramids of giza, egypt, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"} the pyramids of giza, egypt, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"} a stage door painted with a star on the side of a brick wall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"} a stage door painted with a star on the side of a brick wall, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"} a stage door painted with a star on the side of a brick wall, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"} a stage door painted with a star on the side of a brick wall, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"} a stage door painted with a star on the side of a brick wall, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"} a stage door painted with a star on the side of a brick wall, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"} a stage door painted with a star on the side of a brick wall, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"} a stage door painted with a star on the side of a brick wall, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"} a light house on the edge of the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"} a light house on the edge of the water, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"} a light house on the edge of the water, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"} a light house on the edge of the water, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"} a light house on the edge of the water, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"} a light house on the edge of the water, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"} a light house on the edge of the water, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"} a light house on the edge of the water, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"} an asian city street at night with people and bicycles{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"} an asian city street at night with people and bicycles, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"} an asian city street at night with people and bicycles, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"} an asian city street at night with people and bicycles, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"} an asian city street at night with people and bicycles, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"} an asian city street at night with people and bicycles, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"} an asian city street at night with people and bicycles, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"} an asian city street at night with people and bicycles, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"} a couple of wooden benches in the middle of a street{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"} a couple of wooden benches in the middle of a street, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"} a couple of wooden benches in the middle of a street, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"} a couple of wooden benches in the middle of a street, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"} a couple of wooden benches in the middle of a street, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"} a couple of wooden benches in the middle of a street, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"} a couple of wooden benches in the middle of a street, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"} a couple of wooden benches in the middle of a street, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"} a pagoda sits on top of a mountain in japan{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"} a pagoda sits on top of a mountain in japan, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"} a pagoda sits on top of a mountain in japan, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"} a pagoda sits on top of a mountain in japan, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"} a pagoda sits on top of a mountain in japan, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"} a pagoda sits on top of a mountain in japan, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"} a pagoda sits on top of a mountain in japan, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"} a pagoda sits on top of a mountain in japan, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"} a red bus driving down a snowy street at night{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"} a red bus driving down a snowy street at night, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"} a red bus driving down a snowy street at night, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"} a red bus driving down a snowy street at night, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"} a red bus driving down a snowy street at night, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"} a red bus driving down a snowy street at night, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"} a red bus driving down a snowy street at night, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"} a red bus driving down a snowy street at night, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"} a snow covered street{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"} a snow covered street, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"} a snow covered street, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"} a snow covered street, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"} a snow covered street, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"} a snow covered street, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"} a snow covered street, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"} a snow covered street, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"} a house with snow on the ground{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"} a house with snow on the ground, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"} a house with snow on the ground, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"} a house with snow on the ground, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"} a house with snow on the ground, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"} a house with snow on the ground, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"} a house with snow on the ground, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"} a house with snow on the ground, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"} cars parked on the side of the road during a snowstorm{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"} cars parked on the side of the road during a snowstorm, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"} cars parked on the side of the road during a snowstorm, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"} cars parked on the side of the road during a snowstorm, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"} cars parked on the side of the road during a snowstorm, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"} cars parked on the side of the road during a snowstorm, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"} cars parked on the side of the road during a snowstorm, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"} cars parked on the side of the road during a snowstorm, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"} a group of statues on the side of a building{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"} a group of statues on the side of a building, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"} a group of statues on the side of a building, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"} a group of statues on the side of a building, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"} a group of statues on the side of a building, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"} a group of statues on the side of a building, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"} a group of statues on the side of a building, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"} a group of statues on the side of a building, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"} a city street at night during a snow storm{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"} a city street at night during a snow storm, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"} a city street at night during a snow storm, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"} a city street at night during a snow storm, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"} a city street at night during a snow storm, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"} a city street at night during a snow storm, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"} a city street at night during a snow storm, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"} a city street at night during a snow storm, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"} tower bridge in london{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"} tower bridge in london, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"} tower bridge in london, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"} tower bridge in london, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"} tower bridge in london, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"} tower bridge in london, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"} tower bridge in london, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"} tower bridge in london, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"} chinese pagoda in the middle of a snowy day{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"} chinese pagoda in the middle of a snowy day, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"} chinese pagoda in the middle of a snowy day, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"} chinese pagoda in the middle of a snowy day, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"} chinese pagoda in the middle of a snowy day, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"} chinese pagoda in the middle of a snowy day, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"} chinese pagoda in the middle of a snowy day, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"} chinese pagoda in the middle of a snowy day, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"} a dark alleyway with a bus driving down it{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"} a dark alleyway with a bus driving down it, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"} a dark alleyway with a bus driving down it, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"} a dark alleyway with a bus driving down it, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"} a dark alleyway with a bus driving down it, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"} a dark alleyway with a bus driving down it, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"} a dark alleyway with a bus driving down it, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"} a dark alleyway with a bus driving down it, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"} a monastery sits on top of a cliff in bhutan{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"} a monastery sits on top of a cliff in bhutan, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"} a monastery sits on top of a cliff in bhutan, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"} a monastery sits on top of a cliff in bhutan, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"} a monastery sits on top of a cliff in bhutan, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"} a monastery sits on top of a cliff in bhutan, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"} a monastery sits on top of a cliff in bhutan, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"} a monastery sits on top of a cliff in bhutan, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"} the dome of the rock in jerusalem{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"} the dome of the rock in jerusalem, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"} the dome of the rock in jerusalem, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"} the dome of the rock in jerusalem, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"} the dome of the rock in jerusalem, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"} the dome of the rock in jerusalem, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"} the dome of the rock in jerusalem, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"} the dome of the rock in jerusalem, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"} an aerial view of a futuristic building on a cliff overlooking a body of water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"} an aerial view of a futuristic building on a cliff overlooking a body of water, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"} an aerial view of a futuristic building on a cliff overlooking a body of water, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"} an aerial view of a futuristic building on a cliff overlooking a body of water, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"} an aerial view of a futuristic building on a cliff overlooking a body of water, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"} an aerial view of a futuristic building on a cliff overlooking a body of water, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"} an aerial view of a futuristic building on a cliff overlooking a body of water, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"} an aerial view of a futuristic building on a cliff overlooking a body of water, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"} a reflection of a city with buildings in the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"} a reflection of a city with buildings in the water, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"} a reflection of a city with buildings in the water, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"} a reflection of a city with buildings in the water, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"} a reflection of a city with buildings in the water, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"} a reflection of a city with buildings in the water, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"} a reflection of a city with buildings in the water, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"} a reflection of a city with buildings in the water, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"} a bar with chairs and a television on the wall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"} a bar with chairs and a television on the wall, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"} a bar with chairs and a television on the wall, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"} a bar with chairs and a television on the wall, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"} a bar with chairs and a television on the wall, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"} a bar with chairs and a television on the wall, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"} a bar with chairs and a television on the wall, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"} a bar with chairs and a television on the wall, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"} a living room filled with lots of books on a wall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"} a living room filled with lots of books on a wall, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"} a living room filled with lots of books on a wall, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"} a living room filled with lots of books on a wall, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"} a living room filled with lots of books on a wall, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"} a living room filled with lots of books on a wall, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"} a living room filled with lots of books on a wall, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"} a living room filled with lots of books on a wall, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"} a living room filled with furniture next to a stone wall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"} a living room filled with furniture next to a stone wall, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"} a living room filled with furniture next to a stone wall, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"} a living room filled with furniture next to a stone wall, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"} a living room filled with furniture next to a stone wall, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"} a living room filled with furniture next to a stone wall, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"} a living room filled with furniture next to a stone wall, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"} a living room filled with furniture next to a stone wall, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"} a table and chairs in a room with sunlight coming through the window{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"} a table and chairs in a room with sunlight coming through the window, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"} a table and chairs in a room with sunlight coming through the window, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"} a table and chairs in a room with sunlight coming through the window, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"} a table and chairs in a room with sunlight coming through the window, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"} a table and chairs in a room with sunlight coming through the window, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"} a table and chairs in a room with sunlight coming through the window, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"} a table and chairs in a room with sunlight coming through the window, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"} a room filled with lots of shelves filled with books{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"} a room filled with lots of shelves filled with books, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"} a room filled with lots of shelves filled with books, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"} a room filled with lots of shelves filled with books, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"} a room filled with lots of shelves filled with books, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"} a room filled with lots of shelves filled with books, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"} a room filled with lots of shelves filled with books, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"} a room filled with lots of shelves filled with books, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"} an art gallery with paintings on the walls{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"} an art gallery with paintings on the walls, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"} an art gallery with paintings on the walls, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"} an art gallery with paintings on the walls, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"} an art gallery with paintings on the walls, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"} an art gallery with paintings on the walls, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"} an art gallery with paintings on the walls, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"} an art gallery with paintings on the walls, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"} a room with a lot of pictures on the walls{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"} a room with a lot of pictures on the walls, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"} a room with a lot of pictures on the walls, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"} a room with a lot of pictures on the walls, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"} a room with a lot of pictures on the walls, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"} a room with a lot of pictures on the walls, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"} a room with a lot of pictures on the walls, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"} a room with a lot of pictures on the walls, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"} a painting of a cloudy sky next to an easel{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"} a painting of a cloudy sky next to an easel, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"} a painting of a cloudy sky next to an easel, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"} a painting of a cloudy sky next to an easel, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"} a painting of a cloudy sky next to an easel, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"} a painting of a cloudy sky next to an easel, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"} a painting of a cloudy sky next to an easel, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"} a painting of a cloudy sky next to an easel, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"} a living room with a christmas tree and a rocking chair{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"} a living room with a christmas tree and a rocking chair, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"} a living room with a christmas tree and a rocking chair, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"} a living room with a christmas tree and a rocking chair, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"} a living room with a christmas tree and a rocking chair, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"} a living room with a christmas tree and a rocking chair, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"} a living room with a christmas tree and a rocking chair, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"} a living room with a christmas tree and a rocking chair, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"} a kitchen with a sink and a lot of glasses on the counter{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"} a kitchen with a sink and a lot of glasses on the counter, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"} a kitchen with a sink and a lot of glasses on the counter, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"} a kitchen with a sink and a lot of glasses on the counter, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"} a kitchen with a sink and a lot of glasses on the counter, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"} a kitchen with a sink and a lot of glasses on the counter, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"} a kitchen with a sink and a lot of glasses on the counter, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"} a kitchen with a sink and a lot of glasses on the counter, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"} a wooden table in front of a brick wall with bottles on the wall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"} a wooden table in front of a brick wall with bottles on the wall, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"} a wooden table in front of a brick wall with bottles on the wall, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"} a wooden table in front of a brick wall with bottles on the wall, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"} a wooden table in front of a brick wall with bottles on the wall, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"} a wooden table in front of a brick wall with bottles on the wall, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"} a wooden table in front of a brick wall with bottles on the wall, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"} a wooden table in front of a brick wall with bottles on the wall, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"} a room filled with paintings and statues{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"} a room filled with paintings and statues, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"} a room filled with paintings and statues, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"} a room filled with paintings and statues, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"} a room filled with paintings and statues, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"} a room filled with paintings and statues, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"} a room filled with paintings and statues, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"} a room filled with paintings and statues, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"} an outdoor dining area surrounded by plants and a brick walkway{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"} an outdoor dining area surrounded by plants and a brick walkway, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"} an outdoor dining area surrounded by plants and a brick walkway, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"} an outdoor dining area surrounded by plants and a brick walkway, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"} an outdoor dining area surrounded by plants and a brick walkway, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"} an outdoor dining area surrounded by plants and a brick walkway, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"} an outdoor dining area surrounded by plants and a brick walkway, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"} an outdoor dining area surrounded by plants and a brick walkway, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"} a room filled with books and teddy bears{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"} a room filled with books and teddy bears, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"} a room filled with books and teddy bears, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"} a room filled with books and teddy bears, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"} a room filled with books and teddy bears, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"} a room filled with books and teddy bears, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"} a room filled with books and teddy bears, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"} a room filled with books and teddy bears, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"} a table and chairs in a room with a plant in the corner{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"} a table and chairs in a room with a plant in the corner, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"} a table and chairs in a room with a plant in the corner, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"} a table and chairs in a room with a plant in the corner, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"} a table and chairs in a room with a plant in the corner, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"} a table and chairs in a room with a plant in the corner, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"} a table and chairs in a room with a plant in the corner, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"} a table and chairs in a room with a plant in the corner, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"} a living room with a couch, table, and a window{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"} a living room with a couch, table, and a window, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"} a living room with a couch, table, and a window, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"} a living room with a couch, table, and a window, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"} a living room with a couch, table, and a window, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"} a living room with a couch, table, and a window, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"} a living room with a couch, table, and a window, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"} a living room with a couch, table, and a window, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"} a modern living room with wood floors and a tv{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"} a modern living room with wood floors and a tv, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"} a modern living room with wood floors and a tv, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"} a modern living room with wood floors and a tv, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"} a modern living room with wood floors and a tv, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"} a modern living room with wood floors and a tv, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"} a modern living room with wood floors and a tv, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"} a modern living room with wood floors and a tv, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"} a room with a desk and a chair in it{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"} a room with a desk and a chair in it, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"} a room with a desk and a chair in it, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"} a room with a desk and a chair in it, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"} a room with a desk and a chair in it, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"} a room with a desk and a chair in it, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"} a room with a desk and a chair in it, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"} a room with a desk and a chair in it, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"} a large waterfall in the middle of a building{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"} a large waterfall in the middle of a building, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"} a large waterfall in the middle of a building, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"} a large waterfall in the middle of a building, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"} a large waterfall in the middle of a building, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"} a large waterfall in the middle of a building, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"} a large waterfall in the middle of a building, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"} a large waterfall in the middle of a building, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"} a chair in a room next to some drawings{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"} a chair in a room next to some drawings, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"} a chair in a room next to some drawings, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"} a chair in a room next to some drawings, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"} a chair in a room next to some drawings, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"} a chair in a room next to some drawings, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"} a chair in a room next to some drawings, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"} a chair in a room next to some drawings, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"} a living room with hardwood floors and a white couch{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"} a living room with hardwood floors and a white couch, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"} a living room with hardwood floors and a white couch, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"} a living room with hardwood floors and a white couch, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"} a living room with hardwood floors and a white couch, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"} a living room with hardwood floors and a white couch, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"} a living room with hardwood floors and a white couch, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"} a living room with hardwood floors and a white couch, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"} two people in a canoe on a lake with mountains in the background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"} two people in a canoe on a lake with mountains in the background, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"} two people in a canoe on a lake with mountains in the background, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"} two people in a canoe on a lake with mountains in the background, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"} two people in a canoe on a lake with mountains in the background, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"} two people in a canoe on a lake with mountains in the background, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"} two people in a canoe on a lake with mountains in the background, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"} two people in a canoe on a lake with mountains in the background, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"} an aerial view of a snowy road in a forest{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"} an aerial view of a snowy road in a forest, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"} an aerial view of a snowy road in a forest, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"} an aerial view of a snowy road in a forest, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"} an aerial view of a snowy road in a forest, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"} an aerial view of a snowy road in a forest, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"} an aerial view of a snowy road in a forest, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"} an aerial view of a snowy road in a forest, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"} a view of a waterfall from a distance{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"} a view of a waterfall from a distance, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"} a view of a waterfall from a distance, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"} a view of a waterfall from a distance, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"} a view of a waterfall from a distance, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"} a view of a waterfall from a distance, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"} a view of a waterfall from a distance, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"} a view of a waterfall from a distance, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"} a group of hot air balloons flying over a valley{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"} a group of hot air balloons flying over a valley, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"} a group of hot air balloons flying over a valley, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"} a group of hot air balloons flying over a valley, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"} a group of hot air balloons flying over a valley, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"} a group of hot air balloons flying over a valley, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"} a group of hot air balloons flying over a valley, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"} a group of hot air balloons flying over a valley, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"} an aerial view of a group of islands in the middle of a lake{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"} an aerial view of a group of islands in the middle of a lake, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"} an aerial view of a group of islands in the middle of a lake, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"} an aerial view of a group of islands in the middle of a lake, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"} an aerial view of a group of islands in the middle of a lake, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"} an aerial view of a group of islands in the middle of a lake, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"} an aerial view of a group of islands in the middle of a lake, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"} an aerial view of a group of islands in the middle of a lake, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"} an aerial view of a rocky beach in indonesia{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"} an aerial view of a rocky beach in indonesia, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"} an aerial view of a rocky beach in indonesia, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"} an aerial view of a rocky beach in indonesia, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"} an aerial view of a rocky beach in indonesia, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"} an aerial view of a rocky beach in indonesia, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"} an aerial view of a rocky beach in indonesia, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"} an aerial view of a rocky beach in indonesia, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"} fireworks in the night sky over a city{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"} fireworks in the night sky over a city, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"} fireworks in the night sky over a city, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"} fireworks in the night sky over a city, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"} fireworks in the night sky over a city, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"} fireworks in the night sky over a city, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"} fireworks in the night sky over a city, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"} fireworks in the night sky over a city, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"} a large wave crashes into a lighthouse on a stormy day{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"} a large wave crashes into a lighthouse on a stormy day, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"} a large wave crashes into a lighthouse on a stormy day, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"} a large wave crashes into a lighthouse on a stormy day, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"} a large wave crashes into a lighthouse on a stormy day, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"} a large wave crashes into a lighthouse on a stormy day, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"} a large wave crashes into a lighthouse on a stormy day, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"} a large wave crashes into a lighthouse on a stormy day, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"} a mountain range with a sky background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"} a mountain range with a sky background, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"} a mountain range with a sky background, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"} a mountain range with a sky background, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"} a mountain range with a sky background, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"} a mountain range with a sky background, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"} a mountain range with a sky background, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"} a mountain range with a sky background, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"} a large bonfire is burning in the night sky{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"} a large bonfire is burning in the night sky, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"} a large bonfire is burning in the night sky, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"} a large bonfire is burning in the night sky, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"} a large bonfire is burning in the night sky, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"} a large bonfire is burning in the night sky, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"} a large bonfire is burning in the night sky, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"} a large bonfire is burning in the night sky, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"} a close-up view of the flames of a fireplace{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"} a close-up view of the flames of a fireplace, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"} a close-up view of the flames of a fireplace, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"} a close-up view of the flames of a fireplace, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"} a close-up view of the flames of a fireplace, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"} a close-up view of the flames of a fireplace, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"} a close-up view of the flames of a fireplace, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"} a close-up view of the flames of a fireplace, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"} a farm in the middle of the day{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"} a farm in the middle of the day, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"} a farm in the middle of the day, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"} a farm in the middle of the day, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"} a farm in the middle of the day, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"} a farm in the middle of the day, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"} a farm in the middle of the day, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"} a farm in the middle of the day, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"} a flock of birds flying over a tree at sunset{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"} a flock of birds flying over a tree at sunset, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"} a flock of birds flying over a tree at sunset, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"} a flock of birds flying over a tree at sunset, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"} a flock of birds flying over a tree at sunset, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"} a flock of birds flying over a tree at sunset, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"} a flock of birds flying over a tree at sunset, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"} a flock of birds flying over a tree at sunset, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"} a captivating scene featuring a spiral galaxy shining brilliantly in the night sky{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"} a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"} a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"} a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"} a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"} a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"} a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"} a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"} a mountain with snow on it{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"} a mountain with snow on it, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"} a mountain with snow on it, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"} a mountain with snow on it, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"} a mountain with snow on it, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"} a mountain with snow on it, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"} a mountain with snow on it, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"} a mountain with snow on it, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"} a bridge that is in the middle of a river{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"} a bridge that is in the middle of a river, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"} a bridge that is in the middle of a river, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"} a bridge that is in the middle of a river, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"} a bridge that is in the middle of a river, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"} a bridge that is in the middle of a river, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"} a bridge that is in the middle of a river, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"} a bridge that is in the middle of a river, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"} a group of people standing on top of a green hill{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"} a group of people standing on top of a green hill, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"} a group of people standing on top of a green hill, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"} a group of people standing on top of a green hill, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"} a group of people standing on top of a green hill, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"} a group of people standing on top of a green hill, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"} a group of people standing on top of a green hill, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"} a group of people standing on top of a green hill, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"} a sandy beach with a wooden pier in the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"} a sandy beach with a wooden pier in the water, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"} a sandy beach with a wooden pier in the water, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"} a sandy beach with a wooden pier in the water, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"} a sandy beach with a wooden pier in the water, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"} a sandy beach with a wooden pier in the water, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"} a sandy beach with a wooden pier in the water, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"} a sandy beach with a wooden pier in the water, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"} a lake surrounded by mountains and flowers{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"} a lake surrounded by mountains and flowers, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"} a lake surrounded by mountains and flowers, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"} a lake surrounded by mountains and flowers, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"} a lake surrounded by mountains and flowers, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"} a lake surrounded by mountains and flowers, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"} a lake surrounded by mountains and flowers, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"} a lake surrounded by mountains and flowers, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"} a hot-air balloon flying over a desert landscape{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"} a hot-air balloon flying over a desert landscape, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"} a hot-air balloon flying over a desert landscape, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"} a hot-air balloon flying over a desert landscape, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"} a hot-air balloon flying over a desert landscape, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"} a hot-air balloon flying over a desert landscape, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"} a hot-air balloon flying over a desert landscape, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"} a hot-air balloon flying over a desert landscape, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"} several hot air balloons flying over a city{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"} several hot air balloons flying over a city, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"} several hot air balloons flying over a city, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"} several hot air balloons flying over a city, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"} several hot air balloons flying over a city, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"} several hot air balloons flying over a city, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"} several hot air balloons flying over a city, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"} several hot air balloons flying over a city, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"} a group of hot air balloons flying over a field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"} a group of hot air balloons flying over a field, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"} a group of hot air balloons flying over a field, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"} a group of hot air balloons flying over a field, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"} a group of hot air balloons flying over a field, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"} a group of hot air balloons flying over a field, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"} a group of hot air balloons flying over a field, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"} a group of hot air balloons flying over a field, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"} a large wave crashes over a rocky cliff{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"} a large wave crashes over a rocky cliff, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"} a large wave crashes over a rocky cliff, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"} a large wave crashes over a rocky cliff, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"} a large wave crashes over a rocky cliff, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"} a large wave crashes over a rocky cliff, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"} a large wave crashes over a rocky cliff, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"} a large wave crashes over a rocky cliff, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"} the sun is setting over a lake in the mountains{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"} the sun is setting over a lake in the mountains, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"} the sun is setting over a lake in the mountains, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"} the sun is setting over a lake in the mountains, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"} the sun is setting over a lake in the mountains, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"} the sun is setting over a lake in the mountains, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"} the sun is setting over a lake in the mountains, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"} the sun is setting over a lake in the mountains, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"} a mountain range with snow on the ground{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"} a mountain range with snow on the ground, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"} a mountain range with snow on the ground, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"} a mountain range with snow on the ground, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"} a mountain range with snow on the ground, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"} a mountain range with snow on the ground, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"} a mountain range with snow on the ground, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"} a mountain range with snow on the ground, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"} sun rays shining through clouds over a lake{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"} sun rays shining through clouds over a lake, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"} sun rays shining through clouds over a lake, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"} sun rays shining through clouds over a lake, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"} sun rays shining through clouds over a lake, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"} sun rays shining through clouds over a lake, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"} sun rays shining through clouds over a lake, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"} sun rays shining through clouds over a lake, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"} a boat sits on the shore of a lake with mt fuji in the background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"} a boat sits on the shore of a lake with mt fuji in the background, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"} a boat sits on the shore of a lake with mt fuji in the background, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"} a boat sits on the shore of a lake with mt fuji in the background, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"} a boat sits on the shore of a lake with mt fuji in the background, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"} a boat sits on the shore of a lake with mt fuji in the background, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"} a boat sits on the shore of a lake with mt fuji in the background, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"} a boat sits on the shore of a lake with mt fuji in the background, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"} a foggy road with trees in the distance{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"} a foggy road with trees in the distance, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"} a foggy road with trees in the distance, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"} a foggy road with trees in the distance, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"} a foggy road with trees in the distance, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"} a foggy road with trees in the distance, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"} a foggy road with trees in the distance, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"} a foggy road with trees in the distance, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"} two swans swimming on a lake in the fog{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"} two swans swimming on a lake in the fog, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"} two swans swimming on a lake in the fog, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"} two swans swimming on a lake in the fog, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"} two swans swimming on a lake in the fog, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"} two swans swimming on a lake in the fog, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"} two swans swimming on a lake in the fog, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"} two swans swimming on a lake in the fog, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"} the sun is shining through the trees near a waterfall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"} the sun is shining through the trees near a waterfall, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"} the sun is shining through the trees near a waterfall, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"} the sun is shining through the trees near a waterfall, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"} the sun is shining through the trees near a waterfall, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"} the sun is shining through the trees near a waterfall, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"} the sun is shining through the trees near a waterfall, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"} the sun is shining through the trees near a waterfall, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"} a sandy beach with palm trees on the shore{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"} a sandy beach with palm trees on the shore, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"} a sandy beach with palm trees on the shore, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"} a sandy beach with palm trees on the shore, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"} a sandy beach with palm trees on the shore, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"} a sandy beach with palm trees on the shore, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"} a sandy beach with palm trees on the shore, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"} a sandy beach with palm trees on the shore, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"} an aerial view of a body of water and a beach{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"} an aerial view of a body of water and a beach, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"} an aerial view of a body of water and a beach, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"} an aerial view of a body of water and a beach, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"} an aerial view of a body of water and a beach, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"} an aerial view of a body of water and a beach, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"} an aerial view of a body of water and a beach, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"} an aerial view of a body of water and a beach, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"} a foggy field that has trees in the grass{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"} a foggy field that has trees in the grass, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"} a foggy field that has trees in the grass, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"} a foggy field that has trees in the grass, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"} a foggy field that has trees in the grass, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"} a foggy field that has trees in the grass, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"} a foggy field that has trees in the grass, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"} a foggy field that has trees in the grass, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"} a foggy landscape with trees and hills in the distance{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"} a foggy landscape with trees and hills in the distance, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"} a foggy landscape with trees and hills in the distance, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"} a foggy landscape with trees and hills in the distance, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"} a foggy landscape with trees and hills in the distance, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"} a foggy landscape with trees and hills in the distance, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"} a foggy landscape with trees and hills in the distance, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"} a foggy landscape with trees and hills in the distance, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"} a large wave in the ocean with a lot of spray coming from it{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"} a large wave in the ocean with a lot of spray coming from it, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"} a large wave in the ocean with a lot of spray coming from it, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"} a large wave in the ocean with a lot of spray coming from it, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"} a large wave in the ocean with a lot of spray coming from it, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"} a large wave in the ocean with a lot of spray coming from it, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"} a large wave in the ocean with a lot of spray coming from it, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"} a large wave in the ocean with a lot of spray coming from it, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"} a waterfall is shown in the middle of a lush green hillside{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"} a waterfall is shown in the middle of a lush green hillside, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"} a waterfall is shown in the middle of a lush green hillside, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"} a waterfall is shown in the middle of a lush green hillside, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"} a waterfall is shown in the middle of a lush green hillside, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"} a waterfall is shown in the middle of a lush green hillside, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"} a waterfall is shown in the middle of a lush green hillside, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"} a waterfall is shown in the middle of a lush green hillside, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"} an aerial view of a curvy road in the middle of a forest{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"} an aerial view of a curvy road in the middle of a forest, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"} an aerial view of a curvy road in the middle of a forest, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"} an aerial view of a curvy road in the middle of a forest, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"} an aerial view of a curvy road in the middle of a forest, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"} an aerial view of a curvy road in the middle of a forest, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"} an aerial view of a curvy road in the middle of a forest, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"} an aerial view of a curvy road in the middle of a forest, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"} a mountain covered in snow with evergreen trees{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"} a mountain covered in snow with evergreen trees, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"} a mountain covered in snow with evergreen trees, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"} a mountain covered in snow with evergreen trees, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"} a mountain covered in snow with evergreen trees, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"} a mountain covered in snow with evergreen trees, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"} a mountain covered in snow with evergreen trees, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"} a mountain covered in snow with evergreen trees, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"} a very large waterfall in the middle of the day{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"} a very large waterfall in the middle of the day, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"} a very large waterfall in the middle of the day, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"} a very large waterfall in the middle of the day, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"} a very large waterfall in the middle of the day, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"} a very large waterfall in the middle of the day, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"} a very large waterfall in the middle of the day, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"} a very large waterfall in the middle of the day, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"} a large waterfall in the middle of a lush green hillside{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"} a large waterfall in the middle of a lush green hillside, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"} a large waterfall in the middle of a lush green hillside, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"} a large waterfall in the middle of a lush green hillside, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"} a large waterfall in the middle of a lush green hillside, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"} a large waterfall in the middle of a lush green hillside, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"} a large waterfall in the middle of a lush green hillside, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"} a large waterfall in the middle of a lush green hillside, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"} a brown bear in the water with a fish in its mouth{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a brown bear in the water with a fish in its mouth.jpg", "mask_strategy": "0"} a close-up of a hippopotamus eating grass in a field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up of a hippopotamus eating grass in a field.jpg", "mask_strategy": "0"} a sea turtle swimming in the ocean under the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sea turtle swimming in the ocean under the water.jpg", "mask_strategy": "0"} two bees are flying over a lavender plant{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two bees are flying over a lavender plant.jpg", "mask_strategy": "0"} the otter is standing in the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the otter is standing in the water.jpg", "mask_strategy": "0"} a dog carrying a soccer ball in its mouth{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dog carrying a soccer ball in its mouth.jpg", "mask_strategy": "0"} an eagle is flying over a mountain with trees in the background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an eagle is flying over a mountain with trees in the background.jpg", "mask_strategy": "0"} a couple of horses are running in the dirt{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of horses are running in the dirt.jpg", "mask_strategy": "0"} a highland cow with long horns standing in a field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a highland cow with long horns standing in a field.jpg", "mask_strategy": "0"} a monkey is holding a banana in its mouth{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monkey is holding a banana in its mouth.jpg", "mask_strategy": "0"} a large rhino grazing in the grass near a bush{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large rhino grazing in the grass near a bush.jpg", "mask_strategy": "0"} a butterfly sits on top of a purple flower{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a butterfly sits on top of a purple flower.jpg", "mask_strategy": "0"} an alligator is covered in green plants in the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an alligator is covered in green plants in the water.jpg", "mask_strategy": "0"} a red panda eating bamboo in a zoo{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red panda eating bamboo in a zoo.jpg", "mask_strategy": "0"} a monochromatic video capturing a cat's gaze into the camera{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monochromatic video capturing a cat's gaze into the camera.jpg", "mask_strategy": "0"} a frog sitting on top of water lily leaves{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a frog sitting on top of water lily leaves.jpg", "mask_strategy": "0"} a lion is roaring in the wild{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lion is roaring in the wild.jpg", "mask_strategy": "0"} a seagull is flying towards a person's hand{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a seagull is flying towards a person's hand.jpg", "mask_strategy": "0"} a yellow and white jellyfish is floating in the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a yellow and white jellyfish is floating in the ocean.jpg", "mask_strategy": "0"} a group of jellyfish swimming in an aquarium{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of jellyfish swimming in an aquarium.jpg", "mask_strategy": "0"} a clown fish hiding in a purple anemone{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a clown fish hiding in a purple anemone.jpg", "mask_strategy": "0"} a snake sitting on the ground next to a bowl{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snake sitting on the ground next to a bowl.jpg", "mask_strategy": "0"} a brown and white cow eating hay{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a brown and white cow eating hay.jpg", "mask_strategy": "0"} a seal swimming in the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a seal swimming in the water.jpg", "mask_strategy": "0"} a panda bear is eating a piece of bamboo{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a panda bear is eating a piece of bamboo.jpg", "mask_strategy": "0"} a small bird sits on a moss covered branch{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a small bird sits on a moss covered branch.jpg", "mask_strategy": "0"} a bird with a fish in its beak flying over a field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bird with a fish in its beak flying over a field.jpg", "mask_strategy": "0"} a large flock of birds flying in the sky{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large flock of birds flying in the sky.jpg", "mask_strategy": "0"} a bald eagle flying over a tree filled forest{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bald eagle flying over a tree filled forest.jpg", "mask_strategy": "0"} a giraffe walking in a field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a giraffe walking in a field.jpg", "mask_strategy": "0"} a lioness yawning in a field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lioness yawning in a field.jpg", "mask_strategy": "0"} a little crab scurried on the sandy beach{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a little crab scurried on the sandy beach.jpg", "mask_strategy": "0"} a warthog is walking in the grass{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a warthog is walking in the grass.jpg", "mask_strategy": "0"} a penguin walking on a beach near the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a penguin walking on a beach near the water.jpg", "mask_strategy": "0"} a tiger walking through a wooded area{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a tiger walking through a wooded area.jpg", "mask_strategy": "0"} a tiger walking on a dirt path in the woods{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a tiger walking on a dirt path in the woods.jpg", "mask_strategy": "0"} a small monkey holding a piece of food in it's mouth{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a small monkey holding a piece of food in it's mouth.jpg", "mask_strategy": "0"} a squirrel sitting on the ground eating a piece of bread{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a squirrel sitting on the ground eating a piece of bread.jpg", "mask_strategy": "0"} a group of fish swimming over a coral reef{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of fish swimming over a coral reef.jpg", "mask_strategy": "0"} a toad is sitting on top of some moss{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a toad is sitting on top of some moss.jpg", "mask_strategy": "0"} a great white shark swimming in the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a great white shark swimming in the ocean.jpg", "mask_strategy": "0"} a group of camels resting in the desert{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of camels resting in the desert.jpg", "mask_strategy": "0"} two sheep grazing in the grass next to a wooden bridge{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two sheep grazing in the grass next to a wooden bridge.jpg", "mask_strategy": "0"} an elephant walking through a forest{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an elephant walking through a forest.jpg", "mask_strategy": "0"} a white rooster standing in a grassy field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a white rooster standing in a grassy field.jpg", "mask_strategy": "0"} a zebra walking across a dirt road near a field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a zebra walking across a dirt road near a field.jpg", "mask_strategy": "0"} cars are driving down a street lined with tall trees{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/cars are driving down a street lined with tall trees.jpg", "mask_strategy": "0"} the cars on the street are waiting for the traffic lights{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the cars on the street are waiting for the traffic lights.jpg", "mask_strategy": "0"} a bicycle leaning against a fence in the snow{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bicycle leaning against a fence in the snow.jpg", "mask_strategy": "0"} a blue fishing boat is navigating in the ocean next to a cruise ship{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue fishing boat is navigating in the ocean next to a cruise ship.jpg", "mask_strategy": "0"} a blue car driving down a dirt road near train tracks{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue car driving down a dirt road near train tracks.jpg", "mask_strategy": "0"} a sailboat is drifting on the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sailboat is drifting on the ocean.jpg", "mask_strategy": "0"} a couple of boats floating on a body of water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of boats floating on a body of water.jpg", "mask_strategy": "0"} a city street with cars driving in the rain{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city street with cars driving in the rain.jpg", "mask_strategy": "0"} a red and white tram traveling down a snowy street{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red and white tram traveling down a snowy street.jpg", "mask_strategy": "0"} a city bus driving down a snowy street at night{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city bus driving down a snowy street at night.jpg", "mask_strategy": "0"} a green toy car is sitting on the ground{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a green toy car is sitting on the ground.jpg", "mask_strategy": "0"} a train traveling down tracks through the woods with leaves on the ground{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a train traveling down tracks through the woods with leaves on the ground.jpg", "mask_strategy": "0"} a man in a small boat fishing in the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man in a small boat fishing in the ocean.jpg", "mask_strategy": "0"} an airplane is flying through the sky at sunset{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an airplane is flying through the sky at sunset.jpg", "mask_strategy": "0"} an old rusty car sits in the middle of a field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an old rusty car sits in the middle of a field.jpg", "mask_strategy": "0"} a motorcycle driving down a road{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a motorcycle driving down a road.jpg", "mask_strategy": "0"} a blue train traveling through a lush green area{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue train traveling through a lush green area.jpg", "mask_strategy": "0"} a white car is swiftly driving on a dirt road near a bush, kicking up dust{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a white car is swiftly driving on a dirt road near a bush, kicking up dust.jpg", "mask_strategy": "0"} a large cargo ship sailing in the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large cargo ship sailing in the water.jpg", "mask_strategy": "0"} the red Alfa sports car is speeding down the road{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the red Alfa sports car is speeding down the road.jpg", "mask_strategy": "0"} two cars that have been involved in a violent collision{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two cars that have been involved in a violent collision.jpg", "mask_strategy": "0"} a red double decker bus driving down a street{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red double decker bus driving down a street.jpg", "mask_strategy": "0"} A red sports car driving through sand, kicking up a large amount of dust{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A red sports car driving through sand, kicking up a large amount of dust.jpg", "mask_strategy": "0"} a yellow toy car parked on a rock near the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a yellow toy car parked on a rock near the water.jpg", "mask_strategy": "0"} a space shuttle taking off into the sky{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a space shuttle taking off into the sky.jpg", "mask_strategy": "0"} a steam train traveling through the woods{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a steam train traveling through the woods.jpg", "mask_strategy": "0"} a group of buses parked at a bus station{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of buses parked at a bus station.jpg", "mask_strategy": "0"} A bunch of cars are driving on a highway{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A bunch of cars are driving on a highway.jpg", "mask_strategy": "0"} a white and blue airplane flying in the sky{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a white and blue airplane flying in the sky.jpg", "mask_strategy": "0"} A space station orbited above the Earth{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A space station orbited above the Earth.jpg", "mask_strategy": "0"} A yellow boat is cruising in front of a bridge{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A yellow boat is cruising in front of a bridge.jpg", "mask_strategy": "0"} tangerines in a metal bowl on a table{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tangerines in a metal bowl on a table.jpg", "mask_strategy": "0"} a shadow of a hand reaching for a leaf{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a shadow of a hand reaching for a leaf.jpg", "mask_strategy": "0"} A teddy bear is climbing over a wooden fence{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A teddy bear is climbing over a wooden fence.jpg", "mask_strategy": "0"} a book on fire with flames coming out of it{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a book on fire with flames coming out of it.jpg", "mask_strategy": "0"} a close-up of a pink rose with water droplets on it{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up of a pink rose with water droplets on it.jpg", "mask_strategy": "0"} a person is cooking meat on a grill with flames{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person is cooking meat on a grill with flames.jpg", "mask_strategy": "0"} a snowman wearing a santa hat and scarf{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snowman wearing a santa hat and scarf.jpg", "mask_strategy": "0"} a person holding a sparkler in their hand{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person holding a sparkler in their hand.jpg", "mask_strategy": "0"} a teddy bear sitting on a moss covered ground{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a teddy bear sitting on a moss covered ground.jpg", "mask_strategy": "0"} a statue of a lion is sitting on a pedestal{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a statue of a lion is sitting on a pedestal.jpg", "mask_strategy": "0"} metal balls are suspended in the air{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/metal balls are suspended in the air.jpg", "mask_strategy": "0"} a close up of a bunch of green grapes{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a bunch of green grapes.jpg", "mask_strategy": "0"} a close-up view of a green plant with unfurled fronds{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a green plant with unfurled fronds.jpg", "mask_strategy": "0"} an orange mushroom sitting on top of a tree stump in the woods{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an orange mushroom sitting on top of a tree stump in the woods.jpg", "mask_strategy": "0"} a stack of pancakes covered in syrup and fruit{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stack of pancakes covered in syrup and fruit.jpg", "mask_strategy": "0"} a plate of spaghetti with spinach and tomatoes{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a plate of spaghetti with spinach and tomatoes.jpg", "mask_strategy": "0"} a pink lotus flower in the middle of a pond{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pink lotus flower in the middle of a pond.jpg", "mask_strategy": "0"} a person holding a sparkler in front of a sunset{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person holding a sparkler in front of a sunset.jpg", "mask_strategy": "0"} a pink rose is blooming in a garden{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pink rose is blooming in a garden.jpg", "mask_strategy": "0"} a snow man holding a lantern in the snow{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snow man holding a lantern in the snow.jpg", "mask_strategy": "0"} a stack of chocolate cookies with a bite taken out of it{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stack of chocolate cookies with a bite taken out of it.jpg", "mask_strategy": "0"} a white plate topped with eggs, toast, tomatoes, and a sausage{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a white plate topped with eggs, toast, tomatoes, and a sausage.jpg", "mask_strategy": "0"} a yellow water lily is floating in a pond{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a yellow water lily is floating in a pond.jpg", "mask_strategy": "0"} an astronaut floating in space with the earth in the background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an astronaut floating in space with the earth in the background.jpg", "mask_strategy": "0"} A little girl, lost in thought, is quietly sitting on the bus{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A little girl, lost in thought, is quietly sitting on the bus.jpg", "mask_strategy": "0"} a man holding a tray in front of a brick wall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man holding a tray in front of a brick wall.jpg", "mask_strategy": "0"} an older man playing a saxophone on the street{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an older man playing a saxophone on the street.jpg", "mask_strategy": "0"} an older man jogging by the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an older man jogging by the water.jpg", "mask_strategy": "0"} a person riding a skateboard on a concrete floor{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person riding a skateboard on a concrete floor.jpg", "mask_strategy": "0"} a woman with long black hair is posing for a picture{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman with long black hair is posing for a picture.jpg", "mask_strategy": "0"} a woman sitting on the ground in front of a guitar{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman sitting on the ground in front of a guitar.jpg", "mask_strategy": "0"} a little girl wearing a purple helmet riding a blue bike{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a little girl wearing a purple helmet riding a blue bike.jpg", "mask_strategy": "0"} a young boy is jumping in the mud{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a young boy is jumping in the mud.jpg", "mask_strategy": "0"} a man sitting in the driver's seat of a car wearing sunglasses{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man sitting in the driver's seat of a car wearing sunglasses.jpg", "mask_strategy": "0"} a little boy jumping in the air over a puddle of water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a little boy jumping in the air over a puddle of water.jpg", "mask_strategy": "0"} a woman with afro hair is smiling while wearing earphones{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman with afro hair is smiling while wearing earphones.jpg", "mask_strategy": "0"} a smiling woman with her hands clasped{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a smiling woman with her hands clasped.jpg", "mask_strategy": "0"} a young boy standing in a field with horses in the background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a young boy standing in a field with horses in the background.jpg", "mask_strategy": "0"} a young man is covered in colored powder{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a young man is covered in colored powder.jpg", "mask_strategy": "0"} a woman with curly hair is drinking a beer{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman with curly hair is drinking a beer.jpg", "mask_strategy": "0"} an old man standing in the middle of a field holding a bunch of plants{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an old man standing in the middle of a field holding a bunch of plants.jpg", "mask_strategy": "0"} a man standing on a boat with a net{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man standing on a boat with a net.jpg", "mask_strategy": "0"} a woman in a hat is putting salt into a basket{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman in a hat is putting salt into a basket.jpg", "mask_strategy": "0"} a young girl smelling a pink flower{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a young girl smelling a pink flower.jpg", "mask_strategy": "0"} a young boy leaning on a wooden pole{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a young boy leaning on a wooden pole.jpg", "mask_strategy": "0"} a man in a hat sitting in front of a brick oven{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man in a hat sitting in front of a brick oven.jpg", "mask_strategy": "0"} a man in a mexican outfit holding an acoustic guitar{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man in a mexican outfit holding an acoustic guitar.jpg", "mask_strategy": "0"} a snowboarder is in the air doing a trick{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snowboarder is in the air doing a trick.jpg", "mask_strategy": "0"} a man riding a horse with a spear in his hand{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man riding a horse with a spear in his hand.jpg", "mask_strategy": "0"} a woman carrying a bundle of plants over their head{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman carrying a bundle of plants over their head.jpg", "mask_strategy": "0"} a person jumping in the air over a fence{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person jumping in the air over a fence.jpg", "mask_strategy": "0"} a man on a surfboard riding a wave in the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man on a surfboard riding a wave in the ocean.jpg", "mask_strategy": "0"} a man sitting on steps playing an acoustic guitar{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man sitting on steps playing an acoustic guitar.jpg", "mask_strategy": "0"} a man swinging a tennis racquet at a tennis ball{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man swinging a tennis racquet at a tennis ball.jpg", "mask_strategy": "0"} a man riding a mountain bike on top of a rocky hill{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man riding a mountain bike on top of a rocky hill.jpg", "mask_strategy": "0"} a man riding a bike down a street{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man riding a bike down a street.jpg", "mask_strategy": "0"} a man is running on a dirt road{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man is running on a dirt road.jpg", "mask_strategy": "0"} A man in a black suit and a sombrero, shouting loudly{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A man in a black suit and a sombrero, shouting loudly.jpg", "mask_strategy": "0"} a man standing on top of a sand dune in the desert{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man standing on top of a sand dune in the desert.jpg", "mask_strategy": "0"} a person riding a motorcycle down a road{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person riding a motorcycle down a road.jpg", "mask_strategy": "0"} a man standing on top of a mountain with a backpack{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man standing on top of a mountain with a backpack.jpg", "mask_strategy": "0"} a man with a skull face paint smoking a cigar and holding a guitar{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man with a skull face paint smoking a cigar and holding a guitar.jpg", "mask_strategy": "0"} a man in sunglasses laying on a wooden bench{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man in sunglasses laying on a wooden bench.jpg", "mask_strategy": "0"} an older woman sitting in a room with a cigarette in her hand{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an older woman sitting in a room with a cigarette in her hand.jpg", "mask_strategy": "0"} a man sitting on the ground playing a musical instrument{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man sitting on the ground playing a musical instrument.jpg", "mask_strategy": "0"} a person riding a horse in a polo match{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person riding a horse in a polo match.jpg", "mask_strategy": "0"} a woman in a kimono holding an umbrella{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman in a kimono holding an umbrella.jpg", "mask_strategy": "0"} a person riding a dirt bike{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person riding a dirt bike.jpg", "mask_strategy": "0"} a person riding an atv on a dirt track{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person riding an atv on a dirt track.jpg", "mask_strategy": "0"} a person riding a wave on a surfboard{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person riding a wave on a surfboard.jpg", "mask_strategy": "0"} a woman in a wetsuit is swimming in the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman in a wetsuit is swimming in the ocean.jpg", "mask_strategy": "0"} a man snorkling in the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man snorkling in the ocean.jpg", "mask_strategy": "0"} a beautiful woman in a blue sari posing in front of a wall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a beautiful woman in a blue sari posing in front of a wall.jpg", "mask_strategy": "0"} a woman wearing a shawl in front of a mountain{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman wearing a shawl in front of a mountain.jpg", "mask_strategy": "0"} a woman is making bread in an oven{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman is making bread in an oven.jpg", "mask_strategy": "0"} a woman smiles while holding a yellow flower{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman smiles while holding a yellow flower.jpg", "mask_strategy": "0"} A young boy is lifting a bundle of dry grass wrapped in waterproof fabric over his head{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A young boy is lifting a bundle of dry grass wrapped in waterproof fabric over his head.jpg", "mask_strategy": "0"} two people performing a sword fight in front of a forest{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people performing a sword fight in front of a forest.jpg", "mask_strategy": "0"} a woman in a colorful shirt is cooking food{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman in a colorful shirt is cooking food.jpg", "mask_strategy": "0"} an older woman is drinking a bottle of water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an older woman is drinking a bottle of water.jpg", "mask_strategy": "0"} a smiling woman sitting at a table with food and drinks{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a smiling woman sitting at a table with food and drinks.jpg", "mask_strategy": "0"} a woman wearing a hijab reading a book on the beach{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman wearing a hijab reading a book on the beach.jpg", "mask_strategy": "0"} a woman wearing a headscarf is reaching for an olive tree{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman wearing a headscarf is reaching for an olive tree.jpg", "mask_strategy": "0"} a woman in a white dress jumping in the air in a field of pink flowers{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman in a white dress jumping in the air in a field of pink flowers.jpg", "mask_strategy": "0"} a woman wearing a conical hat sits on a boat{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman wearing a conical hat sits on a boat.jpg", "mask_strategy": "0"} an older woman sitting in front of an old building{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an older woman sitting in front of an old building.jpg", "mask_strategy": "0"} a woman is praying in front of a buddhist temple{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman is praying in front of a buddhist temple.jpg", "mask_strategy": "0"} a woman with green hair smiling for the camera{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman with green hair smiling for the camera.jpg", "mask_strategy": "0"} A group of people in a yellow raft is rowing through turbulent waters{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A group of people in a yellow raft is rowing through turbulent waters.jpg", "mask_strategy": "0"} a man carrying a woman on his back in a field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man carrying a woman on his back in a field.jpg", "mask_strategy": "0"} an indian police officer talking to an old woman{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an indian police officer talking to an old woman.jpg", "mask_strategy": "0"} two people scuba diving in the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people scuba diving in the ocean.jpg", "mask_strategy": "0"} A man and woman dressed as sugar skulls in a field of flowers, sharing a loving gaze with each other{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A man and woman dressed as sugar skulls in a field of flowers, sharing a loving gaze with each other.jpg", "mask_strategy": "0"} a group of people watching a cow race{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people watching a cow race.jpg", "mask_strategy": "0"} a man and a child riding bumper cars in an amusement park{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man and a child riding bumper cars in an amusement park.jpg", "mask_strategy": "0"} a group of motorcyclists racing on a dirt track{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of motorcyclists racing on a dirt track.jpg", "mask_strategy": "0"} a man and a woman are boxing in a boxing ring{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man and a woman are boxing in a boxing ring.jpg", "mask_strategy": "0"} a man holding a baby in his arms{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man holding a baby in his arms.jpg", "mask_strategy": "0"} a man and a woman sitting on a bench playing instruments{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man and a woman sitting on a bench playing instruments.jpg", "mask_strategy": "0"} two men are standing next to each other with a bicycle{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two men are standing next to each other with a bicycle.jpg", "mask_strategy": "0"} a man and a boy sitting on a beach near the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man and a boy sitting on a beach near the ocean.jpg", "mask_strategy": "0"} two men in white clothing standing next to each other{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two men in white clothing standing next to each other.jpg", "mask_strategy": "0"} a group of men riding horses in a dusty arena{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of men riding horses in a dusty arena.jpg", "mask_strategy": "0"} a soccer player in a yellow and black shirt is chasing a soccer ball{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a soccer player in a yellow and black shirt is chasing a soccer ball.jpg", "mask_strategy": "0"} a group of women sitting on the steps of a building{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of women sitting on the steps of a building.jpg", "mask_strategy": "0"} a group of people gathered around a red checkered blanket{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people gathered around a red checkered blanket.jpg", "mask_strategy": "0"} a group of people in orange jumpsuits running along a river{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people in orange jumpsuits running along a river.jpg", "mask_strategy": "0"} a woman walking down a sidewalk with a bag{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman walking down a sidewalk with a bag.jpg", "mask_strategy": "0"} a busy street with cars and people on motorcycles{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a busy street with cars and people on motorcycles.jpg", "mask_strategy": "0"} a man in a mask is walking through a crowd of people{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man in a mask is walking through a crowd of people.jpg", "mask_strategy": "0"} a man and a woman walking under an umbrella next to a brick wall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man and a woman walking under an umbrella next to a brick wall.jpg", "mask_strategy": "0"} a group of people riding bikes down a street{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people riding bikes down a street.jpg", "mask_strategy": "0"} An old person is holding a cup on the street, and people around are curiously looking at him{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/An old person is holding a cup on the street, and people around are curiously looking at him.jpg", "mask_strategy": "0"} two young girls playing with leaves in the woods{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two young girls playing with leaves in the woods.jpg", "mask_strategy": "0"} One person is riding on the back of a horse led by another person{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/One person is riding on the back of a horse led by another person.jpg", "mask_strategy": "0"} an older woman and a young girl are knitting together{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an older woman and a young girl are knitting together.jpg", "mask_strategy": "0"} three geishas walking down the street in traditional clothing{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/three geishas walking down the street in traditional clothing.jpg", "mask_strategy": "0"} two men riding bikes down a road near a forest{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two men riding bikes down a road near a forest.jpg", "mask_strategy": "0"} two women carrying bowls on their heads{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two women carrying bowls on their heads.jpg", "mask_strategy": "0"} two women eating pizza at a restaurant{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two women eating pizza at a restaurant.jpg", "mask_strategy": "0"} two young women studying in a library{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two young women studying in a library.jpg", "mask_strategy": "0"} pink water lilies in a pond with leaves{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/pink water lilies in a pond with leaves.jpg", "mask_strategy": "0"} a group of succulents in a rock garden{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of succulents in a rock garden.jpg", "mask_strategy": "0"} a close up view of a bunch of snowdrop flowers{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up view of a bunch of snowdrop flowers.jpg", "mask_strategy": "0"} a close up of leaves with water droplets on them{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of leaves with water droplets on them.jpg", "mask_strategy": "0"} a close-up of a sea anemone in the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up of a sea anemone in the water.jpg", "mask_strategy": "0"} a plant with water droplets on it{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a plant with water droplets on it.jpg", "mask_strategy": "0"} a group of cactus plants in the desert{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of cactus plants in the desert.jpg", "mask_strategy": "0"} a close-up view of a plant with spiky leaves{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a plant with spiky leaves.jpg", "mask_strategy": "0"} A budding and blossoming flower bud seedling{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A budding and blossoming flower bud seedling.jpg", "mask_strategy": "0"} a field of orange flowers near the ocean'{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a field of orange flowers near the ocean'.jpg", "mask_strategy": "0"} a close-up view of a bunch of pink flowers{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a bunch of pink flowers.jpg", "mask_strategy": "0"} pink water lilies in a pond{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/pink water lilies in a pond.jpg", "mask_strategy": "0"} reeds blowing in the wind against a cloudy sky{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/reeds blowing in the wind against a cloudy sky.jpg", "mask_strategy": "0"} two tall cacti in the middle of the desert{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two tall cacti in the middle of the desert.jpg", "mask_strategy": "0"} a sea anemone on a coral reef{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sea anemone on a coral reef.jpg", "mask_strategy": "0"} a dandelion blowing in the wind{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dandelion blowing in the wind.jpg", "mask_strategy": "0"} A boiling pot cooking vegetables{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A boiling pot cooking vegetables.jpg", "mask_strategy": "0"} a woman stirring food in a pan on the stove{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman stirring food in a pan on the stove.jpg", "mask_strategy": "0"} two eggs are fried in a frying pan on the stove{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two eggs are fried in a frying pan on the stove.jpg", "mask_strategy": "0"} fried onion rings in a basket{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/fried onion rings in a basket.jpg", "mask_strategy": "0"} a pot is sitting on top of a campfire{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pot is sitting on top of a campfire.jpg", "mask_strategy": "0"} a chef is preparing a dish with mushrooms on a wooden board{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a chef is preparing a dish with mushrooms on a wooden board.jpg", "mask_strategy": "0"} a hand holding a slice of pizza{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a hand holding a slice of pizza.jpg", "mask_strategy": "0"} A person is using tongs to pick up meat from a plate{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A person is using tongs to pick up meat from a plate.jpg", "mask_strategy": "0"} The meat is picked up from the grill with tongs{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/The meat is picked up from the grill with tongs.jpg", "mask_strategy": "0"} A person is whisking eggs, and the egg whites and yolks are gently streaming out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A person is whisking eggs, and the egg whites and yolks are gently streaming out.jpg", "mask_strategy": "0"} a person is putting sauce on a burger{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person is putting sauce on a burger.jpg", "mask_strategy": "0"} A person is making dumplings{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A person is making dumplings.jpg", "mask_strategy": "0"} a pan filled with fried food{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pan filled with fried food.jpg", "mask_strategy": "0"} Chopsticks are slowly picking up the buns from the plastic container{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Chopsticks are slowly picking up the buns from the plastic container.jpg", "mask_strategy": "0"} a basket of french fries in a fryer{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a basket of french fries in a fryer.jpg", "mask_strategy": "0"} a table with lobsters and drinks on it{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table with lobsters and drinks on it.jpg", "mask_strategy": "0"} a person pouring coffee into a pot on a stove{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person pouring coffee into a pot on a stove.jpg", "mask_strategy": "0"} a kettle is sitting on top of a campfire{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a kettle is sitting on top of a campfire.jpg", "mask_strategy": "0"} Chopsticks are picking up noodles from the bowl{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Chopsticks are picking up noodles from the bowl.jpg", "mask_strategy": "0"} a person is cooking eggs on an outdoor grill{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person is cooking eggs on an outdoor grill.jpg", "mask_strategy": "0"} a person is cooking food in a wok on a stove{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person is cooking food in a wok on a stove.jpg", "mask_strategy": "0"} a person is holding up a burger with his hands{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person is holding up a burger with his hands.jpg", "mask_strategy": "0"} A person is pouring water into a teacup{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A person is pouring water into a teacup.jpg", "mask_strategy": "0"} a person pouring seasoning into a pot of food{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person pouring seasoning into a pot of food.jpg", "mask_strategy": "0"} a person holding a taco in their hand{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person holding a taco in their hand.jpg", "mask_strategy": "0"} a person slicing salmon on a cutting board{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person slicing salmon on a cutting board.jpg", "mask_strategy": "0"} a bunch of food is cooking on a grill over an open fire{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bunch of food is cooking on a grill over an open fire.jpg", "mask_strategy": "0"} a close up of a piece of sushi on chopsticks{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a piece of sushi on chopsticks.jpg", "mask_strategy": "0"} a group of pots on a stove with flames in the background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of pots on a stove with flames in the background.jpg", "mask_strategy": "0"} a person cooking vegetables in a pan on a stove{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person cooking vegetables in a pan on a stove.jpg", "mask_strategy": "0"} a large pot of soup filled with vegetables and meat{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large pot of soup filled with vegetables and meat.jpg", "mask_strategy": "0"} a person holding chopsticks over a bowl of food{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person holding chopsticks over a bowl of food.jpg", "mask_strategy": "0"} ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_category/animal.txt ================================================ a black dog wearing halloween costume spider making a web bat eating fruits while hanging a snake crawling on a wooden flooring a close up video of a dragonfly macro shot of ladybug on green leaf plant chameleon eating ant a bee feeding on nectars bird nests on a tree captured with moving camera a squirrel eating nuts close up video of snail top view of a hermit crab crawling on a wooden surface cat licking another cat red dragonfly perched on green leaf close up view of a brown caterpillar crawling on green leaf ants eating dead spider an eagle on a tree branch a frog eating an ant white rabbit near the fence a gorilla eating a carrot close up of wolf a meerkat looking around a hyena in a zoo lemur eating grass leaves an owl being trained by a man a lizard on a bamboo brown chicken hunting for its food video of parrots perched on bird stand underwater footage of an octopus in a coral reef a cute pomeranian dog playing with a soccer ball white fox on rock close up footage of a horse figurine giraffe feeding on a tree in a savannah curious cat sitting and looking around hummingbird hawk moth flying near pink flowers close up of a scorpion on a rock close up on fish in net koala eating leaves from a branch a pod of dolphins swirling in the sea catching forage fish low angle view of a hawk perched on a tree branch a lion standing on wild grass deer grazing in the field elephant herd in a savanna close up on lobster under water hedgehog crossing road in forest a sheep eating yellow flowers from behind a wire fence twin sisters and a turtle a pig wallowing in mud flock of goose eating on the lake water cow in a field irritated with flies a close up shot of a fly cheetah lying on the grass close up of a lemur close up shot of a kangaroo itching in the sand a tortoise covered with algae turkey in cage a great blue heron bird in the lakeside crab with shell in aquarium a seagull walking on shore an american crocodile a tiger walking inside a cage alligator in the nature a raccoon climbing a tree wild rabbit in a green meadow group of ring tailed lemurs a clouded leopard on a tree branch duck grooming its feathers an african penguin walking on a beach a video of a peacock close up shot of a wild bear baby rhino plays with mom porcupine climbs tree branches close up of a natterjack toad on a rock a sleeping orangutan mother whale swimming with babies a bear wearing red jersey pink jellyfish swimming underwater in a blue sea beautiful clown fish swimming animation of disposable objects shaped as a whale paper cut out of a pair of hands a whale and a heart vertical video of camel roaming in the field during daytime a still video of mosquito biting human a curious sloth hanging from a tree branch a plastic flamingo bird stumbles from the wind a wolf in its natural habitat a monkey sitting in the stone and scratching his head bat hanging upside down a red panda eating leaves snake on ground a harbour seal swimming near the shore shark swimming in the sea otter on branch while eating goat standing over a rock a troop of monkey on top of a mountain a zebra eating grass on the field a colorful butterfly perching on a bud a snail crawling on a leaf zookeeper showering a baby elephant a beetle emerging from the sand a nine banded armadillo searching for food ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_category/architecture.txt ================================================ an apartment building with balcony asian garden and medieval castle illuminated tower in berlin a wooden house overseeing the lake a crowd of people in a plaza in front of a government building a church interior jewish friends posing with hanukkah menorah in a cabin house a destroyed building after a missile attack in ukraine abandoned building in the woods drone video of an abandoned school building in pripyat ukraine elegant university building architecture and designs of buildings in central london a pancake tower with chocolate syrup and strawberries on top an ancient white building friends hanging out at a coffee house house front door with christmas decorations city night dark building a bird house hanging on a tree branch sacred sculpture in a temple high angle shot of a clock tower modern wooden house interior the interior of an abandoned building opera house overlooking sea a concrete structure near the green trees dome like building in scotland low angle shot of a building tower on hill a miniature house eiffel tower from the seine river low angle footage of an apartment building island with pier and antique building asian historic architecture drone footage of a beautiful mansion mosque in the middle east building a tent and hammock in the forest camping site top view of a high rise building house covered in snow skyscraper at night house in village a casino with people outside the building silhouette of a building a woman climbing a tree house drone view of house near lake during golden hour an under construction concrete house a watch tower by the sea exterior view of arabic style building video of a hotel building red paper lantern decorations hanging outside a building house on seashore aerial footage of the palace of culture and science building in warsaw poland aerial video of stuttgart tv tower in germany aerial view of the highway and building in a city drone shot of a skyscraper san francisco california usa waterfall and house view of the sky through a building drone footage of a house on top of the mountain abandoned house in the nature clouds hovering over a mansion light house on the ocean buddhist temple at sunrise people walking by a graveyard near a mosque at sunset view of lifeguard tower on the beach scenic view of a house in the mountains the landscape in front of a government building aerial footage of a building and its surrounding landscape in winter time lapse of a cloudy sky behind a transmission tower blue ocean near the brown castle fog over temple house in countryside top view building under construction turkish flag waving on old tower the georgian building close up shot of a steel structure the atrium and interior design of a multi floor building city view reflected on a glass building aerial view of a luxurious house with pool an unpaved road leading to the house drone footage of a lookout tower in mountain landscape wind turbines on hill behind building time lapse footage of the sun light in front of a small house porch a building built with lots of stairways overcast over house on seashore the view of the sydney opera house from the other side of the harbor candle on a jar and a house figurine on a surface video of a farm and house a dilapidated building made of bricks a view of a unique building from a moving vehicle aerial footage of a tall building in cambodia push in shot of a huge house a beach house built over a seawall protected from the sea waves exotic house surrounded by trees drone video of a house surrounded by tropical vegetation drone footage of a building beside a pond observation tower on hill in forest a tree house in the woods a video of vessel structure during daytime fire in front of illuminated building at night a footage of a wooden house on a wheat field tilt shot of a solar panel below a light tower water tower on the desert ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_category/food.txt ================================================ freshly baked finger looking cookies video of fake blood in wine glass halloween food art a person slicing a vegetable a serving of pumpkin dish in a plate close up view of green leafy vegetable a birthday cake in the plate video of a slice papaya fruit a muffin with a burning candle and a love sign by a ceramic mug a jack o lantern designed cookie baked bread with chocolate a broccoli soup on wooden table a freshly brewed coffee on a pink mug grabbing sourdough neapolitan style pizza slices person cooking mushrooms in frying pan rice grains placed on a reusable cloth bag slices of kiwi fruit grilling a steak on a pan grill close up of bread popping out of a toaster man eating noodle preparing a cocktail drink close up pasta with bacon on plate milk and cinnamon rolls boy getting a dumpling using chopsticks a mother preparing food with her kids man using his phone while eating fresh salmon salad on a plate cutting cucumbers into long thin slices as ingredient for sushi roll a steaming cup of tea by the window a glass filled with beer a kid eating popcorn while watching tv close up shot of fried fish on the plate a man eating a donut person making a vegetarian dish spreading cheese on bagel close up view of a man drinking red wine a couple having breakfast in a restaurant a student eating her sandwich girl peeling a banana red rice in a small bowl pancake with blueberry on the top green apple fruit on white wooden table a man eating a taco by the bar making of a burrito squeezing lemon into salad a chef cutting sushi rolls video of a delicious dessert deep frying a crab on a wok in high fire close up video of a orange juice video of a cooked chicken breast woman holding a pineapple a woman eating a bar of chocolate decorating christmas cookie squeezing a slice of fruit tuna sashimi on a plate a strawberry fruit mixed in an alcoholic drink preparing hot dogs in a grill a woman cutting a tomato an orange fruit cut in half a coconut fruit with drinking straw woman holding a dragon fruit a woman pouring hot beverage on a cup waffles with whipped cream and fruit focus shot of an insect at the bottom of a fruit preparing a healthy broccoli dish man eating snack at picnic close up video of a grilled shrimp skewer a woman mixing a smoothie drinks close up video of woman having a bite of jelly businessman drinking whiskey at the bar counter of a hotel lounge cutting an onion with a knife over a wooden chopping board fresh lemonade in bottles grilling a meat on a charcoal grill people enjoying asian cuisine close up footage of a hot dish on a clay pot pork ribs dish waffle with strawberry and syrup for breakfast tofu dish with rose garnish uncooked pork meat egg yolk being dumped over gourmet dish tasty brunch dish close up little boy pretending to eat the watermelon slicing roasted beef close up of a chef adding teriyaki sauce to a dish flat lay mexican dish a person placing an octopus dish on a marble surface close up of tea leaves brewing in a glass kettle adding fresh herbs to soup dish a scoop of roasted coffee beans fresh dim sum set up on a bamboo steam tray for cooking a girl putting ketchup on food at the kitchen cooking on electric stove a woman with a slice of a pie grapes and wine on a wooden board man taking picture of his food hamburger and fries on restaurant table close up video of japanese food a cracker sandwich with cheese filling for snack barista preparing matcha tea close up of onion rings being deep fried ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_category/human.txt ================================================ people carving a pumpkin people sitting on a sofa a man with a muertos face painting man walking in the dark men in front of their computer editing photos men loading christmas tree on tow truck woman washing the dishes woman adding honey to the cinnamon rolls two women kissing and smiling three women looking at watercolor paintings a family wearing paper bag masks a family posing for the camera a boy covering a rose flower with a dome glass boy sitting on grass petting a dog a girl in her tennis sportswear a girl coloring the cardboard silhouette of the couple during sunset couple dancing with body paint a child playing with water a woman with her child sitting on a couch in the living room a group of friend place doing hand gestures of agreement friends having a group selfie friends talking while on the basketball court group of people protesting a group of campers with a cute dog a group of photographers taking pictures at the north western gardens in llandudno north wales a group of students laughing and talking a group of martial artist warming up a person playing golf a person walking on a wet wooden bridge person doing a leg exercise ice hockey athlete on rink a young athlete training in swimming chess player dusting a chessboard baseball player holding his bat a bearded man putting a vinyl record on a vinyl player an orchestra finishes a performance people applauding the performance of the kids band performance at the recording studio father and his children playing jenga game people playing a board game man playing a video game a man video recording the movie in theater man and a woman eating while watching a movie movie crew talking together a director explaining the movie scene man and woman listening to music on car man playing music couple dancing slow dance with sun glare a ballerina practicing in the dance studio father and son holding hands father and daughter talking together a mother and her kids engaged in a video call mother and daughter reading a book together a mother teaching her daughter playing a violin kid in a halloween costume a happy kid playing the ukulele a chef slicing a cucumber chef wearing his gloves properly brother and sister using hammock girl applying sunblock to her brother a girl pushing the chair while her sister is on the chair colleagues talking in office building fighter practice kicking a woman fighter in her cosplay costume an engineer holding blueprints while talking with her colleague a young woman looking at vr controllers with her friend workmates teasing a colleague in the work a male police officer talking on the radio teacher holding a marker while talking teacher writing on her notebook a young student attending her online classes a student showing his classmates his wand a male vendor selling fruits a shirtless male climber a sound engineer listening to music female talking to a psychiatrist in a therapy session young female activist posing with flag a man in a hoodie and woman with a red bandana talking to each other and smiling a medium close up of women wearing kimonos a male interviewer listening to a person talking a social worker having a conversation with the foster parents a farm worker harvesting onions worker packing street food worker and client at barber shop elderly man lifting kettlebell mom assisting son in riding a bicycle dad watching her daughter eat young guy with vr headset pregnant woman exercising with trainer a fortune teller talking to a client wizard doing a ritual on a woman a footage of an actor on a movie scene a man holding a best actor trophy a singer of a music band a young singer performing on stage young dancer practicing at home seller showing room to a couple cab driver talking to passenger a policeman talking to the car driver ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_category/lifestyle.txt ================================================ kids celebrating halloween at home little boy helping mother in kitchen video of a indoor green plant a girl arranges a christmas garland hanging by the kitchen cabinet candle burning in dark room couple having fun and goofing around the bedroom girls jumping up and down in the bedroom woman and man in pajamas working from home a muslim family sitting and talking in the living room family enjoying snack time while sitting in the living room woman holding an animal puppet and a little girl playing together at the living room kids playing in the indoor tent young people celebrating new year at the office a woman writing on the sticky note in the office a woman exercising at home over a yoga mat girls preparing easter decorations at home dog on floor in room turning on a fluorescent light inside a room colleagues talking to each other near the office windows a woman recording herself while exercising at home music room different kind of tools kept in a utility room sofa beds and other furniture a girl finding her brother reading a book in the bedroom an elegant ceramic plant pot and hanging plant on indoor furniture inside a bedroom interior design of the bar section living room with party decoration firewood burning in dark room a young woman playing the ukulele at home woman painting at home a woman in a locker room video of a bathroom interior the interior design of a jewish synagogue a woman in protective suit disinfecting the kitchen modern minimalist home interior modern interior design of a coffee shop person arranging minimalist furniture aerial shot of interior of the warehouse a room of a manufacturing facility interior of catholic interior design of a restaurant a female model in a changing room looking herself in mirror men walking in the office hallway people sitting in a conference room the interior design of a shopping mall chandeliers in room lucerne railway station interior a female fencer posing in a foggy room a toolbox and a paint roller beside a huge package in a room bedroom in hotel a woman lying in the operating room a chef holding and checking kitchen utensils a couple singing in the shower room together a woman cleaning mess in the living room an empty meeting room with natural light person dancing in a dark room close up on blood in hospital room a couple resting on their home floor a young female staff at courier office a man entering the gym locker room a bored man sitting by the tv at home woman dancing in indoor garden rubble in the interior of an abandoned house indoor farm in a greenhouse man doing handstand in indoor garden an abandoned indoor swimming pool home decorations on top of a cabinet graffiti art on the interior walls of an abandoned mansion indoor wall climbing activity sunlight inside a room teenage girl roller skating at indoor rink home deco with lighted baby in the shower room men enjoying office christmas party a bedroom with a brick wall actors prepping in the dressing room kids playing at an indoor playground a person sanitizing an office space using smoke machine mother and daughter choosing clothes at home a woman sitting by the indoor fire pit man standing on the corner of the room while looking around person assembling furniture a family stacking cardboard boxes in a room family having fun in the dining room person disinfecting a room a woman washing strawberries in the kitchen sink modern office waiting room close up view of a person slicing with a kitchen knife boiling coffee on a stove in the kitchen modern equipment used in a home studio interior of a recording studio people working in a call center office band performing at a home concert a group of people watching a concert in a room people packing their furniture young employees in office holding a certificate a criminal inside a dark room handcuffed in a table couple browsing and looking for furniture in the store workspace at home ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_category/plant.txt ================================================ video of a indoor green plant close up view of a plant close up shot of a burning plant plucking leaves from plant a plant on gold pot with glass lid a branch of a tree and a plant a leafless tree close up shot of fern leaf close up video of strawberry plant plant with blooming flowers close up video of flower petals watering yellow plant beautiful flower decoration cannabis flower in a jar a footage of the tree leaves a red leaf plant close up view of a white christmas tree snow pouring on a tree close up shot of white flowers on the tree leaves in the trees daytime a dead tree lying on a grass field tree branches in a flowing river purple flowers with leaves a coconut tree by the house close up on flower in winter bamboo leaves backlit by the sun close up video of a wet flower a man putting a flower in a box dropping flower petals on a wooden bowl a close up shot of gypsophila flower variety of succulent plants on a garden variety of trees and plants in a botanical garden forest of deciduous trees a stack of dried leaves burning in a forest tall forest trees on a misty morning close up view of dewdrops on a leaf close up view of white petaled flower removing a pineapple leaf a dragonfly perched on a leaf butterfly pollinating flower person visiting and checking a corn plant woman picking beans from a plant woman plucking mint leaves single tree in the middle of farmland a plant on a soil drone footage of a tree on farm field a tractor harvesting lavender flower people putting christmas ornaments on a christmas tree jack o lantern hanging on a tree tree with halloween decoration flower field near the waterfall truck carrying the tree logs raindrops falling on leaves shot of a palm tree swaying with the wind squirrels on a tree branch person holding a flower a fallen tree trunk tree with golden leaves cherry tree wind blows through leaves of the tree in autumn a leaf on a glass the long trunks of tall trees in the forest trees in the forest during sunny day close up video of tree bark reflection of tree branches trunks of many trees in the forest tree leaves providing shades from the sun leaves swaying in the wind low angle shot of baobab tree bare trees in forest a plant surrounded by fallen leaves a couple preparing food and pruning a plant a man cutting a tree bark oranges on a tree branch plant connected on the stones video of a sawmill machine cutting tree log women drying flower petals macro view of an agave plant a video of a person tying a plant on a string green moss in forest nature coconut tree near sea under blue sky the canopy of a coconut tree a man leaning on a tree at the beach a full grown plant on a pot candle wax dripping on flower petals close up of leaves in autumn a woman opening a book with a flower inside a man holding leaves looking at the camera a shadow of a swaying plant a tree and concrete structure under a blue and cloudy sky trimming excess leaves on a potted plant the changing color of the tree leaves during autumn season a gooseberry tree swayed by the wind forest trees and a medieval castle at sunset woman cut down tree an old oak tree in a park across the street from a hotel wild flowers growing in a forest ground a mossy fountain and green plants in a botanical garden mansion with beautiful garden ants on a dragon fruit flower ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_category/scenery.txt ================================================ scenery of desert landscape landscape agriculture farm tractor burning slash piles in the forest graveyard at sunset view of a jack o lantern with pumpkins in a smoky garden sun view through a spider web view of the sea from an abandoned building close up view of a full moon close up view of lighted candles close up view of swaying white flowers and leaves scenery of a relaxing beach selective focus video of grass during sunny day aerial view of brown dry landscape fireworks display in the sky at night a bonfire near river mountain view waterfalls in between mountain a picturesque view of nature exotic view of a riverfront city tall trees in the forest under the clear sky snow on branches in forest stream in the nature an airplane flying above the sea of clouds scenic video of sunset view of houses with bush fence under a blue and cloudy sky scenic view from wooden pathway scenic view of a tropical beach drone footage of waves crashing on beach shore a scenic view of the golden hour at norway time lapse video of foggy mountain forest brown mountain during fall season video of ocean during daytime boat sailing in the ocean top view of yachts beautiful scenery of flowing waterfalls and river wild ducks paddling on the lake surface a relaxing scenery of beach view under cloudy sky natural rock formations on beach under cloudy sky a palm tree against blue sky video of sailboat on a lake during sunset aerial view of snow piles time lapse of a sunset sky in the countryside aerial footage of a statue time lapse video of a farm during sunset clouds formation in the sky at sunset aerial shot of a village drone shot of a beautiful sunrise at the mountains time lapse video of foggy morning during sunrise sun shining between tree leaves at sunrise video of lake during dawn vehicles traveling on roadway under cloudy sky view of golden domed church a monument under the blue sky firecrackers in the sky view of fruit signage in the farm a dark clouds over shadowing the full moon view of the amazon river a big river swamp in a dense forest a blooming cherry blossom tree under a blue sky with white clouds a river waterfall cascading down the plunge basin flooded landscape with palm trees a blurry waterfall background waterfall in the mountains aerial footage of a city at night pond by small waterfall in forest aerial view of farmlands at the bay of lake rice terraces in the countryside a highway built across an agricultural area in the countryside gloomy morning in the countryside drone shot of an abandoned coliseum on a snowy mountain top boat sailing in the middle of ocean drone shot of the grass field natural landscape of mountain and sea with islets developed into a community aerial view of zaporizhia in ukraine aerial footage of a herd an aerial footage of a red sky grass and plants growing in the remains of an abandoned house view from hill on city aerial view on orthodox church aerial view of bay in croatia a footage of a frozen river overlooking view of a city at daylight view outside the cemetery clear sky with moon over meadow clouds over railway aerial footage of moving vehicles on the road at night aerial view of town and park top view of skyscrapers top view of the empire state building in manhattan top view of the central park in new york city sheep running in a grass field clear sky over factory smoke and fire in birds eye view view of a pathway with snow melting on its side ferry under bridge on river near city in malaysia mountain slopes covered in green vegetation panoramic view of a town surrounded by snow covered mountains aerial view of a palace top view of vehicles driving on the intersection a graveyard by a church in a mountain landscape ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_category/vehicles.txt ================================================ a modern railway station in malaysia use for public transportation drone footage of amsterdam metro station train arriving at a station red vehicle driving on field close up view of flashing emergency vehicle lighting vehicle with fertilizer on field a highway built across an agricultural area in the countryside drone footage of motorcycles driving on country road between agricultural fields a road in the woods under fog footage of a car driving through a wheat field vehicle stops for an ambulance passing through city traffic emergency vehicle parked outside the casino zombies attacking a woman and a boy inside a car woman seating inside the car while chewing video of passengers riding a double decker bus during night traffic in london street at night elderly couple checking engine of automobile a green vintage automobile with an open hood parked in a parking area close up of a prototype automobile with exposed engine on the back seat of the car aerial view of road in forest train departing from station aerial view of a train passing by a bridge video of a train tracks video footage of a subway video of blinking traffic lights couple walking out on the subway time lapse of a subway tunnel monitor board inside the subway metro train at night zoom in video of a tram passing by city young man using laptop in the tram man reading a book at bus stop close up shot of a moving taxi night travel in london street on a public bus red bus in a rainy city flow of traffic in the city close up shot of a yellow taxi turning left two women calling for a taxi drone view of an illuminated bridge across a river policeman in police car talking on radio airplane taking off at night view through window in airplane an airplane in the sky helicopter landing on the street a pilot getting out of a helicopter a helicopter flying under blue sky boat sailing in the middle of the ocean girl playing with a toy boat silhouette of a boat on sea during golden hour a boat travelling around the lake road on mountain ridge ship sailing on danube river slow motion video of a ship water trail in the sea drone footage of a wreck ship on shore a white yacht traveling on a river and passing under the bridge female teenagers drinking champagne in the yacht video of yacht sailing in the ocean red combine harvester on road on field a woman sitting on a bicycle while using a mobile phone a woman sitting on a motorcycle looking around three teenagers fixing a bicycle a woman in a halloween costume posing on a motorcycle a parked motorcycle on a foggy roadside cable car near sea shore a truck travelling in the road footage of the road without any traffic a road sign love padlocks on a bridge camera moving at highway construction site vehicles driving on highway a motorbike on highway at timelapse mode point of view of a car driving through a tunnel time lapse of heavy traffic on an avenue ferry boat on city canal black vintage car in museum a zigzag road across a forest people crossing the road video of a kayak boat in a river a person paddling a wooden boat in a lake a car charging in the parking area cars parked on the road footage of the street with people and vehicle passing by in the rain traffic on busy city street a woman getting out of the car to walk with their dog yacht sailing through the ocean people in queue to military ship man wearing motorcycle helmet looking at the camera empty seats in the bus empty boat on the water cargo train traveling on the mountainside cruise ship in harbor counting down at traffic lights pressing the car ignition fire truck driving on the road a footage of a broken bicycle drone footage of an ambulance on the road slow motion footage of a racing car ship sailing on sea against sunset big cargo ship passing on the shore back view of man and woman walking on unpaved road ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/appearance_style.txt ================================================ A beautiful coastal beach in spring, waves lapping on sand, Van Gogh style A beautiful coastal beach in spring, waves lapping on sand, oil painting A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo A beautiful coastal beach in spring, waves lapping on sand, black and white A beautiful coastal beach in spring, waves lapping on sand, pixel art A beautiful coastal beach in spring, waves lapping on sand, in cyberpunk style A beautiful coastal beach in spring, waves lapping on sand, animated style A beautiful coastal beach in spring, waves lapping on sand, watercolor painting A beautiful coastal beach in spring, waves lapping on sand, surrealism style The bund Shanghai, Van Gogh style The bund Shanghai, oil painting The bund Shanghai by Hokusai, in the style of Ukiyo The bund Shanghai, black and white The bund Shanghai, pixel art The bund Shanghai, in cyberpunk style The bund Shanghai, animated style The bund Shanghai, watercolor painting The bund Shanghai, surrealism style a shark is swimming in the ocean, Van Gogh style a shark is swimming in the ocean, oil painting a shark is swimming in the ocean by Hokusai, in the style of Ukiyo a shark is swimming in the ocean, black and white a shark is swimming in the ocean, pixel art a shark is swimming in the ocean, in cyberpunk style a shark is swimming in the ocean, animated style a shark is swimming in the ocean, watercolor painting a shark is swimming in the ocean, surrealism style A panda drinking coffee in a cafe in Paris, Van Gogh style A panda drinking coffee in a cafe in Paris, oil painting A panda drinking coffee in a cafe in Paris by Hokusai, in the style of Ukiyo A panda drinking coffee in a cafe in Paris, black and white A panda drinking coffee in a cafe in Paris, pixel art A panda drinking coffee in a cafe in Paris, in cyberpunk style A panda drinking coffee in a cafe in Paris, animated style A panda drinking coffee in a cafe in Paris, watercolor painting A panda drinking coffee in a cafe in Paris, surrealism style A cute happy Corgi playing in park, sunset, Van Gogh style A cute happy Corgi playing in park, sunset, oil painting A cute happy Corgi playing in park, sunset by Hokusai, in the style of Ukiyo A cute happy Corgi playing in park, sunset, black and white A cute happy Corgi playing in park, sunset, pixel art A cute happy Corgi playing in park, sunset, in cyberpunk style A cute happy Corgi playing in park, sunset, animated style A cute happy Corgi playing in park, sunset, watercolor painting A cute happy Corgi playing in park, sunset, surrealism style Gwen Stacy reading a book, Van Gogh style Gwen Stacy reading a book, oil painting Gwen Stacy reading a book by Hokusai, in the style of Ukiyo Gwen Stacy reading a book, black and white Gwen Stacy reading a book, pixel art Gwen Stacy reading a book, in cyberpunk style Gwen Stacy reading a book, animated style Gwen Stacy reading a book, watercolor painting Gwen Stacy reading a book, surrealism style A boat sailing leisurely along the Seine River with the Eiffel Tower in background, Van Gogh style A boat sailing leisurely along the Seine River with the Eiffel Tower in background, oil painting A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Hokusai, in the style of Ukiyo A boat sailing leisurely along the Seine River with the Eiffel Tower in background, black and white A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pixel art A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in cyberpunk style A boat sailing leisurely along the Seine River with the Eiffel Tower in background, animated style A boat sailing leisurely along the Seine River with the Eiffel Tower in background, watercolor painting A boat sailing leisurely along the Seine River with the Eiffel Tower in background, surrealism style A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, Van Gogh style A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, oil painting A couple in formal evening wear going home get caught in a heavy downpour with umbrellas by Hokusai, in the style of Ukiyo A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, black and white A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pixel art A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in cyberpunk style A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, animated style A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, watercolor painting A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, surrealism style An astronaut flying in space, Van Gogh style An astronaut flying in space, oil painting An astronaut flying in space by Hokusai, in the style of Ukiyo An astronaut flying in space, black and white An astronaut flying in space, pixel art An astronaut flying in space, in cyberpunk style An astronaut flying in space, animated style An astronaut flying in space, watercolor painting An astronaut flying in space, surrealism style Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, Van Gogh style Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, oil painting Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks by Hokusai, in the style of Ukiyo Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, black and white Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pixel art Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in cyberpunk style Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, animated style Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, watercolor painting Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, surrealism style ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/color.txt ================================================ a red bicycle a green bicycle a blue bicycle a yellow bicycle an orange bicycle a purple bicycle a pink bicycle a black bicycle a white bicycle a red car a green car a blue car a yellow car an orange car a purple car a pink car a black car a white car a red bird a green bird a blue bird a yellow bird an orange bird a purple bird a pink bird a black bird a white bird a black cat a white cat an orange cat a yellow cat a red umbrella a green umbrella a blue umbrella a yellow umbrella an orange umbrella a purple umbrella a pink umbrella a black umbrella a white umbrella a red suitcase a green suitcase a blue suitcase a yellow suitcase an orange suitcase a purple suitcase a pink suitcase a black suitcase a white suitcase a red bowl a green bowl a blue bowl a yellow bowl an orange bowl a purple bowl a pink bowl a black bowl a white bowl a red chair a green chair a blue chair a yellow chair an orange chair a purple chair a pink chair a black chair a white chair a red clock a green clock a blue clock a yellow clock an orange clock a purple clock a pink clock a black clock a white clock a red vase a green vase a blue vase a yellow vase an orange vase a purple vase a pink vase a black vase a white vase ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/human_action.txt ================================================ A person is riding a bike A person is marching A person is roller skating A person is tasting beer A person is clapping A person is drawing A person is petting animal (not cat) A person is eating watermelon A person is playing harp A person is wrestling A person is riding scooter A person is sweeping floor A person is skateboarding A person is dunking basketball A person is playing flute A person is stretching leg A person is tying tie A person is skydiving A person is shooting goal (soccer) A person is playing piano A person is finger snapping A person is canoeing or kayaking A person is laughing A person is digging A person is clay pottery making A person is shooting basketball A person is bending back A person is shaking hands A person is bandaging A person is push up A person is catching or throwing frisbee A person is playing trumpet A person is flying kite A person is filling eyebrows A person is shuffling cards A person is folding clothes A person is smoking A person is tai chi A person is squat A person is playing controller A person is throwing axe A person is giving or receiving award A person is air drumming A person is taking a shower A person is planting trees A person is sharpening knives A person is robot dancing A person is rock climbing A person is hula hooping A person is writing A person is bungee jumping A person is pushing cart A person is cleaning windows A person is cutting watermelon A person is cheerleading A person is washing hands A person is ironing A person is cutting nails A person is hugging A person is trimming or shaving beard A person is jogging A person is making bed A person is washing dishes A person is grooming dog A person is doing laundry A person is knitting A person is reading book A person is baby waking up A person is massaging legs A person is brushing teeth A person is crawling baby A person is motorcycling A person is driving car A person is sticking tongue out A person is shaking head A person is sword fighting A person is doing aerobics A person is strumming guitar A person is riding or walking with horse A person is archery A person is catching or throwing baseball A person is playing chess A person is rock scissors paper A person is using computer A person is arranging flowers A person is bending metal A person is ice skating A person is climbing a rope A person is crying A person is dancing ballet A person is getting a haircut A person is running on treadmill A person is kissing A person is counting money A person is barbequing A person is peeling apples A person is milking cow A person is shining shoes A person is making snowman A person is sailing ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/multiple_objects.txt ================================================ a bird and a cat a cat and a dog a dog and a horse a horse and a sheep a sheep and a cow a cow and an elephant an elephant and a bear a bear and a zebra a zebra and a giraffe a giraffe and a bird a chair and a couch a couch and a potted plant a potted plant and a tv a tv and a laptop a laptop and a remote a remote and a keyboard a keyboard and a cell phone a cell phone and a book a book and a clock a clock and a backpack a backpack and an umbrella an umbrella and a handbag a handbag and a tie a tie and a suitcase a suitcase and a vase a vase and scissors scissors and a teddy bear a teddy bear and a frisbee a frisbee and skis skis and a snowboard a snowboard and a sports ball a sports ball and a kite a kite and a baseball bat a baseball bat and a baseball glove a baseball glove and a skateboard a skateboard and a surfboard a surfboard and a tennis racket a tennis racket and a bottle a bottle and a chair an airplane and a train a train and a boat a boat and an airplane a bicycle and a car a car and a motorcycle a motorcycle and a bus a bus and a traffic light a traffic light and a fire hydrant a fire hydrant and a stop sign a stop sign and a parking meter a parking meter and a truck a truck and a bicycle a toilet and a hair drier a hair drier and a toothbrush a toothbrush and a sink a sink and a toilet a wine glass and a chair a cup and a couch a fork and a potted plant a knife and a tv a spoon and a laptop a bowl and a remote a banana and a keyboard an apple and a cell phone a sandwich and a book an orange and a clock broccoli and a backpack a carrot and an umbrella a hot dog and a handbag a pizza and a tie a donut and a suitcase a cake and a vase an oven and scissors a toaster and a teddy bear a microwave and a frisbee a refrigerator and skis a bicycle and an airplane a car and a train a motorcycle and a boat a person and a toilet a person and a hair drier a person and a toothbrush a person and a sink ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/object_class.txt ================================================ a person a bicycle a car a motorcycle an airplane a bus a train a truck a boat a traffic light a fire hydrant a stop sign a parking meter a bench a bird a cat a dog a horse a sheep a cow an elephant a bear a zebra a giraffe a backpack an umbrella a handbag a tie a suitcase a frisbee skis a snowboard a sports ball a kite a baseball bat a baseball glove a skateboard a surfboard a tennis racket a bottle a wine glass a cup a fork a knife a spoon a bowl a banana an apple a sandwich an orange broccoli a carrot a hot dog a pizza a donut a cake a chair a couch a potted plant a bed a dining table a toilet a tv a laptop a remote a keyboard a cell phone a microwave an oven a toaster a sink a refrigerator a book a clock a vase scissors a teddy bear a hair drier a toothbrush ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/overall_consistency.txt ================================================ Close up of grapes on a rotating table. Turtle swimming in ocean. A storm trooper vacuuming the beach. A panda standing on a surfboard in the ocean in sunset. An astronaut feeding ducks on a sunny afternoon, reflection from the water. Two pandas discussing an academic paper. Sunset time lapse at the beach with moving clouds and colors in the sky. A fat rabbit wearing a purple robe walking through a fantasy landscape. A koala bear playing piano in the forest. An astronaut flying in space. Fireworks. An animated painting of fluffy white clouds moving in sky. Flying through fantasy landscapes. A bigfoot walking in the snowstorm. A squirrel eating a burger. A cat wearing sunglasses and working as a lifeguard at a pool. Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks. Splash of turquoise water in extreme slow motion, alpha channel included. an ice cream is melting on the table. a drone flying over a snowy forest. a shark is swimming in the ocean. Aerial panoramic video from a drone of a fantasy land. a teddy bear is swimming in the ocean. time lapse of sunrise on mars. golden fish swimming in the ocean. An artist brush painting on a canvas close up. A drone view of celebration with Christmas tree and fireworks, starry sky - background. happy dog wearing a yellow turtleneck, studio, portrait, facing camera, dark background Origami dancers in white paper, 3D render, on white background, studio shot, dancing modern dance. Campfire at night in a snowy forest with starry sky in the background. a fantasy landscape A 3D model of a 1800s victorian house. this is how I do makeup in the morning. A raccoon that looks like a turtle, digital art. Robot dancing in Times Square. Busy freeway at night. Balloon full of water exploding in extreme slow motion. An astronaut is riding a horse in the space in a photorealistic style. Macro slo-mo. Slow motion cropped closeup of roasted coffee beans falling into an empty bowl. Sewing machine, old sewing machine working. Motion colour drop in water, ink swirling in water, colourful ink in water, abstraction fancy dream cloud of ink. Few big purple plums rotating on the turntable. water drops appear on the skin during rotation. isolated on the white background. close-up. macro. Vampire makeup face of beautiful girl, red contact lenses. Ashtray full of butts on table, smoke flowing on black background, close-up Pacific coast, carmel by the sea ocean and waves. A teddy bear is playing drum kit in NYC Times Square. A corgi is playing drum kit. An Iron man is playing the electronic guitar, high electronic guitar. A raccoon is playing the electronic guitar. A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Vincent van Gogh A corgi's head depicted as an explosion of a nebula A fantasy landscape A future where humans have achieved teleportation technology A jellyfish floating through the ocean, with bioluminescent tentacles A Mars rover moving on Mars A panda drinking coffee in a cafe in Paris A space shuttle launching into orbit, with flames and smoke billowing out from the engines A steam train moving on a mountainside A super cool giant robot in Cyberpunk Beijing A tropical beach at sunrise, with palm trees and crystal-clear water in the foreground Cinematic shot of Van Gogh's selfie, Van Gogh style Gwen Stacy reading a book Iron Man flying in the sky The bund Shanghai, oil painting Yoda playing guitar on the stage A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo A beautiful coastal beach in spring, waves lapping on sand by Vincent van Gogh A boat sailing leisurely along the Seine River with the Eiffel Tower in background A car moving slowly on an empty street, rainy evening A cat eating food out of a bowl A cat wearing sunglasses at a pool A confused panda in calculus class A cute fluffy panda eating Chinese food in a restaurant A cute happy Corgi playing in park, sunset A cute raccoon playing guitar in a boat on the ocean A happy fuzzy panda playing guitar nearby a campfire, snow mountain in the background A lightning striking atop of eiffel tower, dark clouds in the sky A modern art museum, with colorful paintings A panda cooking in the kitchen A panda playing on a swing set A polar bear is playing guitar A raccoon dressed in suit playing the trumpet, stage background A robot DJ is playing the turntable, in heavy raining futuristic tokyo rooftop cyberpunk night, sci-fi, fantasy A shark swimming in clear Caribbean ocean A super robot protecting city A teddy bear washing the dishes An epic tornado attacking above a glowing city at night, the tornado is made of smoke An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with umbrellas Clown fish swimming through the coral reef Hyper-realistic spaceship landing on Mars The bund Shanghai, vibrant color Vincent van Gogh is painting in the room Yellow flowers swing in the wind ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/scene.txt ================================================ alley amusement park aquarium arch art gallery bathroom bakery shop ballroom bar barn basement beach bedroom bridge botanical garden cafeteria campsite campus carrousel castle cemetery classroom cliff crosswalk construction site corridor courtyard desert downtown driveway farm food court football field forest road fountain gas station glacier golf course indoor gymnasium harbor highway hospital house iceberg industrial area jail cell junkyard kitchen indoor library lighthouse laboratory mansion marsh mountain indoor movie theater indoor museum music studio nursery ocean office palace parking lot pharmacy phone booth raceway restaurant river science museum shower ski slope sky skyscraper baseball stadium staircase street supermarket indoor swimming pool tower outdoor track train railway train station platform underwater coral reef valley volcano waterfall windmill ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/spatial_relationship.txt ================================================ a bicycle on the left of a car, front view a car on the right of a motorcycle, front view a motorcycle on the left of a bus, front view a bus on the right of a traffic light, front view a traffic light on the left of a fire hydrant, front view a fire hydrant on the right of a stop sign, front view a stop sign on the left of a parking meter, front view a parking meter on the right of a bench, front view a bench on the left of a truck, front view a truck on the right of a bicycle, front view a bird on the left of a cat, front view a cat on the right of a dog, front view a dog on the left of a horse, front view a horse on the right of a sheep, front view a sheep on the left of a cow, front view a cow on the right of an elephant, front view an elephant on the left of a bear, front view a bear on the right of a zebra, front view a zebra on the left of a giraffe, front view a giraffe on the right of a bird, front view a bottle on the left of a wine glass, front view a wine glass on the right of a cup, front view a cup on the left of a fork, front view a fork on the right of a knife, front view a knife on the left of a spoon, front view a spoon on the right of a bowl, front view a bowl on the left of a bottle, front view a potted plant on the left of a remote, front view a remote on the right of a clock, front view a clock on the left of a vase, front view a vase on the right of scissors, front view scissors on the left of a teddy bear, front view a teddy bear on the right of a potted plant, front view a frisbee on the left of a sports ball, front view a sports ball on the right of a baseball bat, front view a baseball bat on the left of a baseball glove, front view a baseball glove on the right of a tennis racket, front view a tennis racket on the left of a frisbee, front view a toilet on the left of a hair drier, front view a hair drier on the right of a toothbrush, front view a toothbrush on the left of a sink, front view a sink on the right of a toilet, front view a chair on the left of a couch, front view a couch on the right of a bed, front view a bed on the left of a tv, front view a tv on the right of a dining table, front view a dining table on the left of a chair, front view an airplane on the left of a train, front view a train on the right of a boat, front view a boat on the left of an airplane, front view an oven on the top of a toaster, front view an oven on the bottom of a toaster, front view a toaster on the top of a microwave, front view a toaster on the bottom of a microwave, front view a microwave on the top of an oven, front view a microwave on the bottom of an oven, front view a banana on the top of an apple, front view a banana on the bottom of an apple, front view an apple on the top of a sandwich, front view an apple on the bottom of a sandwich, front view a sandwich on the top of an orange, front view a sandwich on the bottom of an orange, front view an orange on the top of a carrot, front view an orange on the bottom of a carrot, front view a carrot on the top of a hot dog, front view a carrot on the bottom of a hot dog, front view a hot dog on the top of a pizza, front view a hot dog on the bottom of a pizza, front view a pizza on the top of a donut, front view a pizza on the bottom of a donut, front view a donut on the top of broccoli, front view a donut on the bottom of broccoli, front view broccoli on the top of a banana, front view broccoli on the bottom of a banana, front view skis on the top of a snowboard, front view skis on the bottom of a snowboard, front view a snowboard on the top of a kite, front view a snowboard on the bottom of a kite, front view a kite on the top of a skateboard, front view a kite on the bottom of a skateboard, front view a skateboard on the top of a surfboard, front view a skateboard on the bottom of a surfboard, front view a surfboard on the top of skis, front view a surfboard on the bottom of skis, front view ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/subject_consistency.txt ================================================ a person swimming in ocean a person giving a presentation to a room full of colleagues a person washing the dishes a person eating a burger a person walking in the snowstorm a person drinking coffee in a cafe a person playing guitar a bicycle leaning against a tree a bicycle gliding through a snowy field a bicycle slowing down to stop a bicycle accelerating to gain speed a car stuck in traffic during rush hour a car turning a corner a car slowing down to stop a car accelerating to gain speed a motorcycle cruising along a coastal highway a motorcycle turning a corner a motorcycle slowing down to stop a motorcycle gliding through a snowy field a motorcycle accelerating to gain speed an airplane soaring through a clear blue sky an airplane taking off an airplane landing smoothly on a runway an airplane accelerating to gain speed a bus turning a corner a bus stuck in traffic during rush hour a bus accelerating to gain speed a train speeding down the tracks a train crossing over a tall bridge a train accelerating to gain speed a truck turning a corner a truck anchored in a tranquil bay a truck stuck in traffic during rush hour a truck slowing down to stop a truck accelerating to gain speed a boat sailing smoothly on a calm lake a boat slowing down to stop a boat accelerating to gain speed a bird soaring gracefully in the sky a bird building a nest from twigs and leaves a bird flying over a snowy forest a cat grooming itself meticulously with its tongue a cat playing in park a cat drinking water a cat running happily a dog enjoying a peaceful walk a dog playing in park a dog drinking water a dog running happily a horse bending down to drink water from a river a horse galloping across an open field a horse taking a peaceful walk a horse running to join a herd of its kind a sheep bending down to drink water from a river a sheep taking a peaceful walk a sheep running to join a herd of its kind a cow bending down to drink water from a river a cow chewing cud while resting in a tranquil barn a cow running to join a herd of its kind an elephant spraying itself with water using its trunk to cool down an elephant taking a peaceful walk an elephant running to join a herd of its kind a bear catching a salmon in its powerful jaws a bear sniffing the air for scents of food a bear climbing a tree a bear hunting for prey a zebra bending down to drink water from a river a zebra running to join a herd of its kind a zebra taking a peaceful walk a giraffe bending down to drink water from a river a giraffe taking a peaceful walk a giraffe running to join a herd of its kind ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/temporal_flickering.txt ================================================ In a still frame, a stop sign a toilet, frozen in time a laptop, frozen in time A tranquil tableau of alley A tranquil tableau of bar A tranquil tableau of barn A tranquil tableau of bathroom A tranquil tableau of bedroom A tranquil tableau of cliff In a still frame, courtyard In a still frame, gas station A tranquil tableau of house indoor gymnasium, frozen in time A tranquil tableau of indoor library A tranquil tableau of kitchen A tranquil tableau of palace In a still frame, parking lot In a still frame, phone booth A tranquil tableau of restaurant A tranquil tableau of tower A tranquil tableau of a bowl A tranquil tableau of an apple A tranquil tableau of a bench A tranquil tableau of a bed A tranquil tableau of a chair A tranquil tableau of a cup A tranquil tableau of a dining table In a still frame, a pear A tranquil tableau of a bunch of grapes A tranquil tableau of a bowl on the kitchen counter A tranquil tableau of a beautiful, handcrafted ceramic bowl A tranquil tableau of an antique bowl A tranquil tableau of an exquisite mahogany dining table A tranquil tableau of a wooden bench in the park A tranquil tableau of a beautiful wrought-iron bench surrounded by blooming flowers In a still frame, a park bench with a view of the lake A tranquil tableau of a vintage rocking chair was placed on the porch A tranquil tableau of the jail cell was small and dimly lit, with cold, steel bars A tranquil tableau of the phone booth was tucked away in a quiet alley a dilapidated phone booth stood as a relic of a bygone era on the sidewalk, frozen in time A tranquil tableau of the old red barn stood weathered and iconic against the backdrop of the countryside A tranquil tableau of a picturesque barn was painted a warm shade of red and nestled in a picturesque meadow In a still frame, within the desolate desert, an oasis unfolded, characterized by the stoic presence of palm trees and a motionless, glassy pool of water In a still frame, the Parthenon's majestic Doric columns stand in serene solitude atop the Acropolis, framed by the tranquil Athenian landscape In a still frame, the Temple of Hephaestus, with its timeless Doric grace, stands stoically against the backdrop of a quiet Athens In a still frame, the ornate Victorian streetlamp stands solemnly, adorned with intricate ironwork and stained glass panels A tranquil tableau of the Stonehenge presented itself as an enigmatic puzzle, each colossal stone meticulously placed against the backdrop of tranquility In a still frame, in the vast desert, an oasis nestled among dunes, featuring tall palm trees and an air of serenity static view on a desert scene with an oasis, palm trees, and a clear, calm pool of water A tranquil tableau of an ornate Victorian streetlamp standing on a cobblestone street corner, illuminating the empty night A tranquil tableau of a tranquil lakeside cabin nestled among tall pines, its reflection mirrored perfectly in the calm water In a still frame, a vintage gas lantern, adorned with intricate details, gracing a historic cobblestone square In a still frame, a tranquil Japanese tea ceremony room, with tatami mats, a delicate tea set, and a bonsai tree in the corner A tranquil tableau of the Parthenon stands resolute in its classical elegance, a timeless symbol of Athens' cultural legacy A tranquil tableau of in the heart of Plaka, the neoclassical architecture of the old city harmonizes with the ancient ruins A tranquil tableau of in the desolate beauty of the American Southwest, Chaco Canyon's ancient ruins whispered tales of an enigmatic civilization that once thrived amidst the arid landscapes A tranquil tableau of at the edge of the Arabian Desert, the ancient city of Petra beckoned with its enigmatic rock-carved façades In a still frame, amidst the cobblestone streets, an Art Nouveau lamppost stood tall A tranquil tableau of in the quaint village square, a traditional wrought-iron streetlamp featured delicate filigree patterns and amber-hued glass panels A tranquil tableau of the lampposts were adorned with Art Deco motifs, their geometric shapes and frosted glass creating a sense of vintage glamour In a still frame, in the picturesque square, a Gothic-style lamppost adorned with intricate stone carvings added a touch of medieval charm to the setting In a still frame, in the heart of the old city, a row of ornate lantern-style streetlamps bathed the narrow alleyway in a warm, welcoming light A tranquil tableau of in the heart of the Utah desert, a massive sandstone arch spanned the horizon A tranquil tableau of in the Arizona desert, a massive stone bridge arched across a rugged canyon A tranquil tableau of in the corner of the minimalist tea room, a bonsai tree added a touch of nature's beauty to the otherwise simple and elegant space In a still frame, amidst the hushed ambiance of the traditional tea room, a meticulously arranged tea set awaited, with porcelain cups, a bamboo whisk In a still frame, nestled in the Zen garden, a rustic teahouse featured tatami seating and a traditional charcoal brazier A tranquil tableau of a country estate's library featured elegant wooden shelves A tranquil tableau of beneath the shade of a solitary oak tree, an old wooden park bench sat patiently A tranquil tableau of beside a tranquil pond, a weeping willow tree draped its branches gracefully over the water's surface, creating a serene tableau of reflection and calm A tranquil tableau of in the Zen garden, a perfectly raked gravel path led to a serene rock garden In a still frame, a tranquil pond was fringed by weeping cherry trees, their blossoms drifting lazily onto the glassy surface In a still frame, within the historic library's reading room, rows of antique leather chairs and mahogany tables offered a serene haven for literary contemplation A tranquil tableau of a peaceful orchid garden showcased a variety of delicate blooms A tranquil tableau of in the serene courtyard, a centuries-old stone well stood as a symbol of a bygone era, its mossy stones bearing witness to the passage of time ================================================ FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/temporal_style.txt ================================================ A beautiful coastal beach in spring, waves lapping on sand, in super slow motion A beautiful coastal beach in spring, waves lapping on sand, zoom in A beautiful coastal beach in spring, waves lapping on sand, zoom out A beautiful coastal beach in spring, waves lapping on sand, pan left A beautiful coastal beach in spring, waves lapping on sand, pan right A beautiful coastal beach in spring, waves lapping on sand, tilt up A beautiful coastal beach in spring, waves lapping on sand, tilt down A beautiful coastal beach in spring, waves lapping on sand, with an intense shaking effect A beautiful coastal beach in spring, waves lapping on sand, featuring a steady and smooth perspective A beautiful coastal beach in spring, waves lapping on sand, racking focus The bund Shanghai, in super slow motion The bund Shanghai, zoom in The bund Shanghai, zoom out The bund Shanghai, pan left The bund Shanghai, pan right The bund Shanghai, tilt up The bund Shanghai, tilt down The bund Shanghai, with an intense shaking effect The bund Shanghai, featuring a steady and smooth perspective The bund Shanghai, racking focus a shark is swimming in the ocean, in super slow motion a shark is swimming in the ocean, zoom in a shark is swimming in the ocean, zoom out a shark is swimming in the ocean, pan left a shark is swimming in the ocean, pan right a shark is swimming in the ocean, tilt up a shark is swimming in the ocean, tilt down a shark is swimming in the ocean, with an intense shaking effect a shark is swimming in the ocean, featuring a steady and smooth perspective a shark is swimming in the ocean, racking focus A panda drinking coffee in a cafe in Paris, in super slow motion A panda drinking coffee in a cafe in Paris, zoom in A panda drinking coffee in a cafe in Paris, zoom out A panda drinking coffee in a cafe in Paris, pan left A panda drinking coffee in a cafe in Paris, pan right A panda drinking coffee in a cafe in Paris, tilt up A panda drinking coffee in a cafe in Paris, tilt down A panda drinking coffee in a cafe in Paris, with an intense shaking effect A panda drinking coffee in a cafe in Paris, featuring a steady and smooth perspective A panda drinking coffee in a cafe in Paris, racking focus A cute happy Corgi playing in park, sunset, in super slow motion A cute happy Corgi playing in park, sunset, zoom in A cute happy Corgi playing in park, sunset, zoom out A cute happy Corgi playing in park, sunset, pan left A cute happy Corgi playing in park, sunset, pan right A cute happy Corgi playing in park, sunset, tilt up A cute happy Corgi playing in park, sunset, tilt down A cute happy Corgi playing in park, sunset, with an intense shaking effect A cute happy Corgi playing in park, sunset, featuring a steady and smooth perspective A cute happy Corgi playing in park, sunset, racking focus Gwen Stacy reading a book, in super slow motion Gwen Stacy reading a book, zoom in Gwen Stacy reading a book, zoom out Gwen Stacy reading a book, pan left Gwen Stacy reading a book, pan right Gwen Stacy reading a book, tilt up Gwen Stacy reading a book, tilt down Gwen Stacy reading a book, with an intense shaking effect Gwen Stacy reading a book, featuring a steady and smooth perspective Gwen Stacy reading a book, racking focus A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in super slow motion A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom in A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom out A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan left A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan right A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt up A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt down A boat sailing leisurely along the Seine River with the Eiffel Tower in background, with an intense shaking effect A boat sailing leisurely along the Seine River with the Eiffel Tower in background, featuring a steady and smooth perspective A boat sailing leisurely along the Seine River with the Eiffel Tower in background, racking focus A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in super slow motion A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom in A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom out A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan left A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan right A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt up A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt down A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, with an intense shaking effect A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, featuring a steady and smooth perspective A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, racking focus An astronaut flying in space, in super slow motion An astronaut flying in space, zoom in An astronaut flying in space, zoom out An astronaut flying in space, pan left An astronaut flying in space, pan right An astronaut flying in space, tilt up An astronaut flying in space, tilt down An astronaut flying in space, with an intense shaking effect An astronaut flying in space, featuring a steady and smooth perspective An astronaut flying in space, racking focus Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in super slow motion Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom in Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom out Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan left Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan right Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt up Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt down Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, with an intense shaking effect Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, featuring a steady and smooth perspective Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, racking focus ================================================ FILE: Open-Sora/assets/texts/imagenet_id.txt ================================================ 207 360 387 974 88 979 417 279 ================================================ FILE: Open-Sora/assets/texts/imagenet_labels.txt ================================================ golden retriever otter lesser panda geyser macaw valley balloon golden panda ================================================ FILE: Open-Sora/assets/texts/rand_types.txt ================================================ 随机电影镜头 随机电影镜头 随机电影镜头 随机电影镜头 随机电影镜头 随机任务镜头 随机任务镜头 随机任务镜头 随机任务镜头 随机任务镜头 随机游戏镜头 随机游戏镜头 随机游戏镜头 随机游戏镜头 随机游戏镜头 随机开车镜头 随机开车镜头 随机开车镜头 随机开车镜头 随机开车镜头 随机动物镜头 随机动物镜头 随机动物镜头 随机动物镜头 随机动物镜头 随机森林镜头 随机森林镜头 随机森林镜头 随机森林镜头 随机森林镜头 随机动漫镜头 随机动漫镜头 随机动漫镜头 随机动漫镜头 随机动漫镜头 随机舞蹈镜头 随机舞蹈镜头 随机舞蹈镜头 随机舞蹈镜头 随机舞蹈镜头 ================================================ FILE: Open-Sora/assets/texts/t2i_samples.txt ================================================ A small cactus with a happy face in the Sahara desert. Bright scene, aerial view,ancient city, fantasy, gorgeous light, mirror reflection, high detail, wide angle lens. Nature vs human nature, surreal, UHD, 8k, hyper details, rich colors, photograph. Poster of a mechanical cat, techical Schematics viewed from front. Luffy from ONEPIECE, handsome face, fantasy. Real beautiful woman. A alpaca made of colorful building blocks, cyberpunk. artistic ================================================ FILE: Open-Sora/assets/texts/t2i_sigma.txt ================================================ Eiffel Tower was Made up of more than 2 million translucent straws to look like a cloud, with the bell tower at the top of the building, Michel installed huge foam-making machines in the forest to blow huge amounts of unpredictable wet clouds in the building's classic architecture. A gorgeously rendered papercraft world of a coral reef, rife with colorful fish and sea creatures. Full body shot, a French woman, Photography, French Streets background, backlighting, rim light, Fujifilm. Close-up photos of models, hazy light and shadow, laser metal hair accessories, soft and beautiful, light gold pupils, white eyelashes, low saturation, real skin details, clear pores and fine lines, light reflection and refraction, ultra-clear, cinematography, award-winning works. A litter of golden retriever puppies playing in the snow. Their heads pop out of the snow, covered in. Lego model, future rocket station, intricate details, high resolution, unreal engine, UHD One giant, sharp, metal square mirror in the center of the frame, four young people on the foreground, background sunny palm oil planation, tropical, realistic style, photography, nostalgic, green tone, mysterious, dreamy, bright color. Modern luxury contemporary luxury home interiors house, in the style of mimicking ruined materials, ray tracing, haunting houses, and stone, capture the essence of nature, gray and bronze, dynamic outdoor shots. Over the shoulder game perspective, game screen of Diablo 4, Inside the gorgeous palace is the wet ground, The necromancer knelt before the king, and a horde of skeletons he summoned stood at his side, cinematic light. A curvy timber house near a sea, designed by Zaha Hadid, represent the image of a cold, modern architecture, at night, white lighting, highly detailed. ================================================ FILE: Open-Sora/assets/texts/t2v_car.txt ================================================ |0|A car driving on the in forest.|2|A car driving in the desert.|4|A car driving near the coast.|6|A car driving in the city.|8|A car driving near a mountain.|10|A car driving on the surface of a river.|12|A car driving on the surface of the earch.|14|A car driving in the universe.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,16,0.4"} ================================================ FILE: Open-Sora/assets/texts/t2v_latte.txt ================================================ Yellow and black tropical fish dart through the sea. An epic tornado attacking above aglowing city at night. Slow pan upward of blazing oak fire in an indoor fireplace. a cat wearing sunglasses and working as a lifeguard at pool. Sunset over the sea. A dog in astronaut suit and sunglasses floating in space. A astronaut in flying in space, 4k, high resolution ================================================ FILE: Open-Sora/assets/texts/t2v_pllava.txt ================================================ a close-up shot of a woman standing in a room with a white wall and a plant on the left side. the woman has curly hair and is wearing a green tank top. she is looking to the side with a neutral expression on her face. the lighting in the room is soft and appears to be natural, coming from the left side of the frame. the focus is on the woman, with the background being out of focus. there are no texts or other objects in the video. the style of the video is a simple, candid portrait with a shallow depth of field. a serene scene of a pond filled with water lilies. the water is a deep blue, providing a striking contrast to the pink and white flowers that float on its surface. the flowers, in full bloom, are the main focus of the video. they are scattered across the pond, with some closer to the camera and others further away, creating a sense of depth. the pond is surrounded by lush greenery, adding a touch of nature to the scene. the video is taken from a low angle, looking up at the flowers, which gives a unique perspective and emphasizes their beauty. the overall composition of the video suggests a peaceful and tranquil setting, likely a garden or a park. a professional setting where a woman is presenting a slide from a presentation. she is standing in front of a projector screen, which displays a bar chart. the chart is colorful, with bars of different heights, indicating some sort of data comparison. the woman is holding a pointer, which she uses to highlight specific parts of the chart. she is dressed in a white blouse and black pants, and her hair is styled in a bun. the room has a modern design, with a sleek black floor and a white ceiling. the lighting is bright, illuminating the woman and the projector screen. the focus of the image is on the woman and the projector screen, with the background being out of focus. there are no texts visible in the image. the relative positions of the objects suggest that the woman is the main subject of the image, and the projector screen is the object of her attention. the image does not provide any information about the content of the presentation or the context of the meeting. a bustling city street from the perspective of a car. the car, a sleek black sedan, is in motion, driving down the street. the dashboard of the car is visible in the foreground, providing a view of the road ahead. the street is lined with parked cars on both sides, their colors muted in the bright sunlight. buildings rise on either side of the street, their windows reflecting the sunlight. the sky above is a clear blue, and the sun is shining brightly, casting a warm glow on the scene. the street is busy with pedestrians and other vehicles, adding to the dynamic nature of the scene. the video does not contain any text. the relative positions of the objects suggest a typical city street scene with the car in the foreground, the parked cars on either side, and the buildings in the background. the sunlight illuminates the scene, highlighting the colors and details of the objects. the pedestrians and other vehicles are in motion, adding a sense of life and activity to the scene. the buildings provide a sense of depth and scale to the image. the video does not contain any text or countable objects. the a serene scene in a park. the sun is shining brightly, casting a warm glow on the lush green trees and the grassy field. the camera is positioned low, looking up at the towering trees, which are the main focus of the image. the trees are dense and full of leaves, creating a canopy of green that fills the frame. the sunlight filters through the leaves, creating a beautiful pattern of light and shadow on the ground. the overall atmosphere of the video is peaceful and tranquil, evoking a sense of calm and relaxation. a moment in a movie theater. a couple is seated in the middle of the theater, engrossed in the movie they are watching. the man is dressed in a casual outfit, complete with a pair of sunglasses, while the woman is wearing a cozy sweater. they are seated on a red theater seat, which stands out against the dark surroundings. the theater itself is dimly lit, with the screen displaying the movie they are watching. the couple appears to be enjoying the movie, their attention completely absorbed by the on-screen action. the theater is mostly empty, with only a few other seats visible in the background. the video does not contain any text or additional objects. the relative positions of the objects are such that the couple is in the foreground, while the screen and the other seats are in the background. the focus of the video is clearly on the couple and their shared experience of watching a movie in a theater. a scene where a person is examining a dog. the person is wearing a blue shirt with the word "volunteer" printed on it. the dog is lying on its side, and the person is using a stethoscope to listen to the dog's heartbeat. the dog appears to be a golden retriever and is looking directly at the camera. the background is blurred, but it seems to be an indoor setting with a white wall. the person's focus is on the dog, and they seem to be checking its health. the dog's expression is calm, and it seems to be comfortable with the person's touch. the overall atmosphere of the video is calm and professional. a close-up shot of a woman applying makeup. she is using a black brush to apply a dark powder to her face. the woman has blonde hair and is wearing a black top. the background is black, which contrasts with her skin tone and the makeup. the focus is on her face and the brush, with the rest of her body and the background being out of focus. the lighting is soft and even, highlighting the texture of the makeup and the woman's skin. there are no texts or other objects in the video. the woman's expression is neutral, and she is looking directly at the camera. the video does not contain any action, as it is a still shot of a woman applying makeup. the relative position of the woman and the brush is such that the brush is in her hand and is being used to apply the makeup to her face. the video does not contain any other objects or actions. the woman is the only person in the video, and she is the main subject. the video does not contain any sound. the description is based on the visible content of the video and does not include any assumptions or interpretations. a young woman is seated in a black gaming chair in a room filled with computer monitors and other gaming equipment. she is wearing a red tank top and black pants, and her hair is styled in loose waves. the room is dimly lit, with the glow of the monitors casting a soft light on her face. she is holding a black game controller in her hands, and her attention is focused on the screen in front of her. the room is filled with other gaming equipment, including keyboards and mice, and there are other chairs and desks scattered around the room. the woman appears to be engrossed in her game, her posture relaxed yet focused. the room is quiet, the only sound coming from the beeps and boops of the game. the woman is the only person in the room, adding a sense of solitude to the scene. the video does not contain any text. the relative positions of the objects suggest a well-organized gaming setup, with the woman at the center, surrounded by her gaming equipment. the video does not contain any action, but the woman's focused expression suggests that she is in the middle of an intense g a breathtaking aerial view of a coastal landscape at sunset. the sky, painted in hues of orange and pink, serves as a stunning backdrop to the scene. the sun, partially obscured by the horizon, casts a warm glow on the landscape below. the foreground of the image is dominated by a rocky cliff, its rugged surface adding a touch of raw beauty to the scene. the cliff's edge is adorned with patches of green vegetation, providing a stark contrast to the otherwise barren landscape. the middle ground of the image reveals a winding road that hugs the coastline. the road, appearing as a thin line against the vast expanse of the landscape, guides the viewer's eye towards the horizon. in the background, the silhouette of mountains can be seen, their peaks shrouded in a light mist. the mountains, along with the road, add depth to the image, creating a sense of distance and scale. overall, the video presents a serene and majestic coastal landscape, captured at the perfect moment of sunset. the colors ================================================ FILE: Open-Sora/assets/texts/t2v_ref.txt ================================================ Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway. In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave. Pirate ship in a cosmic maelstrom nebula. Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway. A sad small cactus with in the Sahara desert becomes happy. A car driving on a road in the middle of a desert. ================================================ FILE: Open-Sora/assets/texts/t2v_samples.txt ================================================ A soaring drone footage captures the majestic beauty of a coastal cliff, its red and yellow stratified rock faces rich in color and against the vibrant turquoise of the sea. Seabirds can be seen taking flight around the cliff's precipices. As the drone slowly moves from different angles, the changing sunlight casts shifting shadows that highlight the rugged textures of the cliff and the surrounding calm sea. The water gently laps at the rock base and the greenery that clings to the top of the cliff, and the scene gives a sense of peaceful isolation at the fringes of the ocean. The video captures the essence of pristine natural beauty untouched by human structures. A majestic beauty of a waterfall cascading down a cliff into a serene lake. The waterfall, with its powerful flow, is the central focus of the video. The surrounding landscape is lush and green, with trees and foliage adding to the natural beauty of the scene. The camera angle provides a bird's eye view of the waterfall, allowing viewers to appreciate the full height and grandeur of the waterfall. The video is a stunning representation of nature's power and beauty. A vibrant scene of a snowy mountain landscape. The sky is filled with a multitude of colorful hot air balloons, each floating at different heights, creating a dynamic and lively atmosphere. The balloons are scattered across the sky, some closer to the viewer, others further away, adding depth to the scene. Below, the mountainous terrain is blanketed in a thick layer of snow, with a few patches of bare earth visible here and there. The snow-covered mountains provide a stark contrast to the colorful balloons, enhancing the visual appeal of the scene. In the foreground, a few cars can be seen driving along a winding road that cuts through the mountains. The cars are small compared to the vastness of the landscape, emphasizing the grandeur of the surroundings. The overall style of the video is a mix of adventure and tranquility, with the hot air balloons adding a touch of whimsy to the otherwise serene mountain landscape. The video is likely shot during the day, as the lighting is bright and even, casting soft shadows on the snow-covered mountains. The vibrant beauty of a sunflower field. The sunflowers, with their bright yellow petals and dark brown centers, are in full bloom, creating a stunning contrast against the green leaves and stems. The sunflowers are arranged in neat rows, creating a sense of order and symmetry. The sun is shining brightly, casting a warm glow on the flowers and highlighting their intricate details. The video is shot from a low angle, looking up at the sunflowers, which adds a sense of grandeur and awe to the scene. The sunflowers are the main focus of the video, with no other objects or people present. The video is a celebration of nature's beauty and the simple joy of a sunny day in the countryside. A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell, is the main focus of the video, swimming gracefully towards the right side of the frame. The coral reef, teeming with life, is visible in the background, providing a vibrant and colorful backdrop to the turtle's journey. Several small fish, darting around the turtle, add a sense of movement and dynamism to the scene. The video is shot from a slightly elevated angle, providing a comprehensive view of the turtle's surroundings. The overall style of the video is calm and peaceful, capturing the beauty and tranquility of the underwater world. A vibrant underwater scene. A group of blue fish, with yellow fins, are swimming around a coral reef. The coral reef is a mix of brown and green, providing a natural habitat for the fish. The water is a deep blue, indicating a depth of around 30 feet. The fish are swimming in a circular pattern around the coral reef, indicating a sense of motion and activity. The overall scene is a beautiful representation of marine life. A bustling city street at night, filled with the glow of car headlights and the ambient light of streetlights. The scene is a blur of motion, with cars speeding by and pedestrians navigating the crosswalks. The cityscape is a mix of towering buildings and illuminated signs, creating a vibrant and dynamic atmosphere. The perspective of the video is from a high angle, providing a bird's eye view of the street and its surroundings. The overall style of the video is dynamic and energetic, capturing the essence of urban life at night. A snowy forest landscape with a dirt road running through it. The road is flanked by trees covered in snow, and the ground is also covered in snow. The sun is shining, creating a bright and serene atmosphere. The road appears to be empty, and there are no people or animals visible in the video. The style of the video is a natural landscape shot, with a focus on the beauty of the snowy forest and the peacefulness of the road. The dynamic movement of tall, wispy grasses swaying in the wind. The sky above is filled with clouds, creating a dramatic backdrop. The sunlight pierces through the clouds, casting a warm glow on the scene. The grasses are a mix of green and brown, indicating a change in seasons. The overall style of the video is naturalistic, capturing the beauty of the landscape in a realistic manner. The focus is on the grasses and their movement, with the sky serving as a secondary element. The video does not contain any human or animal elements. A serene night scene in a forested area. The first frame shows a tranquil lake reflecting the star-filled sky above. The second frame reveals a beautiful sunset, casting a warm glow over the landscape. The third frame showcases the night sky, filled with stars and a vibrant Milky Way galaxy. The video is a time-lapse, capturing the transition from day to night, with the lake and forest serving as a constant backdrop. The style of the video is naturalistic, emphasizing the beauty of the night sky and the peacefulness of the forest. ================================================ FILE: Open-Sora/assets/texts/t2v_short.txt ================================================ A fat rabbit wearing a purple robe walking through a fantasy landscape Waves crashing against a lone lighthouse, ominous lighting A mystical forest showcasing the adventures of travelers who enter A blue-haired mage singing A surreal landscape with floating islands and waterfalls in the sky craft A blue bird standing in water A young man walks alone by the seaside Pink rose on a glass surface with droplets, close-up Drove viewpoint, a subway train coming out of a tunnel Space with all planets green and pink color with background of bright white stars A city floating in an astral space, with stars and nebulae Sunrise on top of a high-rise building Pink and cyan powder explosions Deers in the woods gaze into the camera under the sunlight In a flash of lightning, a wizard appeared from thin air, his long robes billowing in the wind A futuristic cyberpunk cityscape at night with towering neon-lit skyscrapers A scene where the trees, flowers, and animals come together to create a symphony of nature A ghostly ship sailing through the clouds, navigating through a sea under a moonlit sky A sunset with beautiful beach A young man walking alone in the forest ================================================ FILE: Open-Sora/assets/texts/t2v_sora.txt ================================================ A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. A movie trailer featuring the adventures of the 30 year old space man wearing a red wool knitted motorcycle helmet, blue sky, salt desert, cinematic style, shot on 35mm film, vivid colors. Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway. Animated scene features a close-up of a short fluffy monster kneeling beside a melting red candle. The art style is 3D and realistic, with a focus on lighting and texture. The mood of the painting is one of wonder and curiosity, as the monster gazes at the flame with wide eyes and open mouth. Its pose and expression convey a sense of innocence and playfulness, as if it is exploring the world around it for the first time. The use of warm colors and dramatic lighting further enhances the cozy atmosphere of the image. A gorgeously rendered papercraft world of a coral reef, rife with colorful fish and sea creatures. This close-up shot of a Victoria crowned pigeon showcases its striking blue plumage and red chest. Its crest is made of delicate, lacy feathers, while its eye is a striking red color. The bird’s head is tilted slightly to the side, giving the impression of it looking regal and majestic. The background is blurred, drawing attention to the bird’s striking appearance. Photorealistic closeup video of two pirate ships battling each other as they sail inside a cup of coffee. A young man at his 20s is sitting on a piece of cloud in the sky, reading a book. Historical footage of California during the gold rush. A close up view of a glass sphere that has a zen garden within it. There is a small dwarf in the sphere who is raking the zen garden and creating patterns in the sand. Extreme close up of a 24 year old woman’s eye blinking, standing in Marrakech during magic hour, cinematic film shot in 70mm, depth of field, vivid colors, cinematic A cartoon kangaroo disco dances. A beautiful homemade video showing the people of Lagos, Nigeria in the year 2056. Shot with a mobile phone camera. A petri dish with a bamboo forest growing within it that has tiny red pandas running around. The camera rotates around a large stack of vintage televisions all showing different programs — 1950s sci-fi movies, horror movies, news, static, a 1970s sitcom, etc, set inside a large New York museum gallery. 3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies. The creature stops to interact playfully with a group of tiny, fairy-like beings dancing around a mushroom ring. The creature looks up in awe at a large, glowing tree that seems to be the heart of the forest. The camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a steep mountain slope, dust kicks up from it’s tires, the sunlight shines on the SUV as it speeds along the dirt road, casting a warm glow over the scene. The dirt road curves gently into the distance, with no other cars or vehicles in sight. The trees on either side of the road are redwoods, with patches of greenery scattered throughout. The car is seen from the rear following the curve with ease, making it seem as if it is on a rugged drive through the rugged terrain. The dirt road itself is surrounded by steep hills and mountains, with a clear blue sky above with wispy clouds. Reflections in the window of a train traveling through the Tokyo suburbs. A drone camera circles around a beautiful historic church built on a rocky outcropping along the Amalfi Coast, the view showcases historic and magnificent architectural details and tiered pathways and patios, waves are seen crashing against the rocks below as the view overlooks the horizon of the coastal waters and hilly landscapes of the Amalfi Coast Italy, several distant people are seen walking and enjoying vistas on patios of the dramatic ocean views, the warm glow of the afternoon sun creates a magical and romantic feeling to the scene, the view is stunning captured with beautiful photography. A large orange octopus is seen resting on the bottom of the ocean floor, blending in with the sandy and rocky terrain. Its tentacles are spread out around its body, and its eyes are closed. The octopus is unaware of a king crab that is crawling towards it from behind a rock, its claws raised and ready to attack. The crab is brown and spiny, with long legs and antennae. The scene is captured from a wide angle, showing the vastness and depth of the ocean. The water is clear and blue, with rays of sunlight filtering through. The shot is sharp and crisp, with a high dynamic range. The octopus and the crab are in focus, while the background is slightly blurred, creating a depth of field effect. A flock of paper airplanes flutters through a dense jungle, weaving around trees as if they were migrating birds. A cat waking up its sleeping owner demanding breakfast. The owner tries to ignore the cat, but the cat tries new tactics and finally the owner pulls out a secret stash of treats from under the pillow to hold the cat off a little longer. Borneo wildlife on the Kinabatangan River A Chinese Lunar New Year celebration video with Chinese Dragon. Tour of an art gallery with many beautiful works of art in different styles. Beautiful, snowy Tokyo city is bustling. The camera moves through the bustling city street, following several people enjoying the beautiful snowy weather and shopping at nearby stalls. Gorgeous sakura petals are flying through the wind along with snowflakes. A stop motion animation of a flower growing out of the windowsill of a suburban house. The story of a robot’s life in a cyberpunk setting. An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in thought pondering the history of the universe as he sits at a cafe in Paris, his eyes focus on people offscreen as they walk as he sits mostly motionless, he is dressed in a wool coat suit coat with a button-down shirt , he wears a brown beret and glasses and has a very professorial appearance, and the end he offers a subtle closed-mouth smile as if he found the answer to the mystery of life, the lighting is very cinematic with the golden light and the Parisian streets and city in the background, depth of field, cinematic 35mm film. A beautiful silhouette animation shows a wolf howling at the moon, feeling lonely, until it finds its pack. New York City submerged like Atlantis. Fish, whales, sea turtles and sharks swim through the streets of New York. A litter of golden retriever puppies playing in the snow. Their heads pop out of the snow, covered in. Step-printing scene of a person running, cinematic film shot in 35mm. Five gray wolf pups frolicking and chasing each other around a remote gravel road, surrounded by grass. The pups run and leap, chasing each other, and nipping at each other, playing. Basketball through hoop then explodes. Archeologists discover a generic plastic chair in the desert, excavating and dusting it with great care. A grandmother with neatly combed grey hair stands behind a colorful birthday cake with numerous candles at a wood dining room table, expression is one of pure joy and happiness, with a happy glow in her eye. She leans forward and blows out the candles with a gentle puff, the cake has pink frosting and sprinkles and the candles cease to flicker, the grandmother wears a light blue blouse adorned with floral patterns, several happy friends and family sitting at the table can be seen celebrating, out of focus. The scene is beautifully captured, cinematic, showing a 3/4 view of the grandmother and the dining room. Warm color tones and soft lighting enhance the mood. The camera directly faces colorful buildings in Burano Italy. An adorable dalmation looks through a window on a building on the ground floor. Many people are walking and cycling along the canal streets in front of the buildings. An adorable happy otter confidently stands on a surfboard wearing a yellow lifejacket, riding along turquoise tropical waters near lush tropical islands, 3D digital render art style. This close-up shot of a chameleon showcases its striking color changing capabilities. The background is blurred, drawing attention to the animal’s striking appearance. A corgi vlogging itself in tropical Maui. A white and orange tabby cat is seen happily darting through a dense garden, as if chasing something. Its eyes are wide and happy as it jogs forward, scanning the branches, flowers, and leaves as it walks. The path is narrow as it makes its way between all the plants. the scene is captured from a ground-level angle, following the cat closely, giving a low and intimate perspective. The image is cinematic with warm tones and a grainy texture. The scattered daylight between the leaves and plants above creates a warm contrast, accentuating the cat’s orange fur. The shot is clear and sharp, with a shallow depth of field. Aerial view of Santorini during the blue hour, showcasing the stunning architecture of white Cycladic buildings with blue domes. The caldera views are breathtaking, and the lighting creates a beautiful, serene atmosphere. Tiltshift of a construction site filled with workers, equipment, and heavy machinery. A giant, towering cloud in the shape of a man looms over the earth. The cloud man shoots lighting bolts down to the earth. A Samoyed and a Golden Retriever dog are playfully romping through a futuristic neon city at night. The neon lights emitted from the nearby buildings glistens off of their fur. The Glenfinnan Viaduct is a historic railway bridge in Scotland, UK, that crosses over the west highland line between the towns of Mallaig and Fort William. It is a stunning sight as a steam train leaves the bridge, traveling over the arch-covered viaduct. The landscape is dotted with lush greenery and rocky mountains, creating a picturesque backdrop for the train journey. The sky is blue and the sun is shining, making for a beautiful day to explore this majestic spot. ================================================ FILE: Open-Sora/assets/texts/ucf101_id.txt ================================================ 0 1 2 3 4 5 ================================================ FILE: Open-Sora/assets/texts/ucf101_labels.txt ================================================ Apply Eye Makeup Apply Lipstick Archery Baby Crawling Balance Beam Band Marching ================================================ FILE: Open-Sora/build/lib/opensora/acceleration/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/opensora/acceleration/checkpoint.py ================================================ from collections.abc import Iterable import torch.nn as nn from torch.utils.checkpoint import checkpoint, checkpoint_sequential def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1): assert isinstance(model, nn.Module) def set_attr(module): module.grad_checkpointing = True module.fp32_attention = use_fp32_attention module.grad_checkpointing_step = gc_step model.apply(set_attr) def auto_grad_checkpoint(module, *args, **kwargs): if getattr(module, "grad_checkpointing", False): if not isinstance(module, Iterable): return checkpoint(module, *args, use_reentrant=False, **kwargs) gc_step = module[0].grad_checkpointing_step return checkpoint_sequential(module, gc_step, *args, use_reentrant=False, **kwargs) return module(*args, **kwargs) ================================================ FILE: Open-Sora/build/lib/opensora/acceleration/communications.py ================================================ import torch import torch.distributed as dist # ==================== # All-To-All # ==================== def _all_to_all( input_: torch.Tensor, world_size: int, group: dist.ProcessGroup, scatter_dim: int, gather_dim: int, ): input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)] output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)] dist.all_to_all(output_list, input_list, group=group) return torch.cat(output_list, dim=gather_dim).contiguous() class _AllToAll(torch.autograd.Function): """All-to-all communication. Args: input_: input matrix process_group: communication group scatter_dim: scatter dimension gather_dim: gather dimension """ @staticmethod def forward(ctx, input_, process_group, scatter_dim, gather_dim): ctx.process_group = process_group ctx.scatter_dim = scatter_dim ctx.gather_dim = gather_dim ctx.world_size = dist.get_world_size(process_group) output = _all_to_all(input_, ctx.world_size, process_group, scatter_dim, gather_dim) return output @staticmethod def backward(ctx, grad_output): grad_output = _all_to_all( grad_output, ctx.world_size, ctx.process_group, ctx.gather_dim, ctx.scatter_dim, ) return ( grad_output, None, None, None, ) def all_to_all( input_: torch.Tensor, process_group: dist.ProcessGroup, scatter_dim: int = 2, gather_dim: int = 1, ): return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim) def _gather( input_: torch.Tensor, world_size: int, group: dist.ProcessGroup, gather_dim: int, ): if gather_list is None: gather_list = [torch.empty_like(input_) for _ in range(world_size)] dist.gather(input_, gather_list, group=group, gather_dim=gather_dim) return gather_list # ==================== # Gather-Split # ==================== def _split(input_, pg: dist.ProcessGroup, dim=-1): # skip if only one rank involved world_size = dist.get_world_size(pg) rank = dist.get_rank(pg) if world_size == 1: return input_ # Split along last dimension. dim_size = input_.size(dim) assert dim_size % world_size == 0, ( f"The dimension to split ({dim_size}) is not a multiple of world size ({world_size}), " f"cannot split tensor evenly" ) tensor_list = torch.split(input_, dim_size // world_size, dim=dim) output = tensor_list[rank].contiguous() return output def _gather(input_, pg: dist.ProcessGroup, dim=-1): # skip if only one rank involved input_ = input_.contiguous() world_size = dist.get_world_size(pg) dist.get_rank(pg) if world_size == 1: return input_ # all gather tensor_list = [torch.empty_like(input_) for _ in range(world_size)] assert input_.device.type == "cuda" torch.distributed.all_gather(tensor_list, input_, group=pg) # concat output = torch.cat(tensor_list, dim=dim).contiguous() return output class _GatherForwardSplitBackward(torch.autograd.Function): """Gather the input from model parallel region and concatenate. Args: input_: input matrix. process_group: parallel mode. dim: dimension """ @staticmethod def symbolic(graph, input_): return _gather(input_) @staticmethod def forward(ctx, input_, process_group, dim, grad_scale): ctx.mode = process_group ctx.dim = dim ctx.grad_scale = grad_scale return _gather(input_, process_group, dim) @staticmethod def backward(ctx, grad_output): if ctx.grad_scale == "up": grad_output = grad_output * dist.get_world_size(ctx.mode) elif ctx.grad_scale == "down": grad_output = grad_output / dist.get_world_size(ctx.mode) return _split(grad_output, ctx.mode, ctx.dim), None, None, None class _SplitForwardGatherBackward(torch.autograd.Function): """ Split the input and keep only the corresponding chuck to the rank. Args: input_: input matrix. process_group: parallel mode. dim: dimension """ @staticmethod def symbolic(graph, input_): return _split(input_) @staticmethod def forward(ctx, input_, process_group, dim, grad_scale): ctx.mode = process_group ctx.dim = dim ctx.grad_scale = grad_scale return _split(input_, process_group, dim) @staticmethod def backward(ctx, grad_output): if ctx.grad_scale == "up": grad_output = grad_output * dist.get_world_size(ctx.mode) elif ctx.grad_scale == "down": grad_output = grad_output / dist.get_world_size(ctx.mode) return _gather(grad_output, ctx.mode, ctx.dim), None, None, None def split_forward_gather_backward(input_, process_group, dim, grad_scale=1.0): return _SplitForwardGatherBackward.apply(input_, process_group, dim, grad_scale) def gather_forward_split_backward(input_, process_group, dim, grad_scale=None): return _GatherForwardSplitBackward.apply(input_, process_group, dim, grad_scale) ================================================ FILE: Open-Sora/build/lib/opensora/acceleration/parallel_states.py ================================================ import torch.distributed as dist _GLOBAL_PARALLEL_GROUPS = dict() def set_data_parallel_group(group: dist.ProcessGroup): _GLOBAL_PARALLEL_GROUPS["data"] = group def get_data_parallel_group(): return _GLOBAL_PARALLEL_GROUPS.get("data", dist.group.WORLD) def set_sequence_parallel_group(group: dist.ProcessGroup): _GLOBAL_PARALLEL_GROUPS["sequence"] = group def get_sequence_parallel_group(): return _GLOBAL_PARALLEL_GROUPS.get("sequence", None) ================================================ FILE: Open-Sora/build/lib/opensora/acceleration/plugin.py ================================================ import random from typing import Optional import numpy as np import torch from colossalai.booster.plugin import LowLevelZeroPlugin from colossalai.cluster import ProcessGroupMesh from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler DP_AXIS, SP_AXIS = 0, 1 class ZeroSeqParallelPlugin(LowLevelZeroPlugin): def __init__( self, sp_size: int = 1, stage: int = 2, precision: str = "fp16", initial_scale: float = 2**32, min_scale: float = 1, growth_factor: float = 2, backoff_factor: float = 0.5, growth_interval: int = 1000, hysteresis: int = 2, max_scale: float = 2**32, max_norm: float = 0.0, norm_type: float = 2.0, reduce_bucket_size_in_m: int = 12, communication_dtype: Optional[torch.dtype] = None, overlap_communication: bool = True, cpu_offload: bool = False, master_weights: bool = True, verbose: bool = False, ) -> None: super().__init__( stage=stage, precision=precision, initial_scale=initial_scale, min_scale=min_scale, growth_factor=growth_factor, backoff_factor=backoff_factor, growth_interval=growth_interval, hysteresis=hysteresis, max_scale=max_scale, max_norm=max_norm, norm_type=norm_type, reduce_bucket_size_in_m=reduce_bucket_size_in_m, communication_dtype=communication_dtype, overlap_communication=overlap_communication, cpu_offload=cpu_offload, master_weights=master_weights, verbose=verbose, ) self.sp_size = sp_size assert self.world_size % sp_size == 0, "world_size must be divisible by sp_size" self.dp_size = self.world_size // sp_size self.pg_mesh = ProcessGroupMesh(self.dp_size, self.sp_size) self.dp_group = self.pg_mesh.get_group_along_axis(DP_AXIS) self.sp_group = self.pg_mesh.get_group_along_axis(SP_AXIS) self.dp_rank = self.pg_mesh.coordinate(DP_AXIS) self.sp_rank = self.pg_mesh.coordinate(SP_AXIS) def __del__(self): """Destroy the prcess groups in ProcessGroupMesh""" self.pg_mesh.destroy_mesh_process_groups() def prepare_dataloader( self, dataset, batch_size, shuffle=False, seed=1024, drop_last=False, pin_memory=False, num_workers=0, distributed_sampler_cls=None, **kwargs, ): _kwargs = kwargs.copy() distributed_sampler_cls = distributed_sampler_cls or DistributedSampler sampler = distributed_sampler_cls(dataset, num_replicas=self.dp_size, rank=self.dp_rank, shuffle=shuffle) # Deterministic dataloader def seed_worker(worker_id): worker_seed = seed np.random.seed(worker_seed) torch.manual_seed(worker_seed) random.seed(worker_seed) return DataLoader( dataset, batch_size=batch_size, sampler=sampler, worker_init_fn=seed_worker, drop_last=drop_last, pin_memory=pin_memory, num_workers=num_workers, **_kwargs, ) ================================================ FILE: Open-Sora/build/lib/opensora/acceleration/shardformer/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/opensora/acceleration/shardformer/modeling/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/opensora/acceleration/shardformer/modeling/t5.py ================================================ import torch import torch.nn as nn class T5LayerNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): """ Construct a layernorm module in the T5 style. No bias and no subtraction of mean. """ super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps def forward(self, hidden_states): # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for # half-precision inputs is done in fp32 variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) # convert into half-precision if necessary if self.weight.dtype in [torch.float16, torch.bfloat16]: hidden_states = hidden_states.to(self.weight.dtype) return self.weight * hidden_states @staticmethod def from_native_module(module, *args, **kwargs): assert module.__class__.__name__ == "FusedRMSNorm", ( "Recovering T5LayerNorm requires the original layer to be apex's Fused RMS Norm." "Apex's fused norm is automatically used by Hugging Face Transformers https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L265C5-L265C48" ) layer_norm = T5LayerNorm(module.normalized_shape, eps=module.eps) layer_norm.weight.data.copy_(module.weight.data) layer_norm = layer_norm.to(module.weight.device) return layer_norm ================================================ FILE: Open-Sora/build/lib/opensora/acceleration/shardformer/policy/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/opensora/acceleration/shardformer/policy/t5_encoder.py ================================================ from colossalai.shardformer.modeling.jit import get_jit_fused_dropout_add_func from colossalai.shardformer.modeling.t5 import get_jit_fused_T5_layer_ff_forward, get_T5_layer_self_attention_forward from colossalai.shardformer.policies.base_policy import Policy, SubModuleReplacementDescription class T5EncoderPolicy(Policy): def config_sanity_check(self): assert not self.shard_config.enable_tensor_parallelism assert not self.shard_config.enable_flash_attention def preprocess(self): return self.model def module_policy(self): from transformers.models.t5.modeling_t5 import T5LayerFF, T5LayerSelfAttention, T5Stack policy = {} # check whether apex is installed try: from opensora.acceleration.shardformer.modeling.t5 import T5LayerNorm # recover hf from fused rms norm to T5 norm which is faster self.append_or_create_submodule_replacement( description=SubModuleReplacementDescription( suffix="layer_norm", target_module=T5LayerNorm, ), policy=policy, target_key=T5LayerFF, ) self.append_or_create_submodule_replacement( description=SubModuleReplacementDescription(suffix="layer_norm", target_module=T5LayerNorm), policy=policy, target_key=T5LayerSelfAttention, ) self.append_or_create_submodule_replacement( description=SubModuleReplacementDescription(suffix="final_layer_norm", target_module=T5LayerNorm), policy=policy, target_key=T5Stack, ) except (ImportError, ModuleNotFoundError): pass # use jit operator if self.shard_config.enable_jit_fused: self.append_or_create_method_replacement( description={ "forward": get_jit_fused_T5_layer_ff_forward(), "dropout_add": get_jit_fused_dropout_add_func(), }, policy=policy, target_key=T5LayerFF, ) self.append_or_create_method_replacement( description={ "forward": get_T5_layer_self_attention_forward(), "dropout_add": get_jit_fused_dropout_add_func(), }, policy=policy, target_key=T5LayerSelfAttention, ) return policy def postprocess(self): return self.model ================================================ FILE: Open-Sora/build/lib/opensora/datasets/__init__.py ================================================ from .datasets import IMG_FPS, BatchFeatureDataset, VariableVideoTextDataset, VideoTextDataset from .utils import get_transforms_image, get_transforms_video, is_img, is_vid, save_sample ================================================ FILE: Open-Sora/build/lib/opensora/datasets/aspect.py ================================================ import math # computation def get_h_w(a, ts, eps=1e-4): h = (ts * a) ** 0.5 h = h + eps h = math.ceil(h) if math.ceil(h) % 2 == 0 else math.floor(h) w = h / a w = w + eps w = math.ceil(w) if math.ceil(w) % 2 == 0 else math.floor(w) return h, w def get_aspect_ratios_dict(ars, ts=360 * 640): est = {f"{a:.2f}": get_h_w(a, ts) for a in ars} return est def get_ar(ratio): h, w = ratio.split(":") return int(h) / int(w) # H:W ASPECT_RATIO_MAP = { "3:8": "0.38", "9:21": "0.43", "12:25": "0.48", "1:2": "0.50", "9:17": "0.53", "27:50": "0.54", "9:16": "0.56", "5:8": "0.62", "2:3": "0.67", "3:4": "0.75", "1:1": "1.00", "4:3": "1.33", "3:2": "1.50", "16:9": "1.78", "17:9": "1.89", "2:1": "2.00", "50:27": "2.08", } AR = [get_ar(ratio) for ratio in ASPECT_RATIO_MAP.keys()] # computed from above code # S = 8294400 ASPECT_RATIO_4K = { "0.38": (1764, 4704), "0.43": (1886, 4400), "0.48": (1996, 4158), "0.50": (2036, 4072), "0.53": (2096, 3960), "0.54": (2118, 3918), "0.62": (2276, 3642), "0.56": (2160, 3840), # base "0.67": (2352, 3528), "0.75": (2494, 3326), "1.00": (2880, 2880), "1.33": (3326, 2494), "1.50": (3528, 2352), "1.78": (3840, 2160), "1.89": (3958, 2096), "2.00": (4072, 2036), "2.08": (4156, 1994), } # S = 3686400 ASPECT_RATIO_2K = { "0.38": (1176, 3136), "0.43": (1256, 2930), "0.48": (1330, 2770), "0.50": (1358, 2716), "0.53": (1398, 2640), "0.54": (1412, 2612), "0.56": (1440, 2560), # base "0.62": (1518, 2428), "0.67": (1568, 2352), "0.75": (1662, 2216), "1.00": (1920, 1920), "1.33": (2218, 1664), "1.50": (2352, 1568), "1.78": (2560, 1440), "1.89": (2638, 1396), "2.00": (2716, 1358), "2.08": (2772, 1330), } # S = 2073600 ASPECT_RATIO_1080P = { "0.38": (882, 2352), "0.43": (942, 2198), "0.48": (998, 2080), "0.50": (1018, 2036), "0.53": (1048, 1980), "0.54": (1058, 1958), "0.56": (1080, 1920), # base "0.62": (1138, 1820), "0.67": (1176, 1764), "0.75": (1248, 1664), "1.00": (1440, 1440), "1.33": (1662, 1246), "1.50": (1764, 1176), "1.78": (1920, 1080), "1.89": (1980, 1048), "2.00": (2036, 1018), "2.08": (2078, 998), } # S = 921600 ASPECT_RATIO_720P = { "0.38": (588, 1568), "0.43": (628, 1466), "0.48": (666, 1388), "0.50": (678, 1356), "0.53": (698, 1318), "0.54": (706, 1306), "0.56": (720, 1280), # base "0.62": (758, 1212), "0.67": (784, 1176), "0.75": (832, 1110), "1.00": (960, 960), "1.33": (1108, 832), "1.50": (1176, 784), "1.78": (1280, 720), "1.89": (1320, 698), "2.00": (1358, 680), "2.08": (1386, 666), } # S = 409920 ASPECT_RATIO_480P = { "0.38": (392, 1046), "0.43": (420, 980), "0.48": (444, 925), "0.50": (452, 904), "0.53": (466, 880), "0.54": (470, 870), "0.56": (480, 854), # base "0.62": (506, 810), "0.67": (522, 784), "0.75": (554, 738), "1.00": (640, 640), "1.33": (740, 555), "1.50": (784, 522), "1.78": (854, 480), "1.89": (880, 466), "2.00": (906, 454), "2.08": (924, 444), } # S = 230400 ASPECT_RATIO_360P = { "0.38": (294, 784), "0.43": (314, 732), "0.48": (332, 692), "0.50": (340, 680), "0.53": (350, 662), "0.54": (352, 652), "0.56": (360, 640), # base "0.62": (380, 608), "0.67": (392, 588), "0.75": (416, 554), "1.00": (480, 480), "1.33": (554, 416), "1.50": (588, 392), "1.78": (640, 360), "1.89": (660, 350), "2.00": (678, 340), "2.08": (692, 332), } # S = 102240 ASPECT_RATIO_240P = { "0.38": (196, 522), "0.43": (210, 490), "0.48": (222, 462), "0.50": (226, 452), "0.53": (232, 438), "0.54": (236, 436), "0.56": (240, 426), # base "0.62": (252, 404), "0.67": (262, 393), "0.75": (276, 368), "1.00": (320, 320), "1.33": (370, 278), "1.50": (392, 262), "1.78": (426, 240), "1.89": (440, 232), "2.00": (452, 226), "2.08": (462, 222), } # S = 36864 ASPECT_RATIO_144P = { "0.38": (117, 312), "0.43": (125, 291), "0.48": (133, 277), "0.50": (135, 270), "0.53": (139, 262), "0.54": (141, 260), "0.56": (144, 256), # base "0.62": (151, 241), "0.67": (156, 234), "0.75": (166, 221), "1.00": (192, 192), "1.33": (221, 165), "1.50": (235, 156), "1.78": (256, 144), "1.89": (263, 139), "2.00": (271, 135), "2.08": (277, 132), } # from PixArt # S = 8294400 ASPECT_RATIO_2880 = { "0.25": (1408, 5760), "0.26": (1408, 5568), "0.27": (1408, 5376), "0.28": (1408, 5184), "0.32": (1600, 4992), "0.33": (1600, 4800), "0.34": (1600, 4672), "0.40": (1792, 4480), "0.42": (1792, 4288), "0.47": (1920, 4096), "0.49": (1920, 3904), "0.51": (1920, 3776), "0.55": (2112, 3840), "0.59": (2112, 3584), "0.68": (2304, 3392), "0.72": (2304, 3200), "0.78": (2496, 3200), "0.83": (2496, 3008), "0.89": (2688, 3008), "0.93": (2688, 2880), "1.00": (2880, 2880), "1.07": (2880, 2688), "1.12": (3008, 2688), "1.21": (3008, 2496), "1.28": (3200, 2496), "1.39": (3200, 2304), "1.47": (3392, 2304), "1.70": (3584, 2112), "1.82": (3840, 2112), "2.03": (3904, 1920), "2.13": (4096, 1920), "2.39": (4288, 1792), "2.50": (4480, 1792), "2.92": (4672, 1600), "3.00": (4800, 1600), "3.12": (4992, 1600), "3.68": (5184, 1408), "3.82": (5376, 1408), "3.95": (5568, 1408), "4.00": (5760, 1408), } # S = 4194304 ASPECT_RATIO_2048 = { "0.25": (1024, 4096), "0.26": (1024, 3968), "0.27": (1024, 3840), "0.28": (1024, 3712), "0.32": (1152, 3584), "0.33": (1152, 3456), "0.35": (1152, 3328), "0.40": (1280, 3200), "0.42": (1280, 3072), "0.48": (1408, 2944), "0.50": (1408, 2816), "0.52": (1408, 2688), "0.57": (1536, 2688), "0.60": (1536, 2560), "0.68": (1664, 2432), "0.72": (1664, 2304), "0.78": (1792, 2304), "0.82": (1792, 2176), "0.88": (1920, 2176), "0.94": (1920, 2048), "1.00": (2048, 2048), "1.07": (2048, 1920), "1.13": (2176, 1920), "1.21": (2176, 1792), "1.29": (2304, 1792), "1.38": (2304, 1664), "1.46": (2432, 1664), "1.67": (2560, 1536), "1.75": (2688, 1536), "2.00": (2816, 1408), "2.09": (2944, 1408), "2.40": (3072, 1280), "2.50": (3200, 1280), "2.89": (3328, 1152), "3.00": (3456, 1152), "3.11": (3584, 1152), "3.62": (3712, 1024), "3.75": (3840, 1024), "3.88": (3968, 1024), "4.00": (4096, 1024), } # S = 1048576 ASPECT_RATIO_1024 = { "0.25": (512, 2048), "0.26": (512, 1984), "0.27": (512, 1920), "0.28": (512, 1856), "0.32": (576, 1792), "0.33": (576, 1728), "0.35": (576, 1664), "0.40": (640, 1600), "0.42": (640, 1536), "0.48": (704, 1472), "0.50": (704, 1408), "0.52": (704, 1344), "0.57": (768, 1344), "0.60": (768, 1280), "0.68": (832, 1216), "0.72": (832, 1152), "0.78": (896, 1152), "0.82": (896, 1088), "0.88": (960, 1088), "0.94": (960, 1024), "1.00": (1024, 1024), "1.07": (1024, 960), "1.13": (1088, 960), "1.21": (1088, 896), "1.29": (1152, 896), "1.38": (1152, 832), "1.46": (1216, 832), "1.67": (1280, 768), "1.75": (1344, 768), "2.00": (1408, 704), "2.09": (1472, 704), "2.40": (1536, 640), "2.50": (1600, 640), "2.89": (1664, 576), "3.00": (1728, 576), "3.11": (1792, 576), "3.62": (1856, 512), "3.75": (1920, 512), "3.88": (1984, 512), "4.00": (2048, 512), } # S = 262144 ASPECT_RATIO_512 = { "0.25": (256, 1024), "0.26": (256, 992), "0.27": (256, 960), "0.28": (256, 928), "0.32": (288, 896), "0.33": (288, 864), "0.35": (288, 832), "0.40": (320, 800), "0.42": (320, 768), "0.48": (352, 736), "0.50": (352, 704), "0.52": (352, 672), "0.57": (384, 672), "0.60": (384, 640), "0.68": (416, 608), "0.72": (416, 576), "0.78": (448, 576), "0.82": (448, 544), "0.88": (480, 544), "0.94": (480, 512), "1.00": (512, 512), "1.07": (512, 480), "1.13": (544, 480), "1.21": (544, 448), "1.29": (576, 448), "1.38": (576, 416), "1.46": (608, 416), "1.67": (640, 384), "1.75": (672, 384), "2.00": (704, 352), "2.09": (736, 352), "2.40": (768, 320), "2.50": (800, 320), "2.89": (832, 288), "3.00": (864, 288), "3.11": (896, 288), "3.62": (928, 256), "3.75": (960, 256), "3.88": (992, 256), "4.00": (1024, 256), } # S = 65536 ASPECT_RATIO_256 = { "0.25": (128, 512), "0.26": (128, 496), "0.27": (128, 480), "0.28": (128, 464), "0.32": (144, 448), "0.33": (144, 432), "0.35": (144, 416), "0.40": (160, 400), "0.42": (160, 384), "0.48": (176, 368), "0.50": (176, 352), "0.52": (176, 336), "0.57": (192, 336), "0.60": (192, 320), "0.68": (208, 304), "0.72": (208, 288), "0.78": (224, 288), "0.82": (224, 272), "0.88": (240, 272), "0.94": (240, 256), "1.00": (256, 256), "1.07": (256, 240), "1.13": (272, 240), "1.21": (272, 224), "1.29": (288, 224), "1.38": (288, 208), "1.46": (304, 208), "1.67": (320, 192), "1.75": (336, 192), "2.00": (352, 176), "2.09": (368, 176), "2.40": (384, 160), "2.50": (400, 160), "2.89": (416, 144), "3.00": (432, 144), "3.11": (448, 144), "3.62": (464, 128), "3.75": (480, 128), "3.88": (496, 128), "4.00": (512, 128), } def get_closest_ratio(height: float, width: float, ratios: dict): aspect_ratio = height / width closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio)) return closest_ratio ASPECT_RATIOS = { "144p": (36864, ASPECT_RATIO_144P), "256": (65536, ASPECT_RATIO_256), "240p": (102240, ASPECT_RATIO_240P), "360p": (230400, ASPECT_RATIO_360P), "512": (262144, ASPECT_RATIO_512), "480p": (409920, ASPECT_RATIO_480P), "720p": (921600, ASPECT_RATIO_720P), "1024": (1048576, ASPECT_RATIO_1024), "1080p": (2073600, ASPECT_RATIO_1080P), "2k": (3686400, ASPECT_RATIO_2K), "2048": (4194304, ASPECT_RATIO_2048), "2880": (8294400, ASPECT_RATIO_2880), "4k": (8294400, ASPECT_RATIO_4K), } def get_num_pixels(name): return ASPECT_RATIOS[name][0] def get_image_size(resolution, ar_ratio): if ar_ratio in ASPECT_RATIO_MAP: ar_key = ASPECT_RATIO_MAP[ar_ratio] else: ar_key = ar_ratio rs_dict = ASPECT_RATIOS[resolution][1] assert ar_key in rs_dict, f"Aspect ratio {ar_ratio} not found for resolution {resolution}" return rs_dict[ar_key] NUM_FRAMES_MAP = { "1x": 51, "2x": 102, "4x": 204, "8x": 408, "16x": 816, "2s": 51, "4s": 102, "8s": 204, "16s": 408, "32s": 816, } def get_num_frames(num_frames): if num_frames in NUM_FRAMES_MAP: return NUM_FRAMES_MAP[num_frames] else: return int(num_frames) ================================================ FILE: Open-Sora/build/lib/opensora/datasets/bucket.py ================================================ from collections import OrderedDict import numpy as np from opensora.utils.misc import get_logger from .aspect import ASPECT_RATIOS, get_closest_ratio def find_approximate_hw(hw, hw_dict, approx=0.8): for k, v in hw_dict.items(): if hw >= v * approx: return k return None def find_closet_smaller_bucket(t, t_dict, frame_interval): # process image if t == 1: if 1 in t_dict: return 1 else: return None # process video for k, v in t_dict.items(): if t >= v * frame_interval and v != 1: return k return None class Bucket: def __init__(self, bucket_config): for key in bucket_config: assert key in ASPECT_RATIOS, f"Aspect ratio {key} not found." # wrap config with OrderedDict bucket_probs = OrderedDict() bucket_bs = OrderedDict() bucket_names = sorted(bucket_config.keys(), key=lambda x: ASPECT_RATIOS[x][0], reverse=True) for key in bucket_names: bucket_time_names = sorted(bucket_config[key].keys(), key=lambda x: x, reverse=True) bucket_probs[key] = OrderedDict({k: bucket_config[key][k][0] for k in bucket_time_names}) bucket_bs[key] = OrderedDict({k: bucket_config[key][k][1] for k in bucket_time_names}) # first level: HW num_bucket = 0 hw_criteria = dict() t_criteria = dict() ar_criteria = dict() bucket_id = OrderedDict() bucket_id_cnt = 0 for k1, v1 in bucket_probs.items(): hw_criteria[k1] = ASPECT_RATIOS[k1][0] t_criteria[k1] = dict() ar_criteria[k1] = dict() bucket_id[k1] = dict() for k2, _ in v1.items(): t_criteria[k1][k2] = k2 bucket_id[k1][k2] = bucket_id_cnt bucket_id_cnt += 1 ar_criteria[k1][k2] = dict() for k3, v3 in ASPECT_RATIOS[k1][1].items(): ar_criteria[k1][k2][k3] = v3 num_bucket += 1 self.bucket_probs = bucket_probs self.bucket_bs = bucket_bs self.bucket_id = bucket_id self.hw_criteria = hw_criteria self.t_criteria = t_criteria self.ar_criteria = ar_criteria self.num_bucket = num_bucket get_logger().info("Number of buckets: %s", num_bucket) def get_bucket_id(self, T, H, W, frame_interval=1, seed=None): resolution = H * W approx = 0.8 fail = True for hw_id, t_criteria in self.bucket_probs.items(): if resolution < self.hw_criteria[hw_id] * approx: continue # if sample is an image if T == 1: if 1 in t_criteria: rng = np.random.default_rng(seed + self.bucket_id[hw_id][1]) if rng.random() < t_criteria[1]: fail = False t_id = 1 break else: continue # otherwise, find suitable t_id for video t_fail = True for t_id, prob in t_criteria.items(): rng = np.random.default_rng(seed + self.bucket_id[hw_id][t_id]) if isinstance(prob, tuple): prob_t = prob[1] if rng.random() > prob_t: continue if T > t_id * frame_interval and t_id != 1: t_fail = False break if t_fail: continue # leave the loop if prob is high enough if isinstance(prob, tuple): prob = prob[0] if prob >= 1 or rng.random() < prob: fail = False break if fail: return None # get aspect ratio id ar_criteria = self.ar_criteria[hw_id][t_id] ar_id = get_closest_ratio(H, W, ar_criteria) return hw_id, t_id, ar_id def get_thw(self, bucket_id): assert len(bucket_id) == 3 T = self.t_criteria[bucket_id[0]][bucket_id[1]] H, W = self.ar_criteria[bucket_id[0]][bucket_id[1]][bucket_id[2]] return T, H, W def get_prob(self, bucket_id): return self.bucket_probs[bucket_id[0]][bucket_id[1]] def get_batch_size(self, bucket_id): return self.bucket_bs[bucket_id[0]][bucket_id[1]] def __len__(self): return self.num_bucket def closet_smaller_bucket(value, bucket): for i in range(1, len(bucket)): if value < bucket[i]: return bucket[i - 1] return bucket[-1] ================================================ FILE: Open-Sora/build/lib/opensora/datasets/dataloader.py ================================================ import collections import random from typing import Optional import numpy as np import torch from torch.distributed import ProcessGroup from torch.distributed.distributed_c10d import _get_default_group from torch.utils.data import DataLoader from .datasets import BatchFeatureDataset, VariableVideoTextDataset, VideoTextDataset from .sampler import BatchDistributedSampler, StatefulDistributedSampler, VariableVideoBatchSampler # Deterministic dataloader def get_seed_worker(seed): def seed_worker(worker_id): worker_seed = seed np.random.seed(worker_seed) torch.manual_seed(worker_seed) random.seed(worker_seed) return seed_worker def prepare_dataloader( dataset, batch_size=None, shuffle=False, seed=1024, drop_last=False, pin_memory=False, num_workers=0, process_group: Optional[ProcessGroup] = None, bucket_config=None, num_bucket_build_workers=1, prefetch_factor=None, **kwargs, ): _kwargs = kwargs.copy() if isinstance(dataset, VariableVideoTextDataset): batch_sampler = VariableVideoBatchSampler( dataset, bucket_config, num_replicas=process_group.size(), rank=process_group.rank(), shuffle=shuffle, seed=seed, drop_last=drop_last, verbose=True, num_bucket_build_workers=num_bucket_build_workers, ) return ( DataLoader( dataset, batch_sampler=batch_sampler, worker_init_fn=get_seed_worker(seed), pin_memory=pin_memory, num_workers=num_workers, collate_fn=collate_fn_default, prefetch_factor=prefetch_factor, **_kwargs, ), batch_sampler, ) elif isinstance(dataset, VideoTextDataset): process_group = process_group or _get_default_group() sampler = StatefulDistributedSampler( dataset, num_replicas=process_group.size(), rank=process_group.rank(), shuffle=shuffle, ) return ( DataLoader( dataset, batch_size=batch_size, sampler=sampler, worker_init_fn=get_seed_worker(seed), drop_last=drop_last, pin_memory=pin_memory, num_workers=num_workers, collate_fn=collate_fn_default, prefetch_factor=prefetch_factor, **_kwargs, ), sampler, ) elif isinstance(dataset, BatchFeatureDataset): sampler = BatchDistributedSampler( dataset, num_replicas=process_group.size(), rank=process_group.rank(), ) return ( DataLoader( dataset, batch_size=1, sampler=sampler, worker_init_fn=get_seed_worker(seed), pin_memory=pin_memory, num_workers=num_workers, collate_fn=collate_fn_batch, prefetch_factor=prefetch_factor, **_kwargs, ), sampler, ) else: raise ValueError(f"Unsupported dataset type: {type(dataset)}") def collate_fn_default(batch): # filter out None batch = [x for x in batch if x is not None] # HACK: for loading text features use_mask = False if "mask" in batch[0] and isinstance(batch[0]["mask"], int): masks = [x.pop("mask") for x in batch] texts = [x.pop("text") for x in batch] texts = torch.cat(texts, dim=1) use_mask = True ret = torch.utils.data.default_collate(batch) if use_mask: ret["mask"] = masks ret["text"] = texts return ret def collate_fn_batch(batch): """ Used only with BatchDistributedSampler """ # filter out None batch = [x for x in batch if x is not None] res = torch.utils.data.default_collate(batch) # squeeze the first dimension, which is due to torch.stack() in default_collate() if isinstance(res, collections.abc.Mapping): for k, v in res.items(): if isinstance(v, torch.Tensor): res[k] = v.squeeze(0) elif isinstance(res, collections.abc.Sequence): res = [x.squeeze(0) if isinstance(x, torch.Tensor) else x for x in res] elif isinstance(res, torch.Tensor): res = res.squeeze(0) else: raise TypeError return res ================================================ FILE: Open-Sora/build/lib/opensora/datasets/datasets.py ================================================ import os from glob import glob import numpy as np import torch from PIL import ImageFile from torchvision.datasets.folder import IMG_EXTENSIONS, pil_loader from opensora.registry import DATASETS from .read_video import read_video from .utils import VID_EXTENSIONS, get_transforms_image, get_transforms_video, read_file, temporal_random_crop ImageFile.LOAD_TRUNCATED_IMAGES = True IMG_FPS = 120 @DATASETS.register_module() class VideoTextDataset(torch.utils.data.Dataset): """load video according to the csv file. Args: target_video_len (int): the number of video frames will be load. align_transform (callable): Align different videos in a specified size. temporal_sample (callable): Sample the target length of a video. """ def __init__( self, data_path=None, num_frames=16, frame_interval=1, image_size=(256, 256), transform_name="center", ): self.data_path = data_path self.data = read_file(data_path) self.get_text = "text" in self.data.columns self.num_frames = num_frames self.frame_interval = frame_interval self.image_size = image_size self.transforms = { "image": get_transforms_image(transform_name, image_size), "video": get_transforms_video(transform_name, image_size), } def _print_data_number(self): num_videos = 0 num_images = 0 for path in self.data["path"]: if self.get_type(path) == "video": num_videos += 1 else: num_images += 1 print(f"Dataset contains {num_videos} videos and {num_images} images.") def get_type(self, path): ext = os.path.splitext(path)[-1].lower() if ext.lower() in VID_EXTENSIONS: return "video" else: assert ext.lower() in IMG_EXTENSIONS, f"Unsupported file format: {ext}" return "image" def getitem(self, index): sample = self.data.iloc[index] path = sample["path"] file_type = self.get_type(path) if file_type == "video": # loading vframes, vinfo = read_video(path, backend="av") video_fps = vinfo["video_fps"] if "video_fps" in vinfo else 24 # Sampling video frames video = temporal_random_crop(vframes, self.num_frames, self.frame_interval) # transform transform = self.transforms["video"] video = transform(video) # T C H W else: # loading image = pil_loader(path) video_fps = IMG_FPS # transform transform = self.transforms["image"] image = transform(image) # repeat video = image.unsqueeze(0).repeat(self.num_frames, 1, 1, 1) # TCHW -> CTHW video = video.permute(1, 0, 2, 3) ret = {"video": video, "fps": video_fps} if self.get_text: ret["text"] = sample["text"] return ret def __getitem__(self, index): for _ in range(10): try: return self.getitem(index) except Exception as e: path = self.data.iloc[index]["path"] print(f"data {path}: {e}") index = np.random.randint(len(self)) raise RuntimeError("Too many bad data.") def __len__(self): return len(self.data) @DATASETS.register_module() class VariableVideoTextDataset(VideoTextDataset): def __init__( self, data_path=None, num_frames=None, frame_interval=1, image_size=(None, None), transform_name=None, dummy_text_feature=False, ): super().__init__(data_path, num_frames, frame_interval, image_size, transform_name=None) self.transform_name = transform_name self.data["id"] = np.arange(len(self.data)) self.dummy_text_feature = dummy_text_feature def get_data_info(self, index): T = self.data.iloc[index]["num_frames"] H = self.data.iloc[index]["height"] W = self.data.iloc[index]["width"] return T, H, W def getitem(self, index): # a hack to pass in the (time, height, width) info from sampler index, num_frames, height, width = [int(val) for val in index.split("-")] sample = self.data.iloc[index] path = sample["path"] file_type = self.get_type(path) ar = height / width video_fps = 24 # default fps if file_type == "video": # loading vframes, vinfo = read_video(path, backend="av") video_fps = vinfo["video_fps"] if "video_fps" in vinfo else 24 # Sampling video frames video = temporal_random_crop(vframes, num_frames, self.frame_interval) video = video.clone() del vframes video_fps = video_fps // self.frame_interval # transform transform = get_transforms_video(self.transform_name, (height, width)) video = transform(video) # T C H W else: # loading image = pil_loader(path) video_fps = IMG_FPS # transform transform = get_transforms_image(self.transform_name, (height, width)) image = transform(image) # repeat video = image.unsqueeze(0) # TCHW -> CTHW video = video.permute(1, 0, 2, 3) ret = { "video": video, "num_frames": num_frames, "height": height, "width": width, "ar": ar, "fps": video_fps, } if self.get_text: ret["text"] = sample["text"] if self.dummy_text_feature: text_len = 50 ret["text"] = torch.zeros((1, text_len, 1152)) ret["mask"] = text_len return ret def __getitem__(self, index): try: return self.getitem(index) except: return None @DATASETS.register_module() class BatchFeatureDataset(torch.utils.data.Dataset): """ The dataset is composed of multiple .bin files. Each .bin file is a list of batch data (like a buffer). All .bin files have the same length. In each training iteration, one batch is fetched from the current buffer. Once a buffer is consumed, load another one. Avoid loading the same .bin on two difference GPUs, i.e., one .bin is assigned to one GPU only. """ def __init__(self, data_path=None): self.path_list = sorted(glob(data_path + "/**/*.bin")) self._len_buffer = len(torch.load(self.path_list[0])) self._num_buffers = len(self.path_list) self.num_samples = self.len_buffer * len(self.path_list) self.cur_file_idx = -1 self.cur_buffer = None @property def num_buffers(self): return self._num_buffers @property def len_buffer(self): return self._len_buffer def _load_buffer(self, idx): file_idx = idx // self.len_buffer if file_idx != self.cur_file_idx: self.cur_file_idx = file_idx self.cur_buffer = torch.load(self.path_list[file_idx]) def __len__(self): return self.num_samples def __getitem__(self, idx): self._load_buffer(idx) batch = self.cur_buffer[idx % self.len_buffer] # dict; keys are {'x', 'fps'} and text related ret = { "video": batch["x"], "text": batch["y"], "mask": batch["mask"], "fps": batch["fps"], "height": batch["height"], "width": batch["width"], "num_frames": batch["num_frames"], } return ret ================================================ FILE: Open-Sora/build/lib/opensora/datasets/read_video.py ================================================ import gc import math import os import re import warnings from fractions import Fraction from typing import Any, Dict, List, Optional, Tuple, Union import av import cv2 import numpy as np import torch from torchvision import get_video_backend from torchvision.io.video import _check_av_available MAX_NUM_FRAMES = 2500 def read_video_av( filename: str, start_pts: Union[float, Fraction] = 0, end_pts: Optional[Union[float, Fraction]] = None, pts_unit: str = "pts", output_format: str = "THWC", ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]: """ Reads a video from a file, returning both the video frames and the audio frames This method is modified from torchvision.io.video.read_video, with the following changes: 1. will not extract audio frames and return empty for aframes 2. remove checks and only support pyav 3. add container.close() and gc.collect() to avoid thread leakage 4. try our best to avoid memory leak Args: filename (str): path to the video file start_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional): The start presentation time of the video end_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional): The end presentation time pts_unit (str, optional): unit in which start_pts and end_pts values will be interpreted, either 'pts' or 'sec'. Defaults to 'pts'. output_format (str, optional): The format of the output video tensors. Can be either "THWC" (default) or "TCHW". Returns: vframes (Tensor[T, H, W, C] or Tensor[T, C, H, W]): the `T` video frames aframes (Tensor[K, L]): the audio frames, where `K` is the number of channels and `L` is the number of points info (Dict): metadata for the video and audio. Can contain the fields video_fps (float) and audio_fps (int) """ # format output_format = output_format.upper() if output_format not in ("THWC", "TCHW"): raise ValueError(f"output_format should be either 'THWC' or 'TCHW', got {output_format}.") # file existence if not os.path.exists(filename): raise RuntimeError(f"File not found: {filename}") # backend check assert get_video_backend() == "pyav", "pyav backend is required for read_video_av" _check_av_available() # end_pts check if end_pts is None: end_pts = float("inf") if end_pts < start_pts: raise ValueError(f"end_pts should be larger than start_pts, got start_pts={start_pts} and end_pts={end_pts}") # == get video info == info = {} # TODO: creating an container leads to memory leak (1G for 8 workers 1 GPU) container = av.open(filename, metadata_errors="ignore") # fps video_fps = container.streams.video[0].average_rate # guard against potentially corrupted files if video_fps is not None: info["video_fps"] = float(video_fps) iter_video = container.decode(**{"video": 0}) frame = next(iter_video).to_rgb().to_ndarray() height, width = frame.shape[:2] total_frames = container.streams.video[0].frames if total_frames == 0: total_frames = MAX_NUM_FRAMES warnings.warn(f"total_frames is 0, using {MAX_NUM_FRAMES} as a fallback") container.close() del container # HACK: must create before iterating stream # use np.zeros will not actually allocate memory # use np.ones will lead to a little memory leak video_frames = np.zeros((total_frames, height, width, 3), dtype=np.uint8) # == read == try: # TODO: The reading has memory leak (4G for 8 workers 1 GPU) container = av.open(filename, metadata_errors="ignore") assert container.streams.video is not None video_frames = _read_from_stream( video_frames, container, start_pts, end_pts, pts_unit, container.streams.video[0], {"video": 0}, filename=filename, ) except av.AVError as e: print(f"[Warning] Error while reading video {filename}: {e}") vframes = torch.from_numpy(video_frames).clone() del video_frames if output_format == "TCHW": # [T,H,W,C] --> [T,C,H,W] vframes = vframes.permute(0, 3, 1, 2) aframes = torch.empty((1, 0), dtype=torch.float32) return vframes, aframes, info def _read_from_stream( video_frames, container: "av.container.Container", start_offset: float, end_offset: float, pts_unit: str, stream: "av.stream.Stream", stream_name: Dict[str, Optional[Union[int, Tuple[int, ...], List[int]]]], filename: Optional[str] = None, ) -> List["av.frame.Frame"]: if pts_unit == "sec": # TODO: we should change all of this from ground up to simply take # sec and convert to MS in C++ start_offset = int(math.floor(start_offset * (1 / stream.time_base))) if end_offset != float("inf"): end_offset = int(math.ceil(end_offset * (1 / stream.time_base))) else: warnings.warn("The pts_unit 'pts' gives wrong results. Please use pts_unit 'sec'.") should_buffer = True max_buffer_size = 5 if stream.type == "video": # DivX-style packed B-frames can have out-of-order pts (2 frames in a single pkt) # so need to buffer some extra frames to sort everything # properly extradata = stream.codec_context.extradata # overly complicated way of finding if `divx_packed` is set, following # https://github.com/FFmpeg/FFmpeg/commit/d5a21172283572af587b3d939eba0091484d3263 if extradata and b"DivX" in extradata: # can't use regex directly because of some weird characters sometimes... pos = extradata.find(b"DivX") d = extradata[pos:] o = re.search(rb"DivX(\d+)Build(\d+)(\w)", d) if o is None: o = re.search(rb"DivX(\d+)b(\d+)(\w)", d) if o is not None: should_buffer = o.group(3) == b"p" seek_offset = start_offset # some files don't seek to the right location, so better be safe here seek_offset = max(seek_offset - 1, 0) if should_buffer: # FIXME this is kind of a hack, but we will jump to the previous keyframe # so this will be safe seek_offset = max(seek_offset - max_buffer_size, 0) try: # TODO check if stream needs to always be the video stream here or not container.seek(seek_offset, any_frame=False, backward=True, stream=stream) except av.AVError as e: print(f"[Warning] Error while seeking video {filename}: {e}") return [] # == main == buffer_count = 0 frames_pts = [] cnt = 0 try: for _idx, frame in enumerate(container.decode(**stream_name)): frames_pts.append(frame.pts) video_frames[cnt] = frame.to_rgb().to_ndarray() cnt += 1 if cnt >= len(video_frames): break if frame.pts >= end_offset: if should_buffer and buffer_count < max_buffer_size: buffer_count += 1 continue break except av.AVError as e: print(f"[Warning] Error while reading video {filename}: {e}") # garbage collection for thread leakage container.close() del container # NOTE: manually garbage collect to close pyav threads gc.collect() # ensure that the results are sorted wrt the pts # NOTE: here we assert frames_pts is sorted start_ptr = 0 end_ptr = cnt while start_ptr < end_ptr and frames_pts[start_ptr] < start_offset: start_ptr += 1 while start_ptr < end_ptr and frames_pts[end_ptr - 1] > end_offset: end_ptr -= 1 if start_offset > 0 and start_offset not in frames_pts[start_ptr:end_ptr]: # if there is no frame that exactly matches the pts of start_offset # add the last frame smaller than start_offset, to guarantee that # we will have all the necessary data. This is most useful for audio if start_ptr > 0: start_ptr -= 1 result = video_frames[start_ptr:end_ptr].copy() return result def read_video_cv2(video_path): cap = cv2.VideoCapture(video_path) if not cap.isOpened(): # print("Error: Unable to open video") raise ValueError else: fps = cap.get(cv2.CAP_PROP_FPS) vinfo = { "video_fps": fps, } frames = [] while True: # Read a frame from the video ret, frame = cap.read() # If frame is not read correctly, break the loop if not ret: break frames.append(frame[:, :, ::-1]) # BGR to RGB # Exit if 'q' is pressed if cv2.waitKey(25) & 0xFF == ord("q"): break # Release the video capture object and close all windows cap.release() cv2.destroyAllWindows() frames = np.stack(frames) frames = torch.from_numpy(frames) # [T, H, W, C=3] frames = frames.permute(0, 3, 1, 2) return frames, vinfo def read_video(video_path, backend="av"): if backend == "cv2": vframes, vinfo = read_video_cv2(video_path) elif backend == "av": vframes, _, vinfo = read_video_av(filename=video_path, pts_unit="sec", output_format="TCHW") else: raise ValueError return vframes, vinfo ================================================ FILE: Open-Sora/build/lib/opensora/datasets/sampler.py ================================================ from collections import OrderedDict, defaultdict from pprint import pformat from typing import Iterator, List, Optional import numpy as np import torch import torch.distributed as dist from torch.utils.data import Dataset, DistributedSampler from opensora.utils.misc import format_numel_str, get_logger from .aspect import get_num_pixels from .bucket import Bucket from .datasets import VariableVideoTextDataset # use pandarallel to accelerate bucket processing # NOTE: pandarallel should only access local variables def apply(data, method=None, frame_interval=None, seed=None, num_bucket=None): return method( data["num_frames"], data["height"], data["width"], frame_interval, seed + data["id"] * num_bucket, ) class StatefulDistributedSampler(DistributedSampler): def __init__( self, dataset: Dataset, num_replicas: Optional[int] = None, rank: Optional[int] = None, shuffle: bool = True, seed: int = 0, drop_last: bool = False, ) -> None: super().__init__(dataset, num_replicas, rank, shuffle, seed, drop_last) self.start_index: int = 0 def __iter__(self) -> Iterator: iterator = super().__iter__() indices = list(iterator) indices = indices[self.start_index :] return iter(indices) def __len__(self) -> int: return self.num_samples - self.start_index def reset(self) -> None: self.start_index = 0 def state_dict(self, step) -> dict: return {"start_index": step} def load_state_dict(self, state_dict: dict) -> None: self.__dict__.update(state_dict) class VariableVideoBatchSampler(DistributedSampler): def __init__( self, dataset: VariableVideoTextDataset, bucket_config: dict, num_replicas: Optional[int] = None, rank: Optional[int] = None, shuffle: bool = True, seed: int = 0, drop_last: bool = False, verbose: bool = False, num_bucket_build_workers: int = 1, ) -> None: super().__init__( dataset=dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle, seed=seed, drop_last=drop_last ) self.dataset = dataset self.bucket = Bucket(bucket_config) self.verbose = verbose self.last_micro_batch_access_index = 0 self.approximate_num_batch = None self._get_num_batch_cached_bucket_sample_dict = None self.num_bucket_build_workers = num_bucket_build_workers def __iter__(self) -> Iterator[List[int]]: if self._get_num_batch_cached_bucket_sample_dict is not None: bucket_sample_dict = self._get_num_batch_cached_bucket_sample_dict self._get_num_batch_cached_bucket_sample_dict = None else: bucket_sample_dict = self.group_by_bucket() if self.verbose: self._print_bucket_info(bucket_sample_dict) g = torch.Generator() g.manual_seed(self.seed + self.epoch) bucket_micro_batch_count = OrderedDict() bucket_last_consumed = OrderedDict() # process the samples for bucket_id, data_list in bucket_sample_dict.items(): # handle droplast bs_per_gpu = self.bucket.get_batch_size(bucket_id) remainder = len(data_list) % bs_per_gpu if remainder > 0: if not self.drop_last: # if there is remainder, we pad to make it divisible data_list += data_list[: bs_per_gpu - remainder] else: # we just drop the remainder to make it divisible data_list = data_list[:-remainder] bucket_sample_dict[bucket_id] = data_list # handle shuffle if self.shuffle: data_indices = torch.randperm(len(data_list), generator=g).tolist() data_list = [data_list[i] for i in data_indices] bucket_sample_dict[bucket_id] = data_list # compute how many micro-batches each bucket has num_micro_batches = len(data_list) // bs_per_gpu bucket_micro_batch_count[bucket_id] = num_micro_batches # compute the bucket access order # each bucket may have more than one batch of data # thus bucket_id may appear more than 1 time bucket_id_access_order = [] for bucket_id, num_micro_batch in bucket_micro_batch_count.items(): bucket_id_access_order.extend([bucket_id] * num_micro_batch) # randomize the access order if self.shuffle: bucket_id_access_order_indices = torch.randperm(len(bucket_id_access_order), generator=g).tolist() bucket_id_access_order = [bucket_id_access_order[i] for i in bucket_id_access_order_indices] # make the number of bucket accesses divisible by dp size remainder = len(bucket_id_access_order) % self.num_replicas if remainder > 0: if self.drop_last: bucket_id_access_order = bucket_id_access_order[: len(bucket_id_access_order) - remainder] else: bucket_id_access_order += bucket_id_access_order[: self.num_replicas - remainder] # prepare each batch from its bucket # according to the predefined bucket access order num_iters = len(bucket_id_access_order) // self.num_replicas start_iter_idx = self.last_micro_batch_access_index // self.num_replicas # re-compute the micro-batch consumption # this is useful when resuming from a state dict with a different number of GPUs self.last_micro_batch_access_index = start_iter_idx * self.num_replicas for i in range(self.last_micro_batch_access_index): bucket_id = bucket_id_access_order[i] bucket_bs = self.bucket.get_batch_size(bucket_id) if bucket_id in bucket_last_consumed: bucket_last_consumed[bucket_id] += bucket_bs else: bucket_last_consumed[bucket_id] = bucket_bs for i in range(start_iter_idx, num_iters): bucket_access_list = bucket_id_access_order[i * self.num_replicas : (i + 1) * self.num_replicas] self.last_micro_batch_access_index += self.num_replicas # compute the data samples consumed by each access bucket_access_boundaries = [] for bucket_id in bucket_access_list: bucket_bs = self.bucket.get_batch_size(bucket_id) last_consumed_index = bucket_last_consumed.get(bucket_id, 0) bucket_access_boundaries.append([last_consumed_index, last_consumed_index + bucket_bs]) # update consumption if bucket_id in bucket_last_consumed: bucket_last_consumed[bucket_id] += bucket_bs else: bucket_last_consumed[bucket_id] = bucket_bs # compute the range of data accessed by each GPU bucket_id = bucket_access_list[self.rank] boundary = bucket_access_boundaries[self.rank] cur_micro_batch = bucket_sample_dict[bucket_id][boundary[0] : boundary[1]] # encode t, h, w into the sample index real_t, real_h, real_w = self.bucket.get_thw(bucket_id) cur_micro_batch = [f"{idx}-{real_t}-{real_h}-{real_w}" for idx in cur_micro_batch] yield cur_micro_batch self.reset() def __len__(self) -> int: return self.get_num_batch() // dist.get_world_size() def group_by_bucket(self) -> dict: bucket_sample_dict = OrderedDict() from pandarallel import pandarallel pandarallel.initialize(nb_workers=self.num_bucket_build_workers, progress_bar=False) get_logger().info("Building buckets...") bucket_ids = self.dataset.data.parallel_apply( apply, axis=1, method=self.bucket.get_bucket_id, frame_interval=self.dataset.frame_interval, seed=self.seed + self.epoch, num_bucket=self.bucket.num_bucket, ) # group by bucket # each data sample is put into a bucket with a similar image/video size for i in range(len(self.dataset)): bucket_id = bucket_ids[i] if bucket_id is None: continue if bucket_id not in bucket_sample_dict: bucket_sample_dict[bucket_id] = [] bucket_sample_dict[bucket_id].append(i) return bucket_sample_dict def get_num_batch(self) -> int: bucket_sample_dict = self.group_by_bucket() self._get_num_batch_cached_bucket_sample_dict = bucket_sample_dict # calculate the number of batches if self.verbose: self._print_bucket_info(bucket_sample_dict) return self.approximate_num_batch def _print_bucket_info(self, bucket_sample_dict: dict) -> None: # collect statistics total_samples = 0 total_batch = 0 num_aspect_dict = defaultdict(lambda: [0, 0]) num_hwt_dict = defaultdict(lambda: [0, 0]) for k, v in bucket_sample_dict.items(): size = len(v) num_batch = size // self.bucket.get_batch_size(k[:-1]) total_samples += size total_batch += num_batch num_aspect_dict[k[-1]][0] += size num_aspect_dict[k[-1]][1] += num_batch num_hwt_dict[k[:-1]][0] += size num_hwt_dict[k[:-1]][1] += num_batch # sort num_aspect_dict = dict(sorted(num_aspect_dict.items(), key=lambda x: x[0])) num_hwt_dict = dict( sorted(num_hwt_dict.items(), key=lambda x: (get_num_pixels(x[0][0]), x[0][1]), reverse=True) ) num_hwt_img_dict = {k: v for k, v in num_hwt_dict.items() if k[1] == 1} num_hwt_vid_dict = {k: v for k, v in num_hwt_dict.items() if k[1] > 1} # log if dist.get_rank() == 0 and self.verbose: get_logger().info("Bucket Info:") get_logger().info( "Bucket [#sample, #batch] by aspect ratio:\n%s", pformat(num_aspect_dict, sort_dicts=False) ) get_logger().info( "Image Bucket [#sample, #batch] by HxWxT:\n%s", pformat(num_hwt_img_dict, sort_dicts=False) ) get_logger().info( "Video Bucket [#sample, #batch] by HxWxT:\n%s", pformat(num_hwt_vid_dict, sort_dicts=False) ) get_logger().info( "#training batch: %s, #training sample: %s, #non empty bucket: %s", format_numel_str(total_batch), format_numel_str(total_samples), len(bucket_sample_dict), ) self.approximate_num_batch = total_batch def reset(self): self.last_micro_batch_access_index = 0 def state_dict(self, num_steps: int) -> dict: # the last_micro_batch_access_index in the __iter__ is often # not accurate during multi-workers and data prefetching # thus, we need the user to pass the actual steps which have been executed # to calculate the correct last_micro_batch_access_index return {"seed": self.seed, "epoch": self.epoch, "last_micro_batch_access_index": num_steps * self.num_replicas} def load_state_dict(self, state_dict: dict) -> None: self.__dict__.update(state_dict) class BatchDistributedSampler(DistributedSampler): """ Used with BatchDataset; Suppose len_buffer == 5, num_buffers == 6, #GPUs == 3, then | buffer {i} | buffer {i+1} ------ | ------------------- | ------------------- rank 0 | 0, 1, 2, 3, 4, | 5, 6, 7, 8, 9 rank 1 | 10, 11, 12, 13, 14, | 15, 16, 17, 18, 19 rank 2 | 20, 21, 22, 23, 24, | 25, 26, 27, 28, 29 """ def __init__(self, dataset: Dataset, **kwargs): super().__init__(dataset, **kwargs) self.start_index = 0 def __iter__(self): num_buffers = self.dataset.num_buffers len_buffer = self.dataset.len_buffer num_buffers_i = num_buffers // self.num_replicas num_samples_i = len_buffer * num_buffers_i indices_i = np.arange(self.start_index, num_samples_i) + self.rank * num_samples_i indices_i = indices_i.tolist() return iter(indices_i) def reset(self): self.start_index = 0 def state_dict(self, step) -> dict: return {"start_index": step} def load_state_dict(self, state_dict: dict): self.start_index = state_dict["start_index"] + 1 ================================================ FILE: Open-Sora/build/lib/opensora/datasets/utils.py ================================================ import os import re import numpy as np import pandas as pd import requests import torch import torchvision import torchvision.transforms as transforms from PIL import Image from torchvision.datasets.folder import IMG_EXTENSIONS, pil_loader from torchvision.io import write_video from torchvision.utils import save_image from . import video_transforms VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv") regex = re.compile( r"^(?:http|ftp)s?://" # http:// or https:// r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain... r"localhost|" # localhost... r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip r"(?::\d+)?" # optional port r"(?:/?|[/?]\S+)$", re.IGNORECASE, ) def is_img(path): ext = os.path.splitext(path)[-1].lower() return ext in IMG_EXTENSIONS def is_vid(path): ext = os.path.splitext(path)[-1].lower() return ext in VID_EXTENSIONS def is_url(url): return re.match(regex, url) is not None def read_file(input_path): if input_path.endswith(".csv"): return pd.read_csv(input_path) elif input_path.endswith(".parquet"): return pd.read_parquet(input_path) else: raise NotImplementedError(f"Unsupported file format: {input_path}") def download_url(input_path): output_dir = "cache" os.makedirs(output_dir, exist_ok=True) base_name = os.path.basename(input_path) output_path = os.path.join(output_dir, base_name) img_data = requests.get(input_path).content with open(output_path, "wb") as handler: handler.write(img_data) print(f"URL {input_path} downloaded to {output_path}") return output_path def temporal_random_crop(vframes, num_frames, frame_interval): temporal_sample = video_transforms.TemporalRandomCrop(num_frames * frame_interval) total_frames = len(vframes) start_frame_ind, end_frame_ind = temporal_sample(total_frames) assert ( end_frame_ind - start_frame_ind >= num_frames ), f"Not enough frames to sample, {end_frame_ind} - {start_frame_ind} < {num_frames}" frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, num_frames, dtype=int) video = vframes[frame_indice] return video def get_transforms_video(name="center", image_size=(256, 256)): if name is None: return None elif name == "center": assert image_size[0] == image_size[1], "image_size must be square for center crop" transform_video = transforms.Compose( [ video_transforms.ToTensorVideo(), # TCHW # video_transforms.RandomHorizontalFlipVideo(), video_transforms.UCFCenterCropVideo(image_size[0]), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True), ] ) elif name == "resize_crop": transform_video = transforms.Compose( [ video_transforms.ToTensorVideo(), # TCHW video_transforms.ResizeCrop(image_size), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True), ] ) else: raise NotImplementedError(f"Transform {name} not implemented") return transform_video def get_transforms_image(name="center", image_size=(256, 256)): if name is None: return None elif name == "center": assert image_size[0] == image_size[1], "Image size must be square for center crop" transform = transforms.Compose( [ transforms.Lambda(lambda pil_image: center_crop_arr(pil_image, image_size[0])), # transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True), ] ) elif name == "resize_crop": transform = transforms.Compose( [ transforms.Lambda(lambda pil_image: resize_crop_to_fill(pil_image, image_size)), transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True), ] ) else: raise NotImplementedError(f"Transform {name} not implemented") return transform def read_image_from_path(path, transform=None, transform_name="center", num_frames=1, image_size=(256, 256)): image = pil_loader(path) if transform is None: transform = get_transforms_image(image_size=image_size, name=transform_name) image = transform(image) video = image.unsqueeze(0).repeat(num_frames, 1, 1, 1) video = video.permute(1, 0, 2, 3) return video def read_video_from_path(path, transform=None, transform_name="center", image_size=(256, 256)): vframes, aframes, info = torchvision.io.read_video(filename=path, pts_unit="sec", output_format="TCHW") if transform is None: transform = get_transforms_video(image_size=image_size, name=transform_name) video = transform(vframes) # T C H W video = video.permute(1, 0, 2, 3) return video def read_from_path(path, image_size, transform_name="center"): if is_url(path): path = download_url(path) ext = os.path.splitext(path)[-1].lower() if ext.lower() in VID_EXTENSIONS: return read_video_from_path(path, image_size=image_size, transform_name=transform_name) else: assert ext.lower() in IMG_EXTENSIONS, f"Unsupported file format: {ext}" return read_image_from_path(path, image_size=image_size, transform_name=transform_name) def save_sample(x, save_path=None, fps=8, normalize=True, value_range=(-1, 1), force_video=False, verbose=True): """ Args: x (Tensor): shape [C, T, H, W] """ assert x.ndim == 4 if not force_video and x.shape[1] == 1: # T = 1: save as image save_path += ".png" x = x.squeeze(1) save_image([x], save_path, normalize=normalize, value_range=value_range) else: save_path += ".mp4" if normalize: low, high = value_range x.clamp_(min=low, max=high) x.sub_(low).div_(max(high - low, 1e-5)) x = x.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 3, 0).to("cpu", torch.uint8) write_video(save_path, x, fps=fps, video_codec="h264") if verbose: print(f"Saved to {save_path}") return save_path def center_crop_arr(pil_image, image_size): """ Center cropping implementation from ADM. https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126 """ while min(*pil_image.size) >= 2 * image_size: pil_image = pil_image.resize(tuple(x // 2 for x in pil_image.size), resample=Image.BOX) scale = image_size / min(*pil_image.size) pil_image = pil_image.resize(tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC) arr = np.array(pil_image) crop_y = (arr.shape[0] - image_size) // 2 crop_x = (arr.shape[1] - image_size) // 2 return Image.fromarray(arr[crop_y : crop_y + image_size, crop_x : crop_x + image_size]) def resize_crop_to_fill(pil_image, image_size): w, h = pil_image.size # PIL is (W, H) th, tw = image_size rh, rw = th / h, tw / w if rh > rw: sh, sw = th, round(w * rh) image = pil_image.resize((sw, sh), Image.BICUBIC) i = 0 j = int(round((sw - tw) / 2.0)) else: sh, sw = round(h * rw), tw image = pil_image.resize((sw, sh), Image.BICUBIC) i = int(round((sh - th) / 2.0)) j = 0 arr = np.array(image) assert i + th <= arr.shape[0] and j + tw <= arr.shape[1] return Image.fromarray(arr[i : i + th, j : j + tw]) ================================================ FILE: Open-Sora/build/lib/opensora/datasets/video_transforms.py ================================================ # Copyright 2024 Vchitect/Latte # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License.# Modified from Latte # - This file is adapted from https://github.com/Vchitect/Latte/blob/main/datasets/video_transforms.py import numbers import random import numpy as np import torch def _is_tensor_video_clip(clip): if not torch.is_tensor(clip): raise TypeError("clip should be Tensor. Got %s" % type(clip)) if not clip.ndimension() == 4: raise ValueError("clip should be 4D. Got %dD" % clip.dim()) return True def crop(clip, i, j, h, w): """ Args: clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W) """ if len(clip.size()) != 4: raise ValueError("clip should be a 4D tensor") return clip[..., i : i + h, j : j + w] def resize(clip, target_size, interpolation_mode): if len(target_size) != 2: raise ValueError(f"target size should be tuple (height, width), instead got {target_size}") return torch.nn.functional.interpolate(clip, size=target_size, mode=interpolation_mode, align_corners=False) def resize_scale(clip, target_size, interpolation_mode): if len(target_size) != 2: raise ValueError(f"target size should be tuple (height, width), instead got {target_size}") H, W = clip.size(-2), clip.size(-1) scale_ = target_size[0] / min(H, W) return torch.nn.functional.interpolate(clip, scale_factor=scale_, mode=interpolation_mode, align_corners=False) def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"): """ Do spatial cropping and resizing to the video clip Args: clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W) i (int): i in (i,j) i.e coordinates of the upper left corner. j (int): j in (i,j) i.e coordinates of the upper left corner. h (int): Height of the cropped region. w (int): Width of the cropped region. size (tuple(int, int)): height and width of resized clip Returns: clip (torch.tensor): Resized and cropped clip. Size is (T, C, H, W) """ if not _is_tensor_video_clip(clip): raise ValueError("clip should be a 4D torch.tensor") clip = crop(clip, i, j, h, w) clip = resize(clip, size, interpolation_mode) return clip def center_crop(clip, crop_size): if not _is_tensor_video_clip(clip): raise ValueError("clip should be a 4D torch.tensor") h, w = clip.size(-2), clip.size(-1) th, tw = crop_size if h < th or w < tw: raise ValueError("height and width must be no smaller than crop_size") i = int(round((h - th) / 2.0)) j = int(round((w - tw) / 2.0)) return crop(clip, i, j, th, tw) def center_crop_using_short_edge(clip): if not _is_tensor_video_clip(clip): raise ValueError("clip should be a 4D torch.tensor") h, w = clip.size(-2), clip.size(-1) if h < w: th, tw = h, h i = 0 j = int(round((w - tw) / 2.0)) else: th, tw = w, w i = int(round((h - th) / 2.0)) j = 0 return crop(clip, i, j, th, tw) def resize_crop_to_fill(clip, target_size): if not _is_tensor_video_clip(clip): raise ValueError("clip should be a 4D torch.tensor") h, w = clip.size(-2), clip.size(-1) th, tw = target_size[0], target_size[1] rh, rw = th / h, tw / w if rh > rw: sh, sw = th, round(w * rh) clip = resize(clip, (sh, sw), "bilinear") i = 0 j = int(round(sw - tw) / 2.0) else: sh, sw = round(h * rw), tw clip = resize(clip, (sh, sw), "bilinear") i = int(round(sh - th) / 2.0) j = 0 assert i + th <= clip.size(-2) and j + tw <= clip.size(-1) return crop(clip, i, j, th, tw) def random_shift_crop(clip): """ Slide along the long edge, with the short edge as crop size """ if not _is_tensor_video_clip(clip): raise ValueError("clip should be a 4D torch.tensor") h, w = clip.size(-2), clip.size(-1) if h <= w: short_edge = h else: short_edge = w th, tw = short_edge, short_edge i = torch.randint(0, h - th + 1, size=(1,)).item() j = torch.randint(0, w - tw + 1, size=(1,)).item() return crop(clip, i, j, th, tw) def to_tensor(clip): """ Convert tensor data type from uint8 to float, divide value by 255.0 and permute the dimensions of clip tensor Args: clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W) Return: clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W) """ _is_tensor_video_clip(clip) if not clip.dtype == torch.uint8: raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype)) # return clip.float().permute(3, 0, 1, 2) / 255.0 return clip.float() / 255.0 def normalize(clip, mean, std, inplace=False): """ Args: clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W) mean (tuple): pixel RGB mean. Size is (3) std (tuple): pixel standard deviation. Size is (3) Returns: normalized clip (torch.tensor): Size is (T, C, H, W) """ if not _is_tensor_video_clip(clip): raise ValueError("clip should be a 4D torch.tensor") if not inplace: clip = clip.clone() mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device) # print(mean) std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device) clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None]) return clip def hflip(clip): """ Args: clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W) Returns: flipped clip (torch.tensor): Size is (T, C, H, W) """ if not _is_tensor_video_clip(clip): raise ValueError("clip should be a 4D torch.tensor") return clip.flip(-1) class ResizeCrop: def __init__(self, size): if isinstance(size, numbers.Number): self.size = (int(size), int(size)) else: self.size = size def __call__(self, clip): clip = resize_crop_to_fill(clip, self.size) return clip def __repr__(self) -> str: return f"{self.__class__.__name__}(size={self.size})" class RandomCropVideo: def __init__(self, size): if isinstance(size, numbers.Number): self.size = (int(size), int(size)) else: self.size = size def __call__(self, clip): """ Args: clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W) Returns: torch.tensor: randomly cropped video clip. size is (T, C, OH, OW) """ i, j, h, w = self.get_params(clip) return crop(clip, i, j, h, w) def get_params(self, clip): h, w = clip.shape[-2:] th, tw = self.size if h < th or w < tw: raise ValueError(f"Required crop size {(th, tw)} is larger than input image size {(h, w)}") if w == tw and h == th: return 0, 0, h, w i = torch.randint(0, h - th + 1, size=(1,)).item() j = torch.randint(0, w - tw + 1, size=(1,)).item() return i, j, th, tw def __repr__(self) -> str: return f"{self.__class__.__name__}(size={self.size})" class CenterCropResizeVideo: """ First use the short side for cropping length, center crop video, then resize to the specified size """ def __init__( self, size, interpolation_mode="bilinear", ): if isinstance(size, tuple): if len(size) != 2: raise ValueError(f"size should be tuple (height, width), instead got {size}") self.size = size else: self.size = (size, size) self.interpolation_mode = interpolation_mode def __call__(self, clip): """ Args: clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W) Returns: torch.tensor: scale resized / center cropped video clip. size is (T, C, crop_size, crop_size) """ clip_center_crop = center_crop_using_short_edge(clip) clip_center_crop_resize = resize( clip_center_crop, target_size=self.size, interpolation_mode=self.interpolation_mode ) return clip_center_crop_resize def __repr__(self) -> str: return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}" class UCFCenterCropVideo: """ First scale to the specified size in equal proportion to the short edge, then center cropping """ def __init__( self, size, interpolation_mode="bilinear", ): if isinstance(size, tuple): if len(size) != 2: raise ValueError(f"size should be tuple (height, width), instead got {size}") self.size = size else: self.size = (size, size) self.interpolation_mode = interpolation_mode def __call__(self, clip): """ Args: clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W) Returns: torch.tensor: scale resized / center cropped video clip. size is (T, C, crop_size, crop_size) """ clip_resize = resize_scale(clip=clip, target_size=self.size, interpolation_mode=self.interpolation_mode) clip_center_crop = center_crop(clip_resize, self.size) return clip_center_crop def __repr__(self) -> str: return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}" class KineticsRandomCropResizeVideo: """ Slide along the long edge, with the short edge as crop size. And resie to the desired size. """ def __init__( self, size, interpolation_mode="bilinear", ): if isinstance(size, tuple): if len(size) != 2: raise ValueError(f"size should be tuple (height, width), instead got {size}") self.size = size else: self.size = (size, size) self.interpolation_mode = interpolation_mode def __call__(self, clip): clip_random_crop = random_shift_crop(clip) clip_resize = resize(clip_random_crop, self.size, self.interpolation_mode) return clip_resize class CenterCropVideo: def __init__( self, size, interpolation_mode="bilinear", ): if isinstance(size, tuple): if len(size) != 2: raise ValueError(f"size should be tuple (height, width), instead got {size}") self.size = size else: self.size = (size, size) self.interpolation_mode = interpolation_mode def __call__(self, clip): """ Args: clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W) Returns: torch.tensor: center cropped video clip. size is (T, C, crop_size, crop_size) """ clip_center_crop = center_crop(clip, self.size) return clip_center_crop def __repr__(self) -> str: return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}" class NormalizeVideo: """ Normalize the video clip by mean subtraction and division by standard deviation Args: mean (3-tuple): pixel RGB mean std (3-tuple): pixel RGB standard deviation inplace (boolean): whether do in-place normalization """ def __init__(self, mean, std, inplace=False): self.mean = mean self.std = std self.inplace = inplace def __call__(self, clip): """ Args: clip (torch.tensor): video clip must be normalized. Size is (C, T, H, W) """ return normalize(clip, self.mean, self.std, self.inplace) def __repr__(self) -> str: return f"{self.__class__.__name__}(mean={self.mean}, std={self.std}, inplace={self.inplace})" class ToTensorVideo: """ Convert tensor data type from uint8 to float, divide value by 255.0 and permute the dimensions of clip tensor """ def __init__(self): pass def __call__(self, clip): """ Args: clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W) Return: clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W) """ return to_tensor(clip) def __repr__(self) -> str: return self.__class__.__name__ class RandomHorizontalFlipVideo: """ Flip the video clip along the horizontal direction with a given probability Args: p (float): probability of the clip being flipped. Default value is 0.5 """ def __init__(self, p=0.5): self.p = p def __call__(self, clip): """ Args: clip (torch.tensor): Size is (T, C, H, W) Return: clip (torch.tensor): Size is (T, C, H, W) """ if random.random() < self.p: clip = hflip(clip) return clip def __repr__(self) -> str: return f"{self.__class__.__name__}(p={self.p})" # ------------------------------------------------------------ # --------------------- Sampling --------------------------- # ------------------------------------------------------------ class TemporalRandomCrop(object): """Temporally crop the given frame indices at a random location. Args: size (int): Desired length of frames will be seen in the model. """ def __init__(self, size): self.size = size def __call__(self, total_frames): rand_end = max(0, total_frames - self.size - 1) begin_index = random.randint(0, rand_end) end_index = min(begin_index + self.size, total_frames) return begin_index, end_index if __name__ == "__main__": import os import numpy as np import torchvision.io as io from torchvision import transforms from torchvision.utils import save_image vframes, aframes, info = io.read_video(filename="./v_Archery_g01_c03.avi", pts_unit="sec", output_format="TCHW") trans = transforms.Compose( [ ToTensorVideo(), RandomHorizontalFlipVideo(), UCFCenterCropVideo(512), # NormalizeVideo(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True), ] ) target_video_len = 32 frame_interval = 1 total_frames = len(vframes) print(total_frames) temporal_sample = TemporalRandomCrop(target_video_len * frame_interval) # Sampling video frames start_frame_ind, end_frame_ind = temporal_sample(total_frames) # print(start_frame_ind) # print(end_frame_ind) assert end_frame_ind - start_frame_ind >= target_video_len frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, target_video_len, dtype=int) print(frame_indice) select_vframes = vframes[frame_indice] print(select_vframes.shape) print(select_vframes.dtype) select_vframes_trans = trans(select_vframes) print(select_vframes_trans.shape) print(select_vframes_trans.dtype) select_vframes_trans_int = ((select_vframes_trans * 0.5 + 0.5) * 255).to(dtype=torch.uint8) print(select_vframes_trans_int.dtype) print(select_vframes_trans_int.permute(0, 2, 3, 1).shape) io.write_video("./test.avi", select_vframes_trans_int.permute(0, 2, 3, 1), fps=8) for i in range(target_video_len): save_image( select_vframes_trans[i], os.path.join("./test000", "%04d.png" % i), normalize=True, value_range=(-1, 1) ) ================================================ FILE: Open-Sora/build/lib/opensora/models/cache_functions/__init__.py ================================================ from .cache_cutfresh import cache_cutfresh from .fresh_ratio_scheduler import fresh_ratio_scheduler from .score_evaluate import score_evaluate from .global_force_fresh import global_force_fresh from .cache_cutfresh import cache_cutfresh from .update_cache import update_cache from .force_init import force_init from .attention import cached_attention_forward from .cache_init import cache_init ================================================ FILE: Open-Sora/build/lib/opensora/models/cache_functions/attention.py ================================================ # Besides, re-arrange the attention module from torch.jit import Final import torch import torch.nn as nn import torch.nn.functional as F from typing import Optional, Union from xformers.ops.fmha.attn_bias import BlockDiagonalMask def cached_attention_forward( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attn_bias: Optional[Union[torch.Tensor, BlockDiagonalMask]] = None, p: float = 0.0, scale: Optional[float] = None ) -> torch.Tensor: scale = 1.0 / query.shape[-1] ** 0.5 query = query * scale query = query.transpose(1, 2) key = key.transpose(1, 2) value = value.transpose(1, 2) #attn = query @ key.transpose(-2, -1) attn = torch.matmul(query, key.transpose(-2, -1)) if attn_bias is not None: attn_bias = attn_bias.materialize(shape= attn.shape, dtype= attn.dtype, device= attn.device) attn = attn + attn_bias #out_map = attn attn_map = attn.softmax(-1) attn = F.dropout(attn_map, p) attn = torch.matmul(attn, value) #attn = attn @ value return attn.transpose(1, 2).contiguous(), attn_map.mean(dim=1) ================================================ FILE: Open-Sora/build/lib/opensora/models/cache_functions/cache_cutfresh.py ================================================ from .fresh_ratio_scheduler import fresh_ratio_scheduler from .score_evaluate import score_evaluate #from .token_merge import token_merge import torch def cache_cutfresh(cache_dic, tokens, current): """ indices: (B, N), the index tensor for the fresh tokens, tell where the 1st, 2nd, 3rd... tokens are fresh_indices: (B, fresh_ratio * N), top fresh_ratio cut for indices fresh_tokens: (B, fresh_ratio * N, D), the fresh tokens """ tick1 = torch.cuda.Event(enable_timing=True) tick2 = torch.cuda.Event(enable_timing=True) #tick3 = torch.cuda.Event(enable_timing=True) #tick4 = torch.cuda.Event(enable_timing=True) step = current['step'] layer = current['layer'] module = current['module'] fresh_ratio = fresh_ratio_scheduler(cache_dic, current) fresh_ratio = torch.clamp(torch.tensor(fresh_ratio, device = tokens.device), min=0, max=1) # 0.03ms # Generate the index tensor for fresh tokens #tick1.record() score = score_evaluate(cache_dic, tokens, current) # 0.26ms #tick2.record() #score = local_selection_with_space_time_bonus(cache_dic, score, 0.3, 2, time_mean=False) indices = score.argsort(dim=-1, descending=True) # 0.12ms #indices = cache_dic['indices_cache'][current['flag']][current['layer']] topk = int(fresh_ratio * score.shape[1]) #topk = int(fresh_ratio * cache_dic['dynamic_size'][2] * cache_dic['dynamic_size'][3]) * cache_dic['dynamic_size'][1] fresh_indices = indices[:, :topk] #前fresh_ratio的token的index stale_indices = indices[:, topk:] #后1-fresh_ratio的token的index # (B, fresh_ratio *N) # stale tokens index + 1 in each ***module***, fresh tokens index = 0 cache_dic['cache_index'][current['flag']][layer][module] += 1 cache_dic['cache_index'][current['flag']][layer][module].scatter_(dim=1, index=fresh_indices, src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device)) cache_dic['cache_index']['layer_index'][module] += 1 cache_dic['cache_index']['layer_index'][module].scatter_(dim=1, index=fresh_indices, src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device)) # 0.08ms # select the fresh tokens out fresh_indices_expand = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1]) #stale_indices_expand = stale_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1]) #if cache_dic['merge_weight'] != 0: # token_merge(cache_dic, tokens, current, fresh_indices, stale_indices) if module in ['mlp', 'attn', 'cross-attn']: fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices_expand) # 0.10ms #torch.cuda.synchronize() #print(tick1.elapsed_time(tick2)) return fresh_indices, fresh_tokens else: raise ValueError("Unrecognized module?", module) import torch from einops import rearrange def local_selection_with_space_time_bonus(cache_dic, score, bonus_ratio, grid_size=2, time_mean = False): # 从 cache_dic 中获取张量的形状 B, T, H, W = cache_dic['dynamic_size'] # 对 score 进行变形,将其重塑为 [B, T, H, W] 的形状 score = rearrange(score, "B (T H W) -> B T H W", T=T, H=H, W=W) # 计算补 0 的尺寸,使得 H 和 W 都能被 grid_size 整除 pad_h = (grid_size - H % grid_size) % grid_size # H 维度需要补充的 0 的数量 pad_w = (grid_size - W % grid_size) % grid_size # W 维度需要补充的 0 的数量 # 对 H 和 W 维度进行补 0 if pad_h > 0 or pad_w > 0: score = torch.nn.functional.pad(score, (0, pad_w, 0, pad_h)) # (W 左右补 pad_w, H 上下补 pad_h) # 更新补 0 后的 H 和 W H_padded, W_padded = score.shape[2], score.shape[3] # Step 1: 在 H*W 维度上进行归一化,使得不同时间步的信息权重相同 score = score.view(B, T, -1) # 将 H 和 W 合并为一个维度 [B, T, H*W] score = torch.nn.functional.softmax(score, dim=-1) # 在 H*W 维度上进行归一化 score = score.view(B, T, H_padded, W_padded) # 恢复到 [B, T, H_padded, W_padded] 形状 # Step 2: 在每个空间切片(即每个 T 时间步内)进行分块操作 block_size = grid_size * grid_size assert (H_padded * W_padded) % block_size == 0, f"H_padded * W_padded 必须能被块大小整除, shape: {B},{T},{H_padded},{W_padded}; block:{grid_size}*{grid_size};" # 将 score 重塑为按块分组的形状 score_reshaped = score.view(B, T, H_padded // grid_size, grid_size, W_padded // grid_size, grid_size) score_reshaped = score_reshaped.permute(0, 1, 2, 4, 3, 5).contiguous() # [B, T, H//grid_size, W//grid_size, grid_size, grid_size] score_reshaped = score_reshaped.view(B, T, -1, block_size) # [B, T, num_blocks, block_size] # Step 3: 找到每个块中的最大分数 max_scores, max_indices = score_reshaped.max(dim=-1, keepdim=True) # [B, T, num_blocks, 1] # Step 4: 创建掩码以标识最大分数的 token mask = torch.zeros_like(score_reshaped) mask.scatter_(-1, max_indices, 1) # 将掩码在最大分数的索引位置设置为 1 # Step 5: 仅对最大分数的 token 应用加成 score_reshaped = score_reshaped + (mask * max_scores * bonus_ratio) # 仅对最大分数应用加成 # Step 6: 将 score 还原为原始的形状 score_modified = score_reshaped.view(B, T, H_padded // grid_size, W_padded // grid_size, grid_size, grid_size) score_modified = score_modified.permute(0, 1, 2, 4, 3, 5).contiguous() score_modified = score_modified.view(B, T, H_padded, W_padded) # Step 7: 去除补 0 的部分 if pad_h > 0 or pad_w > 0: score_modified = score_modified[:, :, :H, :W] # 移除补的 0 if time_mean: score_modified = score_modified.mean(dim = 1) score_modified = score_modified.unsqueeze(1).expand(B, T, H, W) # 最后将 score 变回原始的形状 [B, (T H W)] score_modified = rearrange(score_modified, "B T H W -> B (T H W)") return score_modified ================================================ FILE: Open-Sora/build/lib/opensora/models/cache_functions/cache_init.py ================================================ def cache_init(model_kwargs, num_steps): cache_dic = {} cache = {} indices_cache = {} cache_index = {} cache[-1]={} cache[0]={} indices_cache[-1]={} indices_cache[0]={} cache_index[-1]={} cache_index[0]={} cache_index['layer_index']={} cache_dic['attn_map'] = {} cache_dic['attn_map'][-1] = {} cache_dic['attn_map'][0] = {} cache_dic['cross_attn_map'] = {} cache_dic['cross_attn_map'][-1] = {} cache_dic['cross_attn_map'][0] = {} for j in range(28): cache[-1][j] = {} indices_cache[-1] = {} cache_index[-1][j] = {} cache_dic['attn_map'][-1][j] = {} cache_dic['cross_attn_map'][-1][j] = {} cache[0][j] = {} indices_cache[0] = {} cache_index[0][j] = {} cache_dic['attn_map'][0][j] = {} cache_dic['cross_attn_map'][0][j] = {} cache_dic['cache_type'] = model_kwargs['cache_type'] cache_dic['cache_index'] = cache_index cache_dic['cache'] = cache cache_dic['indices_cache'] = indices_cache cache_dic['fresh_ratio_schedule'] = model_kwargs['ratio_scheduler'] cache_dic['fresh_ratio'] = model_kwargs['fresh_ratio'] cache_dic['fresh_threshold'] = model_kwargs['fresh_threshold'] cache_dic['force_fresh'] = model_kwargs['force_fresh'] cache_dic['soft_fresh_weight'] = model_kwargs['soft_fresh_weight'] #cache_dic['extra_flops'] = 0.0 #cache_dic['merge_weight'] = merge_weight current = {} current['num_steps'] = num_steps return cache_dic, current ================================================ FILE: Open-Sora/build/lib/opensora/models/cache_functions/force_init.py ================================================ import torch from .force_scheduler import force_scheduler def force_init(cache_dic, current, tokens): cache_dic['cache_index'][current['flag']][current['layer']][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device) force_scheduler(cache_dic, current) if current['layer'] == 0: cache_dic['cache_index']['layer_index'][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device) ================================================ FILE: Open-Sora/build/lib/opensora/models/cache_functions/force_scheduler.py ================================================ import torch def force_scheduler(cache_dic, current): thresholds = {} if cache_dic['fresh_ratio'] == 0: # FORA linear_step_weight = 0.0 else: # TokenCache linear_step_weight = 0.0 #N=6 0.2 #N=4 0.3 step_factor = torch.tensor(1 - linear_step_weight + 2 * linear_step_weight * current['step'] / current['num_steps']) threshold = torch.round(cache_dic['fresh_threshold'] / step_factor) #threshold = torch.round(4 / step_factor) key_point = 2 if current['step'] in range(0,key_point): threshold = 1 #thresholds = { # 'spat-attn' : 3, # 'temp-attn' : 3, # 'cross-attn' : 6, # 'mlp' : 3 } thresholds = { 'spat-attn' : 1, 'temp-attn' : 1, 'cross-attn' : 1, 'mlp' : 1 } #if current['step'] in range(150,175): # threshold = 4 #elif current['step'] in list(range(0,25)) + list(range(75,100)) + list(range(175,200)) + list(range(225,250)): # threshold = 3 #elif current['step'] in list(range(100,125)) + list(range(150,175)) + list(range(200,225)): # threshold = 4 #elif current['step'] in range(100,175): # threshold = 5 #elif current['step'] in range(200,225): # threshold = 5 #step_weight = 0.25 #if current['step'] >= 0.5 * (1 - step_weight) * current['num_steps']: # threshold = int(cache_dic['fresh_threshold'] * (1 + step_weight)) #elif current['step'] <= 0.5 * (1 - step_weight) * current['num_steps']: # threshold = int(cache_dic['fresh_threshold'] * (1 - step_weight)) cache_dic['cal_threshold'] = thresholds #return threshold ================================================ FILE: Open-Sora/build/lib/opensora/models/cache_functions/fresh_ratio_scheduler.py ================================================ import torch def fresh_ratio_scheduler(cache_dic, current): ''' Return the fresh ratio for the current step. ''' fresh_ratio = cache_dic['fresh_ratio'] fresh_ratio_schedule = cache_dic['fresh_ratio_schedule'] step = current['step'] num_steps = current['num_steps'] threshold = cache_dic['fresh_threshold'] weight = 0.9 if fresh_ratio_schedule == 'constant': return fresh_ratio elif fresh_ratio_schedule == 'linear': return fresh_ratio * (1 + weight - 2 * weight * step / num_steps) elif fresh_ratio_schedule == 'exp': #return 0.5 * (0.052 ** (step/num_steps)) return fresh_ratio * (weight ** (step / num_steps)) elif fresh_ratio_schedule == 'linear-mode': mode = (step % threshold)/threshold - 0.5 mode_weight = 0.1 return fresh_ratio * (1 + weight - 2 * weight * step / num_steps + mode_weight * mode) elif fresh_ratio_schedule == 'layerwise': return fresh_ratio * (1 + weight - 2 * weight * current['layer'] / 27) elif fresh_ratio_schedule == 'linear-layerwise': step_weight = 0.0 #0.9 step_factor = 1 + step_weight - 2 * step_weight * step / num_steps layer_weight = 0.0 layer_factor = 1 + layer_weight - 2 * layer_weight * current['layer'] / 27 module_weight = 1.5 module_time_weight = 0.33 module_factor = (1 - (1-module_time_weight) * module_weight) if current['module']=='cross-attn' else (1 + module_time_weight * module_weight) type_weight = 0.0 type_factor = 1 + type_weight if current['flag'] == -1 else 1 - type_weight return fresh_ratio * layer_factor * step_factor * module_factor * type_factor #saved_weight = 0.25 ##earliest 50% #if current['step'] % cache_dic['cal_threshold'] >= (1- saved_weight) * cache_dic['cal_threshold']: # return fresh_ratio * layer_factor * step_factor / saved_weight ## latest 50% ##if current['step'] % cache_dic['cal_threshold'] <= (saved_weight) * cache_dic['cal_threshold']: ## return fresh_ratio * layer_factor * step_factor / saved_weight # #else : # return 0 else: raise ValueError("unrecognized fresh ratio schedule", fresh_ratio_schedule) ================================================ FILE: Open-Sora/build/lib/opensora/models/cache_functions/global_force_fresh.py ================================================ from .force_scheduler import force_scheduler def global_force_fresh(cache_dic, current): ''' Return whether to force fresh tokens globally. ''' is_force_fresh = {} fresh_thresholds = {} first_step = (current['step'] == 0) first_3steps = (current['step'] <= 2) last_step = current['step'] == current['num_steps'] - 1 force_fresh = cache_dic['force_fresh'] if not first_step: fresh_thresholds['spat-attn'] = cache_dic['cal_threshold']['spat-attn'] fresh_thresholds['temp-attn'] = cache_dic['cal_threshold']['temp-attn'] fresh_thresholds['cross-attn'] = cache_dic['cal_threshold']['cross-attn'] fresh_thresholds['mlp'] = cache_dic['cal_threshold']['mlp'] else: fresh_thresholds['spat-attn'] = cache_dic['fresh_threshold'] fresh_thresholds['temp-attn'] = cache_dic['fresh_threshold'] fresh_thresholds['cross-attn'] = cache_dic['fresh_threshold'] fresh_thresholds['mlp'] = cache_dic['fresh_threshold'] if force_fresh == 'global': if current['flag'] == -1: is_force_fresh['attn'] = (first_3steps or (current['step']% fresh_thresholds['temp-attn'] == 0)) else: is_force_fresh['attn'] = (first_3steps or (current['step']% fresh_thresholds['spat-attn'] == 0)) is_force_fresh['cross-attn'] = (first_3steps or (current['step']% fresh_thresholds['cross-attn'] == 0)) is_force_fresh['mlp'] = (first_3steps or (current['step']% fresh_thresholds['mlp'] == 0)) return is_force_fresh elif force_fresh == 'local': return first_step elif force_fresh == 'none': return first_step else: raise ValueError("unrecognized force fresh strategy", force_fresh) ================================================ FILE: Open-Sora/build/lib/opensora/models/cache_functions/score_evaluate.py ================================================ import torch import torch.nn as nn from .scores import attn_score, similarity_score, norm_score def score_evaluate(cache_dic, tokens, current) -> torch.Tensor: ''' Return the score tensor (B, N) for the given tokens. ''' #这里用match case 来做可读性更好,但是考虑到match case是3.10版本才有的,而且其加速性能未验证,先用if else #fresh_ratio = cache_dic['fresh_ratio'] #cache_index = cache_dic['cache_index'] #start = torch.cuda.Event(enable_timing=True) #end = torch.cuda.Event(enable_timing=True) #start.record() if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')): # 0.4ms extra on 4090 # 从cache_index中找出达到cache_step达到fresh_threshold的tokens force_fresh_mask = torch.as_tensor((cache_dic['cache_index'][current['flag']][current['layer']][current['module']] >= 2 * cache_dic['fresh_threshold']), dtype = int) # 2 because the threshold is for step, not module force_len = force_fresh_mask.sum(dim=1) force_indices = force_fresh_mask.argsort(dim = -1, descending = True)[:, :force_len.min()] #在维度-1随机重排 force_indices = force_indices[:, torch.randperm(force_indices.shape[1])] if cache_dic['cache_type'] == 'random': score = torch.rand(int(tokens.shape[0]*0.5), tokens.shape[1], device=tokens.device) score = torch.cat([score, score], dim=0).to(tokens.device) elif cache_dic['cache_type'] == 'straight': score = torch.ones(tokens.shape[0], tokens.shape[1]).to(tokens.device) elif cache_dic['cache_type'] == 'attention': # cache_dic['attn_map'][step][layer] (B, N, N), the last dimention has get softmaxed score = attn_score(cache_dic, current) #score = score + 0.0 * torch.rand_like(score, device= score.device) elif cache_dic['cache_type'] == 'similarity': score = similarity_score(cache_dic, current, tokens) elif cache_dic['cache_type'] == 'norm': score = norm_score(cache_dic, current, tokens) elif cache_dic['cache_type'] == 'compress': score1 = torch.rand(int(tokens.shape[0]*0.5), tokens.shape[1]) score1 = torch.cat([score1, score1], dim=0).to(tokens.device) score2 = cache_dic['attn_map'][current['flag']][current['layer']].sum(dim=1)#.mean(dim=0) # (B, N) # normalize score2 = score2 / score2.max(dim=1, keepdim=True)[0] score = 0.5 * score1 + 0.5 * score2 #end.record() #torch.cuda.synchronize() #print(f"Time for score evaluation: {start.elapsed_time(end)} ms") if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')): # current['is_force_fresh'] is False, cause when it is True, no cut and fresh are needed #print(torch.ones_like(force_indices, dtype=float, device=force_indices.device).dtype) score.scatter_(dim=1, index=force_indices, src=torch.ones_like(force_indices, dtype=torch.float32, device=force_indices.device)) if (True and (cache_dic['force_fresh'] == 'global')): soft_step_score = cache_dic['cache_index'][current['flag']][current['layer']][current['module']].float() / (cache_dic['fresh_threshold']) soft_layer_score = cache_dic['cache_index']['layer_index'][current['module']].float() / (27) score = score + cache_dic['soft_fresh_weight'] * soft_step_score #+ 0.1 *soft_layer_score return score.to(tokens.device) ================================================ FILE: Open-Sora/build/lib/opensora/models/cache_functions/scores.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F def attn_score(cache_dic, current): #self_attn_score = 1- cache_dic['attn_map'][current['flag']][current['layer']].diagonal(dim1=1, dim2=2) #self_attn_score = F.normalize(self_attn_score, dim=1, p=2) #attention_score = F.normalize(cache_dic['attn_map'][current['flag']][current['layer']].sum(dim=1), dim=1, p=2) #cross_attn_map = F.threshold(cache_dic['cross_attn_map'][current['flag']][current['layer']],threshold=0.0, value=0.0) #cross_attention_score = F.normalize(cross_attn_map.sum(dim=-1), dim=-1, p=2) cond_cmap, uncond_cmap = torch.split(cache_dic['cross_attn_map'][current['flag']][current['layer']], len(cache_dic['cross_attn_map'][current['flag']][current['layer']]) // 2, dim=0) cond_weight = 0.5 cmap = cond_weight * cond_cmap + (1 - cond_weight) * uncond_cmap cross_attention_entropy = -torch.sum(cmap * torch.log(cmap + 1e-7), dim=-1) cross_attention_score = F.normalize(1 + cross_attention_entropy, dim=1, p=2) #score = self_attn_score #score = attention_score score = cross_attention_score.repeat(2, 1) #cross_weight = 0.0 #score = (1-cross_weight) * attention_score + cross_weight * cross_attention_score return score def similarity_score(cache_dic, current, tokens): cosine_sim = F.cosine_similarity(tokens, cache_dic['cache'][current['flag']][current['layer']][current['module']], dim=-1) return F.normalize(1- cosine_sim, dim=-1, p=2) def norm_score(cache_dic, current, tokens): norm = tokens.norm(dim=-1, p=2) return F.normalize(norm, dim=-1, p=2) ================================================ FILE: Open-Sora/build/lib/opensora/models/cache_functions/token_merge.py ================================================ import torch def token_merge(cache_dic, tokens, current, fresh_indices, stale_indices): #fresh_tokens = torch.zeros_like(tokens).scatter_(dim=1, index=fresh_indices_expand, src=tokens.gather(dim=1, index=fresh_indices_expand)) #stale_tokens = torch.zeros_like(tokens).scatter_(dim=1, index=stale_indices_expand, src=tokens.gather(dim=1, index=stale_indices_expand)) #fresh_tokens = torch.nn.functional.normalize(fresh_tokens, p=2, dim=-1) #stale_tokens = torch.nn.functional.normalize(stale_tokens, p=2, dim=-1) #stale_fresh_similarity = stale_tokens @ fresh_tokens.transpose(1, 2) #fresh_indices_expand = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1]) if (current['layer'] % 1 == 0): fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1])) stale_tokens = torch.gather(input = tokens, dim = 1, index = stale_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1])) method = 'similarity' if method == 'distance': descending = False distance = torch.cdist(stale_tokens, fresh_tokens, p=1) stale_fresh_dist, stale_fresh_indices_allstale = torch.min(distance, dim=2) elif method == 'similarity': descending = True fresh_tokens = torch.nn.functional.normalize(fresh_tokens, p=2, dim=-1) stale_tokens = torch.nn.functional.normalize(stale_tokens, p=2, dim=-1) similarity = stale_tokens @ fresh_tokens.transpose(1, 2) stale_fresh_dist, stale_fresh_indices_allstale = torch.max(similarity, dim=2) # 在dim = 1 上再次排序,保留 saved_topk_stale 个最小的 # 函数方案 #layer_weight = 1.0 #layer_factor = 1 - layer_weight + 2 * layer_weight * current['layer'] / 27 #layer_factor = 2 * torch.sigmoid(torch.tensor([1.0 * (current['layer'] - 13.5 )])) #saved_topk_stale = int(cache_dic['merge_weight'] * stale_tokens.shape[1] * layer_factor) # 阈值自适应方案 saved_topk_stale = int((stale_fresh_dist > 0.995).sum(dim=1).min()) merged_stale_sequence = torch.sort(stale_fresh_dist, dim=1, descending=descending)[1][:,:saved_topk_stale] stale_fresh_indices = stale_fresh_indices_allstale.gather(1, merged_stale_sequence) merged_stale_sequence = stale_indices.gather(1, merged_stale_sequence) merged_stale_fresh_indices = fresh_indices.gather(1, stale_fresh_indices) cache_dic['merged_stale_fresh_indices'] = merged_stale_fresh_indices # 距离从小到大的stale tokens 与其对应fresh tokens的index cache_dic['merged_stale_sequence'] = merged_stale_sequence # 距离从小到大的stale tokens 的index #print(torch.all(merged_stale_fresh_indices == merged_stale_sequence)) ================================================ FILE: Open-Sora/build/lib/opensora/models/cache_functions/update_cache.py ================================================ import torch def update_cache(fresh_indices, fresh_tokens, cache_dic, current, fresh_attn_map=None): """ Update the cache with fresh tokens based on the given index. Args: indices (torch.Tensor): The index tensor for tokens. 从权重高到底的index fresh_tokens (torch.Tensor): The fresh tokens to update the cache with. cach_dic (dict): The cache dictionary containing cache data and indices. current (dict): Dictionary containing the current step, layer, and module information. fresh_attn_map (torch.Tensor): The attention map for the fresh tokens. attn模块里已经排好序了,直接盖上去就行 """ step = current['step'] layer = current['layer'] module = current['module'] # Update the cached tokens at the positions if module == 'attn': indices = fresh_indices#.sort(dim=1, descending=False)[0] cache_dic['attn_map'][current['flag']][layer].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_attn_map.shape[-1]), src=fresh_attn_map) elif module == 'cross-attn': indices = fresh_indices#.sort(dim=1, descending=False)[0] cache_dic['cross_attn_map'][current['flag']][layer].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_attn_map.shape[-1]), src=fresh_attn_map) elif module == 'mlp': indices = fresh_indices #if (indices.shape[1] != 0): # to_be_updated_fresh_tokens = torch.gather(input = cache_dic['cache'][current['flag']][layer][module], dim = 1, index = indices.unsqueeze(-1).expand(-1, -1, fresh_tokens.shape[-1])) # residual_token = (fresh_tokens - to_be_updated_fresh_tokens).mean(dim=1) # cache_dic['cache'][current['flag']][layer][module] = cache_dic['cache'][current['flag']][layer][module] + 0.0 * residual_token.unsqueeze(1) cache_dic['cache'][current['flag']][layer][module].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_tokens.shape[-1]), src=fresh_tokens) ================================================ FILE: Open-Sora/build/lib/opensora/models/dit/__init__.py ================================================ from .dit import DiT, DiT_XL_2, DiT_XL_2x2 ================================================ FILE: Open-Sora/build/lib/opensora/models/dit/dit.py ================================================ # Modified from Meta DiT # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # DiT: https://github.com/facebookresearch/DiT/tree/main # GLIDE: https://github.com/openai/glide-text2im # MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py # -------------------------------------------------------- import numpy as np import torch import torch.nn as nn import torch.utils.checkpoint from einops import rearrange from timm.models.vision_transformer import Mlp from opensora.acceleration.checkpoint import auto_grad_checkpoint from opensora.models.layers.blocks import ( Attention, CaptionEmbedder, FinalLayer, LabelEmbedder, PatchEmbed3D, TimestepEmbedder, approx_gelu, get_1d_sincos_pos_embed, get_2d_sincos_pos_embed, get_layernorm, modulate, ) from opensora.registry import MODELS from opensora.utils.ckpt_utils import load_checkpoint class DiTBlock(nn.Module): """ A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning. """ def __init__( self, hidden_size, num_heads, mlp_ratio=4.0, enable_flash_attn=False, enable_layernorm_kernel=False, ): super().__init__() self.hidden_size = hidden_size self.num_heads = num_heads self.enable_flash_attn = enable_flash_attn mlp_hidden_dim = int(hidden_size * mlp_ratio) self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.attn = Attention( hidden_size, num_heads=num_heads, qkv_bias=True, enable_flash_attn=enable_flash_attn, ) self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0) self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True)) def forward(self, x, c): shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1) x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1, x, shift_msa, scale_msa)) x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2, x, shift_mlp, scale_mlp)) return x @MODELS.register_module() class DiT(nn.Module): """ Diffusion model with a Transformer backbone. """ def __init__( self, input_size=(16, 32, 32), in_channels=4, patch_size=(1, 2, 2), hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, learn_sigma=True, condition="text", no_temporal_pos_emb=False, caption_channels=512, model_max_length=77, dtype=torch.float32, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, ): super().__init__() self.learn_sigma = learn_sigma self.in_channels = in_channels self.out_channels = in_channels * 2 if learn_sigma else in_channels self.hidden_size = hidden_size self.patch_size = patch_size self.input_size = input_size num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)]) self.num_patches = num_patches self.num_temporal = input_size[0] // patch_size[0] self.num_spatial = num_patches // self.num_temporal self.num_heads = num_heads self.dtype = dtype self.use_text_encoder = not condition.startswith("label") if enable_flash_attn: assert dtype in [ torch.float16, torch.bfloat16, ], f"Flash attention only supports float16 and bfloat16, but got {self.dtype}" self.no_temporal_pos_emb = no_temporal_pos_emb self.mlp_ratio = mlp_ratio self.depth = depth assert enable_sequence_parallelism is False, "Sequence parallelism is not supported in DiT" self.register_buffer("pos_embed_spatial", self.get_spatial_pos_embed()) self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed()) self.x_embedder = PatchEmbed3D(patch_size, in_channels, embed_dim=hidden_size) if not self.use_text_encoder: num_classes = int(condition.split("_")[-1]) self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob) else: self.y_embedder = CaptionEmbedder( in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=1, # pooled token ) self.t_embedder = TimestepEmbedder(hidden_size) self.blocks = nn.ModuleList( [ DiTBlock( hidden_size, num_heads, mlp_ratio=mlp_ratio, enable_flash_attn=enable_flash_attn, enable_layernorm_kernel=enable_layernorm_kernel, ) for _ in range(depth) ] ) self.final_layer = FinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels) self.initialize_weights() self.enable_flash_attn = enable_flash_attn self.enable_layernorm_kernel = enable_layernorm_kernel def get_spatial_pos_embed(self): pos_embed = get_2d_sincos_pos_embed( self.hidden_size, self.input_size[1] // self.patch_size[1], ) pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) return pos_embed def get_temporal_pos_embed(self): pos_embed = get_1d_sincos_pos_embed( self.hidden_size, self.input_size[0] // self.patch_size[0], ) pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) return pos_embed def unpatchify(self, x): c = self.out_channels t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)] pt, ph, pw = self.patch_size x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c)) x = rearrange(x, "n t h w r p q c -> n c t r h p w q") imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw)) return imgs def forward(self, x, t, y): """ Forward pass of DiT. x: (B, C, T, H, W) tensor of inputs t: (B,) tensor of diffusion timesteps y: list of text """ # origin inputs should be float32, cast to specified dtype x = x.to(self.dtype) if self.use_text_encoder: y = y.to(self.dtype) # embedding x = self.x_embedder(x) # (B, N, D) x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial) x = x + self.pos_embed_spatial if not self.no_temporal_pos_emb: x = rearrange(x, "b t s d -> b s t d") x = x + self.pos_embed_temporal x = rearrange(x, "b s t d -> b (t s) d") else: x = rearrange(x, "b t s d -> b (t s) d") t = self.t_embedder(t, dtype=x.dtype) # (N, D) y = self.y_embedder(y, self.training) # (N, D) if self.use_text_encoder: y = y.squeeze(1).squeeze(1) condition = t + y # blocks for _, block in enumerate(self.blocks): c = condition x = auto_grad_checkpoint(block, x, c) # (B, N, D) # final process x = self.final_layer(x, condition) # (B, N, num_patches * out_channels) x = self.unpatchify(x) # (B, out_channels, T, H, W) # cast to float32 for better accuracy x = x.to(torch.float32) return x def initialize_weights(self): # Initialize transformer layers: def _basic_init(module): if isinstance(module, nn.Linear): if module.weight.requires_grad_: torch.nn.init.xavier_uniform_(module.weight) if module.bias is not None: nn.init.constant_(module.bias, 0) self.apply(_basic_init) # Initialize patch_embed like nn.Linear (instead of nn.Conv2d): w = self.x_embedder.proj.weight.data nn.init.xavier_uniform_(w.view([w.shape[0], -1])) nn.init.constant_(self.x_embedder.proj.bias, 0) # Initialize timestep embedding MLP: nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) # Zero-out adaLN modulation layers in DiT blocks: for block in self.blocks: nn.init.constant_(block.adaLN_modulation[-1].weight, 0) nn.init.constant_(block.adaLN_modulation[-1].bias, 0) # Zero-out output layers: nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0) nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0) nn.init.constant_(self.final_layer.linear.weight, 0) nn.init.constant_(self.final_layer.linear.bias, 0) # Zero-out text embedding layers: if self.use_text_encoder: nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02) nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02) @MODELS.register_module("DiT-XL/2") def DiT_XL_2(from_pretrained=None, **kwargs): model = DiT( depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs, ) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model @MODELS.register_module("DiT-XL/2x2") def DiT_XL_2x2(from_pretrained=None, **kwargs): model = DiT( depth=28, hidden_size=1152, patch_size=(2, 2, 2), num_heads=16, **kwargs, ) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model ================================================ FILE: Open-Sora/build/lib/opensora/models/latte/__init__.py ================================================ from .latte import Latte, Latte_XL_2, Latte_XL_2x2 ================================================ FILE: Open-Sora/build/lib/opensora/models/latte/latte.py ================================================ # Copyright 2024 Vchitect/Latte # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License.# Modified from Latte # # # This file is mofied from https://github.com/Vchitect/Latte/blob/main/models/latte.py # # With references to: # Latte: https://github.com/Vchitect/Latte # DiT: https://github.com/facebookresearch/DiT/tree/main import torch from einops import rearrange, repeat from opensora.acceleration.checkpoint import auto_grad_checkpoint from opensora.models.dit import DiT from opensora.registry import MODELS from opensora.utils.ckpt_utils import load_checkpoint @MODELS.register_module() class Latte(DiT): def forward(self, x, t, y): """ Forward pass of DiT. x: (B, C, T, H, W) tensor of inputs t: (B,) tensor of diffusion timesteps y: list of text """ # origin inputs should be float32, cast to specified dtype x = x.to(self.dtype) # embedding x = self.x_embedder(x) # (B, N, D) x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial) x = x + self.pos_embed_spatial x = rearrange(x, "b t s d -> b (t s) d") t = self.t_embedder(t, dtype=x.dtype) # (N, D) y = self.y_embedder(y, self.training) # (N, D) if self.use_text_encoder: y = y.squeeze(1).squeeze(1) condition = t + y condition_spatial = repeat(condition, "b d -> (b t) d", t=self.num_temporal) condition_temporal = repeat(condition, "b d -> (b s) d", s=self.num_spatial) # blocks for i, block in enumerate(self.blocks): if i % 2 == 0: # spatial x = rearrange(x, "b (t s) d -> (b t) s d", t=self.num_temporal, s=self.num_spatial) c = condition_spatial else: # temporal x = rearrange(x, "b (t s) d -> (b s) t d", t=self.num_temporal, s=self.num_spatial) c = condition_temporal if i == 1: x = x + self.pos_embed_temporal x = auto_grad_checkpoint(block, x, c) # (B, N, D) if i % 2 == 0: x = rearrange(x, "(b t) s d -> b (t s) d", t=self.num_temporal, s=self.num_spatial) else: x = rearrange(x, "(b s) t d -> b (t s) d", t=self.num_temporal, s=self.num_spatial) # final process x = self.final_layer(x, condition) # (B, N, num_patches * out_channels) x = self.unpatchify(x) # (B, out_channels, T, H, W) # cast to float32 for better accuracy x = x.to(torch.float32) return x @MODELS.register_module("Latte-XL/2") def Latte_XL_2(from_pretrained=None, **kwargs): model = Latte( depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs, ) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model @MODELS.register_module("Latte-XL/2x2") def Latte_XL_2x2(from_pretrained=None, **kwargs): model = Latte( depth=28, hidden_size=1152, patch_size=(2, 2, 2), num_heads=16, **kwargs, ) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model ================================================ FILE: Open-Sora/build/lib/opensora/models/layers/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/opensora/models/layers/blocks.py ================================================ # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # PixArt: https://github.com/PixArt-alpha/PixArt-alpha # Latte: https://github.com/Vchitect/Latte # DiT: https://github.com/facebookresearch/DiT/tree/main # GLIDE: https://github.com/openai/glide-text2im # MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py # -------------------------------------------------------- import functools import math from typing import Optional import numpy as np import torch import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint import xformers.ops from einops import rearrange from timm.models.vision_transformer import Mlp from opensora.acceleration.communications import all_to_all, split_forward_gather_backward from opensora.acceleration.parallel_states import get_sequence_parallel_group from ..cache_functions.attention import cached_attention_forward approx_gelu = lambda: nn.GELU(approximate="tanh") class LlamaRMSNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): """ LlamaRMSNorm is equivalent to T5LayerNorm """ super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps def forward(self, hidden_states): input_dtype = hidden_states.dtype hidden_states = hidden_states.to(torch.float32) variance = hidden_states.pow(2).mean(-1, keepdim=True) hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) return self.weight * hidden_states.to(input_dtype) def get_layernorm(hidden_size: torch.Tensor, eps: float, affine: bool, use_kernel: bool): if use_kernel: try: from apex.normalization import FusedLayerNorm return FusedLayerNorm(hidden_size, elementwise_affine=affine, eps=eps) except ImportError: raise RuntimeError("FusedLayerNorm not available. Please install apex.") else: return nn.LayerNorm(hidden_size, eps, elementwise_affine=affine) def modulate(norm_func, x, shift, scale): # Suppose x is (B, N, D), shift is (B, D), scale is (B, D) dtype = x.dtype x = norm_func(x.to(torch.float32)).to(dtype) x = x * (scale.unsqueeze(1) + 1) + shift.unsqueeze(1) x = x.to(dtype) return x def t2i_modulate(x, shift, scale): return x * (1 + scale) + shift # =============================================== # General-purpose Layers # =============================================== class PatchEmbed3D(nn.Module): """Video to Patch Embedding. Args: patch_size (int): Patch token size. Default: (2,4,4). in_chans (int): Number of input video channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. norm_layer (nn.Module, optional): Normalization layer. Default: None """ def __init__( self, patch_size=(2, 4, 4), in_chans=3, embed_dim=96, norm_layer=None, flatten=True, ): super().__init__() self.patch_size = patch_size self.flatten = flatten self.in_chans = in_chans self.embed_dim = embed_dim self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: self.norm = None def forward(self, x): """Forward function.""" # padding _, _, D, H, W = x.size() if W % self.patch_size[2] != 0: x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2])) if H % self.patch_size[1] != 0: x = F.pad(x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1])) if D % self.patch_size[0] != 0: x = F.pad(x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0])) x = self.proj(x) # (B C T H W) if self.norm is not None: D, Wh, Ww = x.size(2), x.size(3), x.size(4) x = x.flatten(2).transpose(1, 2) x = self.norm(x) x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww) if self.flatten: x = x.flatten(2).transpose(1, 2) # BCTHW -> BNC return x class Attention(nn.Module): def __init__( self, dim: int, num_heads: int = 8, qkv_bias: bool = False, qk_norm: bool = False, attn_drop: float = 0.0, proj_drop: float = 0.0, norm_layer: nn.Module = LlamaRMSNorm, enable_flash_attn: bool = False, rope=None, qk_norm_legacy: bool = False, ) -> None: super().__init__() assert dim % num_heads == 0, "dim should be divisible by num_heads" self.dim = dim self.num_heads = num_heads self.head_dim = dim // num_heads self.scale = self.head_dim**-0.5 self.enable_flash_attn = False self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() self.qk_norm_legacy = qk_norm_legacy self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.rope = False if rope is not None: self.rope = True self.rotary_emb = rope self.is_causal = False def forward(self, x: torch.Tensor) -> torch.Tensor: B, N, C = x.shape # flash attn is not memory efficient for small sequences, this is empirical enable_flash_attn = self.enable_flash_attn and (N > B) qkv = self.qkv(x) qkv_shape = (B, N, 3, self.num_heads, self.head_dim) qkv = qkv.view(qkv_shape).permute(2, 0, 3, 1, 4) q, k, v = qkv.unbind(0) if self.qk_norm_legacy: # WARNING: this may be a bug if self.rope: q = self.rotary_emb(q) k = self.rotary_emb(k) q, k = self.q_norm(q), self.k_norm(k) else: q, k = self.q_norm(q), self.k_norm(k) if self.rope: q = self.rotary_emb(q) k = self.rotary_emb(k) if enable_flash_attn: from flash_attn import flash_attn_func # (B, #heads, N, #dim) -> (B, N, #heads, #dim) q = q.permute(0, 2, 1, 3) k = k.permute(0, 2, 1, 3) v = v.permute(0, 2, 1, 3) x = flash_attn_func( q, k, v, dropout_p=self.attn_drop.p if self.training else 0.0, softmax_scale=self.scale, causal=self.is_causal, ) else: dtype = q.dtype q = q * self.scale #attn = q @ k.transpose(-2, -1) # translate attn to float32 attn = torch.matmul(q,k.transpose(-2, -1)) attn = attn.to(torch.float32) if self.is_causal: causal_mask = torch.tril(torch.ones_like(attn), diagonal=0) causal_mask = torch.where(causal_mask.bool(), 0, float('-inf')) attn += causal_mask attn = attn.softmax(dim=-1) attn = attn.to(dtype) # cast back attn to original dtype attn = self.attn_drop(attn) #x = attn @ v x = torch.matmul(attn,v) x_output_shape = (B, N, C) if not enable_flash_attn: x = x.transpose(1, 2) x = x.reshape(x_output_shape) x = self.proj(x) x = self.proj_drop(x) return x class KVCompressAttention(nn.Module): def __init__( self, dim: int, num_heads: int = 8, qkv_bias: bool = False, qk_norm: bool = False, attn_drop: float = 0.0, proj_drop: float = 0.0, norm_layer: nn.Module = LlamaRMSNorm, enable_flash_attn: bool = False, sampling="conv", sr_ratio=1, mem_eff_attention=False, attn_half=False, ) -> None: super().__init__() assert dim % num_heads == 0, "dim should be divisible by num_heads" self.dim = dim self.num_heads = num_heads self.head_dim = dim // num_heads self.scale = self.head_dim**-0.5 self.enable_flash_attn = enable_flash_attn self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.sr_ratio = sr_ratio self.sampling = sampling if sr_ratio > 1 and sampling == "conv": # Avg Conv Init. self.sr = nn.Conv2d(dim, dim, groups=dim, kernel_size=sr_ratio, stride=sr_ratio) self.sr.weight.data.fill_(1 / sr_ratio**2) self.sr.bias.data.zero_() self.norm = nn.LayerNorm(dim) self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.mem_eff_attention = mem_eff_attention self.attn_half = attn_half def downsample_2d(self, tensor, H, W, scale_factor, sampling=None): if sampling is None or scale_factor == 1: return tensor B, N, C = tensor.shape if sampling == "uniform_every": return tensor[:, ::scale_factor], int(N // scale_factor) tensor = tensor.reshape(B, H, W, C).permute(0, 3, 1, 2) new_H, new_W = int(H / scale_factor), int(W / scale_factor) new_N = new_H * new_W if sampling == "ave": tensor = F.interpolate(tensor, scale_factor=1 / scale_factor, mode="nearest").permute(0, 2, 3, 1) elif sampling == "uniform": tensor = tensor[:, :, ::scale_factor, ::scale_factor].permute(0, 2, 3, 1) elif sampling == "conv": tensor = self.sr(tensor).reshape(B, C, -1).permute(0, 2, 1) tensor = self.norm(tensor) else: raise ValueError return tensor.reshape(B, new_N, C).contiguous(), new_N def forward(self, x: torch.Tensor, mask=None, HW=None, block_id=None, **kwargs) -> torch.Tensor: B, N, C = x.shape new_N = N H, W = HW # flash attn is not memory efficient for small sequences, this is empirical enable_flash_attn = self.enable_flash_attn and (N > B) qkv = self.qkv(x).reshape(B, N, 3, C) q, k, v = qkv.unbind(2) dtype = q.dtype # KV compression if self.sr_ratio > 1: k, new_N = self.downsample_2d(k, H, W, self.sr_ratio, sampling=self.sampling) v, new_N = self.downsample_2d(v, H, W, self.sr_ratio, sampling=self.sampling) q = q.reshape(B, N, self.num_heads, C // self.num_heads).to(dtype) k = k.reshape(B, new_N, self.num_heads, C // self.num_heads).to(dtype) v = v.reshape(B, new_N, self.num_heads, C // self.num_heads).to(dtype) q, k = self.q_norm(q), self.k_norm(k) if enable_flash_attn: from flash_attn import flash_attn_func x = flash_attn_func( q, k, v, dropout_p=self.attn_drop.p if self.training else 0.0, softmax_scale=self.scale, ) elif self.mem_eff_attention: attn_bias = None if mask is not None: attn_bias = torch.zeros([B * self.num_heads, q.shape[1], k.shape[1]], dtype=q.dtype, device=q.device) attn_bias.masked_fill_(mask.squeeze(1).repeat(self.num_heads, 1, 1) == 0, float("-inf")) x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias) else: # (B, N, #heads, #dim) -> (B, #heads, N, #dim) q = q.permute(0, 2, 1, 3) k = k.permute(0, 2, 1, 3) v = v.permute(0, 2, 1, 3) dtype = q.dtype q = q * self.scale attn = q @ k.transpose(-2, -1) # translate attn to float32 if not self.attn_half: attn = attn.to(torch.float32) attn = attn.softmax(dim=-1) attn = attn.to(dtype) # cast back attn to original dtype attn = self.attn_drop(attn) x = attn @ v x_output_shape = (B, N, C) if not enable_flash_attn: x = x.transpose(1, 2) x = x.reshape(x_output_shape) x = self.proj(x) x = self.proj_drop(x) return x class SeqParallelAttention(Attention): def __init__( self, dim: int, num_heads: int = 8, qkv_bias: bool = False, qk_norm: bool = False, attn_drop: float = 0.0, proj_drop: float = 0.0, norm_layer: nn.Module = LlamaRMSNorm, enable_flash_attn: bool = False, rope=None, ) -> None: assert rope is None, "Rope is not supported in SeqParallelAttention" super().__init__( dim=dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_norm=qk_norm, attn_drop=attn_drop, proj_drop=proj_drop, norm_layer=norm_layer, enable_flash_attn=enable_flash_attn, ) def forward(self, x: torch.Tensor) -> torch.Tensor: B, N, C = x.shape # for sequence parallel here, the N is a local sequence length qkv = self.qkv(x) qkv_shape = (B, N, 3, self.num_heads, self.head_dim) qkv = qkv.view(qkv_shape) sp_group = get_sequence_parallel_group() # apply all_to_all to gather sequence and split attention heads # [B, SUB_N, 3, NUM_HEAD, HEAD_DIM] -> [B, N, 3, NUM_HEAD_PER_DEVICE, HEAD_DIM] qkv = all_to_all(qkv, sp_group, scatter_dim=3, gather_dim=1) if self.enable_flash_attn: qkv_permute_shape = ( 2, 0, 1, 3, 4, ) # [3, B, N, NUM_HEAD_PER_DEVICE, HEAD_DIM] else: qkv_permute_shape = ( 2, 0, 3, 1, 4, ) # [3, B, NUM_HEAD_PER_DEVICE, N, HEAD_DIM] qkv = qkv.permute(qkv_permute_shape) # ERROR: Should qk_norm first q, k, v = qkv.unbind(0) q, k = self.q_norm(q), self.k_norm(k) if self.enable_flash_attn: from flash_attn import flash_attn_func x = flash_attn_func( q, k, v, dropout_p=self.attn_drop.p if self.training else 0.0, softmax_scale=self.scale, ) else: dtype = q.dtype q = q * self.scale attn = q @ k.transpose(-2, -1) # translate attn to float32 attn = attn.to(torch.float32) attn = attn.softmax(dim=-1) attn = attn.to(dtype) # cast back attn to original dtype attn = self.attn_drop(attn) x = attn @ v if not self.enable_flash_attn: x = x.transpose(1, 2) # apply all to all to gather back attention heads and split sequence # [B, N, NUM_HEAD_PER_DEVICE, HEAD_DIM] -> [B, SUB_N, NUM_HEAD, HEAD_DIM] x = all_to_all(x, sp_group, scatter_dim=1, gather_dim=2) # reshape outputs back to [B, N, C] x_output_shape = (B, N, C) x = x.reshape(x_output_shape) x = self.proj(x) x = self.proj_drop(x) return x class MultiHeadCrossAttention(nn.Module): def __init__(self, d_model, num_heads, attn_drop=0.0, proj_drop=0.0): super(MultiHeadCrossAttention, self).__init__() assert d_model % num_heads == 0, "d_model must be divisible by num_heads" self.d_model = d_model self.num_heads = num_heads self.head_dim = d_model // num_heads self.q_linear = nn.Linear(d_model, d_model) self.kv_linear = nn.Linear(d_model, d_model * 2) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(d_model, d_model) self.proj_drop = nn.Dropout(proj_drop) def forward(self, x, cond, mask=None): #start = torch.cuda.Event(enable_timing=True) #end = torch.cuda.Event(enable_timing=True) # query/value: img tokens; key: condition; mask: if padding tokens B, N, C = x.shape #start.record() q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim) kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim) k, v = kv.unbind(2) attn_bias = None if mask is not None: attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask) #x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias) x, cross_attn_map = cached_attention_forward(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias) x = x.view(B, -1, C) cross_attn_map = cross_attn_map.view(B, -1, cross_attn_map.shape[-1]) x = self.proj(x) x = self.proj_drop(x) #end.record() #torch.cuda.synchronize() #print(start.elapsed_time(end)) return x, cross_attn_map class SeqParallelMultiHeadCrossAttention(MultiHeadCrossAttention): def __init__( self, d_model, num_heads, attn_drop=0.0, proj_drop=0.0, ): super().__init__( d_model=d_model, num_heads=num_heads, attn_drop=attn_drop, proj_drop=proj_drop, ) def forward(self, x, cond, mask=None): # query/value: img tokens; key: condition; mask: if padding tokens sp_group = get_sequence_parallel_group() sp_size = dist.get_world_size(sp_group) B, SUB_N, C = x.shape # [B, TS/p, C] N = SUB_N * sp_size # shape: # q, k, v: [B, SUB_N, NUM_HEADS, HEAD_DIM] q = self.q_linear(x).view(B, -1, self.num_heads, self.head_dim) kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim) kv = split_forward_gather_backward(kv, get_sequence_parallel_group(), dim=3, grad_scale="down") k, v = kv.unbind(2) # apply all_to_all to gather sequence and split attention heads q = all_to_all(q, sp_group, scatter_dim=2, gather_dim=1) q = q.view(1, -1, self.num_heads // sp_size, self.head_dim) k = k.view(1, -1, self.num_heads // sp_size, self.head_dim) v = v.view(1, -1, self.num_heads // sp_size, self.head_dim) # compute attention attn_bias = None if mask is not None: attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask) x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias) # apply all to all to gather back attention heads and scatter sequence x = x.view(B, -1, self.num_heads // sp_size, self.head_dim) x = all_to_all(x, sp_group, scatter_dim=1, gather_dim=2) # apply output projection x = x.view(B, -1, C) x = self.proj(x) x = self.proj_drop(x) return x class FinalLayer(nn.Module): """ The final layer of DiT. """ def __init__(self, hidden_size, num_patch, out_channels): super().__init__() self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) self.linear = nn.Linear(hidden_size, num_patch * out_channels, bias=True) self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)) def forward(self, x, c): shift, scale = self.adaLN_modulation(c).chunk(2, dim=1) x = modulate(self.norm_final, x, shift, scale) x = self.linear(x) return x class T2IFinalLayer(nn.Module): """ The final layer of PixArt. """ def __init__(self, hidden_size, num_patch, out_channels, d_t=None, d_s=None): super().__init__() self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) self.linear = nn.Linear(hidden_size, num_patch * out_channels, bias=True) self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size**0.5) self.out_channels = out_channels self.d_t = d_t self.d_s = d_s def t_mask_select(self, x_mask, x, masked_x, T, S): # x: [B, (T, S), C] # mased_x: [B, (T, S), C] # x_mask: [B, T] x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S) masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=T, S=S) x = torch.where(x_mask[:, :, None, None], x, masked_x) x = rearrange(x, "B T S C -> B (T S) C") return x def forward(self, x, t, x_mask=None, t0=None, T=None, S=None): if T is None: T = self.d_t if S is None: S = self.d_s shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1) x = t2i_modulate(self.norm_final(x), shift, scale) if x_mask is not None: shift_zero, scale_zero = (self.scale_shift_table[None] + t0[:, None]).chunk(2, dim=1) x_zero = t2i_modulate(self.norm_final(x), shift_zero, scale_zero) x = self.t_mask_select(x_mask, x, x_zero, T, S) x = self.linear(x) return x # =============================================== # Embedding Layers for Timesteps and Class Labels # =============================================== class TimestepEmbedder(nn.Module): """ Embeds scalar timesteps into vector representations. """ def __init__(self, hidden_size, frequency_embedding_size=256): super().__init__() self.mlp = nn.Sequential( nn.Linear(frequency_embedding_size, hidden_size, bias=True), nn.SiLU(), nn.Linear(hidden_size, hidden_size, bias=True), ) self.frequency_embedding_size = frequency_embedding_size @staticmethod def timestep_embedding(t, dim, max_period=10000): """ Create sinusoidal timestep embeddings. :param t: a 1-D Tensor of N indices, one per batch element. These may be fractional. :param dim: the dimension of the output. :param max_period: controls the minimum frequency of the embeddings. :return: an (N, D) Tensor of positional embeddings. """ # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py half = dim // 2 freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half) freqs = freqs.to(device=t.device) args = t[:, None].float() * freqs[None] embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) if dim % 2: embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) return embedding def forward(self, t, dtype): t_freq = self.timestep_embedding(t, self.frequency_embedding_size) if t_freq.dtype != dtype: t_freq = t_freq.to(dtype) t_emb = self.mlp(t_freq) return t_emb class LabelEmbedder(nn.Module): """ Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance. """ def __init__(self, num_classes, hidden_size, dropout_prob): super().__init__() use_cfg_embedding = dropout_prob > 0 self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size) self.num_classes = num_classes self.dropout_prob = dropout_prob def token_drop(self, labels, force_drop_ids=None): """ Drops labels to enable classifier-free guidance. """ if force_drop_ids is None: drop_ids = torch.rand(labels.shape[0]).cuda() < self.dropout_prob else: drop_ids = force_drop_ids == 1 labels = torch.where(drop_ids, self.num_classes, labels) return labels def forward(self, labels, train, force_drop_ids=None): use_dropout = self.dropout_prob > 0 if (train and use_dropout) or (force_drop_ids is not None): labels = self.token_drop(labels, force_drop_ids) return self.embedding_table(labels) class SizeEmbedder(TimestepEmbedder): """ Embeds scalar timesteps into vector representations. """ def __init__(self, hidden_size, frequency_embedding_size=256): super().__init__(hidden_size=hidden_size, frequency_embedding_size=frequency_embedding_size) self.mlp = nn.Sequential( nn.Linear(frequency_embedding_size, hidden_size, bias=True), nn.SiLU(), nn.Linear(hidden_size, hidden_size, bias=True), ) self.frequency_embedding_size = frequency_embedding_size self.outdim = hidden_size def forward(self, s, bs): if s.ndim == 1: s = s[:, None] assert s.ndim == 2 if s.shape[0] != bs: s = s.repeat(bs // s.shape[0], 1) assert s.shape[0] == bs b, dims = s.shape[0], s.shape[1] s = rearrange(s, "b d -> (b d)") s_freq = self.timestep_embedding(s, self.frequency_embedding_size).to(self.dtype) s_emb = self.mlp(s_freq) s_emb = rearrange(s_emb, "(b d) d2 -> b (d d2)", b=b, d=dims, d2=self.outdim) return s_emb @property def dtype(self): return next(self.parameters()).dtype class CaptionEmbedder(nn.Module): """ Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance. """ def __init__( self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate="tanh"), token_num=120, ): super().__init__() self.y_proj = Mlp( in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size, act_layer=act_layer, drop=0, ) self.register_buffer( "y_embedding", torch.randn(token_num, in_channels) / in_channels**0.5, ) self.uncond_prob = uncond_prob def token_drop(self, caption, force_drop_ids=None): """ Drops labels to enable classifier-free guidance. """ if force_drop_ids is None: drop_ids = torch.rand(caption.shape[0]).cuda() < self.uncond_prob else: drop_ids = force_drop_ids == 1 caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption) return caption def forward(self, caption, train, force_drop_ids=None): if train: assert caption.shape[2:] == self.y_embedding.shape use_dropout = self.uncond_prob > 0 if (train and use_dropout) or (force_drop_ids is not None): caption = self.token_drop(caption, force_drop_ids) caption = self.y_proj(caption) return caption class PositionEmbedding2D(nn.Module): def __init__(self, dim: int) -> None: super().__init__() self.dim = dim assert dim % 4 == 0, "dim must be divisible by 4" half_dim = dim // 2 inv_freq = 1.0 / (10000 ** (torch.arange(0, half_dim, 2).float() / half_dim)) self.register_buffer("inv_freq", inv_freq, persistent=False) def _get_sin_cos_emb(self, t: torch.Tensor): out = torch.einsum("i,d->id", t, self.inv_freq) emb_cos = torch.cos(out) emb_sin = torch.sin(out) return torch.cat((emb_sin, emb_cos), dim=-1) @functools.lru_cache(maxsize=512) def _get_cached_emb( self, device: torch.device, dtype: torch.dtype, h: int, w: int, scale: float = 1.0, base_size: Optional[int] = None, ): grid_h = torch.arange(h, device=device) / scale grid_w = torch.arange(w, device=device) / scale if base_size is not None: grid_h *= base_size / h grid_w *= base_size / w grid_h, grid_w = torch.meshgrid( grid_w, grid_h, indexing="ij", ) # here w goes first grid_h = grid_h.t().reshape(-1) grid_w = grid_w.t().reshape(-1) emb_h = self._get_sin_cos_emb(grid_h) emb_w = self._get_sin_cos_emb(grid_w) return torch.concat([emb_h, emb_w], dim=-1).unsqueeze(0).to(dtype) def forward( self, x: torch.Tensor, h: int, w: int, scale: Optional[float] = 1.0, base_size: Optional[int] = None, ) -> torch.Tensor: return self._get_cached_emb(x.device, x.dtype, h, w, scale, base_size) # =============================================== # Sine/Cosine Positional Embedding Functions # =============================================== # https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, scale=1.0, base_size=None): """ grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) """ if not isinstance(grid_size, tuple): grid_size = (grid_size, grid_size) grid_h = np.arange(grid_size[0], dtype=np.float32) / scale grid_w = np.arange(grid_size[1], dtype=np.float32) / scale if base_size is not None: grid_h *= base_size / grid_size[0] grid_w *= base_size / grid_size[1] grid = np.meshgrid(grid_w, grid_h) # here w goes first grid = np.stack(grid, axis=0) grid = grid.reshape([2, 1, grid_size[1], grid_size[0]]) pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) if cls_token and extra_tokens > 0: pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0) return pos_embed def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): assert embed_dim % 2 == 0 # use half of dimensions to encode grid_h emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) return emb def get_1d_sincos_pos_embed(embed_dim, length, scale=1.0): pos = np.arange(0, length)[..., None] / scale return get_1d_sincos_pos_embed_from_grid(embed_dim, pos) def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): """ embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D) """ assert embed_dim % 2 == 0 omega = np.arange(embed_dim // 2, dtype=np.float64) omega /= embed_dim / 2.0 omega = 1.0 / 10000**omega # (D/2,) pos = pos.reshape(-1) # (M,) out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product emb_sin = np.sin(out) # (M, D/2) emb_cos = np.cos(out) # (M, D/2) emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) return emb ================================================ FILE: Open-Sora/build/lib/opensora/models/pixart/__init__.py ================================================ from .pixart import PixArt, PixArt_1B_2, PixArt_XL_2 from .pixart_sigma import PixArt_Sigma_XL_2 ================================================ FILE: Open-Sora/build/lib/opensora/models/pixart/pixart.py ================================================ # Adapted from PixArt # # Copyright (C) 2023 PixArt-alpha/PixArt-alpha # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published # by the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # PixArt: https://github.com/PixArt-alpha/PixArt-alpha # DiT: https://github.com/facebookresearch/DiT/tree/main # -------------------------------------------------------- import numpy as np import torch import torch.nn as nn from einops import rearrange from timm.models.layers import DropPath from timm.models.vision_transformer import Mlp # from .builder import MODELS from opensora.acceleration.checkpoint import auto_grad_checkpoint from opensora.models.layers.blocks import ( Attention, CaptionEmbedder, MultiHeadCrossAttention, PatchEmbed3D, SeqParallelAttention, SeqParallelMultiHeadCrossAttention, SizeEmbedder, T2IFinalLayer, TimestepEmbedder, approx_gelu, get_1d_sincos_pos_embed, get_2d_sincos_pos_embed, get_layernorm, t2i_modulate, ) from opensora.registry import MODELS from opensora.utils.ckpt_utils import load_checkpoint class PixArtBlock(nn.Module): """ A PixArt block with adaptive layer norm (adaLN-single) conditioning. """ def __init__( self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0.0, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, ): super().__init__() self.hidden_size = hidden_size self.enable_flash_attn = enable_flash_attn self._enable_sequence_parallelism = enable_sequence_parallelism if enable_sequence_parallelism: self.attn_cls = SeqParallelAttention self.mha_cls = SeqParallelMultiHeadCrossAttention else: self.attn_cls = Attention self.mha_cls = MultiHeadCrossAttention self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.attn = self.attn_cls( hidden_size, num_heads=num_heads, qkv_bias=True, enable_flash_attn=enable_flash_attn, ) self.cross_attn = self.mha_cls(hidden_size, num_heads) self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.mlp = Mlp( in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0 ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5) def forward(self, x, y, t, mask=None): B, N, C = x.shape shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( self.scale_shift_table[None] + t.reshape(B, 6, -1) ).chunk(6, dim=1) x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C)) x = x + self.cross_attn(x, y, mask) x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp))) return x @MODELS.register_module() class PixArt(nn.Module): """ Diffusion model with a Transformer backbone. """ def __init__( self, input_size=(1, 32, 32), in_channels=4, patch_size=(1, 2, 2), hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, pred_sigma=True, drop_path: float = 0.0, no_temporal_pos_emb=False, caption_channels=4096, model_max_length=120, dtype=torch.float32, freeze=None, space_scale=1.0, time_scale=1.0, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, base_size=None, ): super().__init__() assert enable_sequence_parallelism is False, "Sequence parallelism is not supported in this version." self.pred_sigma = pred_sigma self.in_channels = in_channels self.out_channels = in_channels * 2 if pred_sigma else in_channels self.hidden_size = hidden_size self.patch_size = patch_size self.input_size = input_size num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)]) self.num_patches = num_patches self.num_temporal = input_size[0] // patch_size[0] self.num_spatial = num_patches // self.num_temporal if base_size is None: self.base_size = int(np.sqrt(self.num_spatial)) else: self.base_size = base_size // patch_size[1] self.num_heads = num_heads self.dtype = dtype self.no_temporal_pos_emb = no_temporal_pos_emb self.depth = depth self.mlp_ratio = mlp_ratio self.enable_flash_attn = enable_flash_attn self.enable_layernorm_kernel = enable_layernorm_kernel self.space_scale = space_scale self.time_scale = time_scale self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size) self.t_embedder = TimestepEmbedder(hidden_size) self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True)) self.y_embedder = CaptionEmbedder( in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=model_max_length, ) self.register_buffer("pos_embed", self.get_spatial_pos_embed()) self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed()) drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] # stochastic depth decay rule self.blocks = nn.ModuleList( [ PixArtBlock( hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i], enable_flash_attn=enable_flash_attn, enable_layernorm_kernel=enable_layernorm_kernel, ) for i in range(depth) ] ) self.final_layer = T2IFinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels) self.initialize_weights() if freeze is not None: assert freeze in ["text"] if freeze == "text": self.freeze_text() def forward(self, x, timestep, y, mask=None, **kwargs): """ Forward pass of PixArt. x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) t: (N,) tensor of diffusion timesteps y: (N, 1, 120, C) tensor of class labels """ dtype = self.x_embedder.proj.weight.dtype B = x.size(0) x = x.to(dtype) timestep = timestep.to(dtype) y = y.to(dtype) # embedding x = self.x_embedder(x) # (B, N, D) x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial) x = x + self.pos_embed if not self.no_temporal_pos_emb: x = rearrange(x, "b t s d -> b s t d") x = x + self.pos_embed_temporal x = rearrange(x, "b s t d -> b (t s) d") else: x = rearrange(x, "b t s d -> b (t s) d") t = self.t_embedder(timestep, dtype=x.dtype) # (N, D) t0 = self.t_block(t) y = self.y_embedder(y, self.training) # (N, 1, L, D) if mask is not None: if mask.shape[0] != y.shape[0]: mask = mask.repeat(y.shape[0] // mask.shape[0], 1) mask = mask.squeeze(1).squeeze(1) y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) y_lens = mask.sum(dim=1).tolist() else: y_lens = [y.shape[2]] * y.shape[0] y = y.squeeze(1).view(1, -1, x.shape[-1]) # blocks for block in self.blocks: x = auto_grad_checkpoint(block, x, y, t0, y_lens) # final process x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels) x = self.unpatchify(x) # (N, out_channels, H, W) # cast to float32 for better accuracy x = x.to(torch.float32) return x def unpatchify(self, x): c = self.out_channels t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)] pt, ph, pw = self.patch_size x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c)) x = rearrange(x, "n t h w r p q c -> n c t r h p w q") imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw)) return imgs def get_spatial_pos_embed(self, grid_size=None): if grid_size is None: grid_size = self.input_size[1:] pos_embed = get_2d_sincos_pos_embed( self.hidden_size, (grid_size[0] // self.patch_size[1], grid_size[1] // self.patch_size[2]), scale=self.space_scale, base_size=self.base_size, ) pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) return pos_embed def get_temporal_pos_embed(self): pos_embed = get_1d_sincos_pos_embed( self.hidden_size, self.input_size[0] // self.patch_size[0], scale=self.time_scale, ) pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) return pos_embed def freeze_text(self): for n, p in self.named_parameters(): if "cross_attn" in n: p.requires_grad = False def initialize_weights(self): # Initialize transformer layers: def _basic_init(module): if isinstance(module, nn.Linear): torch.nn.init.xavier_uniform_(module.weight) if module.bias is not None: nn.init.constant_(module.bias, 0) self.apply(_basic_init) # Initialize patch_embed like nn.Linear (instead of nn.Conv2d): w = self.x_embedder.proj.weight.data nn.init.xavier_uniform_(w.view([w.shape[0], -1])) # Initialize timestep embedding MLP: nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) nn.init.normal_(self.t_block[1].weight, std=0.02) # Initialize caption embedding MLP: nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02) nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02) # Zero-out adaLN modulation layers in PixArt blocks: for block in self.blocks: nn.init.constant_(block.cross_attn.proj.weight, 0) nn.init.constant_(block.cross_attn.proj.bias, 0) # Zero-out output layers: nn.init.constant_(self.final_layer.linear.weight, 0) nn.init.constant_(self.final_layer.linear.bias, 0) @MODELS.register_module() class PixArtMS(PixArt): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) assert self.hidden_size % 3 == 0, "hidden_size must be divisible by 3" self.csize_embedder = SizeEmbedder(self.hidden_size // 3) self.ar_embedder = SizeEmbedder(self.hidden_size // 3) def forward(self, x, timestep, y, mask=None, data_info=None): """ Forward pass of PixArt. x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) t: (N,) tensor of diffusion timesteps y: (N, 1, 120, C) tensor of class labels """ x = x.to(self.dtype) timestep = timestep.to(self.dtype) y = y.to(self.dtype) c_size = data_info["hw"] ar = data_info["ar"] pos_embed = self.get_spatial_pos_embed((x.shape[-2], x.shape[-1])).to(x.dtype) # embedding x = self.x_embedder(x) # (B, N, D) x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial) x = x + pos_embed.to(x.device) if not self.no_temporal_pos_emb: x = rearrange(x, "b t s d -> b s t d") x = x + self.pos_embed_temporal x = rearrange(x, "b s t d -> b (t s) d") else: x = rearrange(x, "b t s d -> b (t s) d") t = self.t_embedder(timestep, dtype=x.dtype) # (N, D) B = x.shape[0] csize = self.csize_embedder(c_size, B) ar = self.ar_embedder(ar, B) t = t + torch.cat([csize, ar], dim=1) t0 = self.t_block(t) y = self.y_embedder(y, self.training) # (N, 1, L, D) if mask is not None: if mask.shape[0] != y.shape[0]: mask = mask.repeat(y.shape[0] // mask.shape[0], 1) mask = mask.squeeze(1).squeeze(1) y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) y_lens = mask.sum(dim=1).tolist() else: y_lens = [y.shape[2]] * y.shape[0] y = y.squeeze(1).view(1, -1, x.shape[-1]) # blocks for block in self.blocks: x = block(x, y, t0, y_lens) # final process x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels) x = self.unpatchify(x) # (N, out_channels, H, W) # cast to float32 for better accuracy x = x.to(torch.float32) return x @MODELS.register_module("PixArt-XL/2") def PixArt_XL_2(from_pretrained=None, **kwargs): model = PixArt(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model @MODELS.register_module("PixArt-1B/2") def PixArt_1B_2(from_pretrained=None, **kwargs): model = PixArt(depth=28, hidden_size=1872, patch_size=(1, 2, 2), num_heads=26, **kwargs) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model @MODELS.register_module("PixArtMS-XL/2") def PixArtMS_XL_2(from_pretrained=None, **kwargs): model = PixArtMS(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model ================================================ FILE: Open-Sora/build/lib/opensora/models/pixart/pixart_sigma.py ================================================ # Adapted from PixArt # # Copyright (C) 2023 PixArt-alpha/PixArt-alpha # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published # by the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # PixArt: https://github.com/PixArt-alpha/PixArt-alpha # DiT: https://github.com/facebookresearch/DiT/tree/main # -------------------------------------------------------- import numpy as np import torch import torch.nn as nn from einops import rearrange from timm.models.layers import DropPath from timm.models.vision_transformer import Mlp # from .builder import MODELS from opensora.acceleration.checkpoint import auto_grad_checkpoint from opensora.models.layers.blocks import ( CaptionEmbedder, KVCompressAttention, MultiHeadCrossAttention, PatchEmbed3D, T2IFinalLayer, TimestepEmbedder, approx_gelu, get_1d_sincos_pos_embed, get_2d_sincos_pos_embed, get_layernorm, t2i_modulate, ) from opensora.registry import MODELS from opensora.utils.ckpt_utils import load_checkpoint class PixArtBlock(nn.Module): """ A PixArt block with adaptive layer norm (adaLN-single) conditioning. """ def __init__( self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0.0, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, qk_norm=False, sampling="conv", sr_ratio=1, ): super().__init__() self.hidden_size = hidden_size self.enable_flash_attn = enable_flash_attn self._enable_sequence_parallelism = enable_sequence_parallelism assert not enable_sequence_parallelism, "Sequence parallelism is not supported in this version." self.attn_cls = KVCompressAttention self.mha_cls = MultiHeadCrossAttention self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.attn = self.attn_cls( hidden_size, num_heads=num_heads, qkv_bias=True, enable_flash_attn=enable_flash_attn, qk_norm=qk_norm, sr_ratio=sr_ratio, sampling=sampling, attn_half=True, ) self.cross_attn = self.mha_cls(hidden_size, num_heads) self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.mlp = Mlp( in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0 ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5) self.sampling = sampling self.sr_ratio = sr_ratio def forward(self, x, y, t, hw, mask=None): B, N, C = x.shape shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( self.scale_shift_table[None] + t.reshape(B, 6, -1) ).chunk(6, dim=1) x = x + self.drop_path( gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa), HW=hw).reshape(B, N, C) ) x = x + self.cross_attn(x, y, mask) x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp))) return x @MODELS.register_module() class PixArt_Sigma(nn.Module): """ Diffusion model with a Transformer backbone. """ def __init__( self, input_size=(1, 32, 32), in_channels=4, patch_size=(1, 2, 2), hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, pred_sigma=True, drop_path: float = 0.0, no_temporal_pos_emb=False, caption_channels=4096, model_max_length=120, dtype=torch.float32, freeze=None, qk_norm=False, space_scale=1.0, time_scale=1.0, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, kv_compress_config=None, ): super().__init__() assert enable_sequence_parallelism is False, "Sequence parallelism is not supported in this version." self.pred_sigma = pred_sigma self.in_channels = in_channels self.out_channels = in_channels * 2 if pred_sigma else in_channels self.hidden_size = hidden_size self.patch_size = patch_size self.input_size = input_size num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)]) self.num_patches = num_patches self.num_temporal = input_size[0] // patch_size[0] self.num_spatial = num_patches // self.num_temporal self.base_size = int(np.sqrt(self.num_spatial)) self.num_heads = num_heads self.dtype = dtype self.no_temporal_pos_emb = no_temporal_pos_emb self.depth = depth self.mlp_ratio = mlp_ratio self.enable_flash_attn = enable_flash_attn self.enable_layernorm_kernel = enable_layernorm_kernel self.space_scale = space_scale self.time_scale = time_scale self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size) self.t_embedder = TimestepEmbedder(hidden_size) self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True)) self.y_embedder = CaptionEmbedder( in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=model_max_length, ) self.register_buffer("pos_embed", self.get_spatial_pos_embed()) self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed()) drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] # stochastic depth decay rule self.kv_compress_config = kv_compress_config if kv_compress_config is None: self.kv_compress_config = { "sampling": None, "scale_factor": 1, "kv_compress_layer": [], } self.blocks = nn.ModuleList( [ PixArtBlock( hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i], enable_flash_attn=enable_flash_attn, enable_layernorm_kernel=enable_layernorm_kernel, qk_norm=qk_norm, sr_ratio=( int(self.kv_compress_config["scale_factor"]) if i in self.kv_compress_config["kv_compress_layer"] else 1 ), sampling=self.kv_compress_config["sampling"], ) for i in range(depth) ] ) self.final_layer = T2IFinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels) self.initialize_weights() if freeze is not None: assert freeze in ["text"] if freeze == "text": self.freeze_text() def forward(self, x, timestep, y, mask=None): """ Forward pass of PixArt. x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) t: (N,) tensor of diffusion timesteps y: (N, 1, 120, C) tensor of class labels """ x = x.to(self.dtype) timestep = timestep.to(self.dtype) y = y.to(self.dtype) pos_embed = self.get_spatial_pos_embed((x.shape[-2], x.shape[-1])).to(x.dtype) hw = (x.shape[-2] // self.patch_size[-2], x.shape[-1] // self.patch_size[-1]) # embedding x = self.x_embedder(x) # (B, N, D) x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial) x = x + pos_embed.to(x.device) if not self.no_temporal_pos_emb: x = rearrange(x, "b t s d -> b s t d") x = x + self.pos_embed_temporal x = rearrange(x, "b s t d -> b (t s) d") else: x = rearrange(x, "b t s d -> b (t s) d") t = self.t_embedder(timestep, dtype=x.dtype) # (N, D) t0 = self.t_block(t) y = self.y_embedder(y, self.training) # (N, 1, L, D) if mask is not None: if mask.shape[0] != y.shape[0]: mask = mask.repeat(y.shape[0] // mask.shape[0], 1) mask = mask.squeeze(1).squeeze(1) y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) y_lens = mask.sum(dim=1).tolist() else: y_lens = [y.shape[2]] * y.shape[0] y = y.squeeze(1).view(1, -1, x.shape[-1]) # blocks for block in self.blocks: x = auto_grad_checkpoint(block, x, y, t0, hw, y_lens) # final process x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels) x = self.unpatchify(x) # (N, out_channels, H, W) # cast to float32 for better accuracy x = x.to(torch.float32) return x def unpatchify(self, x): c = self.out_channels t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)] pt, ph, pw = self.patch_size x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c)) x = rearrange(x, "n t h w r p q c -> n c t r h p w q") imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw)) return imgs def get_spatial_pos_embed(self, grid_size=None): if grid_size is None: grid_size = self.input_size[1:] pos_embed = get_2d_sincos_pos_embed( self.hidden_size, (grid_size[0] // self.patch_size[1], grid_size[1] // self.patch_size[2]), scale=self.space_scale, base_size=self.base_size, ) pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) return pos_embed def get_temporal_pos_embed(self): pos_embed = get_1d_sincos_pos_embed( self.hidden_size, self.input_size[0] // self.patch_size[0], scale=self.time_scale, ) pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) return pos_embed def freeze_text(self): for n, p in self.named_parameters(): if "cross_attn" in n: p.requires_grad = False def initialize_weights(self): # Initialize transformer layers: def _basic_init(module): if isinstance(module, nn.Linear): torch.nn.init.xavier_uniform_(module.weight) if module.bias is not None: nn.init.constant_(module.bias, 0) self.apply(_basic_init) # Initialize patch_embed like nn.Linear (instead of nn.Conv2d): w = self.x_embedder.proj.weight.data nn.init.xavier_uniform_(w.view([w.shape[0], -1])) # Initialize timestep embedding MLP: nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) nn.init.normal_(self.t_block[1].weight, std=0.02) # Initialize caption embedding MLP: nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02) nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02) # Zero-out adaLN modulation layers in PixArt blocks: for block in self.blocks: nn.init.constant_(block.cross_attn.proj.weight, 0) nn.init.constant_(block.cross_attn.proj.bias, 0) # Zero-out output layers: nn.init.constant_(self.final_layer.linear.weight, 0) nn.init.constant_(self.final_layer.linear.bias, 0) @MODELS.register_module("PixArt-Sigma-XL/2") def PixArt_Sigma_XL_2(from_pretrained=None, **kwargs): model = PixArt_Sigma(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model ================================================ FILE: Open-Sora/build/lib/opensora/models/stdit/__init__.py ================================================ from .stdit import STDiT from .stdit2 import STDiT2 from .stdit3 import STDiT3 ================================================ FILE: Open-Sora/build/lib/opensora/models/stdit/stdit.py ================================================ import numpy as np import torch import torch.distributed as dist import torch.nn as nn from einops import rearrange from timm.models.layers import DropPath from timm.models.vision_transformer import Mlp from opensora.acceleration.checkpoint import auto_grad_checkpoint from opensora.acceleration.communications import gather_forward_split_backward, split_forward_gather_backward from opensora.acceleration.parallel_states import get_sequence_parallel_group from opensora.models.layers.blocks import ( Attention, CaptionEmbedder, MultiHeadCrossAttention, PatchEmbed3D, SeqParallelAttention, SeqParallelMultiHeadCrossAttention, T2IFinalLayer, TimestepEmbedder, approx_gelu, get_1d_sincos_pos_embed, get_2d_sincos_pos_embed, get_layernorm, t2i_modulate, ) from opensora.registry import MODELS from opensora.utils.ckpt_utils import load_checkpoint class STDiTBlock(nn.Module): def __init__( self, hidden_size, num_heads, d_s=None, d_t=None, mlp_ratio=4.0, drop_path=0.0, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, ): super().__init__() self.hidden_size = hidden_size self.enable_flash_attn = enable_flash_attn self._enable_sequence_parallelism = enable_sequence_parallelism if enable_sequence_parallelism: self.attn_cls = SeqParallelAttention self.mha_cls = SeqParallelMultiHeadCrossAttention else: self.attn_cls = Attention self.mha_cls = MultiHeadCrossAttention self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.attn = self.attn_cls( hidden_size, num_heads=num_heads, qkv_bias=True, enable_flash_attn=enable_flash_attn, ) self.cross_attn = self.mha_cls(hidden_size, num_heads) self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.mlp = Mlp( in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0 ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5) # temporal attention self.d_s = d_s self.d_t = d_t if self._enable_sequence_parallelism: sp_size = dist.get_world_size(get_sequence_parallel_group()) # make sure d_t is divisible by sp_size assert d_t % sp_size == 0 self.d_t = d_t // sp_size self.attn_temp = self.attn_cls( hidden_size, num_heads=num_heads, qkv_bias=True, enable_flash_attn=self.enable_flash_attn, ) def t_mask_select(self, x, masked_x, x_mask): # x: [B, (T, S), C] # mased_x: [B, (T, S), C] # x_mask: [B, T] x = rearrange(x, "B (T S) C -> B T S C", T=self.d_t, S=self.d_s) masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=self.d_t, S=self.d_s) x = torch.where(x_mask[:, :, None, None], x, masked_x) x = rearrange(x, "B T S C -> B (T S) C") return x def forward(self, x, y, t, mask=None, tpe=None, x_mask=None, t0=None): B, N, C = x.shape shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( self.scale_shift_table[None] + t.reshape(B, 6, -1) ).chunk(6, dim=1) x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa) if x_mask is not None: shift_msa_zero, scale_msa_zero, gate_msa_zero, shift_mlp_zero, scale_mlp_zero, gate_mlp_zero = ( self.scale_shift_table[None] + t0.reshape(B, 6, -1) ).chunk(6, dim=1) x_m_zero = t2i_modulate(self.norm1(x), shift_msa_zero, scale_msa_zero) x_m = self.t_mask_select(x_m, x_m_zero, x_mask) # spatial branch x_s = rearrange(x_m, "B (T S) C -> (B T) S C", T=self.d_t, S=self.d_s) x_s = self.attn(x_s) x_s = rearrange(x_s, "(B T) S C -> B (T S) C", T=self.d_t, S=self.d_s) if x_mask is not None: x_s_zero = gate_msa_zero * x_s x_s = gate_msa * x_s x_s = self.t_mask_select(x_s, x_s_zero, x_mask) else: x_s = gate_msa * x_s x = x + self.drop_path(x_s) # temporal branch x_t = rearrange(x, "B (T S) C -> (B S) T C", T=self.d_t, S=self.d_s) if tpe is not None: x_t = x_t + tpe x_t = self.attn_temp(x_t) x_t = rearrange(x_t, "(B S) T C -> B (T S) C", T=self.d_t, S=self.d_s) x = x + self.drop_path(gate_msa * x_t) # cross attn x = x + self.cross_attn(x, y, mask) # mlp x_m = t2i_modulate(self.norm2(x), shift_mlp, scale_mlp) if x_mask is not None: x_m_zero = t2i_modulate(self.norm2(x), shift_mlp_zero, scale_mlp_zero) x_m = self.t_mask_select(x_m, x_m_zero, x_mask) x_mlp = self.mlp(x_m) if x_mask is not None: x_mlp_zero = gate_mlp_zero * x_mlp x_mlp = gate_mlp * x_mlp x_mlp = self.t_mask_select(x_mlp, x_mlp_zero, x_mask) else: x_mlp = gate_mlp * x_mlp x = x + self.drop_path(x_mlp) return x @MODELS.register_module() class STDiT(nn.Module): def __init__( self, input_size=(1, 32, 32), in_channels=4, patch_size=(1, 2, 2), hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, pred_sigma=True, drop_path=0.0, no_temporal_pos_emb=False, caption_channels=4096, model_max_length=120, dtype=torch.float32, space_scale=1.0, time_scale=1.0, freeze=None, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, ): super().__init__() self.pred_sigma = pred_sigma self.in_channels = in_channels self.out_channels = in_channels * 2 if pred_sigma else in_channels self.hidden_size = hidden_size self.patch_size = patch_size self.input_size = input_size num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)]) self.num_patches = num_patches self.num_temporal = input_size[0] // patch_size[0] self.num_spatial = num_patches // self.num_temporal self.num_heads = num_heads self.dtype = dtype self.no_temporal_pos_emb = no_temporal_pos_emb self.depth = depth self.mlp_ratio = mlp_ratio self.enable_flash_attn = enable_flash_attn self.enable_layernorm_kernel = enable_layernorm_kernel self.space_scale = space_scale self.time_scale = time_scale self.register_buffer("pos_embed", self.get_spatial_pos_embed()) self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed()) self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size) self.t_embedder = TimestepEmbedder(hidden_size) self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True)) self.y_embedder = CaptionEmbedder( in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=model_max_length, ) drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] self.blocks = nn.ModuleList( [ STDiTBlock( self.hidden_size, self.num_heads, mlp_ratio=self.mlp_ratio, drop_path=drop_path[i], enable_flash_attn=self.enable_flash_attn, enable_layernorm_kernel=self.enable_layernorm_kernel, enable_sequence_parallelism=enable_sequence_parallelism, d_t=self.num_temporal, d_s=self.num_spatial, ) for i in range(self.depth) ] ) self.final_layer = T2IFinalLayer( hidden_size, np.prod(self.patch_size), self.out_channels, d_t=self.num_temporal, d_s=self.num_spatial, ) # init model self.initialize_weights() self.initialize_temporal() if freeze is not None: assert freeze in ["not_temporal", "text"] if freeze == "not_temporal": self.freeze_not_temporal() elif freeze == "text": self.freeze_text() # sequence parallel related configs self.enable_sequence_parallelism = enable_sequence_parallelism if enable_sequence_parallelism: self.sp_rank = dist.get_rank(get_sequence_parallel_group()) else: self.sp_rank = None def forward(self, x, timestep, y, mask=None, x_mask=None, **kwargs): """ Forward pass of STDiT. Args: x (torch.Tensor): latent representation of video; of shape [B, C, T, H, W] timestep (torch.Tensor): diffusion time steps; of shape [B] y (torch.Tensor): representation of prompts; of shape [B, 1, N_token, C] mask (torch.Tensor): mask for selecting prompt tokens; of shape [B, N_token] Returns: x (torch.Tensor): output latent representation; of shape [B, C, T, H, W] """ dtype = self.x_embedder.proj.weight.dtype x = x.to(dtype) timestep = timestep.to(dtype) y = y.to(dtype) # embedding x = self.x_embedder(x) # [B, N, C] x = rearrange(x, "B (T S) C -> B T S C", T=self.num_temporal, S=self.num_spatial) x = x + self.pos_embed x = rearrange(x, "B T S C -> B (T S) C") # shard over the sequence dim if sp is enabled if self.enable_sequence_parallelism: x = split_forward_gather_backward(x, get_sequence_parallel_group(), dim=1, grad_scale="down") t = self.t_embedder(timestep, dtype=x.dtype) # [B, C] t_mlp = self.t_block(t) # [B, C] if x_mask is not None: t0_timestep = torch.zeros_like(timestep) t0 = self.t_embedder(t0_timestep, dtype=x.dtype) t0_mlp = self.t_block(t0) else: t0 = None t0_mlp = None y = self.y_embedder(y, self.training) # [B, 1, N_token, C] if mask is not None: if mask.shape[0] != y.shape[0]: mask = mask.repeat(y.shape[0] // mask.shape[0], 1) mask = mask.squeeze(1).squeeze(1) y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) y_lens = mask.sum(dim=1).tolist() else: y_lens = [y.shape[2]] * y.shape[0] y = y.squeeze(1).view(1, -1, x.shape[-1]) # blocks for i, block in enumerate(self.blocks): if i == 0: if self.enable_sequence_parallelism: tpe = torch.chunk( self.pos_embed_temporal, dist.get_world_size(get_sequence_parallel_group()), dim=1 )[self.sp_rank].contiguous() else: tpe = self.pos_embed_temporal else: tpe = None x = auto_grad_checkpoint(block, x, y, t_mlp, y_lens, tpe, x_mask, t0_mlp) if self.enable_sequence_parallelism: x = gather_forward_split_backward(x, get_sequence_parallel_group(), dim=1, grad_scale="up") # x.shape: [B, N, C] # final process x = self.final_layer(x, t, x_mask, t0) # [B, N, C=T_p * H_p * W_p * C_out] x = self.unpatchify(x) # [B, C_out, T, H, W] # cast to float32 for better accuracy x = x.to(torch.float32) return x def unpatchify(self, x): """ Args: x (torch.Tensor): of shape [B, N, C] Return: x (torch.Tensor): of shape [B, C_out, T, H, W] """ N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)] T_p, H_p, W_p = self.patch_size x = rearrange( x, "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)", N_t=N_t, N_h=N_h, N_w=N_w, T_p=T_p, H_p=H_p, W_p=W_p, C_out=self.out_channels, ) return x def unpatchify_old(self, x): c = self.out_channels t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)] pt, ph, pw = self.patch_size x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c)) x = rearrange(x, "n t h w r p q c -> n c t r h p w q") imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw)) return imgs def get_spatial_pos_embed(self, grid_size=None): if grid_size is None: grid_size = self.input_size[1:] pos_embed = get_2d_sincos_pos_embed( self.hidden_size, (grid_size[0] // self.patch_size[1], grid_size[1] // self.patch_size[2]), scale=self.space_scale, ) pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) return pos_embed def get_temporal_pos_embed(self): pos_embed = get_1d_sincos_pos_embed( self.hidden_size, self.input_size[0] // self.patch_size[0], scale=self.time_scale, ) pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) return pos_embed def freeze_not_temporal(self): for n, p in self.named_parameters(): if "attn_temp" not in n: p.requires_grad = False def freeze_text(self): for n, p in self.named_parameters(): if "cross_attn" in n: p.requires_grad = False def initialize_temporal(self): for block in self.blocks: nn.init.constant_(block.attn_temp.proj.weight, 0) nn.init.constant_(block.attn_temp.proj.bias, 0) def initialize_weights(self): # Initialize transformer layers: def _basic_init(module): if isinstance(module, nn.Linear): torch.nn.init.xavier_uniform_(module.weight) if module.bias is not None: nn.init.constant_(module.bias, 0) self.apply(_basic_init) # Initialize patch_embed like nn.Linear (instead of nn.Conv2d): w = self.x_embedder.proj.weight.data nn.init.xavier_uniform_(w.view([w.shape[0], -1])) # Initialize timestep embedding MLP: nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) nn.init.normal_(self.t_block[1].weight, std=0.02) # Initialize caption embedding MLP: nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02) nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02) # Zero-out adaLN modulation layers in PixArt blocks: for block in self.blocks: nn.init.constant_(block.cross_attn.proj.weight, 0) nn.init.constant_(block.cross_attn.proj.bias, 0) # Zero-out output layers: nn.init.constant_(self.final_layer.linear.weight, 0) nn.init.constant_(self.final_layer.linear.bias, 0) @MODELS.register_module("STDiT-XL/2") def STDiT_XL_2(from_pretrained=None, **kwargs): model = STDiT(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model ================================================ FILE: Open-Sora/build/lib/opensora/models/stdit/stdit2.py ================================================ import os import numpy as np import torch import torch.nn as nn from einops import rearrange from rotary_embedding_torch import RotaryEmbedding from timm.models.layers import DropPath from timm.models.vision_transformer import Mlp from transformers import PretrainedConfig, PreTrainedModel from opensora.acceleration.checkpoint import auto_grad_checkpoint from opensora.models.layers.blocks import ( Attention, CaptionEmbedder, MultiHeadCrossAttention, PatchEmbed3D, PositionEmbedding2D, SizeEmbedder, T2IFinalLayer, TimestepEmbedder, approx_gelu, get_2d_sincos_pos_embed, get_layernorm, t2i_modulate, ) from opensora.registry import MODELS from opensora.utils.ckpt_utils import load_checkpoint class STDiT2Block(nn.Module): def __init__( self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0.0, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, rope=None, qk_norm=False, qk_norm_legacy=False, ): super().__init__() self.hidden_size = hidden_size self.enable_flash_attn = enable_flash_attn self._enable_sequence_parallelism = enable_sequence_parallelism # spatial branch self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.attn = Attention( hidden_size, num_heads=num_heads, qkv_bias=True, enable_flash_attn=enable_flash_attn, qk_norm=qk_norm, qk_norm_legacy=qk_norm_legacy, ) self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5) # cross attn self.cross_attn = MultiHeadCrossAttention(hidden_size, num_heads) # mlp branch self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.mlp = Mlp( in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0 ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() # temporal branch self.norm_temp = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) # new self.attn_temp = Attention( hidden_size, num_heads=num_heads, qkv_bias=True, enable_flash_attn=self.enable_flash_attn, rope=rope, qk_norm=qk_norm, qk_norm_legacy=qk_norm_legacy, ) self.scale_shift_table_temporal = nn.Parameter(torch.randn(3, hidden_size) / hidden_size**0.5) # new def t_mask_select(self, x_mask, x, masked_x, T, S): # x: [B, (T, S), C] # mased_x: [B, (T, S), C] # x_mask: [B, T] x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S) masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=T, S=S) x = torch.where(x_mask[:, :, None, None], x, masked_x) x = rearrange(x, "B T S C -> B (T S) C") return x def forward(self, x, y, t, t_tmp, mask=None, x_mask=None, t0=None, t0_tmp=None, T=None, S=None): B, N, C = x.shape shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( self.scale_shift_table[None] + t.reshape(B, 6, -1) ).chunk(6, dim=1) shift_tmp, scale_tmp, gate_tmp = (self.scale_shift_table_temporal[None] + t_tmp.reshape(B, 3, -1)).chunk( 3, dim=1 ) if x_mask is not None: shift_msa_zero, scale_msa_zero, gate_msa_zero, shift_mlp_zero, scale_mlp_zero, gate_mlp_zero = ( self.scale_shift_table[None] + t0.reshape(B, 6, -1) ).chunk(6, dim=1) shift_tmp_zero, scale_tmp_zero, gate_tmp_zero = ( self.scale_shift_table_temporal[None] + t0_tmp.reshape(B, 3, -1) ).chunk(3, dim=1) # modulate x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa) if x_mask is not None: x_m_zero = t2i_modulate(self.norm1(x), shift_msa_zero, scale_msa_zero) x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S) # spatial branch x_s = rearrange(x_m, "B (T S) C -> (B T) S C", T=T, S=S) x_s = self.attn(x_s) x_s = rearrange(x_s, "(B T) S C -> B (T S) C", T=T, S=S) if x_mask is not None: x_s_zero = gate_msa_zero * x_s x_s = gate_msa * x_s x_s = self.t_mask_select(x_mask, x_s, x_s_zero, T, S) else: x_s = gate_msa * x_s x = x + self.drop_path(x_s) # modulate x_m = t2i_modulate(self.norm_temp(x), shift_tmp, scale_tmp) if x_mask is not None: x_m_zero = t2i_modulate(self.norm_temp(x), shift_tmp_zero, scale_tmp_zero) x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S) # temporal branch x_t = rearrange(x_m, "B (T S) C -> (B S) T C", T=T, S=S) x_t = self.attn_temp(x_t) x_t = rearrange(x_t, "(B S) T C -> B (T S) C", T=T, S=S) if x_mask is not None: x_t_zero = gate_tmp_zero * x_t x_t = gate_tmp * x_t x_t = self.t_mask_select(x_mask, x_t, x_t_zero, T, S) else: x_t = gate_tmp * x_t x = x + self.drop_path(x_t) # cross attn x = x + self.cross_attn(x, y, mask) # modulate x_m = t2i_modulate(self.norm2(x), shift_mlp, scale_mlp) if x_mask is not None: x_m_zero = t2i_modulate(self.norm2(x), shift_mlp_zero, scale_mlp_zero) x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S) # mlp x_mlp = self.mlp(x_m) if x_mask is not None: x_mlp_zero = gate_mlp_zero * x_mlp x_mlp = gate_mlp * x_mlp x_mlp = self.t_mask_select(x_mask, x_mlp, x_mlp_zero, T, S) else: x_mlp = gate_mlp * x_mlp x = x + self.drop_path(x_mlp) return x class STDiT2Config(PretrainedConfig): model_type = "STDiT2" def __init__( self, input_size=(None, None, None), input_sq_size=32, in_channels=4, patch_size=(1, 2, 2), hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, pred_sigma=True, drop_path=0.0, no_temporal_pos_emb=False, caption_channels=4096, model_max_length=120, freeze=None, qk_norm=False, qk_norm_legacy=False, enable_flash_attn=False, enable_layernorm_kernel=False, **kwargs, ): self.input_size = input_size self.input_sq_size = input_sq_size self.in_channels = in_channels self.patch_size = patch_size self.hidden_size = hidden_size self.depth = depth self.num_heads = num_heads self.mlp_ratio = mlp_ratio self.class_dropout_prob = class_dropout_prob self.pred_sigma = pred_sigma self.drop_path = drop_path self.no_temporal_pos_emb = no_temporal_pos_emb self.caption_channels = caption_channels self.model_max_length = model_max_length self.freeze = freeze self.qk_norm = qk_norm self.qk_norm_legacy = qk_norm_legacy self.enable_flash_attn = enable_flash_attn self.enable_layernorm_kernel = enable_layernorm_kernel super().__init__(**kwargs) @MODELS.register_module() class STDiT2(PreTrainedModel): config_class = STDiT2Config def __init__(self, config): super().__init__(config) self.pred_sigma = config.pred_sigma self.in_channels = config.in_channels self.out_channels = config.in_channels * 2 if config.pred_sigma else config.in_channels self.hidden_size = config.hidden_size self.num_heads = config.num_heads self.no_temporal_pos_emb = config.no_temporal_pos_emb self.depth = config.depth self.mlp_ratio = config.mlp_ratio self.enable_flash_attn = config.enable_flash_attn self.enable_layernorm_kernel = config.enable_layernorm_kernel # support dynamic input self.patch_size = config.patch_size self.input_size = config.input_size self.input_sq_size = config.input_sq_size self.pos_embed = PositionEmbedding2D(config.hidden_size) self.x_embedder = PatchEmbed3D(config.patch_size, config.in_channels, config.hidden_size) self.t_embedder = TimestepEmbedder(config.hidden_size) self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(config.hidden_size, 6 * config.hidden_size, bias=True)) self.t_block_temp = nn.Sequential( nn.SiLU(), nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=True) ) # new self.y_embedder = CaptionEmbedder( in_channels=config.caption_channels, hidden_size=config.hidden_size, uncond_prob=config.class_dropout_prob, act_layer=approx_gelu, token_num=config.model_max_length, ) drop_path = [x.item() for x in torch.linspace(0, config.drop_path, config.depth)] self.rope = RotaryEmbedding(dim=self.hidden_size // self.num_heads) # new self.blocks = nn.ModuleList( [ STDiT2Block( self.hidden_size, self.num_heads, mlp_ratio=self.mlp_ratio, drop_path=drop_path[i], enable_flash_attn=self.enable_flash_attn, enable_layernorm_kernel=self.enable_layernorm_kernel, rope=self.rope.rotate_queries_or_keys, qk_norm=config.qk_norm, qk_norm_legacy=config.qk_norm_legacy, ) for i in range(self.depth) ] ) self.final_layer = T2IFinalLayer(config.hidden_size, np.prod(self.patch_size), self.out_channels) # multi_res assert self.hidden_size % 3 == 0, "hidden_size must be divisible by 3" self.csize_embedder = SizeEmbedder(self.hidden_size // 3) self.ar_embedder = SizeEmbedder(self.hidden_size // 3) self.fl_embedder = SizeEmbedder(self.hidden_size) # new self.fps_embedder = SizeEmbedder(self.hidden_size) # new # init model self.initialize_weights() self.initialize_temporal() if config.freeze is not None: assert config.freeze in ["not_temporal", "text"] if config.freeze == "not_temporal": self.freeze_not_temporal() elif config.freeze == "text": self.freeze_text() def get_dynamic_size(self, x): _, _, T, H, W = x.size() if T % self.patch_size[0] != 0: T += self.patch_size[0] - T % self.patch_size[0] if H % self.patch_size[1] != 0: H += self.patch_size[1] - H % self.patch_size[1] if W % self.patch_size[2] != 0: W += self.patch_size[2] - W % self.patch_size[2] T = T // self.patch_size[0] H = H // self.patch_size[1] W = W // self.patch_size[2] return (T, H, W) def forward( self, x, timestep, y, mask=None, x_mask=None, num_frames=None, height=None, width=None, ar=None, fps=None ): """ Forward pass of STDiT. Args: x (torch.Tensor): latent representation of video; of shape [B, C, T, H, W] timestep (torch.Tensor): diffusion time steps; of shape [B] y (torch.Tensor): representation of prompts; of shape [B, 1, N_token, C] mask (torch.Tensor): mask for selecting prompt tokens; of shape [B, N_token] Returns: x (torch.Tensor): output latent representation; of shape [B, C, T, H, W] """ B = x.shape[0] dtype = self.x_embedder.proj.weight.dtype x = x.to(dtype) timestep = timestep.to(dtype) y = y.to(dtype) # === process data info === # 1. get dynamic size hw = torch.cat([height[:, None], width[:, None]], dim=1) rs = (height[0].item() * width[0].item()) ** 0.5 csize = self.csize_embedder(hw, B) # 2. get aspect ratio ar = ar.unsqueeze(1) ar = self.ar_embedder(ar, B) data_info = torch.cat([csize, ar], dim=1) # 3. get number of frames fl = num_frames.unsqueeze(1) fps = fps.unsqueeze(1) fl = self.fl_embedder(fl, B) fl = fl + self.fps_embedder(fps, B) # === get dynamic shape size === _, _, Tx, Hx, Wx = x.size() T, H, W = self.get_dynamic_size(x) S = H * W scale = rs / self.input_sq_size base_size = round(S**0.5) pos_emb = self.pos_embed(x, H, W, scale=scale, base_size=base_size) # embedding x = self.x_embedder(x) # [B, N, C] x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S) x = x + pos_emb x = rearrange(x, "B T S C -> B (T S) C") # prepare adaIN t = self.t_embedder(timestep, dtype=x.dtype) # [B, C] t_spc = t + data_info # [B, C] t_tmp = t + fl # [B, C] t_spc_mlp = self.t_block(t_spc) # [B, 6*C] t_tmp_mlp = self.t_block_temp(t_tmp) # [B, 3*C] if x_mask is not None: t0_timestep = torch.zeros_like(timestep) t0 = self.t_embedder(t0_timestep, dtype=x.dtype) t0_spc = t0 + data_info t0_tmp = t0 + fl t0_spc_mlp = self.t_block(t0_spc) t0_tmp_mlp = self.t_block_temp(t0_tmp) else: t0_spc = None t0_tmp = None t0_spc_mlp = None t0_tmp_mlp = None # prepare y y = self.y_embedder(y, self.training) # [B, 1, N_token, C] if mask is not None: if mask.shape[0] != y.shape[0]: mask = mask.repeat(y.shape[0] // mask.shape[0], 1) mask = mask.squeeze(1).squeeze(1) y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) y_lens = mask.sum(dim=1).tolist() else: y_lens = [y.shape[2]] * y.shape[0] y = y.squeeze(1).view(1, -1, x.shape[-1]) # blocks for _, block in enumerate(self.blocks): x = auto_grad_checkpoint( block, x, y, t_spc_mlp, t_tmp_mlp, y_lens, x_mask, t0_spc_mlp, t0_tmp_mlp, T, S, ) # x.shape: [B, N, C] # final process x = self.final_layer(x, t, x_mask, t0_spc, T, S) # [B, N, C=T_p * H_p * W_p * C_out] x = self.unpatchify(x, T, H, W, Tx, Hx, Wx) # [B, C_out, T, H, W] # cast to float32 for better accuracy x = x.to(torch.float32) return x def unpatchify(self, x, N_t, N_h, N_w, R_t, R_h, R_w): """ Args: x (torch.Tensor): of shape [B, N, C] Return: x (torch.Tensor): of shape [B, C_out, T, H, W] """ # N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)] T_p, H_p, W_p = self.patch_size x = rearrange( x, "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)", N_t=N_t, N_h=N_h, N_w=N_w, T_p=T_p, H_p=H_p, W_p=W_p, C_out=self.out_channels, ) # unpad x = x[:, :, :R_t, :R_h, :R_w] return x def unpatchify_old(self, x): c = self.out_channels t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)] pt, ph, pw = self.patch_size x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c)) x = rearrange(x, "n t h w r p q c -> n c t r h p w q") imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw)) return imgs def get_spatial_pos_embed(self, H, W, scale=1.0, base_size=None): pos_embed = get_2d_sincos_pos_embed( self.hidden_size, (H, W), scale=scale, base_size=base_size, ) pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) return pos_embed def freeze_not_temporal(self): for n, p in self.named_parameters(): if "attn_temp" not in n: p.requires_grad = False def freeze_text(self): for n, p in self.named_parameters(): if "cross_attn" in n: p.requires_grad = False def initialize_temporal(self): for block in self.blocks: nn.init.constant_(block.attn_temp.proj.weight, 0) nn.init.constant_(block.attn_temp.proj.bias, 0) def initialize_weights(self): # Initialize transformer layers: def _basic_init(module): if isinstance(module, nn.Linear): torch.nn.init.xavier_uniform_(module.weight) if module.bias is not None: nn.init.constant_(module.bias, 0) self.apply(_basic_init) # Initialize patch_embed like nn.Linear (instead of nn.Conv2d): w = self.x_embedder.proj.weight.data nn.init.xavier_uniform_(w.view([w.shape[0], -1])) # Initialize timestep embedding MLP: nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) nn.init.normal_(self.t_block[1].weight, std=0.02) nn.init.normal_(self.t_block_temp[1].weight, std=0.02) # Initialize caption embedding MLP: nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02) nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02) # Zero-out adaLN modulation layers in PixArt blocks: for block in self.blocks: nn.init.constant_(block.cross_attn.proj.weight, 0) nn.init.constant_(block.cross_attn.proj.bias, 0) # Zero-out output layers: nn.init.constant_(self.final_layer.linear.weight, 0) nn.init.constant_(self.final_layer.linear.bias, 0) @MODELS.register_module("STDiT2-XL/2") def STDiT2_XL_2(from_pretrained=None, **kwargs): if from_pretrained is not None: if os.path.isdir(from_pretrained) or os.path.isfile(from_pretrained): # if it is a directory or a file, we load the checkpoint manually config = STDiT2Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) model = STDiT2(config) load_checkpoint(model, from_pretrained) return model else: # otherwise, we load the model from hugging face hub return STDiT2.from_pretrained(from_pretrained) else: # create a new model config = STDiT2Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) model = STDiT2(config) return model ================================================ FILE: Open-Sora/build/lib/opensora/models/stdit/stdit3 copy.py ================================================ import os import numpy as np import torch import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F from einops import rearrange from rotary_embedding_torch import RotaryEmbedding from timm.models.layers import DropPath from timm.models.vision_transformer import Mlp from transformers import PretrainedConfig, PreTrainedModel from opensora.acceleration.checkpoint import auto_grad_checkpoint from opensora.acceleration.communications import gather_forward_split_backward, split_forward_gather_backward from opensora.acceleration.parallel_states import get_sequence_parallel_group from opensora.models.layers.blocks import ( Attention, CaptionEmbedder, MultiHeadCrossAttention, PatchEmbed3D, PositionEmbedding2D, SeqParallelAttention, SeqParallelMultiHeadCrossAttention, SizeEmbedder, T2IFinalLayer, TimestepEmbedder, approx_gelu, get_layernorm, t2i_modulate, ) from opensora.registry import MODELS from opensora.utils.ckpt_utils import load_checkpoint from ...models.cache_functions import global_force_fresh, cache_cutfresh, update_cache, force_init, score_evaluate class STDiT3Block(nn.Module): def __init__( self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0.0, rope=None, qk_norm=False, temporal=False, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, ): super().__init__() self.temporal = temporal self.hidden_size = hidden_size self.enable_flash_attn = enable_flash_attn self.enable_sequence_parallelism = enable_sequence_parallelism if self.enable_sequence_parallelism and not temporal: attn_cls = SeqParallelAttention mha_cls = SeqParallelMultiHeadCrossAttention else: attn_cls = Attention mha_cls = MultiHeadCrossAttention self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.attn = attn_cls( hidden_size, num_heads=num_heads, qkv_bias=True, qk_norm=qk_norm, rope=rope, enable_flash_attn=enable_flash_attn, ) self.cross_attn = mha_cls(hidden_size, num_heads) self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.mlp = Mlp( in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0 ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5) def t_mask_select(self, x_mask, x, masked_x, T, S): # x: [B, (T, S), C] # mased_x: [B, (T, S), C] # x_mask: [B, T] x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S) masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=T, S=S) x = torch.where(x_mask[:, :, None, None], x, masked_x) x = rearrange(x, "B T S C -> B (T S) C") return x def forward( self, x, y, t, current, cache_dic, mask=None, # text mask x_mask=None, # temporal mask t0=None, # t with timestamp=0 T=None, # number of frames S=None, # number of pixel patches ): # prepare modulate parameters B, N, C = x.shape shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( self.scale_shift_table[None] + t.reshape(B, 6, -1) ).chunk(6, dim=1) if x_mask is not None: shift_msa_zero, scale_msa_zero, gate_msa_zero, shift_mlp_zero, scale_mlp_zero, gate_mlp_zero = ( self.scale_shift_table[None] + t0.reshape(B, 6, -1) ).chunk(6, dim=1) #attn_tick = torch.cuda.Event(enable_timing=True) #cross_attn_tick = torch.cuda.Event(enable_timing=True) #end_cross_attn_tick = torch.cuda.Event(enable_timing=True) #mlp_tick = torch.cuda.Event(enable_timing=True) #end = torch.cuda.Event(enable_timing=True) if self.temporal: current['flag'] = -1 else: current['flag'] = 0 is_force_fresh = global_force_fresh(cache_dic, current) current['is_force_fresh'] = is_force_fresh #print(is_force_fresh) if is_force_fresh: # modulate (attention) current['module'] = 'attn' #attn_tick.record() x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa) if x_mask is not None: x_m_zero = t2i_modulate(self.norm1(x), shift_msa_zero, scale_msa_zero) x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S) # attention if self.temporal: x_m = rearrange(x_m, "B (T S) C -> (B S) T C", T=T, S=S) x_m = self.attn(x_m) x_m = rearrange(x_m, "(B S) T C -> B (T S) C", T=T, S=S) else: x_m = rearrange(x_m, "B (T S) C -> (B T) S C", T=T, S=S) x_m = self.attn(x_m) x_m = rearrange(x_m, "(B T) S C -> B (T S) C", T=T, S=S) cache_dic['cache'][current['flag']][current['layer']][current['module']] = x_m force_init(cache_dic, current, x) # modulate (attention) x_m_s = gate_msa * x_m if x_mask is not None: x_m_s_zero = gate_msa_zero * x_m x_m_s = self.t_mask_select(x_mask, x_m_s, x_m_s_zero, T, S) # residual x = x + self.drop_path(x_m_s) # cross attention current['module'] = 'cross-attn' #cross_attn_tick.record() cache_dic['cache'][current['flag']][current['layer']][current['module']], cache_dic['cross_attn_map'][current['flag']][current['layer']] = self.cross_attn(x, y, mask) force_init(cache_dic, current, x) x = x + cache_dic['cache'][current['flag']][current['layer']][current['module']] # modulate (MLP) current['module'] = 'mlp' #mlp_tick.record() x_m = t2i_modulate(self.norm2(x), shift_mlp, scale_mlp) if x_mask is not None: x_m_zero = t2i_modulate(self.norm2(x), shift_mlp_zero, scale_mlp_zero) x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S) # MLP x_m = self.mlp(x_m) cache_dic['cache'][current['flag']][current['layer']][current['module']] = x_m # modulate (MLP) x_m_s = gate_mlp * x_m if x_mask is not None: x_m_s_zero = gate_mlp_zero * x_m x_m_s = self.t_mask_select(x_mask, x_m_s, x_m_s_zero, T, S) # residual force_init(cache_dic, current, x) x = x + self.drop_path(x_m_s) #end.record() #torch.cuda.synchronize() #print(attn_tick.elapsed_time(cross_attn_tick),cross_attn_tick.elapsed_time(mlp_tick),mlp_tick.elapsed_time(end)) else: # modulate (attention) current['module'] = 'attn' #attn_tick.record() #cal_attn = current['step'] % cache_dic['cal_threshold'] == 1 cal_attn = True if cal_attn: x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa) if x_mask is not None: x_m_zero = t2i_modulate(self.norm1(x), shift_msa_zero, scale_msa_zero) x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S) # attention if self.temporal: x_m = rearrange(x_m, "B (T S) C -> (B S) T C", T=T, S=S) x_m = self.attn(x_m) x_m = rearrange(x_m, "(B S) T C -> B (T S) C", T=T, S=S) else: x_m = rearrange(x_m, "B (T S) C -> (B T) S C", T=T, S=S) x_m = self.attn(x_m) x_m = rearrange(x_m, "(B T) S C -> B (T S) C", T=T, S=S) cache_dic['cache'][current['flag']][current['layer']][current['module']] = x_m x_m = cache_dic['cache'][current['flag']][current['layer']][current['module']] # modulate (attention) x_m_s = gate_msa * x_m if x_mask is not None: x_m_s_zero = gate_msa_zero * x_m x_m_s = self.t_mask_select(x_mask, x_m_s, x_m_s_zero, T, S) # residual x = x + self.drop_path(x_m_s) # cross attention current['module'] = 'cross-attn' #cache_dic['cache'][flag][current['layer']][current['module']] = self.cross_attn(x, y, mask) #x = x + cache_dic['cache'][flag][current['layer']][current['module']] fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x, current) # 0.6ms fresh_tokens, fresh_cross_attn_map = self.cross_attn(fresh_tokens, y, mask) # 0.45ms #cross_attn_tick.record() update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current, fresh_attn_map=fresh_cross_attn_map) # 0.3ms #cache_dic['cache'][-1][current['layer']][current['module']] = self.cross_attn(x, y, mask) x = x + cache_dic['cache'][current['flag']][current['layer']][current['module']] # modulate (MLP) current['module'] = 'mlp' #mlp_tick.record() x_m = t2i_modulate(self.norm2(x), shift_mlp, scale_mlp) if x_mask is not None: x_m_zero = t2i_modulate(self.norm2(x), shift_mlp_zero, scale_mlp_zero) x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S) # MLP fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x_m, current) fresh_tokens = self.mlp(fresh_tokens) update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current, fresh_attn_map=fresh_cross_attn_map) x_m = cache_dic['cache'][current['flag']][current['layer']][current['module']] # modulate (MLP) x_m_s = gate_mlp * x_m if x_mask is not None: x_m_s_zero = gate_mlp_zero * x_m x_m_s = self.t_mask_select(x_mask, x_m_s, x_m_s_zero, T, S) # residual x = x + self.drop_path(x_m_s) #end.record() #torch.cuda.synchronize() #print("Cached:",attn_tick.elapsed_time(cross_attn_tick),cross_attn_tick.elapsed_time(mlp_tick),mlp_tick.elapsed_time(end)) #print(cross_attn_tick.elapsed_time(end_cross_attn_tick)) return x class STDiT3Config(PretrainedConfig): model_type = "STDiT3" def __init__( self, input_size=(None, None, None), input_sq_size=512, in_channels=4, patch_size=(1, 2, 2), hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, pred_sigma=True, drop_path=0.0, caption_channels=4096, model_max_length=300, qk_norm=True, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, only_train_temporal=False, freeze_y_embedder=False, skip_y_embedder=False, **kwargs, ): self.input_size = input_size self.input_sq_size = input_sq_size self.in_channels = in_channels self.patch_size = patch_size self.hidden_size = hidden_size self.depth = depth self.num_heads = num_heads self.mlp_ratio = mlp_ratio self.class_dropout_prob = class_dropout_prob self.pred_sigma = pred_sigma self.drop_path = drop_path self.caption_channels = caption_channels self.model_max_length = model_max_length self.qk_norm = qk_norm self.enable_flash_attn = enable_flash_attn self.enable_layernorm_kernel = enable_layernorm_kernel self.enable_sequence_parallelism = enable_sequence_parallelism self.only_train_temporal = only_train_temporal self.freeze_y_embedder = freeze_y_embedder self.skip_y_embedder = skip_y_embedder super().__init__(**kwargs) class STDiT3(PreTrainedModel): config_class = STDiT3Config def __init__(self, config): super().__init__(config) self.pred_sigma = config.pred_sigma self.in_channels = config.in_channels self.out_channels = config.in_channels * 2 if config.pred_sigma else config.in_channels # model size related self.depth = config.depth self.mlp_ratio = config.mlp_ratio self.hidden_size = config.hidden_size self.num_heads = config.num_heads # computation related self.drop_path = config.drop_path self.enable_flash_attn = config.enable_flash_attn self.enable_layernorm_kernel = config.enable_layernorm_kernel self.enable_sequence_parallelism = config.enable_sequence_parallelism # input size related self.patch_size = config.patch_size self.input_sq_size = config.input_sq_size self.pos_embed = PositionEmbedding2D(config.hidden_size) self.rope = RotaryEmbedding(dim=self.hidden_size // self.num_heads) # embedding self.x_embedder = PatchEmbed3D(config.patch_size, config.in_channels, config.hidden_size) self.t_embedder = TimestepEmbedder(config.hidden_size) self.fps_embedder = SizeEmbedder(self.hidden_size) self.t_block = nn.Sequential( nn.SiLU(), nn.Linear(config.hidden_size, 6 * config.hidden_size, bias=True), ) self.y_embedder = CaptionEmbedder( in_channels=config.caption_channels, hidden_size=config.hidden_size, uncond_prob=config.class_dropout_prob, act_layer=approx_gelu, token_num=config.model_max_length, ) # spatial blocks drop_path = [x.item() for x in torch.linspace(0, self.drop_path, config.depth)] self.spatial_blocks = nn.ModuleList( [ STDiT3Block( hidden_size=config.hidden_size, num_heads=config.num_heads, mlp_ratio=config.mlp_ratio, drop_path=drop_path[i], qk_norm=config.qk_norm, enable_flash_attn=config.enable_flash_attn, enable_layernorm_kernel=config.enable_layernorm_kernel, enable_sequence_parallelism=config.enable_sequence_parallelism, ) for i in range(config.depth) ] ) # temporal blocks drop_path = [x.item() for x in torch.linspace(0, self.drop_path, config.depth)] self.temporal_blocks = nn.ModuleList( [ STDiT3Block( hidden_size=config.hidden_size, num_heads=config.num_heads, mlp_ratio=config.mlp_ratio, drop_path=drop_path[i], qk_norm=config.qk_norm, enable_flash_attn=config.enable_flash_attn, enable_layernorm_kernel=config.enable_layernorm_kernel, enable_sequence_parallelism=config.enable_sequence_parallelism, # temporal temporal=True, rope=self.rope.rotate_queries_or_keys, ) for i in range(config.depth) ] ) # final layer self.final_layer = T2IFinalLayer(config.hidden_size, np.prod(self.patch_size), self.out_channels) self.initialize_weights() if config.only_train_temporal: for param in self.parameters(): param.requires_grad = False for block in self.temporal_blocks: for param in block.parameters(): param.requires_grad = True if config.freeze_y_embedder: for param in self.y_embedder.parameters(): param.requires_grad = False def initialize_weights(self): # Initialize transformer layers: def _basic_init(module): if isinstance(module, nn.Linear): torch.nn.init.xavier_uniform_(module.weight) if module.bias is not None: nn.init.constant_(module.bias, 0) self.apply(_basic_init) # Initialize fps_embedder nn.init.normal_(self.fps_embedder.mlp[0].weight, std=0.02) nn.init.constant_(self.fps_embedder.mlp[0].bias, 0) nn.init.constant_(self.fps_embedder.mlp[2].weight, 0) nn.init.constant_(self.fps_embedder.mlp[2].bias, 0) # Initialize timporal blocks for block in self.temporal_blocks: nn.init.constant_(block.attn.proj.weight, 0) nn.init.constant_(block.cross_attn.proj.weight, 0) nn.init.constant_(block.mlp.fc2.weight, 0) def get_dynamic_size(self, x): _, _, T, H, W = x.size() if T % self.patch_size[0] != 0: T += self.patch_size[0] - T % self.patch_size[0] if H % self.patch_size[1] != 0: H += self.patch_size[1] - H % self.patch_size[1] if W % self.patch_size[2] != 0: W += self.patch_size[2] - W % self.patch_size[2] T = T // self.patch_size[0] H = H // self.patch_size[1] W = W // self.patch_size[2] return (T, H, W) def encode_text(self, y, mask=None): y = self.y_embedder(y, self.training) # [B, 1, N_token, C] if mask is not None: if mask.shape[0] != y.shape[0]: mask = mask.repeat(y.shape[0] // mask.shape[0], 1) mask = mask.squeeze(1).squeeze(1) y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, self.hidden_size) y_lens = mask.sum(dim=1).tolist() else: y_lens = [y.shape[2]] * y.shape[0] y = y.squeeze(1).view(1, -1, self.hidden_size) return y, y_lens def forward(self, x, timestep, y, mask=None, x_mask=None, fps=None, height=None, width=None, cache_dic=None, current=None, **kwargs): dtype = self.x_embedder.proj.weight.dtype B = x.size(0) x = x.to(dtype) timestep = timestep.to(dtype) y = y.to(dtype) # === get pos embed === _, _, Tx, Hx, Wx = x.size() T, H, W = self.get_dynamic_size(x) cache_dic['dynamic_size'] = (B,T,H,W) # adjust for sequence parallelism # we need to ensure H * W is divisible by sequence parallel size # for simplicity, we can adjust the height to make it divisible if self.enable_sequence_parallelism: sp_size = dist.get_world_size(get_sequence_parallel_group()) if H % sp_size != 0: h_pad_size = sp_size - H % sp_size else: h_pad_size = 0 if h_pad_size > 0: hx_pad_size = h_pad_size * self.patch_size[1] # pad x along the H dimension H += h_pad_size x = F.pad(x, (0, 0, 0, hx_pad_size)) S = H * W base_size = round(S**0.5) resolution_sq = (height[0].item() * width[0].item()) ** 0.5 scale = resolution_sq / self.input_sq_size pos_emb = self.pos_embed(x, H, W, scale=scale, base_size=base_size) # === get timestep embed === t = self.t_embedder(timestep, dtype=x.dtype) # [B, C] fps = self.fps_embedder(fps.unsqueeze(1), B) t = t + fps t_mlp = self.t_block(t) t0 = t0_mlp = None if x_mask is not None: t0_timestep = torch.zeros_like(timestep) t0 = self.t_embedder(t0_timestep, dtype=x.dtype) t0 = t0 + fps t0_mlp = self.t_block(t0) # === get y embed === if self.config.skip_y_embedder: y_lens = mask if isinstance(y_lens, torch.Tensor): y_lens = y_lens.long().tolist() else: y, y_lens = self.encode_text(y, mask) # === get x embed === x = self.x_embedder(x) # [B, N, C] x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S) x = x + pos_emb # shard over the sequence dim if sp is enabled if self.enable_sequence_parallelism: x = split_forward_gather_backward(x, get_sequence_parallel_group(), dim=2, grad_scale="down") S = S // dist.get_world_size(get_sequence_parallel_group()) x = rearrange(x, "B T S C -> B (T S) C", T=T, S=S) # === blocks === for i, (spatial_block, temporal_block) in enumerate(zip(self.spatial_blocks, self.temporal_blocks)): current['layer'] = i #x = auto_grad_checkpoint(spatial_block, x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S) #x = auto_grad_checkpoint(temporal_block, x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S) x = spatial_block(x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S) x = temporal_block(x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S) if self.enable_sequence_parallelism: x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S) x = gather_forward_split_backward(x, get_sequence_parallel_group(), dim=2, grad_scale="up") S = S * dist.get_world_size(get_sequence_parallel_group()) x = rearrange(x, "B T S C -> B (T S) C", T=T, S=S) # === final layer === x = self.final_layer(x, t, x_mask, t0, T, S) x = self.unpatchify(x, T, H, W, Tx, Hx, Wx) # cast to float32 for better accuracy x = x.to(torch.float32) return x def unpatchify(self, x, N_t, N_h, N_w, R_t, R_h, R_w): """ Args: x (torch.Tensor): of shape [B, N, C] Return: x (torch.Tensor): of shape [B, C_out, T, H, W] """ # N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)] T_p, H_p, W_p = self.patch_size x = rearrange( x, "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)", N_t=N_t, N_h=N_h, N_w=N_w, T_p=T_p, H_p=H_p, W_p=W_p, C_out=self.out_channels, ) # unpad x = x[:, :, :R_t, :R_h, :R_w] return x @MODELS.register_module("STDiT3-XL/2") def STDiT3_XL_2(from_pretrained=None, **kwargs): force_huggingface = kwargs.pop("force_huggingface", False) if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained): model = STDiT3.from_pretrained(from_pretrained, **kwargs) else: config = STDiT3Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) model = STDiT3(config) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model @MODELS.register_module("STDiT3-3B/2") def STDiT3_3B_2(from_pretrained=None, **kwargs): force_huggingface = kwargs.pop("force_huggingface", False) if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained): model = STDiT3.from_pretrained(from_pretrained, **kwargs) else: config = STDiT3Config(depth=28, hidden_size=1872, patch_size=(1, 2, 2), num_heads=26, **kwargs) model = STDiT3(config) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model ================================================ FILE: Open-Sora/build/lib/opensora/models/stdit/stdit3.py ================================================ import os import numpy as np import torch import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F from einops import rearrange from rotary_embedding_torch import RotaryEmbedding from timm.models.layers import DropPath from timm.models.vision_transformer import Mlp from transformers import PretrainedConfig, PreTrainedModel from opensora.acceleration.checkpoint import auto_grad_checkpoint from opensora.acceleration.communications import gather_forward_split_backward, split_forward_gather_backward from opensora.acceleration.parallel_states import get_sequence_parallel_group from opensora.models.layers.blocks import ( Attention, CaptionEmbedder, MultiHeadCrossAttention, PatchEmbed3D, PositionEmbedding2D, SeqParallelAttention, SeqParallelMultiHeadCrossAttention, SizeEmbedder, T2IFinalLayer, TimestepEmbedder, approx_gelu, get_layernorm, t2i_modulate, ) from opensora.registry import MODELS from opensora.utils.ckpt_utils import load_checkpoint from ...models.cache_functions import global_force_fresh, cache_cutfresh, update_cache, force_init, score_evaluate class STDiT3Block(nn.Module): def __init__( self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0.0, rope=None, qk_norm=False, temporal=False, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, ): super().__init__() self.temporal = temporal self.hidden_size = hidden_size self.enable_flash_attn = enable_flash_attn self.enable_sequence_parallelism = enable_sequence_parallelism if self.enable_sequence_parallelism and not temporal: attn_cls = SeqParallelAttention mha_cls = SeqParallelMultiHeadCrossAttention else: attn_cls = Attention mha_cls = MultiHeadCrossAttention self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.attn = attn_cls( hidden_size, num_heads=num_heads, qkv_bias=True, qk_norm=qk_norm, rope=rope, enable_flash_attn=enable_flash_attn, ) self.cross_attn = mha_cls(hidden_size, num_heads) self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.mlp = Mlp( in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0 ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5) def t_mask_select(self, x_mask, x, masked_x, T, S): # x: [B, (T, S), C] # mased_x: [B, (T, S), C] # x_mask: [B, T] x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S) masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=T, S=S) x = torch.where(x_mask[:, :, None, None], x, masked_x) x = rearrange(x, "B T S C -> B (T S) C") return x def forward( self, x, y, t, current, cache_dic, mask=None, # text mask x_mask=None, # temporal mask t0=None, # t with timestamp=0 T=None, # number of frames S=None, # number of pixel patches ): # prepare modulate parameters B, N, C = x.shape shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( self.scale_shift_table[None] + t.reshape(B, 6, -1) ).chunk(6, dim=1) if x_mask is not None: shift_msa_zero, scale_msa_zero, gate_msa_zero, shift_mlp_zero, scale_mlp_zero, gate_mlp_zero = ( self.scale_shift_table[None] + t0.reshape(B, 6, -1) ).chunk(6, dim=1) #attn_tick = torch.cuda.Event(enable_timing=True) #cross_attn_tick = torch.cuda.Event(enable_timing=True) #end_cross_attn_tick = torch.cuda.Event(enable_timing=True) #mlp_tick = torch.cuda.Event(enable_timing=True) #end = torch.cuda.Event(enable_timing=True) if self.temporal: current['flag'] = -1 else: current['flag'] = 0 is_force_fresh = global_force_fresh(cache_dic, current) current['is_force_fresh'] = is_force_fresh #print(is_force_fresh) # modulate (attention) current['module'] = 'attn' if is_force_fresh[current['module']]: #attn_tick.record() x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa) if x_mask is not None: x_m_zero = t2i_modulate(self.norm1(x), shift_msa_zero, scale_msa_zero) x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S) # attention if self.temporal: x_m = rearrange(x_m, "B (T S) C -> (B S) T C", T=T, S=S) x_m = self.attn(x_m) x_m = rearrange(x_m, "(B S) T C -> B (T S) C", T=T, S=S) else: x_m = rearrange(x_m, "B (T S) C -> (B T) S C", T=T, S=S) x_m = self.attn(x_m) x_m = rearrange(x_m, "(B T) S C -> B (T S) C", T=T, S=S) cache_dic['cache'][current['flag']][current['layer']][current['module']] = x_m force_init(cache_dic, current, x) else: x_m = cache_dic['cache'][current['flag']][current['layer']][current['module']] # modulate (attention) x_m_s = gate_msa * x_m if x_mask is not None: x_m_s_zero = gate_msa_zero * x_m x_m_s = self.t_mask_select(x_mask, x_m_s, x_m_s_zero, T, S) # residual x = x + self.drop_path(x_m_s) # cross attention current['module'] = 'cross-attn' if is_force_fresh[current['module']]: #cross_attn_tick.record() cache_dic['cache'][current['flag']][current['layer']][current['module']], cache_dic['cross_attn_map'][current['flag']][current['layer']] = self.cross_attn(x, y, mask) force_init(cache_dic, current, x) else: fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x, current) # 0.6ms fresh_tokens, fresh_cross_attn_map = self.cross_attn(fresh_tokens, y, mask) # 0.45ms #cross_attn_tick.record() update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current, fresh_attn_map=fresh_cross_attn_map) # 0.3ms #cache_dic['cache'][-1][current['layer']][current['module']] = self.cross_attn(x, y, mask) x = x + cache_dic['cache'][current['flag']][current['layer']][current['module']] # modulate (MLP) current['module'] = 'mlp' #mlp_tick.record() x_m = t2i_modulate(self.norm2(x), shift_mlp, scale_mlp) if x_mask is not None: x_m_zero = t2i_modulate(self.norm2(x), shift_mlp_zero, scale_mlp_zero) x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S) # MLP if is_force_fresh[current['module']]: x_m = self.mlp(x_m) cache_dic['cache'][current['flag']][current['layer']][current['module']] = x_m force_init(cache_dic, current, x) else: fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x_m, current) fresh_tokens = self.mlp(fresh_tokens) update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current) # modulate (MLP) x_m_s = gate_mlp * cache_dic['cache'][current['flag']][current['layer']][current['module']] if x_mask is not None: x_m_s_zero = gate_mlp_zero * x_m x_m_s = self.t_mask_select(x_mask, x_m_s, x_m_s_zero, T, S) # residual x = x + self.drop_path(x_m_s) #end.record() #torch.cuda.synchronize() #print("Cached:",attn_tick.elapsed_time(cross_attn_tick),cross_attn_tick.elapsed_time(mlp_tick),mlp_tick.elapsed_time(end)) #print(cross_attn_tick.elapsed_time(end_cross_attn_tick)) return x class STDiT3Config(PretrainedConfig): model_type = "STDiT3" def __init__( self, input_size=(None, None, None), input_sq_size=512, in_channels=4, patch_size=(1, 2, 2), hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, pred_sigma=True, drop_path=0.0, caption_channels=4096, model_max_length=300, qk_norm=True, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, only_train_temporal=False, freeze_y_embedder=False, skip_y_embedder=False, **kwargs, ): self.input_size = input_size self.input_sq_size = input_sq_size self.in_channels = in_channels self.patch_size = patch_size self.hidden_size = hidden_size self.depth = depth self.num_heads = num_heads self.mlp_ratio = mlp_ratio self.class_dropout_prob = class_dropout_prob self.pred_sigma = pred_sigma self.drop_path = drop_path self.caption_channels = caption_channels self.model_max_length = model_max_length self.qk_norm = qk_norm self.enable_flash_attn = enable_flash_attn self.enable_layernorm_kernel = enable_layernorm_kernel self.enable_sequence_parallelism = enable_sequence_parallelism self.only_train_temporal = only_train_temporal self.freeze_y_embedder = freeze_y_embedder self.skip_y_embedder = skip_y_embedder super().__init__(**kwargs) class STDiT3(PreTrainedModel): config_class = STDiT3Config def __init__(self, config): super().__init__(config) self.pred_sigma = config.pred_sigma self.in_channels = config.in_channels self.out_channels = config.in_channels * 2 if config.pred_sigma else config.in_channels # model size related self.depth = config.depth self.mlp_ratio = config.mlp_ratio self.hidden_size = config.hidden_size self.num_heads = config.num_heads # computation related self.drop_path = config.drop_path self.enable_flash_attn = config.enable_flash_attn self.enable_layernorm_kernel = config.enable_layernorm_kernel self.enable_sequence_parallelism = config.enable_sequence_parallelism # input size related self.patch_size = config.patch_size self.input_sq_size = config.input_sq_size self.pos_embed = PositionEmbedding2D(config.hidden_size) self.rope = RotaryEmbedding(dim=self.hidden_size // self.num_heads) # embedding self.x_embedder = PatchEmbed3D(config.patch_size, config.in_channels, config.hidden_size) self.t_embedder = TimestepEmbedder(config.hidden_size) self.fps_embedder = SizeEmbedder(self.hidden_size) self.t_block = nn.Sequential( nn.SiLU(), nn.Linear(config.hidden_size, 6 * config.hidden_size, bias=True), ) self.y_embedder = CaptionEmbedder( in_channels=config.caption_channels, hidden_size=config.hidden_size, uncond_prob=config.class_dropout_prob, act_layer=approx_gelu, token_num=config.model_max_length, ) # spatial blocks drop_path = [x.item() for x in torch.linspace(0, self.drop_path, config.depth)] self.spatial_blocks = nn.ModuleList( [ STDiT3Block( hidden_size=config.hidden_size, num_heads=config.num_heads, mlp_ratio=config.mlp_ratio, drop_path=drop_path[i], qk_norm=config.qk_norm, enable_flash_attn=config.enable_flash_attn, enable_layernorm_kernel=config.enable_layernorm_kernel, enable_sequence_parallelism=config.enable_sequence_parallelism, ) for i in range(config.depth) ] ) # temporal blocks drop_path = [x.item() for x in torch.linspace(0, self.drop_path, config.depth)] self.temporal_blocks = nn.ModuleList( [ STDiT3Block( hidden_size=config.hidden_size, num_heads=config.num_heads, mlp_ratio=config.mlp_ratio, drop_path=drop_path[i], qk_norm=config.qk_norm, enable_flash_attn=config.enable_flash_attn, enable_layernorm_kernel=config.enable_layernorm_kernel, enable_sequence_parallelism=config.enable_sequence_parallelism, # temporal temporal=True, rope=self.rope.rotate_queries_or_keys, ) for i in range(config.depth) ] ) # final layer self.final_layer = T2IFinalLayer(config.hidden_size, np.prod(self.patch_size), self.out_channels) self.initialize_weights() if config.only_train_temporal: for param in self.parameters(): param.requires_grad = False for block in self.temporal_blocks: for param in block.parameters(): param.requires_grad = True if config.freeze_y_embedder: for param in self.y_embedder.parameters(): param.requires_grad = False def initialize_weights(self): # Initialize transformer layers: def _basic_init(module): if isinstance(module, nn.Linear): torch.nn.init.xavier_uniform_(module.weight) if module.bias is not None: nn.init.constant_(module.bias, 0) self.apply(_basic_init) # Initialize fps_embedder nn.init.normal_(self.fps_embedder.mlp[0].weight, std=0.02) nn.init.constant_(self.fps_embedder.mlp[0].bias, 0) nn.init.constant_(self.fps_embedder.mlp[2].weight, 0) nn.init.constant_(self.fps_embedder.mlp[2].bias, 0) # Initialize timporal blocks for block in self.temporal_blocks: nn.init.constant_(block.attn.proj.weight, 0) nn.init.constant_(block.cross_attn.proj.weight, 0) nn.init.constant_(block.mlp.fc2.weight, 0) def get_dynamic_size(self, x): _, _, T, H, W = x.size() if T % self.patch_size[0] != 0: T += self.patch_size[0] - T % self.patch_size[0] if H % self.patch_size[1] != 0: H += self.patch_size[1] - H % self.patch_size[1] if W % self.patch_size[2] != 0: W += self.patch_size[2] - W % self.patch_size[2] T = T // self.patch_size[0] H = H // self.patch_size[1] W = W // self.patch_size[2] return (T, H, W) def encode_text(self, y, mask=None): y = self.y_embedder(y, self.training) # [B, 1, N_token, C] if mask is not None: if mask.shape[0] != y.shape[0]: mask = mask.repeat(y.shape[0] // mask.shape[0], 1) mask = mask.squeeze(1).squeeze(1) y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, self.hidden_size) y_lens = mask.sum(dim=1).tolist() else: y_lens = [y.shape[2]] * y.shape[0] y = y.squeeze(1).view(1, -1, self.hidden_size) return y, y_lens def forward(self, x, timestep, y, mask=None, x_mask=None, fps=None, height=None, width=None, cache_dic=None, current=None, **kwargs): dtype = self.x_embedder.proj.weight.dtype B = x.size(0) x = x.to(dtype) timestep = timestep.to(dtype) y = y.to(dtype) # === get pos embed === _, _, Tx, Hx, Wx = x.size() T, H, W = self.get_dynamic_size(x) cache_dic['dynamic_size'] = (B,T,H,W) # adjust for sequence parallelism # we need to ensure H * W is divisible by sequence parallel size # for simplicity, we can adjust the height to make it divisible if self.enable_sequence_parallelism: sp_size = dist.get_world_size(get_sequence_parallel_group()) if H % sp_size != 0: h_pad_size = sp_size - H % sp_size else: h_pad_size = 0 if h_pad_size > 0: hx_pad_size = h_pad_size * self.patch_size[1] # pad x along the H dimension H += h_pad_size x = F.pad(x, (0, 0, 0, hx_pad_size)) S = H * W base_size = round(S**0.5) resolution_sq = (height[0].item() * width[0].item()) ** 0.5 scale = resolution_sq / self.input_sq_size pos_emb = self.pos_embed(x, H, W, scale=scale, base_size=base_size) # === get timestep embed === t = self.t_embedder(timestep, dtype=x.dtype) # [B, C] fps = self.fps_embedder(fps.unsqueeze(1), B) t = t + fps t_mlp = self.t_block(t) t0 = t0_mlp = None if x_mask is not None: t0_timestep = torch.zeros_like(timestep) t0 = self.t_embedder(t0_timestep, dtype=x.dtype) t0 = t0 + fps t0_mlp = self.t_block(t0) # === get y embed === if self.config.skip_y_embedder: y_lens = mask if isinstance(y_lens, torch.Tensor): y_lens = y_lens.long().tolist() else: y, y_lens = self.encode_text(y, mask) # === get x embed === x = self.x_embedder(x) # [B, N, C] x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S) x = x + pos_emb # shard over the sequence dim if sp is enabled if self.enable_sequence_parallelism: x = split_forward_gather_backward(x, get_sequence_parallel_group(), dim=2, grad_scale="down") S = S // dist.get_world_size(get_sequence_parallel_group()) x = rearrange(x, "B T S C -> B (T S) C", T=T, S=S) # === blocks === for i, (spatial_block, temporal_block) in enumerate(zip(self.spatial_blocks, self.temporal_blocks)): current['layer'] = i #x = auto_grad_checkpoint(spatial_block, x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S) #x = auto_grad_checkpoint(temporal_block, x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S) x = spatial_block(x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S) x = temporal_block(x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S) if self.enable_sequence_parallelism: x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S) x = gather_forward_split_backward(x, get_sequence_parallel_group(), dim=2, grad_scale="up") S = S * dist.get_world_size(get_sequence_parallel_group()) x = rearrange(x, "B T S C -> B (T S) C", T=T, S=S) # === final layer === x = self.final_layer(x, t, x_mask, t0, T, S) x = self.unpatchify(x, T, H, W, Tx, Hx, Wx) # cast to float32 for better accuracy x = x.to(torch.float32) return x def unpatchify(self, x, N_t, N_h, N_w, R_t, R_h, R_w): """ Args: x (torch.Tensor): of shape [B, N, C] Return: x (torch.Tensor): of shape [B, C_out, T, H, W] """ # N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)] T_p, H_p, W_p = self.patch_size x = rearrange( x, "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)", N_t=N_t, N_h=N_h, N_w=N_w, T_p=T_p, H_p=H_p, W_p=W_p, C_out=self.out_channels, ) # unpad x = x[:, :, :R_t, :R_h, :R_w] return x @MODELS.register_module("STDiT3-XL/2") def STDiT3_XL_2(from_pretrained=None, **kwargs): force_huggingface = kwargs.pop("force_huggingface", False) if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained): model = STDiT3.from_pretrained(from_pretrained, **kwargs) else: config = STDiT3Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) model = STDiT3(config) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model @MODELS.register_module("STDiT3-3B/2") def STDiT3_3B_2(from_pretrained=None, **kwargs): force_huggingface = kwargs.pop("force_huggingface", False) if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained): model = STDiT3.from_pretrained(from_pretrained, **kwargs) else: config = STDiT3Config(depth=28, hidden_size=1872, patch_size=(1, 2, 2), num_heads=26, **kwargs) model = STDiT3(config) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model ================================================ FILE: Open-Sora/build/lib/opensora/models/text_encoder/__init__.py ================================================ from .classes import ClassEncoder from .clip import ClipEncoder from .t5 import T5Encoder ================================================ FILE: Open-Sora/build/lib/opensora/models/text_encoder/classes.py ================================================ import torch from opensora.registry import MODELS @MODELS.register_module("classes") class ClassEncoder: def __init__(self, num_classes, model_max_length=None, device="cuda", dtype=torch.float): self.num_classes = num_classes self.y_embedder = None self.model_max_length = model_max_length self.output_dim = None self.device = device def encode(self, text): return dict(y=torch.tensor([int(t) for t in text]).to(self.device)) def null(self, n): return torch.tensor([self.num_classes] * n).to(self.device) ================================================ FILE: Open-Sora/build/lib/opensora/models/text_encoder/clip.py ================================================ # Copyright 2024 Vchitect/Latte # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License.# Modified from Latte # # This file is adapted from the Latte project. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # Latte: https://github.com/Vchitect/Latte # DiT: https://github.com/facebookresearch/DiT/tree/main # -------------------------------------------------------- import torch import torch.nn as nn import transformers from transformers import CLIPTextModel, CLIPTokenizer from opensora.registry import MODELS transformers.logging.set_verbosity_error() class AbstractEncoder(nn.Module): def __init__(self): super().__init__() def encode(self, *args, **kwargs): raise NotImplementedError class FrozenCLIPEmbedder(AbstractEncoder): """Uses the CLIP transformer encoder for text (from Hugging Face)""" def __init__(self, path="openai/clip-vit-huge-patch14", device="cuda", max_length=77): super().__init__() self.tokenizer = CLIPTokenizer.from_pretrained(path) self.transformer = CLIPTextModel.from_pretrained(path) self.device = device self.max_length = max_length self._freeze() def _freeze(self): self.transformer = self.transformer.eval() for param in self.parameters(): param.requires_grad = False def forward(self, text): batch_encoding = self.tokenizer( text, truncation=True, max_length=self.max_length, return_length=True, return_overflowing_tokens=False, padding="max_length", return_tensors="pt", ) tokens = batch_encoding["input_ids"].to(self.device) outputs = self.transformer(input_ids=tokens) z = outputs.last_hidden_state pooled_z = outputs.pooler_output return z, pooled_z def encode(self, text): return self(text) @MODELS.register_module("clip") class ClipEncoder: """ Embeds text prompt into vector representations. Also handles text dropout for classifier-free guidance. """ def __init__( self, from_pretrained, model_max_length=77, device="cuda", dtype=torch.float, ): super().__init__() assert from_pretrained is not None, "Please specify the path to the T5 model" self.text_encoder = FrozenCLIPEmbedder(path=from_pretrained, max_length=model_max_length).to(device, dtype) self.y_embedder = None self.model_max_length = model_max_length self.output_dim = self.text_encoder.transformer.config.hidden_size def encode(self, text): _, pooled_embeddings = self.text_encoder.encode(text) y = pooled_embeddings.unsqueeze(1).unsqueeze(1) return dict(y=y) def null(self, n): null_y = self.y_embedder.y_embedding[None].repeat(n, 1, 1)[:, None] return null_y def to(self, dtype): self.text_encoder = self.text_encoder.to(dtype) return self ================================================ FILE: Open-Sora/build/lib/opensora/models/text_encoder/t5.py ================================================ # Adapted from PixArt # # Copyright (C) 2023 PixArt-alpha/PixArt-alpha # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published # by the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # PixArt: https://github.com/PixArt-alpha/PixArt-alpha # T5: https://github.com/google-research/text-to-text-transfer-transformer # -------------------------------------------------------- import html import re import ftfy import torch from transformers import AutoTokenizer, T5EncoderModel from opensora.registry import MODELS class T5Embedder: def __init__( self, device, from_pretrained=None, *, cache_dir=None, hf_token=None, use_text_preprocessing=True, t5_model_kwargs=None, torch_dtype=None, use_offload_folder=None, model_max_length=120, local_files_only=False, ): self.device = torch.device(device) self.torch_dtype = torch_dtype or torch.bfloat16 self.cache_dir = cache_dir if t5_model_kwargs is None: t5_model_kwargs = { "low_cpu_mem_usage": True, "torch_dtype": self.torch_dtype, } if use_offload_folder is not None: t5_model_kwargs["offload_folder"] = use_offload_folder t5_model_kwargs["device_map"] = { "shared": self.device, "encoder.embed_tokens": self.device, "encoder.block.0": self.device, "encoder.block.1": self.device, "encoder.block.2": self.device, "encoder.block.3": self.device, "encoder.block.4": self.device, "encoder.block.5": self.device, "encoder.block.6": self.device, "encoder.block.7": self.device, "encoder.block.8": self.device, "encoder.block.9": self.device, "encoder.block.10": self.device, "encoder.block.11": self.device, "encoder.block.12": "disk", "encoder.block.13": "disk", "encoder.block.14": "disk", "encoder.block.15": "disk", "encoder.block.16": "disk", "encoder.block.17": "disk", "encoder.block.18": "disk", "encoder.block.19": "disk", "encoder.block.20": "disk", "encoder.block.21": "disk", "encoder.block.22": "disk", "encoder.block.23": "disk", "encoder.final_layer_norm": "disk", "encoder.dropout": "disk", } else: t5_model_kwargs["device_map"] = { "shared": self.device, "encoder": self.device, } self.use_text_preprocessing = use_text_preprocessing self.hf_token = hf_token self.tokenizer = AutoTokenizer.from_pretrained( from_pretrained, cache_dir=cache_dir, local_files_only=local_files_only, ) self.model = T5EncoderModel.from_pretrained( from_pretrained, cache_dir=cache_dir, local_files_only=local_files_only, **t5_model_kwargs, ).eval() self.model_max_length = model_max_length def get_text_embeddings(self, texts): text_tokens_and_mask = self.tokenizer( texts, max_length=self.model_max_length, padding="max_length", truncation=True, return_attention_mask=True, add_special_tokens=True, return_tensors="pt", ) input_ids = text_tokens_and_mask["input_ids"].to(self.device) attention_mask = text_tokens_and_mask["attention_mask"].to(self.device) with torch.no_grad(): text_encoder_embs = self.model( input_ids=input_ids, attention_mask=attention_mask, )["last_hidden_state"].detach() return text_encoder_embs, attention_mask @MODELS.register_module("t5") class T5Encoder: def __init__( self, from_pretrained=None, model_max_length=120, device="cuda", dtype=torch.float, cache_dir=None, shardformer=False, local_files_only=False, ): assert from_pretrained is not None, "Please specify the path to the T5 model" self.t5 = T5Embedder( device=device, torch_dtype=dtype, from_pretrained=from_pretrained, cache_dir=cache_dir, model_max_length=model_max_length, local_files_only=local_files_only, ) self.t5.model.to(dtype=dtype) self.y_embedder = None self.model_max_length = model_max_length self.output_dim = self.t5.model.config.d_model self.dtype = dtype if shardformer: self.shardformer_t5() def shardformer_t5(self): from colossalai.shardformer import ShardConfig, ShardFormer from opensora.acceleration.shardformer.policy.t5_encoder import T5EncoderPolicy from opensora.utils.misc import requires_grad shard_config = ShardConfig( tensor_parallel_process_group=None, pipeline_stage_manager=None, enable_tensor_parallelism=False, enable_fused_normalization=False, enable_flash_attention=False, enable_jit_fused=True, enable_sequence_parallelism=False, enable_sequence_overlap=False, ) shard_former = ShardFormer(shard_config=shard_config) optim_model, _ = shard_former.optimize(self.t5.model, policy=T5EncoderPolicy()) self.t5.model = optim_model.to(self.dtype) # ensure the weights are frozen requires_grad(self.t5.model, False) def encode(self, text): caption_embs, emb_masks = self.t5.get_text_embeddings(text) caption_embs = caption_embs[:, None] return dict(y=caption_embs, mask=emb_masks) def null(self, n): null_y = self.y_embedder.y_embedding[None].repeat(n, 1, 1)[:, None] return null_y def basic_clean(text): text = ftfy.fix_text(text) text = html.unescape(html.unescape(text)) return text.strip() BAD_PUNCT_REGEX = re.compile( r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" ) # noqa def clean_caption(caption): import urllib.parse as ul from bs4 import BeautifulSoup caption = str(caption) caption = ul.unquote_plus(caption) caption = caption.strip().lower() caption = re.sub("", "person", caption) # urls: caption = re.sub( r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", caption, ) # regex for urls caption = re.sub( r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", caption, ) # regex for urls # html: caption = BeautifulSoup(caption, features="html.parser").text # @ caption = re.sub(r"@[\w\d]+\b", "", caption) # 31C0—31EF CJK Strokes # 31F0—31FF Katakana Phonetic Extensions # 3200—32FF Enclosed CJK Letters and Months # 3300—33FF CJK Compatibility # 3400—4DBF CJK Unified Ideographs Extension A # 4DC0—4DFF Yijing Hexagram Symbols # 4E00—9FFF CJK Unified Ideographs caption = re.sub(r"[\u31c0-\u31ef]+", "", caption) caption = re.sub(r"[\u31f0-\u31ff]+", "", caption) caption = re.sub(r"[\u3200-\u32ff]+", "", caption) caption = re.sub(r"[\u3300-\u33ff]+", "", caption) caption = re.sub(r"[\u3400-\u4dbf]+", "", caption) caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption) caption = re.sub(r"[\u4e00-\u9fff]+", "", caption) ####################################################### # все виды тире / all types of dash --> "-" caption = re.sub( r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa "-", caption, ) # кавычки к одному стандарту caption = re.sub(r"[`´«»“”¨]", '"', caption) caption = re.sub(r"[‘’]", "'", caption) # " caption = re.sub(r""?", "", caption) # & caption = re.sub(r"&", "", caption) # ip adresses: caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption) # article ids: caption = re.sub(r"\d:\d\d\s+$", "", caption) # \n caption = re.sub(r"\\n", " ", caption) # "#123" caption = re.sub(r"#\d{1,3}\b", "", caption) # "#12345.." caption = re.sub(r"#\d{5,}\b", "", caption) # "123456.." caption = re.sub(r"\b\d{6,}\b", "", caption) # filenames: caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) # caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" caption = re.sub(BAD_PUNCT_REGEX, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " # this-is-my-cute-cat / this_is_my_cute_cat regex2 = re.compile(r"(?:\-|\_)") if len(re.findall(regex2, caption)) > 3: caption = re.sub(regex2, " ", caption) caption = basic_clean(caption) caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640 caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231 caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) caption = re.sub(r"\bpage\s+\d+\b", "", caption) caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) caption = re.sub(r"\b\s+\:\s+", r": ", caption) caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption) caption = re.sub(r"\s+", " ", caption) caption.strip() caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption) caption = re.sub(r"^[\'\_,\-\:;]", r"", caption) caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption) caption = re.sub(r"^\.\S+$", "", caption) return caption.strip() def text_preprocessing(text, use_text_preprocessing: bool = True): if use_text_preprocessing: # The exact text cleaning as was in the training stage: text = clean_caption(text) text = clean_caption(text) return text else: return text.lower().strip() ================================================ FILE: Open-Sora/build/lib/tools/caption/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/tools/caption/acceleration/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/tools/caption/acceleration/llava/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/tools/caption/acceleration/llava/policies/__init__.py ================================================ from .llama import LlavaLlamaForCausalLMPolicy from .mistral import LlavaMistralForCausalLMPolicy ================================================ FILE: Open-Sora/build/lib/tools/caption/acceleration/llava/policies/llama.py ================================================ from typing import Dict, Union import torch.nn as nn from colossalai.shardformer.layer import Linear1D_Col, Linear1D_Row from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription __all__ = ["LlavaLlamaPolicy", "LlavaLlamaForCausalLMPolicy"] class LlavaLlamaPolicy(Policy): def config_sanity_check(self): pass def preprocess(self): if self.shard_config.enable_tensor_parallelism: # Resize embedding self.model.config.vocab_size self.shard_config.tensor_parallel_size # if vocab_size % world_size != 0: # new_vocab_size = vocab_size + world_size - vocab_size % world_size # self.model.resize_token_embeddings(new_vocab_size) return self.model def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]: from transformers.models.llama.modeling_llama import LlamaDecoderLayer policy = {} if self.shard_config.enable_tensor_parallelism: decoder_attribute_replacement = { "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size, "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size, } if getattr(self.model.config, "num_key_value_heads", False): decoder_attribute_replacement["self_attn.num_key_value_heads"] = ( self.model.config.num_key_value_heads // self.shard_config.tensor_parallel_size ) policy[LlamaDecoderLayer] = ModulePolicyDescription( attribute_replacement=decoder_attribute_replacement, sub_module_replacement=[ SubModuleReplacementDescription( suffix="self_attn.q_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="self_attn.k_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="self_attn.v_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="self_attn.o_proj", target_module=Linear1D_Row, ), SubModuleReplacementDescription( suffix="mlp.gate_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="mlp.up_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="mlp.down_proj", target_module=Linear1D_Row, ), ], ) return policy def postprocess(self): return self.model class LlavaLlamaForCausalLMPolicy(LlavaLlamaPolicy): def module_policy(self): from transformers import LlamaForCausalLM policy = super().module_policy() if self.shard_config.enable_tensor_parallelism: # add a new item for casual lm new_item = { LlamaForCausalLM: ModulePolicyDescription( sub_module_replacement=[ SubModuleReplacementDescription( suffix="lm_head", target_module=Linear1D_Col, kwargs={"gather_output": True} ) ], ) } policy.update(new_item) return policy ================================================ FILE: Open-Sora/build/lib/tools/caption/acceleration/llava/policies/mistral.py ================================================ import warnings from typing import Dict, Union import torch.nn as nn from colossalai.shardformer.layer import Linear1D_Col, Linear1D_Row, VocabParallelEmbedding1D from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription __all__ = ["LlavaMistralPolicy", "LlavaMistralForCausalLMPolicy"] class LlavaMistralPolicy(Policy): def config_sanity_check(self): pass def preprocess(self): if self.shard_config.enable_tensor_parallelism: # Resize embedding vocab_size = self.model.config.vocab_size world_size = self.shard_config.tensor_parallel_size if vocab_size % world_size != 0: new_vocab_size = vocab_size + world_size - vocab_size % world_size self.model.resize_token_embeddings(new_vocab_size) return self.model def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]: from transformers.models.mistral.modeling_mistral import MistralDecoderLayer, MistralModel policy = {} if self.shard_config.enable_sequence_parallelism: self.shard_config.enable_sequence_parallelism = False warnings.warn( "Mistral doesn't support sequence parallelism now, will ignore the sequence parallelism flag." ) if self.shard_config.enable_tensor_parallelism: decoder_attribute_replacement = { "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size, "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size, "self_attn.num_key_value_heads": self.model.config.num_key_value_heads // self.shard_config.tensor_parallel_size, } policy[MistralDecoderLayer] = ModulePolicyDescription( attribute_replacement=decoder_attribute_replacement, sub_module_replacement=[ SubModuleReplacementDescription( suffix="self_attn.q_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="self_attn.k_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="self_attn.v_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="self_attn.o_proj", target_module=Linear1D_Row, ), SubModuleReplacementDescription( suffix="mlp.gate_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="mlp.up_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="mlp.down_proj", target_module=Linear1D_Row, ), ], ) self.append_or_create_submodule_replacement( description=SubModuleReplacementDescription( suffix="embed_tokens", target_module=VocabParallelEmbedding1D, ), policy=policy, target_key=MistralModel, ) return policy def postprocess(self): return self.model class LlavaMistralForCausalLMPolicy(LlavaMistralPolicy): def module_policy(self): from transformers import MistralForCausalLM policy = super().module_policy() if self.shard_config.enable_tensor_parallelism: # add a new item for casual lm new_item = { MistralForCausalLM: ModulePolicyDescription( sub_module_replacement=[ SubModuleReplacementDescription( suffix="lm_head", target_module=Linear1D_Col, kwargs=dict(gather_output=True) ) ] ) } policy.update(new_item) return policy ================================================ FILE: Open-Sora/build/lib/tools/caption/camera_motion/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/tools/caption/camera_motion/camera_motion.py ================================================ import os import numpy as np import torch from .utils import load_video from .visualizer import Visualizer def transform(vector): x = np.mean([item[0] for item in vector]) y = np.mean([item[1] for item in vector]) return [x, y] class CameraPredict: def __init__(self, device, submodules_list, factor=0.25): self.device = device self.grid_size = 10 self.factor = factor try: self.model = torch.hub.load(submodules_list["repo"], submodules_list["model"]).to(self.device) except: # workaround for CERTIFICATE_VERIFY_FAILED (see: https://github.com/pytorch/pytorch/issues/33288#issuecomment-954160699) import ssl ssl._create_default_https_context = ssl._create_unverified_context self.model = torch.hub.load(submodules_list["repo"], submodules_list["model"]).to(self.device) def infer(self, video_path, save_video=False, save_dir="./saved_videos"): # load video video = load_video(video_path, return_tensor=False) # set scale height, width = video.shape[1], video.shape[2] self.scale = min(height, width) video = torch.from_numpy(video).permute(0, 3, 1, 2)[None].float().to(self.device) # B T C H W pred_tracks, pred_visibility = self.model(video, grid_size=self.grid_size) # B T N 2, B T N 1 if save_video: video_name = os.path.basename(video_path)[:-4] vis = Visualizer(save_dir=save_dir, pad_value=120, linewidth=3) vis.visualize(video, pred_tracks, pred_visibility, filename=video_name) return pred_tracks[0].long().detach().cpu().numpy() def transform_class(self, vector, min_reso): # 768*0.05 scale = min_reso * self.factor x, y = vector direction = [] if x > scale: direction.append("right") elif x < -scale: direction.append("left") if y > scale: direction.append("down") elif y < -scale: direction.append("up") return direction if direction else ["static"] def get_edge_point(self, track): middle = self.grid_size // 2 top = [list(track[0, i, :]) for i in range(middle - 2, middle + 2)] down = [list(track[self.grid_size - 1, i, :]) for i in range(middle - 2, middle + 2)] left = [list(track[i, 0, :]) for i in range(middle - 2, middle + 2)] right = [list(track[i, self.grid_size - 1, :]) for i in range(middle - 2, middle + 2)] return top, down, left, right def get_edge_direction(self, track1, track2): edge_points1 = self.get_edge_point(track1) edge_points2 = self.get_edge_point(track2) vector_results = [] for points1, points2 in zip(edge_points1, edge_points2): vectors = [[end[0] - start[0], end[1] - start[1]] for start, end in zip(points1, points2)] vector_results.append(vectors) vector_results = list(map(transform, vector_results)) class_results = [self.transform_class(vector, min_reso=self.scale) for vector in vector_results] return class_results def classify_top_down(self, top, down): results = [] classes = [f"{item_t}_{item_d}" for item_t in top for item_d in down] results_mapping = { "left_left": "pan_right", "right_right": "pan_left", "down_down": "tilt_up", "up_up": "tilt_down", "up_down": "zoom_in", "down_up": "zoom_out", "static_static": "static", } results = [results_mapping.get(cls) for cls in classes if cls in results_mapping] return results if results else ["None"] def classify_left_right(self, left, right): results = [] classes = [f"{item_l}_{item_r}" for item_l in left for item_r in right] results_mapping = { "left_left": "pan_right", "right_right": "pan_left", "down_down": "tilt_up", "up_up": "tilt_down", "left_right": "zoom_in", "right_left": "zoom_out", "static_static": "static", } results = [results_mapping.get(cls) for cls in classes if cls in results_mapping] return results if results else ["None"] def camera_classify(self, track1, track2): top, down, left, right = self.get_edge_direction(track1, track2) top_results = self.classify_top_down(top, down) left_results = self.classify_left_right(left, right) results = list(set(top_results + left_results)) if "None" in results and len(results) > 1: results.remove("None") if "static" in results and len(results) > 1: results.remove("static") if len(results) == 1 and results[0] == "None": # Tom added this to deal with edge cases results = ["Undetermined"] return results def predict(self, video_path): pred_track = self.infer(video_path) track1 = pred_track[0].reshape((self.grid_size, self.grid_size, 2)) track2 = pred_track[-1].reshape((self.grid_size, self.grid_size, 2)) results = self.camera_classify(track1, track2) return results def compute_camera_motion(device, submodules_dict, video_paths, factor): camera = CameraPredict(device, submodules_dict, factor) # predict_results = camera.predict(video_path) # return predict_results all_predictions = [] for video_path in video_paths: camera_motion_types = camera.predict(video_path) all_predictions.append("+".join(camera_motion_types)) return all_predictions ================================================ FILE: Open-Sora/build/lib/tools/caption/camera_motion/detect.py ================================================ # Originally developed by https://github.com/Vchitect/VBench based on https://github.com/facebookresearch/co-tracker. import argparse from typing import List import pandas as pd from .camera_motion import compute_camera_motion def process(paths: List[str], threshold: float) -> List[str]: device = "cuda" submodules = {"repo": "facebookresearch/co-tracker", "model": "cotracker2"} camera_motion_types = compute_camera_motion(device, submodules, paths, factor=threshold) return camera_motion_types def main(args): output_file = args.input.replace(".csv", "_cmotion.csv") data = pd.read_csv(args.input) data["cmotion"] = process(data["path"], args.threshold) data.to_csv(output_file, index=False) print(f"Output saved to {output_file}") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("input", type=str) parser.add_argument("--threshold", type=float, default=0.25) args = parser.parse_args() main(args) ================================================ FILE: Open-Sora/build/lib/tools/caption/camera_motion/utils.py ================================================ import numpy as np import torch from decord import VideoReader from PIL import Image, ImageSequence def get_frame_indices(num_frames, vlen, sample="rand", fix_start=None, input_fps=1, max_num_frames=-1): if sample in ["rand", "middle"]: # uniform sampling acc_samples = min(num_frames, vlen) # split the video into `acc_samples` intervals, and sample from each interval. intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int) ranges = [] for idx, interv in enumerate(intervals[:-1]): ranges.append((interv, intervals[idx + 1] - 1)) if sample == "rand": try: frame_indices = [random.choice(range(x[0], x[1])) for x in ranges] except: frame_indices = np.random.permutation(vlen)[:acc_samples] frame_indices.sort() frame_indices = list(frame_indices) elif fix_start is not None: frame_indices = [x[0] + fix_start for x in ranges] elif sample == "middle": frame_indices = [(x[0] + x[1]) // 2 for x in ranges] else: raise NotImplementedError if len(frame_indices) < num_frames: # padded with last frame padded_frame_indices = [frame_indices[-1]] * num_frames padded_frame_indices[: len(frame_indices)] = frame_indices frame_indices = padded_frame_indices elif "fps" in sample: # fps0.5, sequentially sample frames at 0.5 fps output_fps = float(sample[3:]) duration = float(vlen) / input_fps delta = 1 / output_fps # gap between frames, this is also the clip length each frame represents frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta) frame_indices = np.around(frame_seconds * input_fps).astype(int) frame_indices = [e for e in frame_indices if e < vlen] if max_num_frames > 0 and len(frame_indices) > max_num_frames: frame_indices = frame_indices[:max_num_frames] # frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames) else: raise ValueError return frame_indices def load_video(video_path, data_transform=None, num_frames=None, return_tensor=True, width=None, height=None): """ Load a video from a given path and apply optional data transformations. The function supports loading video in GIF (.gif), PNG (.png), and MP4 (.mp4) formats. Depending on the format, it processes and extracts frames accordingly. Parameters: - video_path (str): The file path to the video or image to be loaded. - data_transform (callable, optional): A function that applies transformations to the video data. Returns: - frames (torch.Tensor): A tensor containing the video frames with shape (T, C, H, W), where T is the number of frames, C is the number of channels, H is the height, and W is the width. Raises: - NotImplementedError: If the video format is not supported. The function first determines the format of the video file by its extension. For GIFs, it iterates over each frame and converts them to RGB. For PNGs, it reads the single frame, converts it to RGB. For MP4s, it reads the frames using the VideoReader class and converts them to NumPy arrays. If a data_transform is provided, it is applied to the buffer before converting it to a tensor. Finally, the tensor is permuted to match the expected (T, C, H, W) format. """ if video_path.endswith(".gif"): frame_ls = [] img = Image.open(video_path) for frame in ImageSequence.Iterator(img): frame = frame.convert("RGB") frame = np.array(frame).astype(np.uint8) frame_ls.append(frame) buffer = np.array(frame_ls).astype(np.uint8) elif video_path.endswith(".png"): frame = Image.open(video_path) frame = frame.convert("RGB") frame = np.array(frame).astype(np.uint8) frame_ls = [frame] buffer = np.array(frame_ls) elif video_path.endswith(".mp4"): import decord decord.bridge.set_bridge("native") if width: video_reader = VideoReader(video_path, width=width, height=height, num_threads=1) else: video_reader = VideoReader(video_path, num_threads=1) frames = video_reader.get_batch(range(len(video_reader))) # (T, H, W, C), torch.uint8 buffer = frames.asnumpy().astype(np.uint8) else: raise NotImplementedError frames = buffer if num_frames: frame_indices = get_frame_indices(num_frames, len(frames), sample="middle") frames = frames[frame_indices] if data_transform: frames = data_transform(frames) elif return_tensor: frames = torch.Tensor(frames) frames = frames.permute(0, 3, 1, 2) # (T, C, H, W), torch.uint8 return frames ================================================ FILE: Open-Sora/build/lib/tools/caption/camera_motion/visualizer.py ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # This source code is licensed under the license found in the cotracker github repo. https://github.com/facebookresearch/co-tracker. import os import imageio import matplotlib.pyplot as plt import numpy as np import torch import torch.nn.functional as F import torchvision.transforms as transforms from matplotlib import cm from PIL import Image, ImageDraw def read_video_from_path(path): try: reader = imageio.get_reader(path) except Exception as e: print("Error opening video file: ", e) return None frames = [] for i, im in enumerate(reader): frames.append(np.array(im)) return np.stack(frames) def draw_circle(rgb, coord, radius, color=(255, 0, 0), visible=True): # Create a draw object draw = ImageDraw.Draw(rgb) # Calculate the bounding box of the circle left_up_point = (coord[0] - radius, coord[1] - radius) right_down_point = (coord[0] + radius, coord[1] + radius) # Draw the circle draw.ellipse( [left_up_point, right_down_point], fill=tuple(color) if visible else None, outline=tuple(color), ) return rgb def draw_line(rgb, coord_y, coord_x, color, linewidth): draw = ImageDraw.Draw(rgb) draw.line( (coord_y[0], coord_y[1], coord_x[0], coord_x[1]), fill=tuple(color), width=linewidth, ) return rgb def add_weighted(rgb, alpha, original, beta, gamma): return (rgb * alpha + original * beta + gamma).astype("uint8") class Visualizer: def __init__( self, save_dir: str = "./results", grayscale: bool = False, pad_value: int = 0, fps: int = 10, mode: str = "rainbow", # 'cool', 'optical_flow' linewidth: int = 2, show_first_frame: int = 10, tracks_leave_trace: int = 0, # -1 for infinite ): self.mode = mode self.save_dir = save_dir if mode == "rainbow": self.color_map = cm.get_cmap("gist_rainbow") elif mode == "cool": self.color_map = cm.get_cmap(mode) self.show_first_frame = show_first_frame self.grayscale = grayscale self.tracks_leave_trace = tracks_leave_trace self.pad_value = pad_value self.linewidth = linewidth self.fps = fps def visualize( self, video: torch.Tensor, # (B,T,C,H,W) tracks: torch.Tensor, # (B,T,N,2) visibility: torch.Tensor = None, # (B, T, N, 1) bool gt_tracks: torch.Tensor = None, # (B,T,N,2) segm_mask: torch.Tensor = None, # (B,1,H,W) filename: str = "video", writer=None, # tensorboard Summary Writer, used for visualization during training step: int = 0, query_frame: int = 0, save_video: bool = True, compensate_for_camera_motion: bool = False, ): if compensate_for_camera_motion: assert segm_mask is not None if segm_mask is not None: coords = tracks[0, query_frame].round().long() segm_mask = segm_mask[0, query_frame][coords[:, 1], coords[:, 0]].long() video = F.pad( video, (self.pad_value, self.pad_value, self.pad_value, self.pad_value), "constant", 255, ) print("video shape after pad is: ", video.shape) tracks = tracks + self.pad_value print(tracks) print("tracks shape after pad is: ", tracks.shape) if self.grayscale: transform = transforms.Grayscale() video = transform(video) video = video.repeat(1, 1, 3, 1, 1) res_video = self.draw_tracks_on_video( video=video, tracks=tracks, visibility=visibility, segm_mask=segm_mask, gt_tracks=gt_tracks, query_frame=query_frame, compensate_for_camera_motion=compensate_for_camera_motion, ) if save_video: self.save_video(res_video, filename=filename, writer=writer, step=step) return res_video def save_video(self, video, filename, writer=None, step=0): if writer is not None: writer.add_video( filename, video.to(torch.uint8), global_step=step, fps=self.fps, ) else: os.makedirs(self.save_dir, exist_ok=True) wide_list = list(video.unbind(1)) wide_list = [wide[0].permute(1, 2, 0).cpu().numpy() for wide in wide_list] # Prepare the video file path save_path = os.path.join(self.save_dir, f"{filename}.mp4") # Create a writer object video_writer = imageio.get_writer(save_path, fps=self.fps) # Write frames to the video file for frame in wide_list[2:-1]: video_writer.append_data(frame) video_writer.close() print(f"Video saved to {save_path}") def draw_tracks_on_video( self, video: torch.Tensor, tracks: torch.Tensor, visibility: torch.Tensor = None, segm_mask: torch.Tensor = None, gt_tracks=None, query_frame: int = 0, compensate_for_camera_motion=False, ): B, T, C, H, W = video.shape _, _, N, D = tracks.shape assert D == 2 assert C == 3 video = video[0].permute(0, 2, 3, 1).byte().detach().cpu().numpy() # S, H, W, C tracks = tracks[0].long().detach().cpu().numpy() # S, N, 2 if gt_tracks is not None: gt_tracks = gt_tracks[0].detach().cpu().numpy() res_video = [] # process input video for rgb in video: res_video.append(rgb.copy()) vector_colors = np.zeros((T, N, 3)) if self.mode == "optical_flow": import flow_vis vector_colors = flow_vis.flow_to_color(tracks - tracks[query_frame][None]) elif segm_mask is None: if self.mode == "rainbow": y_min, y_max = ( tracks[query_frame, :, 1].min(), tracks[query_frame, :, 1].max(), ) norm = plt.Normalize(y_min, y_max) for n in range(N): color = self.color_map(norm(tracks[query_frame, n, 1])) color = np.array(color[:3])[None] * 255 vector_colors[:, n] = np.repeat(color, T, axis=0) else: # color changes with time for t in range(T): color = np.array(self.color_map(t / T)[:3])[None] * 255 vector_colors[t] = np.repeat(color, N, axis=0) else: if self.mode == "rainbow": vector_colors[:, segm_mask <= 0, :] = 255 y_min, y_max = ( tracks[0, segm_mask > 0, 1].min(), tracks[0, segm_mask > 0, 1].max(), ) norm = plt.Normalize(y_min, y_max) for n in range(N): if segm_mask[n] > 0: color = self.color_map(norm(tracks[0, n, 1])) color = np.array(color[:3])[None] * 255 vector_colors[:, n] = np.repeat(color, T, axis=0) else: # color changes with segm class segm_mask = segm_mask.cpu() color = np.zeros((segm_mask.shape[0], 3), dtype=np.float32) color[segm_mask > 0] = np.array(self.color_map(1.0)[:3]) * 255.0 color[segm_mask <= 0] = np.array(self.color_map(0.0)[:3]) * 255.0 vector_colors = np.repeat(color[None], T, axis=0) # draw tracks if self.tracks_leave_trace != 0: for t in range(query_frame + 1, T): first_ind = max(0, t - self.tracks_leave_trace) if self.tracks_leave_trace >= 0 else 0 curr_tracks = tracks[first_ind : t + 1] curr_colors = vector_colors[first_ind : t + 1] if compensate_for_camera_motion: diff = (tracks[first_ind : t + 1, segm_mask <= 0] - tracks[t : t + 1, segm_mask <= 0]).mean(1)[ :, None ] curr_tracks = curr_tracks - diff curr_tracks = curr_tracks[:, segm_mask > 0] curr_colors = curr_colors[:, segm_mask > 0] res_video[t] = self._draw_pred_tracks( res_video[t], curr_tracks, curr_colors, ) if gt_tracks is not None: res_video[t] = self._draw_gt_tracks(res_video[t], gt_tracks[first_ind : t + 1]) # draw points for t in range(query_frame, T): img = Image.fromarray(np.uint8(res_video[t])) for i in range(N): coord = (tracks[t, i, 0], tracks[t, i, 1]) visibile = True if visibility is not None: visibile = visibility[0, t, i] if coord[0] != 0 and coord[1] != 0: if not compensate_for_camera_motion or (compensate_for_camera_motion and segm_mask[i] > 0): img = draw_circle( img, coord=coord, radius=int(self.linewidth * 2), color=vector_colors[t, i].astype(int), visible=visibile, ) res_video[t] = np.array(img) # construct the final rgb sequence if self.show_first_frame > 0: res_video = [res_video[0]] * self.show_first_frame + res_video[1:] return torch.from_numpy(np.stack(res_video)).permute(0, 3, 1, 2)[None].byte() def _draw_pred_tracks( self, rgb: np.ndarray, # H x W x 3 tracks: np.ndarray, # T x 2 vector_colors: np.ndarray, alpha: float = 0.5, ): T, N, _ = tracks.shape rgb = Image.fromarray(np.uint8(rgb)) for s in range(T - 1): vector_color = vector_colors[s] original = rgb.copy() alpha = (s / T) ** 2 for i in range(N): coord_y = (int(tracks[s, i, 0]), int(tracks[s, i, 1])) coord_x = (int(tracks[s + 1, i, 0]), int(tracks[s + 1, i, 1])) if coord_y[0] != 0 and coord_y[1] != 0: rgb = draw_line( rgb, coord_y, coord_x, vector_color[i].astype(int), self.linewidth, ) if self.tracks_leave_trace > 0: rgb = Image.fromarray(np.uint8(add_weighted(np.array(rgb), alpha, np.array(original), 1 - alpha, 0))) rgb = np.array(rgb) return rgb def _draw_gt_tracks( self, rgb: np.ndarray, # H x W x 3, gt_tracks: np.ndarray, # T x 2 ): T, N, _ = gt_tracks.shape color = np.array((211, 0, 0)) rgb = Image.fromarray(np.uint8(rgb)) for t in range(T): for i in range(N): gt_tracks = gt_tracks[t][i] # draw a red cross if gt_tracks[0] > 0 and gt_tracks[1] > 0: length = self.linewidth * 3 coord_y = (int(gt_tracks[0]) + length, int(gt_tracks[1]) + length) coord_x = (int(gt_tracks[0]) - length, int(gt_tracks[1]) - length) rgb = draw_line( rgb, coord_y, coord_x, color, self.linewidth, ) coord_y = (int(gt_tracks[0]) - length, int(gt_tracks[1]) + length) coord_x = (int(gt_tracks[0]) + length, int(gt_tracks[1]) - length) rgb = draw_line( rgb, coord_y, coord_x, color, self.linewidth, ) rgb = np.array(rgb) return rgb ================================================ FILE: Open-Sora/build/lib/tools/caption/camera_motion_detect.py ================================================ # ref: https://github.com/antiboredom/camera-motion-detector import argparse import cv2 import numpy as np import pandas as pd from tqdm import tqdm tqdm.pandas() def apply(df, func, **kwargs): if pandas_has_parallel: return df.parallel_apply(func, **kwargs) return df.progress_apply(func, **kwargs) try: from pandarallel import pandarallel pandarallel.initialize(progress_bar=True) pandas_has_parallel = True except ImportError: pandas_has_parallel = False def make_empty(new_w, new_h): empty = [] for y in range(new_h): xvals = [] for x in range(new_w): xvals.append([x, y]) empty.append(xvals) empty = np.array(empty) return empty def get_type(mag, ang, zoom_in, tau_static=1.0, tau_zoom=(0.4, 0.6)): if mag < tau_static: return "static" if zoom_in < tau_zoom[0]: return "zoom out" if zoom_in > tau_zoom[1]: return "zoom in" if ang < 45 or ang >= 315: return "pan left" if 45 <= ang < 135: return "tilt up" if 135 <= ang < 225: return "pan right" if 225 <= ang < 315: return "tilt down" return "unknown" def get_video_type(frame_types): # count the number of each type counts = {} max_count = 0 max_type = None for frame_type in frame_types: if frame_type not in counts: counts[frame_type] = 0 counts[frame_type] += 1 if counts[frame_type] > max_count: max_count = counts[frame_type] max_type = frame_type if max_count > len(frame_types) / 2: return max_type if "static" in counts: return "unknown" if "zoom in" not in counts and "zoom out" not in counts: return "pan/tilt" return "dynamic" def process(path: str, frame_interval=15) -> str: cap = cv2.VideoCapture(path) count = 0 prvs = None frame_types = [] while cap.isOpened(): ret, frame = cap.read() if ret: frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) if count == 0: prvs = frame h, w = frame.shape empty = make_empty(w, h) empty_dists = np.sqrt( np.square(empty.ravel()[::2] - (w / 2)) + np.square(empty.ravel()[1::2] - (h / 2)) ) else: flow = cv2.calcOpticalFlowFarneback(prvs, frame, None, 0.5, 3, 15, 3, 5, 1.2, 0) mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1], angleInDegrees=True) mean_mag = np.median(mag) mean_ang = np.median(ang) flow_coords = flow + empty xvals = flow_coords.ravel()[::2] - (w / 2) yvals = flow_coords.ravel()[1::2] - (h / 2) dists = np.sqrt(np.square(xvals) + np.square(yvals)) dist_diff = dists >= empty_dists zoom_in_factor = np.count_nonzero(dist_diff) / len(dist_diff) frame_types.append(get_type(mean_mag, mean_ang, zoom_in_factor)) count += frame_interval cap.set(cv2.CAP_PROP_POS_FRAMES, count) else: cap.release() break video_type = get_video_type(frame_types) return video_type def main(args): output_file = args.input.replace(".csv", "_cmotion.csv") data = pd.read_csv(args.input) data["cmotion"] = apply(data["path"], process) data.to_csv(output_file, index=False) print(f"Output saved to {output_file}") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("input", type=str) parser.add_argument("--disable-parallel", action="store_true") args = parser.parse_args() if args.disable_parallel: pandas_has_parallel = False main(args) ================================================ FILE: Open-Sora/build/lib/tools/caption/caption_gpt4.py ================================================ import argparse import base64 import csv import os from io import BytesIO import requests import tqdm from .utils import IMG_EXTENSIONS, PROMPTS, VID_EXTENSIONS, VideoTextDataset def to_base64(image): buffer = BytesIO() image.save(buffer, format="JPEG") return base64.b64encode(buffer.getvalue()).decode("utf-8") def get_caption(frame, prompt, api_key): headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"} payload = { "model": "gpt-4-vision-preview", "messages": [ { "role": "user", "content": [ { "type": "text", "text": prompt, }, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[0]}"}}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[1]}"}}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[2]}"}}, ], } ], "max_tokens": 300, } response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload, timeout=60) caption = response.json()["choices"][0]["message"]["content"] caption = caption.replace("\n", " ") return caption def main(args): # ====================================================== # 1. read video list # ====================================================== dataset = VideoTextDataset(args.input) output_file = os.path.splitext(args.input)[0] + "_caption.csv" f = open(output_file, "w") writer = csv.writer(f) writer.writerow(["video", "text"]) # make sure that the prompt type matches the data type data_extension = "." + dataset.data["path"].iloc[0].split(".")[-1] prompt_type = PROMPTS[args.prompt]["type"] if prompt_type == "image": assert ( data_extension.lower() in IMG_EXTENSIONS ), "The prompt is suitable for an image dataset but the data is not image." elif prompt_type == "video": assert ( data_extension.lower() in VID_EXTENSIONS ), "The prompt is suitable for a video dataset but the data is not video." else: raise ValueError(f"Found invalid prompt type {prompt_type}") # ====================================================== # 2. generate captions # ====================================================== for sample in tqdm.tqdm(dataset): prompt = PROMPTS[args.prompt]["text"] if "text" in args.prompt: prompt = prompt.format(sample["text"]) frames = sample["image"] frames = [to_base64(frame) for frame in frames] caption = get_caption(frames, prompt, args.key) writer.writerow((sample["path"], caption)) f.close() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("input", type=str, help="Path to the input CSV file") parser.add_argument("--prompt", type=str, default="video-f3-detail-3ex") parser.add_argument("--key", type=str) args = parser.parse_args() main(args) ================================================ FILE: Open-Sora/build/lib/tools/caption/caption_llama3.py ================================================ import argparse import csv import os import warnings from datetime import timedelta import pandas as pd import torch import torch.distributed as dist from torch.utils.data import Dataset from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer from .utils import read_file os.system(f"cp {__file__} ~/backup/") # optionally backup the script warnings.filterwarnings("ignore") os.environ["TOKENIZERS_PARALLELISM"] = "false" from torch.distributed.elastic.multiprocessing.errors import record class CSVTextDataset(Dataset): def __init__(self, csv_path): self.df = pd.read_csv(csv_path) # assert text is in the columns assert "text" in self.df.columns, "text column not found in the csv file" def __len__(self): return len(self.df) def __getitem__(self, idx): if idx < 0 or idx >= len(self.df): raise IndexError return self.df.iloc[idx] def set_rank_and_world_size(self, rank, world_size): self.rank = rank self.world_size = world_size self.data_per_gpu = len(self) // world_size self.start_index = rank * self.data_per_gpu self.end_index = (rank + 1) * self.data_per_gpu if rank != world_size - 1 else len(self) self.df = self.df.iloc[self.start_index : self.end_index] def write_to_csv(self, output_file, data, new_key): """write the part of the df to a csv file corresponding to the rank and write self.data_list as a new column""" writer = csv.writer(open(output_file, "w")) columns = self.df.columns + [new_key] writer.writerow(columns) for index, row in self.df.iterrows(): if index < self.start_index or index >= self.end_index: continue writer.writerow([*row, data[index - self.start_index]]) writer.close() def pad_left(sequences, padding_value=0): # Determine the maximum length of the sequences max_len = max([s.size(0) for s in sequences]) # Create a list to hold the padded sequences padded_sequences = [] for sequence in sequences: # Calculate the number of padding elements needed for this sequence num_padding = max_len - sequence.size(0) # Create a tensor of padding values padding = torch.full((num_padding,), padding_value, dtype=sequence.dtype).to(sequence.device) # Concatenate the padding and the sequence to pad on the left padded_sequence = torch.cat([padding, sequence], dim=0) padded_sequences.append(padded_sequence) # Stack the padded sequences into a batch batch = torch.stack(padded_sequences) return batch @record def main(args): # ====================================================== # 1. init environment # ====================================================== dist.init_process_group(backend="nccl", timeout=timedelta(hours=24)) torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count()) # ====================================================== # 2. Prep rank-wise dataloader # ====================================================== dataframe = read_file(args.input) print("read data from {}".format(args.input)) dataset = CSVTextDataset(args.input) dataset.set_rank_and_world_size(dist.get_rank(), dist.get_world_size()) import os if os.getenv("DEBUG_ADDRESS") != None and dist.get_rank() == 2: import ptvsd print("waiting for debugger attachment") ptvsd.enable_attach(address=("localhost", int(os.getenv("DEBUG_ADDRESS"))), redirect_output=True) ptvsd.wait_for_attach() output_file = args.output_prefix + f"_rank{dist.get_rank()}" + f"_{args.key}.csv" output_file_handle = open(output_file, "w") writer = csv.writer(output_file_handle) columns = list(dataframe.columns) + [args.key] writer.writerow(columns) # add a new key named summary, write in csv file print("the processed data saved on this rank will be saved to {}".format(output_file)) def collate_fn(batch): return batch dataloader = torch.utils.data.DataLoader( dataset, # num_workers=2, batch_size=args.batch_size, collate_fn=collate_fn, shuffle=False, ) # ====================================================== # 2. process using llama3 and prompt # ====================================================== print("Using model with the id {}".format(args.model_id)) model_id = args.model_id tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left") model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map=dist.get_rank() % torch.cuda.device_count(), ) # .to(dist.get_rank() % torch.cuda.device_count()) dist.barrier() print("======== Process data using LLAMA3 ========") def extract_batch(texts, prompt): input_ids_list = [ tokenizer.apply_chat_template( [{"role": "system", "content": prompt}, {"role": "user", "content": text}], add_generation_prompt=True, return_tensors="pt", ).to(model.device)[0] for text in texts ] attention_mask_list = [ torch.ones(input_ids.shape, dtype=torch.long, device=model.device) for input_ids in input_ids_list ] # input_ids_batch = pad_left( # input_ids_list, padding_value=tokenizer.eos_token_id # ) input_ids_batch = torch.nn.utils.rnn.pad_sequence( input_ids_list, batch_first=True, padding_value=tokenizer.eos_token_id ) attention_mask_batch = torch.nn.utils.rnn.pad_sequence(attention_mask_list, batch_first=True, padding_value=0) # attention_mask_batch = pad_left( # attention_mask_list, padding_value=0 # ) terminators = [ tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>"), ] outputs = model.generate( input_ids_batch, max_new_tokens=512, attention_mask=attention_mask_batch, pad_token_id=tokenizer.eos_token_id, eos_token_id=terminators, # do_sample=True, # temperature=0.6, # top_p=0.9, ) responses = [] for i in range(len(texts)): response = outputs[i][input_ids_list[i].shape[-1] :] response = tokenizer.decode(response, skip_special_tokens=True) responses.append(response) return responses print("Processing starting...") if args.prompt == "" and args.key == "objects": prompt = ( "You are a AI assistant to extract objects from user's text. " "For example: user: 'In this video a dog is running around. In addition, a person is laughing at the dog.', you produce a list of objects separated by ',' and wrapped by '[' and ']': '[dog, person]' " ) elif args.prompt == "" and args.key == "actions": prompt = ( "You are a AI assistant to extract actions from user's text. " "For example: user: 'In this video a dog is running around. In addition, a person is laughing at the dog.', you produce a list of actions separated by ',' and wrapped by '[' and ']': '[run, laugh]' " ) else: prompt = args.prompt print("Prompt: {}".format(prompt)) args.batch_size # for i in tqdm(range(0, len(dataframe), batch_size)): for _, batch in enumerate(tqdm(dataloader)): # get the text column from the batch texts = [batch[i]["text"] for i in range(len(batch))] list_keywords = extract_batch(texts, prompt) for idx, keywords in enumerate(list_keywords): try: keywords_start = keywords.find("[") keywords_end = keywords.find("]") keywords = keywords[keywords_start + 1 : keywords_end] if ( "\n" in keywords or len(keywords.strip()) == 0 ): # we empirically observe that it produces newlines when no keywords are found keywords = "NONE_FOUND" except: keywords = "NONE_FOUND" row = batch[idx] writer.writerow([*row, keywords]) output_file_handle.close() dist.barrier() if dist.get_rank() == 0: collated_file = args.output_prefix + f"_{args.key}.csv" print("All ranks are finished. Collating the processed data to {}".format(collated_file)) import pandas as pd csv_files = [args.output_prefix + f"_rank{i}" + f"_{args.key}.csv" for i in range(dist.get_world_size())] # List to hold DataFrames dataframes = [] # Read each CSV into a DataFrame and append to list for file in csv_files: df = pd.read_csv(file) # scan each line in the df, if the ``key`` column is NaN, replace it with "NONE_FOUND" df[args.key] = df[args.key].fillna("NONE_FOUND") dataframes.append(df) # Concatenate all DataFrames combined_df = pd.concat(dataframes, ignore_index=True) # Save the combined DataFrame to a new CSV file combined_df.to_csv(collated_file, index=False) print("Collated data saved to {}".format(collated_file)) # terminate distributed env dist.destroy_process_group() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model-id", default="meta-llama/Meta-Llama-3-8B-Instruct") parser.add_argument("input", type=str, help="Path to the input CSV file") parser.add_argument("--output_prefix", type=str, help="Path to the output CSV file") parser.add_argument("--prompt", type=str, default="") parser.add_argument("--batch_size", type=int, default=32) parser.add_argument("--key", type=str) args = parser.parse_args() main(args) ================================================ FILE: Open-Sora/build/lib/tools/caption/caption_llava.py ================================================ import argparse import csv import time import warnings from datetime import timedelta import torch import torch.distributed as dist from colossalai.cluster import DistCoordinator, ProcessGroupMesh from colossalai.shardformer import ShardConfig, ShardFormer from colossalai.utils import get_current_device, set_seed from llava.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX from llava.conversation import conv_templates from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token from llava.model.builder import load_pretrained_model from llava.utils import disable_torch_init from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm from ..datasets.utils import IMG_EXTENSIONS, VID_EXTENSIONS from .acceleration.llava.policies import LlavaLlamaForCausalLMPolicy, LlavaMistralForCausalLMPolicy from .utils import PROMPTS, Timer, VideoTextDataset, collate_fn disable_torch_init() class NoPaddingDistributedSampler(DistributedSampler): def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, seed=0, drop_last=False): super().__init__( dataset=dataset, num_replicas=num_replicas, rank=rank, seed=seed, shuffle=False, drop_last=False ) remainder = len(self.dataset) % self.num_replicas if remainder > 0 and (self.rank + 1) - remainder <= 0: # if the dataset is not divisible by num_replicas # the remaining items will be allocated to the first n ranks self.num_samples = len(self.dataset) // self.num_replicas + 1 else: self.num_samples = len(self.dataset) // self.num_replicas self.total_size = len(dataset) def __iter__(self): if self.shuffle: # deterministically shuffle based on epoch and seed g = torch.Generator() g.manual_seed(self.seed + self.epoch) indices = torch.randperm(len(self.dataset), generator=g).tolist() # type: ignore[arg-type] else: indices = list(range(len(self.dataset))) # type: ignore[arg-type] # remove tail of data to make it evenly divisible. indices = indices[: self.total_size] # subsample indices = indices[self.rank : self.total_size : self.num_replicas] assert len(indices) == self.num_samples return iter(indices) @torch.inference_mode() def main(args): # ====================================================== # 1. init environment # ====================================================== # we set a very large timeout to avoid some processes exit early dist.init_process_group(backend="nccl", timeout=timedelta(hours=24)) torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count()) set_seed(1024) coordinator = DistCoordinator() # prepare the dp and tp groups assert ( args.dp_size * args.tp_size == coordinator.world_size ), f"DP size {args.dp_size} * TP size {args.tp_size} must equal to world size {coordinator.world_size}" mesh = ProcessGroupMesh(args.dp_size, args.tp_size) dp_group = mesh.get_group_along_axis(0) tp_group = mesh.get_group_along_axis(1) # ====================================================== # 2. load model # ====================================================== model_path = args.model_path with warnings.catch_warnings(): warnings.simplefilter("ignore") # Pytorch non-meta copying warning fills out the console tokenizer, model, image_processor, context_len = load_pretrained_model( model_path=model_path, model_base=None, model_name=get_model_name_from_path(model_path), device=get_current_device(), torch_dtype=torch.float16, attn_implementation="flash_attention_2" if args.flash_attention else "eager", ) dist.barrier() # ====================================================== # 3. Apply system optimization # ====================================================== tp_size = dist.get_world_size(tp_group) shard_config = ShardConfig( tensor_parallel_process_group=tp_group if tp_size > 1 else None, enable_tensor_parallelism=True if tp_size > 1 else False, ) shard_former = ShardFormer(shard_config=shard_config) # check the model type model_name = model.__class__.__name__ print(model_name) if model_name == "LlavaLlamaForCausalLM": model = shard_former.optimize(model, policy=LlavaLlamaForCausalLMPolicy())[0].cuda() elif model_name == "LlavaMistralForCausalLM": model = shard_former.optimize(model, policy=LlavaMistralForCausalLMPolicy())[0].cuda() else: print(f"The shardformer policy for {model_name} is not implemented, skip") torch.cuda.empty_cache() # ====================================================== # 4. Prepare dataloader # ====================================================== # prepare prompt query = PROMPTS[args.prompt]["text"] if dist.get_rank() == 0: print(f"Prompt: {query}") if "text" in args.prompt: def get_text_input_ids(text): conv = conv_templates["chatml_direct"].copy() query_text = query.format(text) conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + "\n" + query_text) prompt = conv.get_prompt() # add num_frames images t = prompt.split("") prompt = t[0] + "" * args.num_frames + t[1] input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") input_ids = input_ids.unsqueeze(0) return input_ids else: conv = conv_templates["chatml_direct"].copy() conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + "\n" + query) prompt = conv.get_prompt() # add num_frames images t = prompt.split("") prompt = t[0] + "" * args.num_frames + t[1] input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") input_ids = input_ids.unsqueeze(0) def get_text_input_ids(*args): return input_ids # build dataset def transform(imgs): imgs = process_images(imgs, image_processor, model.config) imgs = imgs.to(dtype=torch.float16) return imgs dataset = VideoTextDataset( args.input, transform=transform, num_frames=args.num_frames, get_text_input_ids=get_text_input_ids, resize=args.resize, ) # make sure that the prompt type matches the data type data_extension = "." + dataset.data["path"].iloc[0].split(".")[-1] prompt_type = PROMPTS[args.prompt]["type"] if prompt_type == "image": assert ( data_extension.lower() in IMG_EXTENSIONS ), f"The prompt is suitable for an image dataset but the data is not image. The first data is of format {data_extension}" elif prompt_type == "video": assert ( data_extension.lower() in VID_EXTENSIONS ), f"The prompt is suitable for a video dataset but the data is not video. The first data is of format {data_extension}" else: raise ValueError(f"Found invalid prompt type {prompt_type}") total_num_videos = len(dataset) # build sampler dp_rank = dist.get_rank(dp_group) dp_size = dist.get_world_size(dp_group) sampler = NoPaddingDistributedSampler(dataset, rank=dp_rank, num_replicas=dp_size) # build dataloader dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.bs, shuffle=False, num_workers=args.num_workers, pin_memory=True, prefetch_factor=args.prefetch_factor, sampler=sampler, collate_fn=collate_fn, ) # prepare output file reader output_file = args.input.replace(".csv", "_caption.csv") # create csv writer has_dp_writter = dist.get_rank(tp_group) == 0 if has_dp_writter: # the dp writer takes care of the files processed on the current dp rank # so we use write mode output_file_split = output_file.replace(".csv", f"_part{dp_rank}.csv") dp_file = open(output_file_split, "w") dp_writer = csv.writer(dp_file) dp_writer.writerow(["path", "text", "num_frames"]) # ====================================================== # 5. generate captions # ====================================================== if dist.get_rank(tp_group) == 0: pbar = tqdm(dataloader, position=dp_rank, desc=f"Data Parallel Rank {dist.get_rank(dp_group)}") else: pbar = dataloader if args.profile: encode_time = [] generate_time = [] output_length = [] total_time = [] for i, batch in enumerate(pbar): # measure time if args.profile: torch.cuda.synchronize() start_time = time.time() video_files, frames, video_lengths, img_size_list, texts = batch # encode the batch of inputs with Timer() as encode_timer: samples = [] for imgs, imgs_size, input_ids in zip(frames, img_size_list, texts): imgs = imgs.cuda() input_ids = input_ids.cuda() _, _, _, _, inputs_embeds, _ = model.prepare_inputs_labels_for_multimodal( input_ids, None, None, None, None, images=imgs, image_sizes=imgs_size ) samples.append(inputs_embeds) # padding max_len = max([sample.shape[1] for sample in samples]) attention_mask = torch.tensor( [[0] * (max_len - samples[i].shape[1]) + [1] * samples[i].shape[1] for i in range(len(samples))] ).to(model.device) inputs_embeds = [ torch.cat( [ torch.zeros( (1, max_len - samples[i].shape[1], samples[i].shape[-1]), device=model.device, dtype=torch.float16, ), samples[i], ], dim=1, ) for i in range(len(samples)) ] inputs_embeds = torch.cat(inputs_embeds, dim=0) # generate outputs with Timer() as generate_timer: output_ids = super(type(model), model).generate( inputs_embeds=inputs_embeds, attention_mask=attention_mask, do_sample=False, # sampling is not deterministic and may cause TP to hang max_new_tokens=args.max_tokens, use_cache=True, ) # skip warmup and add profiling data if args.profile and i >= args.profile_warmup: output_length.append(output_ids.size(0) * output_ids.size(1)) outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True) outputs = [output.replace("\n", " ").strip() for output in outputs] # skip warmup and add profiling data if args.profile and i >= args.profile_warmup: # measure time torch.cuda.synchronize() time_taken = time.time() - start_time total_time.append(time_taken) encode_time.append(encode_timer.time_taken) generate_time.append(generate_timer.time_taken) # save results if has_dp_writter: result = list(zip(video_files, outputs, video_lengths)) for t in result: dp_writer.writerow(t) # display profiling info if args.profile: print(output_length) num_samples_after_warmup = total_num_videos - args.bs * args.profile_warmup * dp_size print(f"throughput (samples/s): {num_samples_after_warmup / sum(total_time)}") print(f"average encode time per sample: {sum(encode_time) / num_samples_after_warmup}") print(f"average generate time per sample: {sum(generate_time) / num_samples_after_warmup}") print(f"average number of tokens characters per sample: {sum(output_length) / num_samples_after_warmup}") print(f"Max GPU allocated / GB: {torch.cuda.max_memory_allocated() / 1024**3}") print(f"Max GPU reserved / GB: {torch.cuda.max_memory_reserved() / 1024**3}") # ====================================================== # 6. shutdown # ====================================================== # close file writing if has_dp_writter: dp_file.close() dist.barrier() # terminate distributed env dist.destroy_process_group() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("input", type=str, help="Path to the input CSV file") parser.add_argument("--model-path", type=str, default="liuhaotian/llava-v1.6-34b") parser.add_argument("--prompt", type=str, default="video-f1-detail-3ex") parser.add_argument("--resize", type=int, default=336) parser.add_argument("--num-frames", type=int, default=1) parser.add_argument("--max-tokens", type=int, default=300) # speed related parser.add_argument("--bs", type=int, default=16) parser.add_argument("--tp-size", type=int, default=2) parser.add_argument("--dp-size", type=int, default=4) parser.add_argument("--num-workers", type=int, default=8) parser.add_argument("--prefetch-factor", type=int, default=8, help="Prefetch factor") parser.add_argument( "--flash-attention", action="store_true", help="Whether to use flash attention. You can turn on this flag for llama model and off for mistral model.", ) # debug related parser.add_argument("--profile", action="store_true") parser.add_argument("--profile-warmup", type=int, default=1) args = parser.parse_args() main(args) ================================================ FILE: Open-Sora/build/lib/tools/caption/utils.py ================================================ import time import pandas as pd import torch import torchvision.transforms as transforms from torchvision.datasets.folder import pil_loader from tools.datasets.utils import extract_frames, is_video IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp") VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv") PROMPTS = { "image": { "text": "Describe this image and its style to generate a succinct yet informative description. Pay attention to all objects in the image. The description should be useful for AI to re-generate the image. The description should be no more than five sentences. Remember do not exceed 5 sentences.", "type": "image", }, "image-text": { "text": "Describe this image and its style in a very detailed manner. Pay attention to all objects in the image. The description should be useful for AI to re-generate the image. The description should be no more than six sentences. Some information about the image is '{}'.", "type": "image", }, "image-3ex": { "text": "An image is given. Describe this image and its style to generate a succinct yet informative description. Pay attention to all objects in the image. The description should be useful for AI to re-generate the video. The description should be no more than five sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick and walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.", "type": "image", }, "video": { "text": "Describe this video and its style in a very detailed manner. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences.", "type": "video", }, "video-text": { "text": "Describe this video and its style in a very detailed manner. Some information about the image is '{}'. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences.", "type": "video", }, "video-f1-detail-3ex": { "text": "A video is given by providing the middle frame. Describe this video and its style to generate a description. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.", "type": "video", }, "video-f1-detail-2ex-text": { "text": "A video is given by providing the middle frame. Some information about the image is '{}'. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.", "type": "video", }, "video-f3-detail-3ex": { "text": "A video is given by providing three frames in chronological order. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.", "type": "video", }, "video-f3-detail-2ex-text": { "text": "A video is given by providing three frames in chronological order. Some information about the image is '{}'. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.", "type": "video", }, } NUM_FRAMES_POINTS = { 1: (0.5,), 2: (0.25, 0.75), 3: (0.1, 0.5, 0.9), } def read_file(input_path): if input_path.endswith(".csv"): return pd.read_csv(input_path) elif input_path.endswith(".parquet"): return pd.read_parquet(input_path) else: raise NotImplementedError(f"Unsupported file format: {input_path}") class VideoTextDataset(torch.utils.data.Dataset): def __init__(self, csv_path, transform=None, num_frames=3, get_text_input_ids=None, resize=None): self.csv_path = csv_path self.transform = transform self.data = read_file(csv_path) self.points = NUM_FRAMES_POINTS[num_frames] self.get_text_input_ids = get_text_input_ids self.use_text = False self.resize_size = resize self.resize = transforms.Resize(resize, transforms.InterpolationMode.BICUBIC) if resize is not None else None if "text" in self.data.columns: self.use_text = True def getitem(self, index): sample = self.data.iloc[index] path = sample["path"] if not is_video(path): images = [pil_loader(path)] length = 1 else: images, length = extract_frames(sample["path"], points=self.points, backend="opencv", return_length=True) if self.resize_size is not None: images_r = [] for img in images: if img.size[0] > self.resize_size or img.size[1] > self.resize_size: img = self.resize(img) images_r.append(img) images = images_r imgs_size = [img.size for img in images] if self.transform is not None: images = self.transform(images) # we put images into a list as pytorch dataloader does not accept Pill out = dict(path=path, image=images, length=length, img_size=imgs_size) if self.get_text_input_ids is not None: if self.use_text: out["text"] = self.get_text_input_ids(sample["text"]) else: out["text"] = self.get_text_input_ids() else: if self.use_text: out["text"] = sample["text"] else: out["text"] = "" return out def __len__(self): return len(self.data) def __getitem__(self, index): return self.getitem(index) def collate_fn(batch): paths = [item["path"] for item in batch] images = [item["image"] for item in batch] lengths = [item["length"] for item in batch] img_sizes = [item["img_size"] for item in batch] texts = [item["text"] for item in batch] return paths, images, lengths, img_sizes, texts class Timer: def __init__(self): self.time_taken = 0 self.start_time = 0 self.end_time = 0 def __enter__(self): self.start_time = time.time() return self def __exit__(self, exc_type, exc_value, exc_tb): self.end_time = time.time() self.time_taken = self.end_time - self.start_time ================================================ FILE: Open-Sora/build/lib/tools/datasets/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/tools/datasets/analyze.py ================================================ import argparse import os import matplotlib.pyplot as plt import pandas as pd def read_file(input_path): if input_path.endswith(".csv"): return pd.read_csv(input_path) elif input_path.endswith(".parquet"): return pd.read_parquet(input_path) else: raise NotImplementedError(f"Unsupported file format: {input_path}") def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("input", type=str, help="Path to the input dataset") parser.add_argument("--save-img", type=str, default="samples/infos/", help="Path to save the image") return parser.parse_args() def plot_data(data, column, bins, name): plt.clf() data.hist(column=column, bins=bins) os.makedirs(os.path.dirname(name), exist_ok=True) plt.savefig(name) print(f"Saved {name}") def plot_categorical_data(data, column, name): plt.clf() data[column].value_counts().plot(kind="bar") os.makedirs(os.path.dirname(name), exist_ok=True) plt.savefig(name) print(f"Saved {name}") COLUMNS = { "num_frames": 100, "resolution": 100, "text_len": 100, "aes": 100, "match": 100, "flow": 100, "cmotion": None, } def main(args): data = read_file(args.input) # === Image Data Info === image_index = data["num_frames"] == 1 if image_index.sum() > 0: print("=== Image Data Info ===") img_data = data[image_index] print(f"Number of images: {len(img_data)}") print(img_data.head()) print(img_data.describe()) if args.save_img: for column in COLUMNS: if column in img_data.columns and column not in ["num_frames", "cmotion"]: if COLUMNS[column] is None: plot_categorical_data(img_data, column, os.path.join(args.save_img, f"image_{column}.png")) else: plot_data(img_data, column, COLUMNS[column], os.path.join(args.save_img, f"image_{column}.png")) # === Video Data Info === if not image_index.all(): print("=== Video Data Info ===") video_data = data[~image_index] print(f"Number of videos: {len(video_data)}") if "num_frames" in video_data.columns: total_num_frames = video_data["num_frames"].sum() print(f"Number of frames: {total_num_frames}") DEFAULT_FPS = 30 total_hours = total_num_frames / DEFAULT_FPS / 3600 print(f"Total hours (30 FPS): {int(total_hours)}") print(video_data.head()) print(video_data.describe()) if args.save_img: for column in COLUMNS: if column in video_data.columns: if COLUMNS[column] is None: plot_categorical_data(video_data, column, os.path.join(args.save_img, f"video_{column}.png")) else: plot_data( video_data, column, COLUMNS[column], os.path.join(args.save_img, f"video_{column}.png") ) if __name__ == "__main__": args = parse_args() main(args) ================================================ FILE: Open-Sora/build/lib/tools/datasets/convert.py ================================================ import argparse import os import time import pandas as pd from torchvision.datasets import ImageNet IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp") VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv", ".m2ts") def scan_recursively(root): num = 0 for entry in os.scandir(root): if entry.is_file(): yield entry elif entry.is_dir(): num += 1 if num % 100 == 0: print(f"Scanned {num} directories.") yield from scan_recursively(entry.path) def get_filelist(file_path, exts=None): filelist = [] time_start = time.time() # == OS Walk == # for home, dirs, files in os.walk(file_path): # for filename in files: # ext = os.path.splitext(filename)[-1].lower() # if exts is None or ext in exts: # filelist.append(os.path.join(home, filename)) # == Scandir == obj = scan_recursively(file_path) for entry in obj: if entry.is_file(): ext = os.path.splitext(entry.name)[-1].lower() if exts is None or ext in exts: filelist.append(entry.path) time_end = time.time() print(f"Scanned {len(filelist)} files in {time_end - time_start:.2f} seconds.") return filelist def split_by_capital(name): # BoxingPunchingBag -> Boxing Punching Bag new_name = "" for i in range(len(name)): if name[i].isupper() and i != 0: new_name += " " new_name += name[i] return new_name def process_imagenet(root, split): root = os.path.expanduser(root) data = ImageNet(root, split=split) samples = [(path, data.classes[label][0]) for path, label in data.samples] output = f"imagenet_{split}.csv" df = pd.DataFrame(samples, columns=["path", "text"]) df.to_csv(output, index=False) print(f"Saved {len(samples)} samples to {output}.") def process_ucf101(root, split): root = os.path.expanduser(root) video_lists = get_filelist(os.path.join(root, split)) classes = [x.split("/")[-2] for x in video_lists] classes = [split_by_capital(x) for x in classes] samples = list(zip(video_lists, classes)) output = f"ucf101_{split}.csv" df = pd.DataFrame(samples, columns=["path", "text"]) df.to_csv(output, index=False) print(f"Saved {len(samples)} samples to {output}.") def process_vidprom(root, info): root = os.path.expanduser(root) video_lists = get_filelist(root) video_set = set(video_lists) # read info csv infos = pd.read_csv(info) abs_path = infos["uuid"].apply(lambda x: os.path.join(root, f"pika-{x}.mp4")) is_exist = abs_path.apply(lambda x: x in video_set) df = pd.DataFrame(dict(path=abs_path[is_exist], text=infos["prompt"][is_exist])) df.to_csv("vidprom.csv", index=False) print(f"Saved {len(df)} samples to vidprom.csv.") def process_general_images(root, output): root = os.path.expanduser(root) if not os.path.exists(root): return path_list = get_filelist(root, IMG_EXTENSIONS) fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list] df = pd.DataFrame(dict(id=fname_list, path=path_list)) os.makedirs(os.path.dirname(output), exist_ok=True) df.to_csv(output, index=False) print(f"Saved {len(df)} samples to {output}.") def process_general_videos(root, output): root = os.path.expanduser(root) if not os.path.exists(root): return path_list = get_filelist(root, VID_EXTENSIONS) path_list = list(set(path_list)) # remove duplicates fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list] relpath_list = [os.path.relpath(x, root) for x in path_list] df = pd.DataFrame(dict(path=path_list, id=fname_list, relpath=relpath_list)) os.makedirs(os.path.dirname(output), exist_ok=True) df.to_csv(output, index=False) print(f"Saved {len(df)} samples to {output}.") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("dataset", type=str, choices=["imagenet", "ucf101", "vidprom", "image", "video"]) parser.add_argument("root", type=str) parser.add_argument("--split", type=str, default="train") parser.add_argument("--info", type=str, default=None) parser.add_argument("--output", type=str, default=None, required=True, help="Output path") args = parser.parse_args() if args.dataset == "imagenet": process_imagenet(args.root, args.split) elif args.dataset == "ucf101": process_ucf101(args.root, args.split) elif args.dataset == "vidprom": process_vidprom(args.root, args.info) elif args.dataset == "image": process_general_images(args.root, args.output) elif args.dataset == "video": process_general_videos(args.root, args.output) else: raise ValueError("Invalid dataset") ================================================ FILE: Open-Sora/build/lib/tools/datasets/datautil.py ================================================ import argparse import html import json import os import random import re from functools import partial from glob import glob import cv2 import numpy as np import pandas as pd from PIL import Image from tqdm import tqdm from opensora.datasets.read_video import read_video from .utils import IMG_EXTENSIONS tqdm.pandas() try: from pandarallel import pandarallel PANDA_USE_PARALLEL = True except ImportError: PANDA_USE_PARALLEL = False def apply(df, func, **kwargs): if PANDA_USE_PARALLEL: return df.parallel_apply(func, **kwargs) return df.progress_apply(func, **kwargs) TRAIN_COLUMNS = ["path", "text", "num_frames", "fps", "height", "width", "aspect_ratio", "resolution", "text_len"] # ====================================================== # --info # ====================================================== def get_video_length(cap, method="header"): assert method in ["header", "set"] if method == "header": length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) else: cap.set(cv2.CAP_PROP_POS_AVI_RATIO, 1) length = int(cap.get(cv2.CAP_PROP_POS_FRAMES)) return length def get_info_old(path): try: ext = os.path.splitext(path)[1].lower() if ext in IMG_EXTENSIONS: im = cv2.imread(path) if im is None: return 0, 0, 0, np.nan, np.nan, np.nan height, width = im.shape[:2] num_frames, fps = 1, np.nan else: cap = cv2.VideoCapture(path) num_frames, height, width, fps = ( get_video_length(cap, method="header"), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)), int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), float(cap.get(cv2.CAP_PROP_FPS)), ) hw = height * width aspect_ratio = height / width if width > 0 else np.nan return num_frames, height, width, aspect_ratio, fps, hw except: return 0, 0, 0, np.nan, np.nan, np.nan def get_info(path): try: ext = os.path.splitext(path)[1].lower() if ext in IMG_EXTENSIONS: return get_image_info(path) else: return get_video_info(path) except: return 0, 0, 0, np.nan, np.nan, np.nan def get_image_info(path, backend="pillow"): if backend == "pillow": try: with open(path, "rb") as f: img = Image.open(f) img = img.convert("RGB") width, height = img.size num_frames, fps = 1, np.nan hw = height * width aspect_ratio = height / width if width > 0 else np.nan return num_frames, height, width, aspect_ratio, fps, hw except: return 0, 0, 0, np.nan, np.nan, np.nan elif backend == "cv2": try: im = cv2.imread(path) if im is None: return 0, 0, 0, np.nan, np.nan, np.nan height, width = im.shape[:2] num_frames, fps = 1, np.nan hw = height * width aspect_ratio = height / width if width > 0 else np.nan return num_frames, height, width, aspect_ratio, fps, hw except: return 0, 0, 0, np.nan, np.nan, np.nan else: raise ValueError def get_video_info(path, backend="torchvision"): if backend == "torchvision": try: vframes, infos = read_video(path) num_frames, height, width = vframes.shape[0], vframes.shape[2], vframes.shape[3] if "video_fps" in infos: fps = infos["video_fps"] else: fps = np.nan hw = height * width aspect_ratio = height / width if width > 0 else np.nan return num_frames, height, width, aspect_ratio, fps, hw except: return 0, 0, 0, np.nan, np.nan, np.nan elif backend == "cv2": try: cap = cv2.VideoCapture(path) num_frames, height, width, fps = ( get_video_length(cap, method="header"), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)), int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), float(cap.get(cv2.CAP_PROP_FPS)), ) hw = height * width aspect_ratio = height / width if width > 0 else np.nan return num_frames, height, width, aspect_ratio, fps, hw except: return 0, 0, 0, np.nan, np.nan, np.nan else: raise ValueError # ====================================================== # --refine-llm-caption # ====================================================== LLAVA_PREFIX = [ "The video shows", "The video captures", "The video features", "The video depicts", "The video presents", "The video features", "The video is ", "In the video,", "The image shows", "The image captures", "The image features", "The image depicts", "The image presents", "The image features", "The image is ", "The image portrays", "In the image,", ] def remove_caption_prefix(caption): for prefix in LLAVA_PREFIX: if caption.startswith(prefix) or caption.startswith(prefix.lower()): caption = caption[len(prefix) :].strip() if caption[0].islower(): caption = caption[0].upper() + caption[1:] return caption return caption # ====================================================== # --merge-cmotion # ====================================================== CMOTION_TEXT = { "static": "static", "pan_right": "pan right", "pan_left": "pan left", "zoom_in": "zoom in", "zoom_out": "zoom out", "tilt_up": "tilt up", "tilt_down": "tilt down", # "pan/tilt": "The camera is panning.", # "dynamic": "The camera is moving.", # "unknown": None, } CMOTION_PROBS = { # hard-coded probabilities "static": 1.0, "zoom_in": 1.0, "zoom_out": 1.0, "pan_left": 1.0, "pan_right": 1.0, "tilt_up": 1.0, "tilt_down": 1.0, # "dynamic": 1.0, # "unknown": 0.0, # "pan/tilt": 1.0, } def merge_cmotion(caption, cmotion): text = CMOTION_TEXT[cmotion] prob = CMOTION_PROBS[cmotion] if text is not None and random.random() < prob: caption = f"{caption} Camera motion: {text}." return caption # ====================================================== # --lang # ====================================================== def build_lang_detector(lang_to_detect): from lingua import Language, LanguageDetectorBuilder lang_dict = dict(en=Language.ENGLISH) assert lang_to_detect in lang_dict valid_lang = lang_dict[lang_to_detect] detector = LanguageDetectorBuilder.from_all_spoken_languages().with_low_accuracy_mode().build() def detect_lang(caption): confidence_values = detector.compute_language_confidence_values(caption) confidence = [x.language for x in confidence_values[:5]] if valid_lang not in confidence: return False return True return detect_lang # ====================================================== # --clean-caption # ====================================================== def basic_clean(text): import ftfy text = ftfy.fix_text(text) text = html.unescape(html.unescape(text)) return text.strip() BAD_PUNCT_REGEX = re.compile( r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" ) # noqa def clean_caption(caption): import urllib.parse as ul from bs4 import BeautifulSoup caption = str(caption) caption = ul.unquote_plus(caption) caption = caption.strip().lower() caption = re.sub("", "person", caption) # urls: caption = re.sub( r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", caption, ) # regex for urls caption = re.sub( r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", caption, ) # regex for urls # html: caption = BeautifulSoup(caption, features="html.parser").text # @ caption = re.sub(r"@[\w\d]+\b", "", caption) # 31C0—31EF CJK Strokes # 31F0—31FF Katakana Phonetic Extensions # 3200—32FF Enclosed CJK Letters and Months # 3300—33FF CJK Compatibility # 3400—4DBF CJK Unified Ideographs Extension A # 4DC0—4DFF Yijing Hexagram Symbols # 4E00—9FFF CJK Unified Ideographs caption = re.sub(r"[\u31c0-\u31ef]+", "", caption) caption = re.sub(r"[\u31f0-\u31ff]+", "", caption) caption = re.sub(r"[\u3200-\u32ff]+", "", caption) caption = re.sub(r"[\u3300-\u33ff]+", "", caption) caption = re.sub(r"[\u3400-\u4dbf]+", "", caption) caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption) caption = re.sub(r"[\u4e00-\u9fff]+", "", caption) ####################################################### # все виды тире / all types of dash --> "-" caption = re.sub( r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa "-", caption, ) # кавычки к одному стандарту caption = re.sub(r"[`´«»“”¨]", '"', caption) caption = re.sub(r"[‘’]", "'", caption) # " caption = re.sub(r""?", "", caption) # & caption = re.sub(r"&", "", caption) # ip adresses: caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption) # article ids: caption = re.sub(r"\d:\d\d\s+$", "", caption) # \n caption = re.sub(r"\\n", " ", caption) # "#123" caption = re.sub(r"#\d{1,3}\b", "", caption) # "#12345.." caption = re.sub(r"#\d{5,}\b", "", caption) # "123456.." caption = re.sub(r"\b\d{6,}\b", "", caption) # filenames: caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) # caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" caption = re.sub(BAD_PUNCT_REGEX, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " # this-is-my-cute-cat / this_is_my_cute_cat regex2 = re.compile(r"(?:\-|\_)") if len(re.findall(regex2, caption)) > 3: caption = re.sub(regex2, " ", caption) caption = basic_clean(caption) caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640 caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231 caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) caption = re.sub(r"\bpage\s+\d+\b", "", caption) caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) caption = re.sub(r"\b\s+\:\s+", r": ", caption) caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption) caption = re.sub(r"\s+", " ", caption) caption.strip() caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption) caption = re.sub(r"^[\'\_,\-\:;]", r"", caption) caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption) caption = re.sub(r"^\.\S+$", "", caption) return caption.strip() def text_preprocessing(text, use_text_preprocessing: bool = True): if use_text_preprocessing: # The exact text cleaning as was in the training stage: text = clean_caption(text) text = clean_caption(text) return text else: return text.lower().strip() # ====================================================== # load caption # ====================================================== def load_caption(path, ext): try: assert ext in ["json"] json_path = path.split(".")[0] + ".json" with open(json_path, "r") as f: data = json.load(f) caption = data["caption"] return caption except: return "" # ====================================================== # --clean-caption # ====================================================== DROP_SCORE_PROB = 0.2 def score_to_text(data): text = data["text"] scores = [] # aesthetic if "aes" in data: aes = data["aes"] if random.random() > DROP_SCORE_PROB: score_text = f"aesthetic score: {aes:.1f}" scores.append(score_text) if "flow" in data: flow = data["flow"] if random.random() > DROP_SCORE_PROB: score_text = f"motion score: {flow:.1f}" scores.append(score_text) if len(scores) > 0: text = f"{text} [{', '.join(scores)}]" return text # ====================================================== # read & write # ====================================================== def read_file(input_path): if input_path.endswith(".csv"): return pd.read_csv(input_path) elif input_path.endswith(".parquet"): return pd.read_parquet(input_path) else: raise NotImplementedError(f"Unsupported file format: {input_path}") def save_file(data, output_path): output_dir = os.path.dirname(output_path) if not os.path.exists(output_dir) and output_dir != "": os.makedirs(output_dir) if output_path.endswith(".csv"): return data.to_csv(output_path, index=False) elif output_path.endswith(".parquet"): return data.to_parquet(output_path, index=False) else: raise NotImplementedError(f"Unsupported file format: {output_path}") def read_data(input_paths): data = [] input_name = "" input_list = [] for input_path in input_paths: input_list.extend(glob(input_path)) print("Input files:", input_list) for i, input_path in enumerate(input_list): if not os.path.exists(input_path): continue data.append(read_file(input_path)) input_name += os.path.basename(input_path).split(".")[0] if i != len(input_list) - 1: input_name += "+" print(f"Loaded {len(data[-1])} samples from '{input_path}'.") if len(data) == 0: print(f"No samples to process. Exit.") exit() data = pd.concat(data, ignore_index=True, sort=False) print(f"Total number of samples: {len(data)}") return data, input_name # ====================================================== # main # ====================================================== # To add a new method, register it in the main, parse_args, and get_output_path functions, and update the doc at /tools/datasets/README.md#documentation def main(args): # reading data data, input_name = read_data(args.input) # make difference if args.difference is not None: data_diff = pd.read_csv(args.difference) print(f"Difference csv contains {len(data_diff)} samples.") data = data[~data["path"].isin(data_diff["path"])] input_name += f"-{os.path.basename(args.difference).split('.')[0]}" print(f"Filtered number of samples: {len(data)}.") # make intersection if args.intersection is not None: data_new = pd.read_csv(args.intersection) print(f"Intersection csv contains {len(data_new)} samples.") cols_to_use = data_new.columns.difference(data.columns) col_on = "path" # if 'id' in data.columns and 'id' in data_new.columns: # col_on = 'id' cols_to_use = cols_to_use.insert(0, col_on) data = pd.merge(data, data_new[cols_to_use], on=col_on, how="inner") print(f"Intersection number of samples: {len(data)}.") # get output path output_path = get_output_path(args, input_name) # preparation if args.lang is not None: detect_lang = build_lang_detector(args.lang) if args.count_num_token == "t5": from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("DeepFloyd/t5-v1_1-xxl") # IO-related if args.load_caption is not None: assert "path" in data.columns data["text"] = apply(data["path"], load_caption, ext=args.load_caption) if args.info: info = apply(data["path"], get_info) ( data["num_frames"], data["height"], data["width"], data["aspect_ratio"], data["fps"], data["resolution"], ) = zip(*info) if args.video_info: info = apply(data["path"], get_video_info) ( data["num_frames"], data["height"], data["width"], data["aspect_ratio"], data["fps"], data["resolution"], ) = zip(*info) if args.ext: assert "path" in data.columns data = data[apply(data["path"], os.path.exists)] # filtering if args.remove_url: assert "text" in data.columns data = data[~data["text"].str.contains(r"(?Phttps?://[^\s]+)", regex=True)] if args.lang is not None: assert "text" in data.columns data = data[data["text"].progress_apply(detect_lang)] # cannot parallelize if args.remove_empty_path: assert "path" in data.columns data = data[data["path"].str.len() > 0] data = data[~data["path"].isna()] if args.remove_empty_caption: assert "text" in data.columns data = data[data["text"].str.len() > 0] data = data[~data["text"].isna()] if args.remove_path_duplication: assert "path" in data.columns data = data.drop_duplicates(subset=["path"]) if args.path_subset: data = data[data["path"].str.contains(args.path_subset)] # processing if args.relpath is not None: data["path"] = apply(data["path"], lambda x: os.path.relpath(x, args.relpath)) if args.abspath is not None: data["path"] = apply(data["path"], lambda x: os.path.join(args.abspath, x)) if args.path_to_id: data["id"] = apply(data["path"], lambda x: os.path.splitext(os.path.basename(x))[0]) if args.merge_cmotion: data["text"] = apply(data, lambda x: merge_cmotion(x["text"], x["cmotion"]), axis=1) if args.refine_llm_caption: assert "text" in data.columns data["text"] = apply(data["text"], remove_caption_prefix) if args.append_text is not None: assert "text" in data.columns data["text"] = data["text"] + args.append_text if args.score_to_text: data["text"] = apply(data, score_to_text, axis=1) if args.clean_caption: assert "text" in data.columns data["text"] = apply( data["text"], partial(text_preprocessing, use_text_preprocessing=True), ) if args.count_num_token is not None: assert "text" in data.columns data["text_len"] = apply(data["text"], lambda x: len(tokenizer(x)["input_ids"])) if args.update_text is not None: data_new = pd.read_csv(args.update_text) num_updated = data.path.isin(data_new.path).sum() print(f"Number of updated samples: {num_updated}.") data = data.set_index("path") data_new = data_new[["path", "text"]].set_index("path") data.update(data_new) data = data.reset_index() # sort if args.sort is not None: data = data.sort_values(by=args.sort, ascending=False) if args.sort_ascending is not None: data = data.sort_values(by=args.sort_ascending, ascending=True) # filtering if args.filesize: assert "path" in data.columns data["filesize"] = apply(data["path"], lambda x: os.stat(x).st_size / 1024 / 1024) if args.fsmax is not None: assert "filesize" in data.columns data = data[data["filesize"] <= args.fsmax] if args.remove_empty_caption: assert "text" in data.columns data = data[data["text"].str.len() > 0] data = data[~data["text"].isna()] if args.fmin is not None: assert "num_frames" in data.columns data = data[data["num_frames"] >= args.fmin] if args.fmax is not None: assert "num_frames" in data.columns data = data[data["num_frames"] <= args.fmax] if args.fpsmax is not None: assert "fps" in data.columns data = data[(data["fps"] <= args.fpsmax) | np.isnan(data["fps"])] if args.hwmax is not None: if "resolution" not in data.columns: height = data["height"] width = data["width"] data["resolution"] = height * width data = data[data["resolution"] <= args.hwmax] if args.aesmin is not None: assert "aes" in data.columns data = data[data["aes"] >= args.aesmin] if args.matchmin is not None: assert "match" in data.columns data = data[data["match"] >= args.matchmin] if args.flowmin is not None: assert "flow" in data.columns data = data[data["flow"] >= args.flowmin] if args.remove_text_duplication: data = data.drop_duplicates(subset=["text"], keep="first") if args.img_only: data = data[data["path"].str.lower().str.endswith(IMG_EXTENSIONS)] if args.vid_only: data = data[~data["path"].str.lower().str.endswith(IMG_EXTENSIONS)] # process data if args.shuffle: data = data.sample(frac=1).reset_index(drop=True) # shuffle if args.head is not None: data = data.head(args.head) # train columns if args.train_column: all_columns = data.columns columns_to_drop = all_columns.difference(TRAIN_COLUMNS) data = data.drop(columns=columns_to_drop) print(f"Filtered number of samples: {len(data)}.") # shard data if args.shard is not None: sharded_data = np.array_split(data, args.shard) for i in range(args.shard): output_path_part = output_path.split(".") output_path_s = ".".join(output_path_part[:-1]) + f"_{i}." + output_path_part[-1] save_file(sharded_data[i], output_path_s) print(f"Saved {len(sharded_data[i])} samples to {output_path_s}.") else: save_file(data, output_path) print(f"Saved {len(data)} samples to {output_path}.") def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("input", type=str, nargs="+", help="path to the input dataset") parser.add_argument("--output", type=str, default=None, help="output path") parser.add_argument("--format", type=str, default="csv", help="output format", choices=["csv", "parquet"]) parser.add_argument("--disable-parallel", action="store_true", help="disable parallel processing") parser.add_argument("--num-workers", type=int, default=None, help="number of workers") parser.add_argument("--seed", type=int, default=42, help="random seed") # special case parser.add_argument("--shard", type=int, default=None, help="shard the dataset") parser.add_argument("--sort", type=str, default=None, help="sort by column") parser.add_argument("--sort-ascending", type=str, default=None, help="sort by column (ascending order)") parser.add_argument("--difference", type=str, default=None, help="get difference from the dataset") parser.add_argument( "--intersection", type=str, default=None, help="keep the paths in csv from the dataset and merge columns" ) parser.add_argument("--train-column", action="store_true", help="only keep the train column") # IO-related parser.add_argument("--info", action="store_true", help="get the basic information of each video and image") parser.add_argument("--video-info", action="store_true", help="get the basic information of each video") parser.add_argument("--ext", action="store_true", help="check if the file exists") parser.add_argument( "--load-caption", type=str, default=None, choices=["json", "txt"], help="load the caption from json or txt" ) # path processing parser.add_argument("--relpath", type=str, default=None, help="modify the path to relative path by root given") parser.add_argument("--abspath", type=str, default=None, help="modify the path to absolute path by root given") parser.add_argument("--path-to-id", action="store_true", help="add id based on path") parser.add_argument( "--path-subset", type=str, default=None, help="extract a subset data containing the given `path-subset` value" ) parser.add_argument( "--remove-empty-path", action="store_true", help="remove rows with empty path", # caused by transform, cannot read path ) # caption filtering parser.add_argument( "--remove-empty-caption", action="store_true", help="remove rows with empty caption", ) parser.add_argument("--remove-url", action="store_true", help="remove rows with url in caption") parser.add_argument("--lang", type=str, default=None, help="remove rows with other language") parser.add_argument("--remove-path-duplication", action="store_true", help="remove rows with duplicated path") parser.add_argument("--remove-text-duplication", action="store_true", help="remove rows with duplicated caption") # caption processing parser.add_argument("--refine-llm-caption", action="store_true", help="modify the caption generated by LLM") parser.add_argument( "--clean-caption", action="store_true", help="modify the caption according to T5 pipeline to suit training" ) parser.add_argument("--merge-cmotion", action="store_true", help="merge the camera motion to the caption") parser.add_argument( "--count-num-token", type=str, choices=["t5"], default=None, help="Count the number of tokens in the caption" ) parser.add_argument("--append-text", type=str, default=None, help="append text to the caption") parser.add_argument("--score-to-text", action="store_true", help="convert score to text") parser.add_argument("--update-text", type=str, default=None, help="update the text with the given text") # score filtering parser.add_argument("--filesize", action="store_true", help="get the filesize of each video and image in MB") parser.add_argument("--fsmax", type=int, default=None, help="filter the dataset by maximum filesize") parser.add_argument("--fmin", type=int, default=None, help="filter the dataset by minimum number of frames") parser.add_argument("--fmax", type=int, default=None, help="filter the dataset by maximum number of frames") parser.add_argument("--hwmax", type=int, default=None, help="filter the dataset by maximum resolution") parser.add_argument("--aesmin", type=float, default=None, help="filter the dataset by minimum aes score") parser.add_argument("--matchmin", type=float, default=None, help="filter the dataset by minimum match score") parser.add_argument("--flowmin", type=float, default=None, help="filter the dataset by minimum flow score") parser.add_argument("--fpsmax", type=float, default=None, help="filter the dataset by maximum fps") parser.add_argument("--img-only", action="store_true", help="only keep the image data") parser.add_argument("--vid-only", action="store_true", help="only keep the video data") # data processing parser.add_argument("--shuffle", default=False, action="store_true", help="shuffle the dataset") parser.add_argument("--head", type=int, default=None, help="return the first n rows of data") return parser.parse_args() def get_output_path(args, input_name): if args.output is not None: return args.output name = input_name dir_path = os.path.dirname(args.input[0]) # sort if args.sort is not None: assert args.sort_ascending is None name += "_sort" if args.sort_ascending is not None: assert args.sort is None name += "_sort" # IO-related # for IO-related, the function must be wrapped in try-except if args.info: name += "_info" if args.video_info: name += "_vinfo" if args.ext: name += "_ext" if args.load_caption: name += f"_load{args.load_caption}" # path processing if args.relpath is not None: name += "_relpath" if args.abspath is not None: name += "_abspath" if args.remove_empty_path: name += "_noemptypath" # caption filtering if args.remove_empty_caption: name += "_noempty" if args.remove_url: name += "_nourl" if args.lang is not None: name += f"_{args.lang}" if args.remove_path_duplication: name += "_noduppath" if args.remove_text_duplication: name += "_noduptext" if args.path_subset: name += "_subset" # caption processing if args.refine_llm_caption: name += "_llm" if args.clean_caption: name += "_clean" if args.merge_cmotion: name += "_cmcaption" if args.count_num_token: name += "_ntoken" if args.append_text is not None: name += "_appendtext" if args.score_to_text: name += "_score2text" if args.update_text is not None: name += "_update" # score filtering if args.filesize: name += "_filesize" if args.fsmax is not None: name += f"_fsmax{args.fsmax}" if args.fmin is not None: name += f"_fmin{args.fmin}" if args.fmax is not None: name += f"_fmax{args.fmax}" if args.fpsmax is not None: name += f"_fpsmax{args.fpsmax}" if args.hwmax is not None: name += f"_hwmax{args.hwmax}" if args.aesmin is not None: name += f"_aesmin{args.aesmin}" if args.matchmin is not None: name += f"_matchmin{args.matchmin}" if args.flowmin is not None: name += f"_flowmin{args.flowmin}" if args.img_only: name += "_img" if args.vid_only: name += "_vid" # processing if args.shuffle: name += f"_shuffled_seed{args.seed}" if args.head is not None: name += f"_first_{args.head}_data" output_path = os.path.join(dir_path, f"{name}.{args.format}") return output_path if __name__ == "__main__": args = parse_args() if args.disable_parallel: PANDA_USE_PARALLEL = False if PANDA_USE_PARALLEL: if args.num_workers is not None: pandarallel.initialize(nb_workers=args.num_workers, progress_bar=True) else: pandarallel.initialize(progress_bar=True) if args.seed is not None: random.seed(args.seed) np.random.seed(args.seed) main(args) ================================================ FILE: Open-Sora/build/lib/tools/datasets/filter_panda10m.py ================================================ # TODO: remove this file before releasing import argparse import html import os import re import pandas as pd from tqdm import tqdm tqdm.pandas() try: from pandarallel import pandarallel pandarallel.initialize(progress_bar=True) pandas_has_parallel = True except ImportError: pandas_has_parallel = False def apply(df, func, **kwargs): if pandas_has_parallel: return df.parallel_apply(func, **kwargs) return df.progress_apply(func, **kwargs) def basic_clean(text): import ftfy text = ftfy.fix_text(text) text = html.unescape(html.unescape(text)) return text.strip() BAD_PUNCT_REGEX = re.compile( r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" ) # noqa def clean_caption(caption): import urllib.parse as ul from bs4 import BeautifulSoup caption = str(caption) caption = ul.unquote_plus(caption) caption = caption.strip().lower() caption = re.sub("", "person", caption) # urls: caption = re.sub( r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", caption, ) # regex for urls caption = re.sub( r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", caption, ) # regex for urls # html: caption = BeautifulSoup(caption, features="html.parser").text # @ caption = re.sub(r"@[\w\d]+\b", "", caption) # 31C0—31EF CJK Strokes # 31F0—31FF Katakana Phonetic Extensions # 3200—32FF Enclosed CJK Letters and Months # 3300—33FF CJK Compatibility # 3400—4DBF CJK Unified Ideographs Extension A # 4DC0—4DFF Yijing Hexagram Symbols # 4E00—9FFF CJK Unified Ideographs caption = re.sub(r"[\u31c0-\u31ef]+", "", caption) caption = re.sub(r"[\u31f0-\u31ff]+", "", caption) caption = re.sub(r"[\u3200-\u32ff]+", "", caption) caption = re.sub(r"[\u3300-\u33ff]+", "", caption) caption = re.sub(r"[\u3400-\u4dbf]+", "", caption) caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption) caption = re.sub(r"[\u4e00-\u9fff]+", "", caption) ####################################################### # все виды тире / all types of dash --> "-" caption = re.sub( r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa "-", caption, ) # кавычки к одному стандарту caption = re.sub(r"[`´«»“”¨]", '"', caption) caption = re.sub(r"[‘’]", "'", caption) # " caption = re.sub(r""?", "", caption) # & caption = re.sub(r"&", "", caption) # ip adresses: caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption) # article ids: caption = re.sub(r"\d:\d\d\s+$", "", caption) # \n caption = re.sub(r"\\n", " ", caption) # "#123" caption = re.sub(r"#\d{1,3}\b", "", caption) # "#12345.." caption = re.sub(r"#\d{5,}\b", "", caption) # "123456.." caption = re.sub(r"\b\d{6,}\b", "", caption) # filenames: caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) # caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" caption = re.sub(BAD_PUNCT_REGEX, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " # this-is-my-cute-cat / this_is_my_cute_cat regex2 = re.compile(r"(?:\-|\_)") if len(re.findall(regex2, caption)) > 3: caption = re.sub(regex2, " ", caption) caption = basic_clean(caption) caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640 caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231 caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) caption = re.sub(r"\bpage\s+\d+\b", "", caption) caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) caption = re.sub(r"\b\s+\:\s+", r": ", caption) caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption) caption = re.sub(r"\s+", " ", caption) caption.strip() caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption) caption = re.sub(r"^[\'\_,\-\:;]", r"", caption) caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption) caption = re.sub(r"^\.\S+$", "", caption) return caption.strip() def get_10m_set(): meta_path_10m = "/mnt/hdd/data/Panda-70M/raw/meta/train/panda70m_training_10m.csv" meta_10m = pd.read_csv(meta_path_10m) def process_single_caption(row): text_list = eval(row["caption"]) clean_list = [clean_caption(x) for x in text_list] return str(clean_list) ret = apply(meta_10m, process_single_caption, axis=1) # ret = meta_10m.progress_apply(process_single_caption, axis=1) print("==> text processed.") text_list = [] for x in ret: text_list += eval(x) # text_set = text_set.union(set(eval(x))) text_set = set(text_list) # meta_10m['caption_new'] = ret # meta_10m.to_csv('/mnt/hdd/data/Panda-70M/raw/meta/train/panda70m_training_10m_new-cap.csv') # video_id_set = set(meta_10m['videoID']) # id2t = {} # for idx, row in tqdm(meta_10m.iterrows(), total=len(meta_10m)): # video_id = row['videoID'] # text_list = eval(row['caption']) # id2t[video_id] = set(text_list) print(f"==> Loaded meta_10m from '{meta_path_10m}'") return text_set def filter_panda10m_text(meta_path, text_set): def process_single_row(row): # path = row['path'] t = row["text"] # fname = os.path.basename(path) # video_id = fname[:fname.rindex('_')] if t not in text_set: return False return True meta = pd.read_csv(meta_path) ret = apply(meta, process_single_row, axis=1) # ret = meta.progress_apply(process_single_row, axis=1) meta = meta[ret] wo_ext, ext = os.path.splitext(meta_path) out_path = f"{wo_ext}_filter-10m{ext}" meta.to_csv(out_path, index=False) print(f"New meta (shape={meta.shape}) saved to '{out_path}'.") def filter_panda10m_timestamp(meta_path): meta_path_10m = "/mnt/hdd/data/Panda-70M/raw/meta/train/panda70m_training_10m.csv" meta_10m = pd.read_csv(meta_path_10m) id2t = {} for idx, row in tqdm(meta_10m.iterrows(), total=len(meta_10m)): video_id = row["videoID"] timestamp = eval(row["timestamp"]) timestamp = [str(tuple(x)) for x in timestamp] id2t[video_id] = timestamp # video_id_set_10m = set(meta_10m['videoID']) print(f"==> Loaded meta_10m from '{meta_path_10m}'") def process_single_row(row): path = row["path"] t = row["timestamp"] fname = os.path.basename(path) video_id = fname[: fname.rindex("_")] if video_id not in id2t: return False if t not in id2t[video_id]: return False return True # return video_id in video_id_set_10m meta = pd.read_csv(meta_path) ret = apply(meta, process_single_row, axis=1) meta = meta[ret] wo_ext, ext = os.path.splitext(meta_path) out_path = f"{wo_ext}_filter-10m{ext}" meta.to_csv(out_path, index=False) print(f"New meta (shape={meta.shape}) saved to '{out_path}'.") def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--meta_path", type=str, nargs="+") parser.add_argument("--num_workers", default=5, type=int) args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() text_set = get_10m_set() for x in args.meta_path: filter_panda10m_text(x, text_set) ================================================ FILE: Open-Sora/build/lib/tools/datasets/split.py ================================================ import argparse from typing import List import pandas as pd from mmengine.config import Config from opensora.datasets.bucket import Bucket def split_by_bucket( bucket: Bucket, input_files: List[str], output_path: str, limit: int, frame_interval: int, ): print(f"Split {len(input_files)} files into {len(bucket)} buckets") total_limit = len(bucket) * limit bucket_cnt = {} # get all bucket id for hw_id, d in bucket.ar_criteria.items(): for t_id, v in d.items(): for ar_id in v.keys(): bucket_id = (hw_id, t_id, ar_id) bucket_cnt[bucket_id] = 0 output_df = None # split files for path in input_files: df = pd.read_csv(path) if output_df is None: output_df = pd.DataFrame(columns=df.columns) for i in range(len(df)): row = df.iloc[i] t, h, w = row["num_frames"], row["height"], row["width"] bucket_id = bucket.get_bucket_id(t, h, w, frame_interval) if bucket_id is None: continue if bucket_cnt[bucket_id] < limit: bucket_cnt[bucket_id] += 1 output_df = pd.concat([output_df, pd.DataFrame([row])], ignore_index=True) if len(output_df) >= total_limit: break if len(output_df) >= total_limit: break assert len(output_df) <= total_limit if len(output_df) == total_limit: print(f"All buckets are full ({total_limit} samples)") else: print(f"Only {len(output_df)} files are used") output_df.to_csv(output_path, index=False) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("input", type=str, nargs="+") parser.add_argument("-o", "--output", required=True) parser.add_argument("-c", "--config", required=True) parser.add_argument("-l", "--limit", default=200, type=int) args = parser.parse_args() assert args.limit > 0 cfg = Config.fromfile(args.config) bucket_config = cfg.bucket_config # rewrite bucket_config for ar, d in bucket_config.items(): for frames, t in d.items(): p, bs = t if p > 0.0: p = 1.0 d[frames] = (p, bs) bucket = Bucket(bucket_config) split_by_bucket(bucket, args.input, args.output, args.limit, cfg.dataset.frame_interval) ================================================ FILE: Open-Sora/build/lib/tools/datasets/transform.py ================================================ import argparse import os import random import cv2 import numpy as np import pandas as pd from tqdm import tqdm from .utils import IMG_EXTENSIONS, extract_frames tqdm.pandas() try: from pandarallel import pandarallel pandarallel.initialize(progress_bar=True) pandas_has_parallel = True except ImportError: pandas_has_parallel = False def apply(df, func, **kwargs): if pandas_has_parallel: return df.parallel_apply(func, **kwargs) return df.progress_apply(func, **kwargs) def get_new_path(path, input_dir, output): path_new = os.path.join(output, os.path.relpath(path, input_dir)) os.makedirs(os.path.dirname(path_new), exist_ok=True) return path_new def resize(path, length, input_dir, output): path_new = get_new_path(path, input_dir, output) ext = os.path.splitext(path)[1].lower() assert ext in IMG_EXTENSIONS img = cv2.imread(path) if img is not None: h, w = img.shape[:2] if min(h, w) > length: if h > w: new_h = length new_w = int(w * new_h / h) else: new_w = length new_h = int(h * new_w / w) img = cv2.resize(img, (new_w, new_h)) cv2.imwrite(path_new, img) else: path_new = "" return path_new def rand_crop(path, input_dir, output): ext = os.path.splitext(path)[1].lower() path_new = get_new_path(path, input_dir, output) assert ext in IMG_EXTENSIONS img = cv2.imread(path) if img is not None: h, w = img.shape[:2] width, height, _ = img.shape pos = random.randint(0, 3) if pos == 0: img_cropped = img[: width // 2, : height // 2] elif pos == 1: img_cropped = img[width // 2 :, : height // 2] elif pos == 2: img_cropped = img[: width // 2, height // 2 :] else: img_cropped = img[width // 2 :, height // 2 :] cv2.imwrite(path_new, img_cropped) else: path_new = "" return path_new def main(args): data = pd.read_csv(args.input) if args.method == "img_rand_crop": data["path"] = apply(data["path"], lambda x: rand_crop(x, args.input_dir, args.output)) output_csv = args.input.replace(".csv", f"_rand_crop.csv") elif args.method == "img_resize": data["path"] = apply(data["path"], lambda x: resize(x, args.length, args.input_dir, args.output)) output_csv = args.input.replace(".csv", f"_resized{args.length}.csv") elif args.method == "vid_frame_extract": points = args.points if args.points is not None else args.points_index data = pd.DataFrame(np.repeat(data.values, 3, axis=0), columns=data.columns) num_points = len(points) data["point"] = np.nan for i, point in enumerate(points): if isinstance(point, int): data.loc[i::num_points, "point"] = point else: data.loc[i::num_points, "point"] = data.loc[i::num_points, "num_frames"] * point data["path"] = apply(data, lambda x: extract_frames(x["path"], args.input_dir, args.output, x["point"]), axis=1) output_csv = args.input.replace(".csv", f"_vid_frame_extract.csv") data.to_csv(output_csv, index=False) print(f"Saved to {output_csv}") def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("method", type=str, choices=["img_resize", "img_rand_crop", "vid_frame_extract"]) parser.add_argument("input", type=str) parser.add_argument("input_dir", type=str) parser.add_argument("output", type=str) parser.add_argument("--disable-parallel", action="store_true") parser.add_argument("--length", type=int, default=2160) parser.add_argument("--seed", type=int, default=42, help="seed for random") parser.add_argument("--points", nargs="+", type=float, default=None) parser.add_argument("--points_index", nargs="+", type=int, default=None) args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() random.seed(args.seed) if args.disable_parallel: pandas_has_parallel = False main(args) ================================================ FILE: Open-Sora/build/lib/tools/datasets/utils.py ================================================ import os import cv2 import numpy as np from PIL import Image IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp") VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv") def is_video(filename): ext = os.path.splitext(filename)[-1].lower() return ext in VID_EXTENSIONS def extract_frames( video_path, frame_inds=None, points=None, backend="opencv", return_length=False, num_frames=None, ): """ Args: video_path (str): path to video frame_inds (List[int]): indices of frames to extract points (List[float]): values within [0, 1); multiply #frames to get frame indices Return: List[PIL.Image] """ assert backend in ["av", "opencv", "decord"] assert (frame_inds is None) or (points is None) if backend == "av": import av container = av.open(video_path) if num_frames is not None: total_frames = num_frames else: total_frames = container.streams.video[0].frames if points is not None: frame_inds = [int(p * total_frames) for p in points] frames = [] for idx in frame_inds: if idx >= total_frames: idx = total_frames - 1 target_timestamp = int(idx * av.time_base / container.streams.video[0].average_rate) container.seek(target_timestamp) frame = next(container.decode(video=0)).to_image() frames.append(frame) if return_length: return frames, total_frames return frames elif backend == "decord": import decord container = decord.VideoReader(video_path, num_threads=1) if num_frames is not None: total_frames = num_frames else: total_frames = len(container) if points is not None: frame_inds = [int(p * total_frames) for p in points] frame_inds = np.array(frame_inds).astype(np.int32) frame_inds[frame_inds >= total_frames] = total_frames - 1 frames = container.get_batch(frame_inds).asnumpy() # [N, H, W, C] frames = [Image.fromarray(x) for x in frames] if return_length: return frames, total_frames return frames elif backend == "opencv": cap = cv2.VideoCapture(video_path) if num_frames is not None: total_frames = num_frames else: total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) if points is not None: frame_inds = [int(p * total_frames) for p in points] frames = [] for idx in frame_inds: if idx >= total_frames: idx = total_frames - 1 cap.set(cv2.CAP_PROP_POS_FRAMES, idx) # HACK: sometimes OpenCV fails to read frames, return a black frame instead try: ret, frame = cap.read() frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame = Image.fromarray(frame) except Exception as e: print(f"[Warning] Error reading frame {idx} from {video_path}: {e}") # First, try to read the first frame try: print(f"[Warning] Try reading first frame.") cap.set(cv2.CAP_PROP_POS_FRAMES, 0) ret, frame = cap.read() frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame = Image.fromarray(frame) # If that fails, return a black frame except Exception as e: print(f"[Warning] Error in reading first frame from {video_path}: {e}") height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) frame = Image.new("RGB", (width, height), (0, 0, 0)) # HACK: if height or width is 0, return a black frame instead if frame.height == 0 or frame.width == 0: height = width = 256 frame = Image.new("RGB", (width, height), (0, 0, 0)) frames.append(frame) if return_length: return frames, total_frames return frames else: raise ValueError ================================================ FILE: Open-Sora/build/lib/tools/frame_interpolation/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/tools/frame_interpolation/interpolation.py ================================================ # this script is modified from https://github.com/MCG-NKU/AMT/blob/main/demos/demo_2x.py import argparse import os import os.path as osp import cv2 import numpy as np import torch from opensora.utils.ckpt_utils import download_model from .networks.amt_g import Model from .utils.utils import InputPadder, img2tensor, tensor2img hf_endpoint = os.environ.get("HF_ENDPOINT") if hf_endpoint is None: hf_endpoint = "https://huggingface.co" VID_EXT = [".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".webm"] network_cfg = { "params": { "corr_radius": 3, "corr_lvls": 4, "num_flows": 5, }, } device = "cuda" if torch.cuda.is_available() else "cpu" def init(): """ initialize the device and the anchor resolution. """ if device == "cuda": anchor_resolution = 1024 * 512 anchor_memory = 1500 * 1024**2 anchor_memory_bias = 2500 * 1024**2 vram_avail = torch.cuda.get_device_properties(device).total_memory print("VRAM available: {:.1f} MB".format(vram_avail / 1024**2)) else: # Do not resize in cpu mode anchor_resolution = 8192 * 8192 anchor_memory = 1 anchor_memory_bias = 0 vram_avail = 1 return anchor_resolution, anchor_memory, anchor_memory_bias, vram_avail def get_input_video_from_path(input_path): """ Get the input video from the input_path. params: input_path: str, the path of the input video. devices: str, the device to run the model. returns: inputs: list, the list of the input frames. scale: float, the scale of the input frames. padder: InputPadder, the padder to pad the input frames. """ anchor_resolution, anchor_memory, anchor_memory_bias, vram_avail = init() if osp.splitext(input_path)[-1].lower() in VID_EXT: vcap = cv2.VideoCapture(input_path) inputs = [] w = int(vcap.get(cv2.CAP_PROP_FRAME_WIDTH)) h = int(vcap.get(cv2.CAP_PROP_FRAME_HEIGHT)) scale = anchor_resolution / (h * w) * np.sqrt((vram_avail - anchor_memory_bias) / anchor_memory) scale = 1 if scale > 1 else scale scale = 1 / np.floor(1 / np.sqrt(scale) * 16) * 16 if scale < 1: print(f"Due to the limited VRAM, the video will be scaled by {scale:.2f}") padding = int(16 / scale) padder = InputPadder((h, w), padding) while True: ret, frame = vcap.read() if ret is False: break frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame_t = img2tensor(frame).to(device) frame_t = padder.pad(frame_t) inputs.append(frame_t) print(f"Loading the [video] from {input_path}, the number of frames [{len(inputs)}]") else: raise TypeError("Input should be a video.") return inputs, scale, padder def load_model(ckpt): """ load the frame interpolation model. """ params = network_cfg.get("params", {}) model = Model(**params) model.load_state_dict(ckpt["state_dict"]) model = model.to(device) model.eval() return model def interpolater(model, inputs, scale, padder, iters=1): """ interpolating with the interpolation model. params: model: nn.Module, the frame interpolation model. inputs: list, the list of the input frames. scale: float, the scale of the input frames. iters: int, the number of iterations of interpolation. The final frames model generating is 2 ** iters * (m - 1) + 1 and m is input frames. returns: outputs: list, the list of the output frames. """ print("Start frame interpolation:") embt = torch.tensor(1 / 2).float().view(1, 1, 1, 1).to(device) for i in range(iters): print(f"Iter {i+1}. input_frames={len(inputs)} output_frames={2*len(inputs)-1}") outputs = [inputs[0]] for in_0, in_1 in zip(inputs[:-1], inputs[1:]): in_0 = in_0.to(device) in_1 = in_1.to(device) with torch.no_grad(): imgt_pred = model(in_0, in_1, embt, scale_factor=scale, eval=True)["imgt_pred"] outputs += [imgt_pred.cpu(), in_1.cpu()] inputs = outputs outputs = padder.unpad(*outputs) return outputs def write(outputs, input_path, output_path, fps=30): """ write results to the output_path. """ if osp.exists(output_path) is False: os.makedirs(output_path) size = outputs[0].shape[2:][::-1] _, file_name_with_extension = os.path.split(input_path) file_name, _ = os.path.splitext(file_name_with_extension) save_video_path = f"{output_path}/fps{fps}_{file_name}.mp4" fourcc = cv2.VideoWriter_fourcc(*"mp4v") writer = cv2.VideoWriter(save_video_path, fourcc, fps, size) for i, imgt_pred in enumerate(outputs): imgt_pred = tensor2img(imgt_pred) imgt_pred = cv2.cvtColor(imgt_pred, cv2.COLOR_RGB2BGR) writer.write(imgt_pred) print(f"Demo video is saved to [{save_video_path}]") writer.release() def process( model, image_path, output_path, fps, iters, ): inputs, scale, padder = get_input_video_from_path(image_path) outputs = interpolater(model, inputs, scale, padder, iters) write(outputs, image_path, output_path, fps) def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("input", help="Input video.") parser.add_argument("--ckpt", type=str, default="./pretrained_models/amt-g.pth", help="The pretrained model.") parser.add_argument( "--niters", type=int, default=1, help="Iter of Interpolation. The number of frames will be double after per iter.", ) parser.add_argument("--output_path", type=str, default="samples", help="Output path.") parser.add_argument("--fps", type=int, default=8, help="Frames rate of the output video.") parser.add_argument("--folder", action="store_true", help="If the input is a folder, set this flag.") args = parser.parse_args() times_frame = 2**args.niters old_fps = args.fps args.fps = args.fps * times_frame print(f"Interpolation will turn {old_fps}fps video to {args.fps}fps video.") args.input = os.path.expanduser(args.input) args.ckpt = os.path.expanduser(args.ckpt) args.folder = osp.splitext(args.input)[-1].lower() not in VID_EXT args.ckpt = download_model(local_path=args.ckpt, url=hf_endpoint + "/lalala125/AMT/resolve/main/amt-g.pth") return args if __name__ == "__main__": args = parse_args() ckpt_path = args.ckpt input_path = args.input output_path = args.output_path iters = int(args.niters) fps = int(args.fps) model = load_model(ckpt_path) if args.folder: for file in os.listdir(input_path): if osp.splitext(file)[-1].lower() in VID_EXT: vid_path = os.path.join(input_path, file) process(model, vid_path, output_path, fps, iters) else: process(model, input_path, output_path, fps, iters) print("Interpolation is done.") print(f"Output path: {output_path}") ================================================ FILE: Open-Sora/build/lib/tools/frame_interpolation/networks/__init__.py ================================================ from .amt_g import Model ================================================ FILE: Open-Sora/build/lib/tools/frame_interpolation/networks/amt_g.py ================================================ import torch import torch.nn as nn from .blocks.feat_enc import LargeEncoder from .blocks.ifrnet import Encoder, InitDecoder, IntermediateDecoder, resize from .blocks.multi_flow import MultiFlowDecoder, multi_flow_combine from .blocks.raft import BasicUpdateBlock, BidirCorrBlock, coords_grid class Model(nn.Module): def __init__(self, corr_radius=3, corr_lvls=4, num_flows=5, channels=[84, 96, 112, 128], skip_channels=84): super(Model, self).__init__() self.radius = corr_radius self.corr_levels = corr_lvls self.num_flows = num_flows self.feat_encoder = LargeEncoder(output_dim=128, norm_fn="instance", dropout=0.0) self.encoder = Encoder(channels, large=True) self.decoder4 = InitDecoder(channels[3], channels[2], skip_channels) self.decoder3 = IntermediateDecoder(channels[2], channels[1], skip_channels) self.decoder2 = IntermediateDecoder(channels[1], channels[0], skip_channels) self.decoder1 = MultiFlowDecoder(channels[0], skip_channels, num_flows) self.update4 = self._get_updateblock(112, None) self.update3_low = self._get_updateblock(96, 2.0) self.update2_low = self._get_updateblock(84, 4.0) self.update3_high = self._get_updateblock(96, None) self.update2_high = self._get_updateblock(84, None) self.comb_block = nn.Sequential( nn.Conv2d(3 * self.num_flows, 6 * self.num_flows, 7, 1, 3), nn.PReLU(6 * self.num_flows), nn.Conv2d(6 * self.num_flows, 3, 7, 1, 3), ) def _get_updateblock(self, cdim, scale_factor=None): return BasicUpdateBlock( cdim=cdim, hidden_dim=192, flow_dim=64, corr_dim=256, corr_dim2=192, fc_dim=188, scale_factor=scale_factor, corr_levels=self.corr_levels, radius=self.radius, ) def _corr_scale_lookup(self, corr_fn, coord, flow0, flow1, embt, downsample=1): # convert t -> 0 to 0 -> 1 | convert t -> 1 to 1 -> 0 # based on linear assumption t1_scale = 1.0 / embt t0_scale = 1.0 / (1.0 - embt) if downsample != 1: inv = 1 / downsample flow0 = inv * resize(flow0, scale_factor=inv) flow1 = inv * resize(flow1, scale_factor=inv) corr0, corr1 = corr_fn(coord + flow1 * t1_scale, coord + flow0 * t0_scale) corr = torch.cat([corr0, corr1], dim=1) flow = torch.cat([flow0, flow1], dim=1) return corr, flow def forward(self, img0, img1, embt, scale_factor=1.0, eval=False, **kwargs): mean_ = torch.cat([img0, img1], 2).mean(1, keepdim=True).mean(2, keepdim=True).mean(3, keepdim=True) img0 = img0 - mean_ img1 = img1 - mean_ img0_ = resize(img0, scale_factor) if scale_factor != 1.0 else img0 img1_ = resize(img1, scale_factor) if scale_factor != 1.0 else img1 b, _, h, w = img0_.shape coord = coords_grid(b, h // 8, w // 8, img0.device) fmap0, fmap1 = self.feat_encoder([img0_, img1_]) # [1, 128, H//8, W//8] corr_fn = BidirCorrBlock(fmap0, fmap1, radius=self.radius, num_levels=self.corr_levels) # f0_1: [1, c0, H//2, W//2] | f0_2: [1, c1, H//4, W//4] # f0_3: [1, c2, H//8, W//8] | f0_4: [1, c3, H//16, W//16] f0_1, f0_2, f0_3, f0_4 = self.encoder(img0_) f1_1, f1_2, f1_3, f1_4 = self.encoder(img1_) ######################################### the 4th decoder ######################################### up_flow0_4, up_flow1_4, ft_3_ = self.decoder4(f0_4, f1_4, embt) corr_4, flow_4 = self._corr_scale_lookup(corr_fn, coord, up_flow0_4, up_flow1_4, embt, downsample=1) # residue update with lookup corr delta_ft_3_, delta_flow_4 = self.update4(ft_3_, flow_4, corr_4) delta_flow0_4, delta_flow1_4 = torch.chunk(delta_flow_4, 2, 1) up_flow0_4 = up_flow0_4 + delta_flow0_4 up_flow1_4 = up_flow1_4 + delta_flow1_4 ft_3_ = ft_3_ + delta_ft_3_ ######################################### the 3rd decoder ######################################### up_flow0_3, up_flow1_3, ft_2_ = self.decoder3(ft_3_, f0_3, f1_3, up_flow0_4, up_flow1_4) corr_3, flow_3 = self._corr_scale_lookup(corr_fn, coord, up_flow0_3, up_flow1_3, embt, downsample=2) # residue update with lookup corr delta_ft_2_, delta_flow_3 = self.update3_low(ft_2_, flow_3, corr_3) delta_flow0_3, delta_flow1_3 = torch.chunk(delta_flow_3, 2, 1) up_flow0_3 = up_flow0_3 + delta_flow0_3 up_flow1_3 = up_flow1_3 + delta_flow1_3 ft_2_ = ft_2_ + delta_ft_2_ # residue update with lookup corr (hr) corr_3 = resize(corr_3, scale_factor=2.0) up_flow_3 = torch.cat([up_flow0_3, up_flow1_3], dim=1) delta_ft_2_, delta_up_flow_3 = self.update3_high(ft_2_, up_flow_3, corr_3) ft_2_ += delta_ft_2_ up_flow0_3 += delta_up_flow_3[:, 0:2] up_flow1_3 += delta_up_flow_3[:, 2:4] ######################################### the 2nd decoder ######################################### up_flow0_2, up_flow1_2, ft_1_ = self.decoder2(ft_2_, f0_2, f1_2, up_flow0_3, up_flow1_3) corr_2, flow_2 = self._corr_scale_lookup(corr_fn, coord, up_flow0_2, up_flow1_2, embt, downsample=4) # residue update with lookup corr delta_ft_1_, delta_flow_2 = self.update2_low(ft_1_, flow_2, corr_2) delta_flow0_2, delta_flow1_2 = torch.chunk(delta_flow_2, 2, 1) up_flow0_2 = up_flow0_2 + delta_flow0_2 up_flow1_2 = up_flow1_2 + delta_flow1_2 ft_1_ = ft_1_ + delta_ft_1_ # residue update with lookup corr (hr) corr_2 = resize(corr_2, scale_factor=4.0) up_flow_2 = torch.cat([up_flow0_2, up_flow1_2], dim=1) delta_ft_1_, delta_up_flow_2 = self.update2_high(ft_1_, up_flow_2, corr_2) ft_1_ += delta_ft_1_ up_flow0_2 += delta_up_flow_2[:, 0:2] up_flow1_2 += delta_up_flow_2[:, 2:4] ######################################### the 1st decoder ######################################### up_flow0_1, up_flow1_1, mask, img_res = self.decoder1(ft_1_, f0_1, f1_1, up_flow0_2, up_flow1_2) if scale_factor != 1.0: up_flow0_1 = resize(up_flow0_1, scale_factor=(1.0 / scale_factor)) * (1.0 / scale_factor) up_flow1_1 = resize(up_flow1_1, scale_factor=(1.0 / scale_factor)) * (1.0 / scale_factor) mask = resize(mask, scale_factor=(1.0 / scale_factor)) img_res = resize(img_res, scale_factor=(1.0 / scale_factor)) # Merge multiple predictions imgt_pred = multi_flow_combine(self.comb_block, img0, img1, up_flow0_1, up_flow1_1, mask, img_res, mean_) imgt_pred = torch.clamp(imgt_pred, 0, 1) if eval: return { "imgt_pred": imgt_pred, } else: up_flow0_1 = up_flow0_1.reshape(b, self.num_flows, 2, h, w) up_flow1_1 = up_flow1_1.reshape(b, self.num_flows, 2, h, w) return { "imgt_pred": imgt_pred, "flow0_pred": [up_flow0_1, up_flow0_2, up_flow0_3, up_flow0_4], "flow1_pred": [up_flow1_1, up_flow1_2, up_flow1_3, up_flow1_4], "ft_pred": [ft_1_, ft_2_, ft_3_], } ================================================ FILE: Open-Sora/build/lib/tools/frame_interpolation/networks/blocks/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/tools/frame_interpolation/networks/blocks/feat_enc.py ================================================ import torch import torch.nn as nn class BottleneckBlock(nn.Module): def __init__(self, in_planes, planes, norm_fn="group", stride=1): super(BottleneckBlock, self).__init__() self.conv1 = nn.Conv2d(in_planes, planes // 4, kernel_size=1, padding=0) self.conv2 = nn.Conv2d(planes // 4, planes // 4, kernel_size=3, padding=1, stride=stride) self.conv3 = nn.Conv2d(planes // 4, planes, kernel_size=1, padding=0) self.relu = nn.ReLU(inplace=True) num_groups = planes // 8 if norm_fn == "group": self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes // 4) self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes // 4) self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) if not stride == 1: self.norm4 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) elif norm_fn == "batch": self.norm1 = nn.BatchNorm2d(planes // 4) self.norm2 = nn.BatchNorm2d(planes // 4) self.norm3 = nn.BatchNorm2d(planes) if not stride == 1: self.norm4 = nn.BatchNorm2d(planes) elif norm_fn == "instance": self.norm1 = nn.InstanceNorm2d(planes // 4) self.norm2 = nn.InstanceNorm2d(planes // 4) self.norm3 = nn.InstanceNorm2d(planes) if not stride == 1: self.norm4 = nn.InstanceNorm2d(planes) elif norm_fn == "none": self.norm1 = nn.Sequential() self.norm2 = nn.Sequential() self.norm3 = nn.Sequential() if not stride == 1: self.norm4 = nn.Sequential() if stride == 1: self.downsample = None else: self.downsample = nn.Sequential(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm4) def forward(self, x): y = x y = self.relu(self.norm1(self.conv1(y))) y = self.relu(self.norm2(self.conv2(y))) y = self.relu(self.norm3(self.conv3(y))) if self.downsample is not None: x = self.downsample(x) return self.relu(x + y) class ResidualBlock(nn.Module): def __init__(self, in_planes, planes, norm_fn="group", stride=1): super(ResidualBlock, self).__init__() self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1) self.relu = nn.ReLU(inplace=True) num_groups = planes // 8 if norm_fn == "group": self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) if not stride == 1: self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) elif norm_fn == "batch": self.norm1 = nn.BatchNorm2d(planes) self.norm2 = nn.BatchNorm2d(planes) if not stride == 1: self.norm3 = nn.BatchNorm2d(planes) elif norm_fn == "instance": self.norm1 = nn.InstanceNorm2d(planes) self.norm2 = nn.InstanceNorm2d(planes) if not stride == 1: self.norm3 = nn.InstanceNorm2d(planes) elif norm_fn == "none": self.norm1 = nn.Sequential() self.norm2 = nn.Sequential() if not stride == 1: self.norm3 = nn.Sequential() if stride == 1: self.downsample = None else: self.downsample = nn.Sequential(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3) def forward(self, x): y = x y = self.relu(self.norm1(self.conv1(y))) y = self.relu(self.norm2(self.conv2(y))) if self.downsample is not None: x = self.downsample(x) return self.relu(x + y) class SmallEncoder(nn.Module): def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0): super(SmallEncoder, self).__init__() self.norm_fn = norm_fn if self.norm_fn == "group": self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32) elif self.norm_fn == "batch": self.norm1 = nn.BatchNorm2d(32) elif self.norm_fn == "instance": self.norm1 = nn.InstanceNorm2d(32) elif self.norm_fn == "none": self.norm1 = nn.Sequential() self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3) self.relu1 = nn.ReLU(inplace=True) self.in_planes = 32 self.layer1 = self._make_layer(32, stride=1) self.layer2 = self._make_layer(64, stride=2) self.layer3 = self._make_layer(96, stride=2) self.dropout = None if dropout > 0: self.dropout = nn.Dropout2d(p=dropout) self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0) def _make_layer(self, dim, stride=1): layer1 = BottleneckBlock(self.in_planes, dim, self.norm_fn, stride=stride) layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1) layers = (layer1, layer2) self.in_planes = dim return nn.Sequential(*layers) def forward(self, x): # if input is list, combine batch dimension is_list = isinstance(x, tuple) or isinstance(x, list) if is_list: batch_dim = x[0].shape[0] x = torch.cat(x, dim=0) x = self.conv1(x) x = self.norm1(x) x = self.relu1(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.conv2(x) if self.training and self.dropout is not None: x = self.dropout(x) if is_list: x = torch.split(x, [batch_dim, batch_dim], dim=0) return x class BasicEncoder(nn.Module): def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0): super(BasicEncoder, self).__init__() self.norm_fn = norm_fn if self.norm_fn == "group": self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64) elif self.norm_fn == "batch": self.norm1 = nn.BatchNorm2d(64) elif self.norm_fn == "instance": self.norm1 = nn.InstanceNorm2d(64) elif self.norm_fn == "none": self.norm1 = nn.Sequential() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) self.relu1 = nn.ReLU(inplace=True) self.in_planes = 64 self.layer1 = self._make_layer(64, stride=1) self.layer2 = self._make_layer(72, stride=2) self.layer3 = self._make_layer(128, stride=2) # output convolution self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1) self.dropout = None if dropout > 0: self.dropout = nn.Dropout2d(p=dropout) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0) def _make_layer(self, dim, stride=1): layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride) layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1) layers = (layer1, layer2) self.in_planes = dim return nn.Sequential(*layers) def forward(self, x): # if input is list, combine batch dimension is_list = isinstance(x, tuple) or isinstance(x, list) if is_list: batch_dim = x[0].shape[0] x = torch.cat(x, dim=0) x = self.conv1(x) x = self.norm1(x) x = self.relu1(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.conv2(x) if self.training and self.dropout is not None: x = self.dropout(x) if is_list: x = torch.split(x, [batch_dim, batch_dim], dim=0) return x class LargeEncoder(nn.Module): def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0): super(LargeEncoder, self).__init__() self.norm_fn = norm_fn if self.norm_fn == "group": self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64) elif self.norm_fn == "batch": self.norm1 = nn.BatchNorm2d(64) elif self.norm_fn == "instance": self.norm1 = nn.InstanceNorm2d(64) elif self.norm_fn == "none": self.norm1 = nn.Sequential() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) self.relu1 = nn.ReLU(inplace=True) self.in_planes = 64 self.layer1 = self._make_layer(64, stride=1) self.layer2 = self._make_layer(112, stride=2) self.layer3 = self._make_layer(160, stride=2) self.layer3_2 = self._make_layer(160, stride=1) # output convolution self.conv2 = nn.Conv2d(self.in_planes, output_dim, kernel_size=1) self.dropout = None if dropout > 0: self.dropout = nn.Dropout2d(p=dropout) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0) def _make_layer(self, dim, stride=1): layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride) layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1) layers = (layer1, layer2) self.in_planes = dim return nn.Sequential(*layers) def forward(self, x): # if input is list, combine batch dimension is_list = isinstance(x, tuple) or isinstance(x, list) if is_list: batch_dim = x[0].shape[0] x = torch.cat(x, dim=0) x = self.conv1(x) x = self.norm1(x) x = self.relu1(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer3_2(x) x = self.conv2(x) if self.training and self.dropout is not None: x = self.dropout(x) if is_list: x = torch.split(x, [batch_dim, batch_dim], dim=0) return x ================================================ FILE: Open-Sora/build/lib/tools/frame_interpolation/networks/blocks/ifrnet.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from tools.frame_interpolation.utils.flow_utils import warp def resize(x, scale_factor): return F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False) def convrelu(in_channels, out_channels, kernel_size=3, stride=1, padding=1, dilation=1, groups=1, bias=True): return nn.Sequential( nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias=bias), nn.PReLU(out_channels), ) class ResBlock(nn.Module): def __init__(self, in_channels, side_channels, bias=True): super(ResBlock, self).__init__() self.side_channels = side_channels self.conv1 = nn.Sequential( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(in_channels) ) self.conv2 = nn.Sequential( nn.Conv2d(side_channels, side_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(side_channels), ) self.conv3 = nn.Sequential( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(in_channels) ) self.conv4 = nn.Sequential( nn.Conv2d(side_channels, side_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(side_channels), ) self.conv5 = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias) self.prelu = nn.PReLU(in_channels) def forward(self, x): out = self.conv1(x) res_feat = out[:, : -self.side_channels, ...] side_feat = out[:, -self.side_channels :, :, :] side_feat = self.conv2(side_feat) out = self.conv3(torch.cat([res_feat, side_feat], 1)) res_feat = out[:, : -self.side_channels, ...] side_feat = out[:, -self.side_channels :, :, :] side_feat = self.conv4(side_feat) out = self.conv5(torch.cat([res_feat, side_feat], 1)) out = self.prelu(x + out) return out class Encoder(nn.Module): def __init__(self, channels, large=False): super(Encoder, self).__init__() self.channels = channels prev_ch = 3 for idx, ch in enumerate(channels, 1): k = 7 if large and idx == 1 else 3 p = 3 if k == 7 else 1 self.register_module( f"pyramid{idx}", nn.Sequential(convrelu(prev_ch, ch, k, 2, p), convrelu(ch, ch, 3, 1, 1)) ) prev_ch = ch def forward(self, in_x): fs = [] for idx in range(len(self.channels)): out_x = getattr(self, f"pyramid{idx+1}")(in_x) fs.append(out_x) in_x = out_x return fs class InitDecoder(nn.Module): def __init__(self, in_ch, out_ch, skip_ch) -> None: super().__init__() self.convblock = nn.Sequential( convrelu(in_ch * 2 + 1, in_ch * 2), ResBlock(in_ch * 2, skip_ch), nn.ConvTranspose2d(in_ch * 2, out_ch + 4, 4, 2, 1, bias=True), ) def forward(self, f0, f1, embt): h, w = f0.shape[2:] embt = embt.repeat(1, 1, h, w) out = self.convblock(torch.cat([f0, f1, embt], 1)) flow0, flow1 = torch.chunk(out[:, :4, ...], 2, 1) ft_ = out[:, 4:, ...] return flow0, flow1, ft_ class IntermediateDecoder(nn.Module): def __init__(self, in_ch, out_ch, skip_ch) -> None: super().__init__() self.convblock = nn.Sequential( convrelu(in_ch * 3 + 4, in_ch * 3), ResBlock(in_ch * 3, skip_ch), nn.ConvTranspose2d(in_ch * 3, out_ch + 4, 4, 2, 1, bias=True), ) def forward(self, ft_, f0, f1, flow0_in, flow1_in): f0_warp = warp(f0, flow0_in) f1_warp = warp(f1, flow1_in) f_in = torch.cat([ft_, f0_warp, f1_warp, flow0_in, flow1_in], 1) out = self.convblock(f_in) flow0, flow1 = torch.chunk(out[:, :4, ...], 2, 1) ft_ = out[:, 4:, ...] flow0 = flow0 + 2.0 * resize(flow0_in, scale_factor=2.0) flow1 = flow1 + 2.0 * resize(flow1_in, scale_factor=2.0) return flow0, flow1, ft_ ================================================ FILE: Open-Sora/build/lib/tools/frame_interpolation/networks/blocks/multi_flow.py ================================================ import torch import torch.nn as nn from tools.frame_interpolation.utils.flow_utils import warp from .ifrnet import ResBlock, convrelu, resize def multi_flow_combine(comb_block, img0, img1, flow0, flow1, mask=None, img_res=None, mean=None): """ A parallel implementation of multiple flow field warping comb_block: An nn.Seqential object. img shape: [b, c, h, w] flow shape: [b, 2*num_flows, h, w] mask (opt): If 'mask' is None, the function conduct a simple average. img_res (opt): If 'img_res' is None, the function adds zero instead. mean (opt): If 'mean' is None, the function adds zero instead. """ b, c, h, w = flow0.shape num_flows = c // 2 flow0 = flow0.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w) flow1 = flow1.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w) mask = mask.reshape(b, num_flows, 1, h, w).reshape(-1, 1, h, w) if mask is not None else None img_res = img_res.reshape(b, num_flows, 3, h, w).reshape(-1, 3, h, w) if img_res is not None else 0 img0 = torch.stack([img0] * num_flows, 1).reshape(-1, 3, h, w) img1 = torch.stack([img1] * num_flows, 1).reshape(-1, 3, h, w) mean = torch.stack([mean] * num_flows, 1).reshape(-1, 1, 1, 1) if mean is not None else 0 img0_warp = warp(img0, flow0) img1_warp = warp(img1, flow1) img_warps = mask * img0_warp + (1 - mask) * img1_warp + mean + img_res img_warps = img_warps.reshape(b, num_flows, 3, h, w) imgt_pred = img_warps.mean(1) + comb_block(img_warps.view(b, -1, h, w)) return imgt_pred class MultiFlowDecoder(nn.Module): def __init__(self, in_ch, skip_ch, num_flows=3): super(MultiFlowDecoder, self).__init__() self.num_flows = num_flows self.convblock = nn.Sequential( convrelu(in_ch * 3 + 4, in_ch * 3), ResBlock(in_ch * 3, skip_ch), nn.ConvTranspose2d(in_ch * 3, 8 * num_flows, 4, 2, 1, bias=True), ) def forward(self, ft_, f0, f1, flow0, flow1): n = self.num_flows f0_warp = warp(f0, flow0) f1_warp = warp(f1, flow1) out = self.convblock(torch.cat([ft_, f0_warp, f1_warp, flow0, flow1], 1)) delta_flow0, delta_flow1, mask, img_res = torch.split(out, [2 * n, 2 * n, n, 3 * n], 1) mask = torch.sigmoid(mask) flow0 = delta_flow0 + 2.0 * resize(flow0, scale_factor=2.0).repeat(1, self.num_flows, 1, 1) flow1 = delta_flow1 + 2.0 * resize(flow1, scale_factor=2.0).repeat(1, self.num_flows, 1, 1) return flow0, flow1, mask, img_res ================================================ FILE: Open-Sora/build/lib/tools/frame_interpolation/networks/blocks/raft.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F def resize(x, scale_factor): return F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False) def bilinear_sampler(img, coords, mask=False): """Wrapper for grid_sample, uses pixel coordinates""" H, W = img.shape[-2:] xgrid, ygrid = coords.split([1, 1], dim=-1) xgrid = 2 * xgrid / (W - 1) - 1 ygrid = 2 * ygrid / (H - 1) - 1 grid = torch.cat([xgrid, ygrid], dim=-1) img = F.grid_sample(img, grid, align_corners=True) if mask: mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1) return img, mask.float() return img def coords_grid(batch, ht, wd, device): coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device), indexing="ij") coords = torch.stack(coords[::-1], dim=0).float() return coords[None].repeat(batch, 1, 1, 1) class SmallUpdateBlock(nn.Module): def __init__(self, cdim, hidden_dim, flow_dim, corr_dim, fc_dim, corr_levels=4, radius=3, scale_factor=None): super(SmallUpdateBlock, self).__init__() cor_planes = corr_levels * (2 * radius + 1) ** 2 self.scale_factor = scale_factor self.convc1 = nn.Conv2d(2 * cor_planes, corr_dim, 1, padding=0) self.convf1 = nn.Conv2d(4, flow_dim * 2, 7, padding=3) self.convf2 = nn.Conv2d(flow_dim * 2, flow_dim, 3, padding=1) self.conv = nn.Conv2d(corr_dim + flow_dim, fc_dim, 3, padding=1) self.gru = nn.Sequential( nn.Conv2d(fc_dim + 4 + cdim, hidden_dim, 3, padding=1), nn.LeakyReLU(negative_slope=0.1, inplace=True), nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), ) self.feat_head = nn.Sequential( nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), nn.LeakyReLU(negative_slope=0.1, inplace=True), nn.Conv2d(hidden_dim, cdim, 3, padding=1), ) self.flow_head = nn.Sequential( nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), nn.LeakyReLU(negative_slope=0.1, inplace=True), nn.Conv2d(hidden_dim, 4, 3, padding=1), ) self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True) def forward(self, net, flow, corr): net = resize(net, 1 / self.scale_factor) if self.scale_factor is not None else net cor = self.lrelu(self.convc1(corr)) flo = self.lrelu(self.convf1(flow)) flo = self.lrelu(self.convf2(flo)) cor_flo = torch.cat([cor, flo], dim=1) inp = self.lrelu(self.conv(cor_flo)) inp = torch.cat([inp, flow, net], dim=1) out = self.gru(inp) delta_net = self.feat_head(out) delta_flow = self.flow_head(out) if self.scale_factor is not None: delta_net = resize(delta_net, scale_factor=self.scale_factor) delta_flow = self.scale_factor * resize(delta_flow, scale_factor=self.scale_factor) return delta_net, delta_flow class BasicUpdateBlock(nn.Module): def __init__( self, cdim, hidden_dim, flow_dim, corr_dim, corr_dim2, fc_dim, corr_levels=4, radius=3, scale_factor=None, out_num=1, ): super(BasicUpdateBlock, self).__init__() cor_planes = corr_levels * (2 * radius + 1) ** 2 self.scale_factor = scale_factor self.convc1 = nn.Conv2d(2 * cor_planes, corr_dim, 1, padding=0) self.convc2 = nn.Conv2d(corr_dim, corr_dim2, 3, padding=1) self.convf1 = nn.Conv2d(4, flow_dim * 2, 7, padding=3) self.convf2 = nn.Conv2d(flow_dim * 2, flow_dim, 3, padding=1) self.conv = nn.Conv2d(flow_dim + corr_dim2, fc_dim, 3, padding=1) self.gru = nn.Sequential( nn.Conv2d(fc_dim + 4 + cdim, hidden_dim, 3, padding=1), nn.LeakyReLU(negative_slope=0.1, inplace=True), nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), ) self.feat_head = nn.Sequential( nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), nn.LeakyReLU(negative_slope=0.1, inplace=True), nn.Conv2d(hidden_dim, cdim, 3, padding=1), ) self.flow_head = nn.Sequential( nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), nn.LeakyReLU(negative_slope=0.1, inplace=True), nn.Conv2d(hidden_dim, 4 * out_num, 3, padding=1), ) self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True) def forward(self, net, flow, corr): net = resize(net, 1 / self.scale_factor) if self.scale_factor is not None else net cor = self.lrelu(self.convc1(corr)) cor = self.lrelu(self.convc2(cor)) flo = self.lrelu(self.convf1(flow)) flo = self.lrelu(self.convf2(flo)) cor_flo = torch.cat([cor, flo], dim=1) inp = self.lrelu(self.conv(cor_flo)) inp = torch.cat([inp, flow, net], dim=1) out = self.gru(inp) delta_net = self.feat_head(out) delta_flow = self.flow_head(out) if self.scale_factor is not None: delta_net = resize(delta_net, scale_factor=self.scale_factor) delta_flow = self.scale_factor * resize(delta_flow, scale_factor=self.scale_factor) return delta_net, delta_flow class BidirCorrBlock: def __init__(self, fmap1, fmap2, num_levels=4, radius=4): self.num_levels = num_levels self.radius = radius self.corr_pyramid = [] self.corr_pyramid_T = [] corr = BidirCorrBlock.corr(fmap1, fmap2) batch, h1, w1, dim, h2, w2 = corr.shape corr_T = corr.clone().permute(0, 4, 5, 3, 1, 2) corr = corr.reshape(batch * h1 * w1, dim, h2, w2) corr_T = corr_T.reshape(batch * h2 * w2, dim, h1, w1) self.corr_pyramid.append(corr) self.corr_pyramid_T.append(corr_T) for _ in range(self.num_levels - 1): corr = F.avg_pool2d(corr, 2, stride=2) corr_T = F.avg_pool2d(corr_T, 2, stride=2) self.corr_pyramid.append(corr) self.corr_pyramid_T.append(corr_T) def __call__(self, coords0, coords1): r = self.radius coords0 = coords0.permute(0, 2, 3, 1) coords1 = coords1.permute(0, 2, 3, 1) assert coords0.shape == coords1.shape, f"coords0 shape: [{coords0.shape}] is not equal to [{coords1.shape}]" batch, h1, w1, _ = coords0.shape out_pyramid = [] out_pyramid_T = [] for i in range(self.num_levels): corr = self.corr_pyramid[i] corr_T = self.corr_pyramid_T[i] dx = torch.linspace(-r, r, 2 * r + 1, device=coords0.device) dy = torch.linspace(-r, r, 2 * r + 1, device=coords0.device) delta = torch.stack(torch.meshgrid(dy, dx, indexing="ij"), axis=-1) delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2) centroid_lvl_0 = coords0.reshape(batch * h1 * w1, 1, 1, 2) / 2**i centroid_lvl_1 = coords1.reshape(batch * h1 * w1, 1, 1, 2) / 2**i coords_lvl_0 = centroid_lvl_0 + delta_lvl coords_lvl_1 = centroid_lvl_1 + delta_lvl corr = bilinear_sampler(corr, coords_lvl_0) corr_T = bilinear_sampler(corr_T, coords_lvl_1) corr = corr.view(batch, h1, w1, -1) corr_T = corr_T.view(batch, h1, w1, -1) out_pyramid.append(corr) out_pyramid_T.append(corr_T) out = torch.cat(out_pyramid, dim=-1) out_T = torch.cat(out_pyramid_T, dim=-1) return out.permute(0, 3, 1, 2).contiguous().float(), out_T.permute(0, 3, 1, 2).contiguous().float() @staticmethod def corr(fmap1, fmap2): batch, dim, ht, wd = fmap1.shape fmap1 = fmap1.view(batch, dim, ht * wd) fmap2 = fmap2.view(batch, dim, ht * wd) corr = torch.matmul(fmap1.transpose(1, 2), fmap2) corr = corr.view(batch, ht, wd, 1, ht, wd) return corr / torch.sqrt(torch.tensor(dim).float()) ================================================ FILE: Open-Sora/build/lib/tools/frame_interpolation/utils/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/tools/frame_interpolation/utils/dist_utils.py ================================================ import os import torch def get_world_size(): """Find OMPI world size without calling mpi functions :rtype: int """ if os.environ.get("PMI_SIZE") is not None: return int(os.environ.get("PMI_SIZE") or 1) elif os.environ.get("OMPI_COMM_WORLD_SIZE") is not None: return int(os.environ.get("OMPI_COMM_WORLD_SIZE") or 1) else: return torch.cuda.device_count() def get_global_rank(): """Find OMPI world rank without calling mpi functions :rtype: int """ if os.environ.get("PMI_RANK") is not None: return int(os.environ.get("PMI_RANK") or 0) elif os.environ.get("OMPI_COMM_WORLD_RANK") is not None: return int(os.environ.get("OMPI_COMM_WORLD_RANK") or 0) else: return 0 def get_local_rank(): """Find OMPI local rank without calling mpi functions :rtype: int """ if os.environ.get("MPI_LOCALRANKID") is not None: return int(os.environ.get("MPI_LOCALRANKID") or 0) elif os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") is not None: return int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") or 0) else: return 0 def get_master_ip(): if os.environ.get("AZ_BATCH_MASTER_NODE") is not None: return os.environ.get("AZ_BATCH_MASTER_NODE").split(":")[0] elif os.environ.get("AZ_BATCHAI_MPI_MASTER_NODE") is not None: return os.environ.get("AZ_BATCHAI_MPI_MASTER_NODE") else: return "127.0.0.1" ================================================ FILE: Open-Sora/build/lib/tools/frame_interpolation/utils/flow_utils.py ================================================ import numpy as np import torch import torch.nn.functional as F from PIL import ImageFile ImageFile.LOAD_TRUNCATED_IMAGES = True def warp(img, flow): B, _, H, W = flow.shape xx = torch.linspace(-1.0, 1.0, W).view(1, 1, 1, W).expand(B, -1, H, -1) yy = torch.linspace(-1.0, 1.0, H).view(1, 1, H, 1).expand(B, -1, -1, W) grid = torch.cat([xx, yy], 1).to(img) flow_ = torch.cat([flow[:, 0:1, :, :] / ((W - 1.0) / 2.0), flow[:, 1:2, :, :] / ((H - 1.0) / 2.0)], 1) grid_ = (grid + flow_).permute(0, 2, 3, 1) output = F.grid_sample(input=img, grid=grid_, mode="bilinear", padding_mode="border", align_corners=True) return output def make_colorwheel(): """ Generates a color wheel for optical flow visualization as presented in: Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007) URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf Code follows the original C++ source code of Daniel Scharstein. Code follows the Matlab source code of Deqing Sun. Returns: np.ndarray: Color wheel """ RY = 15 YG = 6 GC = 4 CB = 11 BM = 13 MR = 6 ncols = RY + YG + GC + CB + BM + MR colorwheel = np.zeros((ncols, 3)) col = 0 # RY colorwheel[0:RY, 0] = 255 colorwheel[0:RY, 1] = np.floor(255 * np.arange(0, RY) / RY) col = col + RY # YG colorwheel[col : col + YG, 0] = 255 - np.floor(255 * np.arange(0, YG) / YG) colorwheel[col : col + YG, 1] = 255 col = col + YG # GC colorwheel[col : col + GC, 1] = 255 colorwheel[col : col + GC, 2] = np.floor(255 * np.arange(0, GC) / GC) col = col + GC # CB colorwheel[col : col + CB, 1] = 255 - np.floor(255 * np.arange(CB) / CB) colorwheel[col : col + CB, 2] = 255 col = col + CB # BM colorwheel[col : col + BM, 2] = 255 colorwheel[col : col + BM, 0] = np.floor(255 * np.arange(0, BM) / BM) col = col + BM # MR colorwheel[col : col + MR, 2] = 255 - np.floor(255 * np.arange(MR) / MR) colorwheel[col : col + MR, 0] = 255 return colorwheel def flow_uv_to_colors(u, v, convert_to_bgr=False): """ Applies the flow color wheel to (possibly clipped) flow components u and v. According to the C++ source code of Daniel Scharstein According to the Matlab source code of Deqing Sun Args: u (np.ndarray): Input horizontal flow of shape [H,W] v (np.ndarray): Input vertical flow of shape [H,W] convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False. Returns: np.ndarray: Flow visualization image of shape [H,W,3] """ flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8) colorwheel = make_colorwheel() # shape [55x3] ncols = colorwheel.shape[0] rad = np.sqrt(np.square(u) + np.square(v)) a = np.arctan2(-v, -u) / np.pi fk = (a + 1) / 2 * (ncols - 1) k0 = np.floor(fk).astype(np.int32) k1 = k0 + 1 k1[k1 == ncols] = 0 f = fk - k0 for i in range(colorwheel.shape[1]): tmp = colorwheel[:, i] col0 = tmp[k0] / 255.0 col1 = tmp[k1] / 255.0 col = (1 - f) * col0 + f * col1 idx = rad <= 1 col[idx] = 1 - rad[idx] * (1 - col[idx]) col[~idx] = col[~idx] * 0.75 # out of range # Note the 2-i => BGR instead of RGB ch_idx = 2 - i if convert_to_bgr else i flow_image[:, :, ch_idx] = np.floor(255 * col) return flow_image def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False): """ Expects a two dimensional flow image of shape. Args: flow_uv (np.ndarray): Flow UV image of shape [H,W,2] clip_flow (float, optional): Clip maximum of flow values. Defaults to None. convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False. Returns: np.ndarray: Flow visualization image of shape [H,W,3] """ assert flow_uv.ndim == 3, "input flow must have three dimensions" assert flow_uv.shape[2] == 2, "input flow must have shape [H,W,2]" if clip_flow is not None: flow_uv = np.clip(flow_uv, 0, clip_flow) u = flow_uv[:, :, 0] v = flow_uv[:, :, 1] rad = np.sqrt(np.square(u) + np.square(v)) rad_max = np.max(rad) epsilon = 1e-5 u = u / (rad_max + epsilon) v = v / (rad_max + epsilon) return flow_uv_to_colors(u, v, convert_to_bgr) ================================================ FILE: Open-Sora/build/lib/tools/frame_interpolation/utils/utils.py ================================================ import random import re import sys import numpy as np import torch import torch.nn.functional as F from imageio import imread, imwrite from PIL import ImageFile ImageFile.LOAD_TRUNCATED_IMAGES = True class AverageMeter: def __init__(self): self.reset() def reset(self): self.val = 0.0 self.avg = 0.0 self.sum = 0.0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count class AverageMeterGroups: def __init__(self) -> None: self.meter_dict = dict() def update(self, dict, n=1): for name, val in dict.items(): if self.meter_dict.get(name) is None: self.meter_dict[name] = AverageMeter() self.meter_dict[name].update(val, n) def reset(self, name=None): if name is None: for v in self.meter_dict.values(): v.reset() else: meter = self.meter_dict.get(name) if meter is not None: meter.reset() def avg(self, name): meter = self.meter_dict.get(name) if meter is not None: return meter.avg class InputPadder: """Pads images such that dimensions are divisible by divisor""" def __init__(self, dims, divisor=16): self.ht, self.wd = dims[-2:] pad_ht = (((self.ht // divisor) + 1) * divisor - self.ht) % divisor pad_wd = (((self.wd // divisor) + 1) * divisor - self.wd) % divisor self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2, pad_ht - pad_ht // 2] def pad(self, *inputs): if len(inputs) == 1: return F.pad(inputs[0], self._pad, mode="replicate") else: return [F.pad(x, self._pad, mode="replicate") for x in inputs] def unpad(self, *inputs): if len(inputs) == 1: return self._unpad(inputs[0]) else: return [self._unpad(x) for x in inputs] def _unpad(self, x): ht, wd = x.shape[-2:] c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]] return x[..., c[0] : c[1], c[2] : c[3]] def img2tensor(img): if img.shape[-1] > 3: img = img[:, :, :3] return torch.tensor(img).permute(2, 0, 1).unsqueeze(0) / 255.0 def tensor2img(img_t): return (img_t * 255.0).detach().squeeze(0).permute(1, 2, 0).cpu().numpy().clip(0, 255).astype(np.uint8) def seed_all(seed): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) def read(file): if file.endswith(".float3"): return readFloat(file) elif file.endswith(".flo"): return readFlow(file) elif file.endswith(".ppm"): return readImage(file) elif file.endswith(".pgm"): return readImage(file) elif file.endswith(".png"): return readImage(file) elif file.endswith(".jpg"): return readImage(file) elif file.endswith(".pfm"): return readPFM(file)[0] else: raise Exception("don't know how to read %s" % file) def write(file, data): if file.endswith(".float3"): return writeFloat(file, data) elif file.endswith(".flo"): return writeFlow(file, data) elif file.endswith(".ppm"): return writeImage(file, data) elif file.endswith(".pgm"): return writeImage(file, data) elif file.endswith(".png"): return writeImage(file, data) elif file.endswith(".jpg"): return writeImage(file, data) elif file.endswith(".pfm"): return writePFM(file, data) else: raise Exception("don't know how to write %s" % file) def readPFM(file): file = open(file, "rb") color = None width = None height = None scale = None endian = None header = file.readline().rstrip() if header.decode("ascii") == "PF": color = True elif header.decode("ascii") == "Pf": color = False else: raise Exception("Not a PFM file.") dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii")) if dim_match: width, height = list(map(int, dim_match.groups())) else: raise Exception("Malformed PFM header.") scale = float(file.readline().decode("ascii").rstrip()) if scale < 0: endian = "<" scale = -scale else: endian = ">" data = np.fromfile(file, endian + "f") shape = (height, width, 3) if color else (height, width) data = np.reshape(data, shape) data = np.flipud(data) return data, scale def writePFM(file, image, scale=1): file = open(file, "wb") color = None if image.dtype.name != "float32": raise Exception("Image dtype must be float32.") image = np.flipud(image) if len(image.shape) == 3 and image.shape[2] == 3: color = True elif len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1: color = False else: raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.") file.write("PF\n" if color else "Pf\n".encode()) file.write("%d %d\n".encode() % (image.shape[1], image.shape[0])) endian = image.dtype.byteorder if endian == "<" or endian == "=" and sys.byteorder == "little": scale = -scale file.write("%f\n".encode() % scale) image.tofile(file) def readFlow(name): if name.endswith(".pfm") or name.endswith(".PFM"): return readPFM(name)[0][:, :, 0:2] f = open(name, "rb") header = f.read(4) if header.decode("utf-8") != "PIEH": raise Exception("Flow file header does not contain PIEH") width = np.fromfile(f, np.int32, 1).squeeze() height = np.fromfile(f, np.int32, 1).squeeze() flow = np.fromfile(f, np.float32, width * height * 2).reshape((height, width, 2)) return flow.astype(np.float32) def readImage(name): if name.endswith(".pfm") or name.endswith(".PFM"): data = readPFM(name)[0] if len(data.shape) == 3: return data[:, :, 0:3] else: return data return imread(name) def writeImage(name, data): if name.endswith(".pfm") or name.endswith(".PFM"): return writePFM(name, data, 1) return imwrite(name, data) def writeFlow(name, flow): f = open(name, "wb") f.write("PIEH".encode("utf-8")) np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f) flow = flow.astype(np.float32) flow.tofile(f) def readFloat(name): f = open(name, "rb") if (f.readline().decode("utf-8")) != "float\n": raise Exception("float file %s did not contain keyword" % name) dim = int(f.readline()) dims = [] count = 1 for i in range(0, dim): d = int(f.readline()) dims.append(d) count *= d dims = list(reversed(dims)) data = np.fromfile(f, np.float32, count).reshape(dims) if dim > 2: data = np.transpose(data, (2, 1, 0)) data = np.transpose(data, (1, 0, 2)) return data def writeFloat(name, data): f = open(name, "wb") dim = len(data.shape) if dim > 3: raise Exception("bad float file dimension: %d" % dim) f.write(("float\n").encode("ascii")) f.write(("%d\n" % dim).encode("ascii")) if dim == 1: f.write(("%d\n" % data.shape[0]).encode("ascii")) else: f.write(("%d\n" % data.shape[1]).encode("ascii")) f.write(("%d\n" % data.shape[0]).encode("ascii")) for i in range(2, dim): f.write(("%d\n" % data.shape[i]).encode("ascii")) data = data.astype(np.float32) if dim == 2: data.tofile(f) else: np.transpose(data, (2, 0, 1)).tofile(f) def check_dim_and_resize(tensor_list): shape_list = [] for t in tensor_list: shape_list.append(t.shape[2:]) if len(set(shape_list)) > 1: desired_shape = shape_list[0] print(f"Inconsistent size of input video frames. All frames will be resized to {desired_shape}") resize_tensor_list = [] for t in tensor_list: resize_tensor_list.append(torch.nn.functional.interpolate(t, size=tuple(desired_shape), mode="bilinear")) tensor_list = resize_tensor_list return tensor_list ================================================ FILE: Open-Sora/build/lib/tools/scene_cut/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/tools/scene_cut/convert_id_to_path.py ================================================ import argparse import json import os from functools import partial import cv2 import numpy as np import pandas as pd from mmengine.logging import print_log from moviepy.editor import VideoFileClip from pandarallel import pandarallel from tqdm import tqdm tqdm.pandas() def is_intact_video(video_path, mode="moviepy", verbose=False, logger=None): if not os.path.exists(video_path): if verbose: print_log(f"Could not find '{video_path}'", logger=logger) return False if mode == "moviepy": try: VideoFileClip(video_path) if verbose: print_log(f"The video file '{video_path}' is intact.", logger=logger) return True except Exception as e: if verbose: print_log(f"Error: {e}", logger=logger) print_log(f"The video file '{video_path}' is not intact.", logger=logger) return False elif mode == "cv2": try: cap = cv2.VideoCapture(video_path) if cap.isOpened(): if verbose: print_log(f"The video file '{video_path}' is intact.", logger=logger) return True except Exception as e: if verbose: print_log(f"Error: {e}", logger=logger) print_log(f"The video file '{video_path}' is not intact.", logger=logger) return False else: raise ValueError def has_downloaded_success(json_path): if not os.path.exists(json_path): return False try: with open(json_path, "r") as f: data = json.load(f) if "success" not in data or isinstance(data["success"], bool) is False or data["success"] is False: return False except Exception: return False return True def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("meta_path", type=str) parser.add_argument("--folder_path", type=str, required=True) parser.add_argument("--mode", type=str, default=None) parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel") args = parser.parse_args() return args def main(): args = parse_args() meta_path = args.meta_path folder_path = args.folder_path mode = args.mode def is_intact(row, mode=None): video_id = row["id"] video_path = os.path.join(folder_path, f"{video_id}.mp4") row["path"] = video_path if mode == ".mp4": if is_intact_video(video_path): return True, video_path return False, video_path elif mode == ".json": # json_path = os.path.join(root_raw, f"data/{split}/{video_id}.json") json_path = os.path.join(folder_path, f"{video_id}.json") if has_downloaded_success(json_path): return True, video_path return False, video_path elif mode is None: return True, video_path else: raise ValueError meta_dirpath = os.path.dirname(meta_path) meta_fname = os.path.basename(meta_path) wo_ext, ext = os.path.splitext(meta_fname) if args.num_workers is not None: pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers) else: pandarallel.initialize(progress_bar=True) is_intact_partial = partial(is_intact, mode=mode) meta = pd.read_csv(meta_path) ret = meta.parallel_apply(is_intact_partial, axis=1) intact, paths = list(zip(*ret)) meta["intact"] = intact meta["path"] = paths out_path = os.path.join(meta_dirpath, f"{wo_ext}_path_intact.csv") meta.to_csv(out_path, index=False) print(f"New meta (shape={meta.shape}) with intact info saved to '{out_path}'") meta_format = meta[np.array(intact)] meta_format.drop("intact", axis=1, inplace=True) out_path = os.path.join(meta_dirpath, f"{wo_ext}_path-filtered.csv") meta_format.to_csv(out_path, index=False) print(f"New meta (shape={meta_format.shape}) with format info saved to '{out_path}'") if __name__ == "__main__": main() ================================================ FILE: Open-Sora/build/lib/tools/scene_cut/cut.py ================================================ import cv2 # isort:skip import argparse import os import subprocess from functools import partial import pandas as pd from imageio_ffmpeg import get_ffmpeg_exe from pandarallel import pandarallel from scenedetect import FrameTimecode from tqdm import tqdm tqdm.pandas() def print_log(s, logger=None): if logger is not None: logger.info(s) else: print(s) def process_single_row(row, args): video_path = row["path"] logger = None # check mp4 integrity # if not is_intact_video(video_path, logger=logger): # return False try: if "timestamp" in row: timestamp = row["timestamp"] if not (timestamp.startswith("[") and timestamp.endswith("]")): return False scene_list = eval(timestamp) scene_list = [(FrameTimecode(s, fps=100), FrameTimecode(t, fps=100)) for s, t in scene_list] else: scene_list = [None] if args.drop_invalid_timestamps: return True except Exception as e: if args.drop_invalid_timestamps: return False if "relpath" in row: save_dir = os.path.dirname(os.path.join(args.save_dir, row["relpath"])) os.makedirs(save_dir, exist_ok=True) else: save_dir = args.save_dir shorter_size = args.shorter_size if (shorter_size is not None) and ("height" in row) and ("width" in row): min_size = min(row["height"], row["width"]) if min_size <= shorter_size: shorter_size = None split_video( video_path, scene_list, save_dir=save_dir, min_seconds=args.min_seconds, max_seconds=args.max_seconds, target_fps=args.target_fps, shorter_size=shorter_size, logger=logger, ) return True def split_video( video_path, scene_list, save_dir, min_seconds=2, max_seconds=15, target_fps=30, shorter_size=None, verbose=False, logger=None, ): """ scenes shorter than min_seconds will be ignored; scenes longer than max_seconds will be cut to save the beginning max_seconds. Currently, the saved file name pattern is f'{fname}_scene-{idx}'.mp4 Args: scene_list (List[Tuple[FrameTimecode, FrameTimecode]]): each element is (s, t): start and end of a scene. min_seconds (float | None) max_seconds (float | None) target_fps (int | None) shorter_size (int | None) """ FFMPEG_PATH = get_ffmpeg_exe() save_path_list = [] for idx, scene in enumerate(scene_list): if scene is not None: s, t = scene # FrameTimecode if min_seconds is not None: if (t - s).get_seconds() < min_seconds: continue duration = t - s if max_seconds is not None: fps = s.framerate max_duration = FrameTimecode(max_seconds, fps=fps) duration = min(max_duration, duration) # save path fname = os.path.basename(video_path) fname_wo_ext = os.path.splitext(fname)[0] # TODO: fname pattern save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4") if os.path.exists(save_path): # print_log(f"File '{save_path}' already exists. Skip.", logger=logger) continue # ffmpeg cmd cmd = [FFMPEG_PATH] # Only show ffmpeg output for the first call, which will display any # errors if it fails, and then break the loop. We only show error messages # for the remaining calls. # cmd += ['-v', 'error'] # clip to cut # Note: -ss after -i is very slow; put -ss before -i !!! if scene is None: cmd += ["-nostdin", "-y", "-i", video_path] else: cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-i", video_path, "-t", str(duration.get_seconds())] # target fps if target_fps is not None: cmd += ["-r", f"{target_fps}"] # aspect ratio if shorter_size is not None: cmd += ["-vf", f"scale='if(gt(iw,ih),-2,{shorter_size})':'if(gt(iw,ih),{shorter_size},-2)'"] # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"] cmd += ["-map", "0:v", save_path] # print(cmd) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout, stderr = proc.communicate() # stdout = stdout.decode("utf-8") # print_log(stdout, logger=logger) save_path_list.append(video_path) if verbose: print_log(f"Video clip saved to '{save_path}'", logger=logger) return save_path_list def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("meta_path", type=str) parser.add_argument("--save_dir", type=str) parser.add_argument( "--min_seconds", type=float, default=None, help="if not None, clip shorter than min_seconds is ignored" ) parser.add_argument( "--max_seconds", type=float, default=None, help="if not None, clip longer than max_seconds is truncated" ) parser.add_argument("--target_fps", type=int, default=None, help="target fps of clips") parser.add_argument( "--shorter_size", type=int, default=None, help="resize the shorter size by keeping ratio; will not do upscale" ) parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel") parser.add_argument("--disable_parallel", action="store_true", help="disable parallel processing") parser.add_argument("--drop_invalid_timestamps", action="store_true", help="drop rows with invalid timestamps") args = parser.parse_args() return args def main(): args = parse_args() meta_path = args.meta_path if not os.path.exists(meta_path): print(f"Meta file '{meta_path}' not found. Exit.") exit() # create save_dir os.makedirs(args.save_dir, exist_ok=True) # initialize pandarallel if not args.disable_parallel: if args.num_workers is not None: pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers) else: pandarallel.initialize(progress_bar=True) process_single_row_partial = partial(process_single_row, args=args) # process meta = pd.read_csv(args.meta_path) if not args.disable_parallel: results = meta.parallel_apply(process_single_row_partial, axis=1) else: results = meta.apply(process_single_row_partial, axis=1) if args.drop_invalid_timestamps: meta = meta[results] assert args.meta_path.endswith("timestamp.csv"), "Only support *timestamp.csv" meta.to_csv(args.meta_path.replace("timestamp.csv", "correct_timestamp.csv"), index=False) print(f"Corrected timestamp file saved to '{args.meta_path.replace('timestamp.csv', 'correct_timestamp.csv')}'") if __name__ == "__main__": main() ================================================ FILE: Open-Sora/build/lib/tools/scene_cut/scene_detect.py ================================================ import argparse import os import numpy as np import pandas as pd from pandarallel import pandarallel from scenedetect import AdaptiveDetector, detect from tqdm import tqdm tqdm.pandas() def process_single_row(row): # windows # from scenedetect import detect, ContentDetector, AdaptiveDetector video_path = row["path"] detector = AdaptiveDetector( adaptive_threshold=3.0, # luma_only=True, ) # detector = ContentDetector() # TODO: catch error here try: scene_list = detect(video_path, detector, start_in_scene=True) timestamp = [(s.get_timecode(), t.get_timecode()) for s, t in scene_list] return True, str(timestamp) except Exception as e: print(f"Video '{video_path}' with error {e}") return False, "" def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("meta_path", type=str) parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel") args = parser.parse_args() return args def main(): args = parse_args() meta_path = args.meta_path if not os.path.exists(meta_path): print(f"Meta file '{meta_path}' not found. Exit.") exit() if args.num_workers is not None: pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers) else: pandarallel.initialize(progress_bar=True) meta = pd.read_csv(meta_path) ret = meta.parallel_apply(process_single_row, axis=1) succ, timestamps = list(zip(*ret)) meta["timestamp"] = timestamps meta = meta[np.array(succ)] wo_ext, ext = os.path.splitext(meta_path) out_path = f"{wo_ext}_timestamp{ext}" meta.to_csv(out_path, index=False) print(f"New meta (shape={meta.shape}) with timestamp saved to '{out_path}'.") if __name__ == "__main__": main() ================================================ FILE: Open-Sora/build/lib/tools/scoring/aesthetic/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/tools/scoring/aesthetic/inference.py ================================================ # adapted from https://github.com/christophschuhmann/improved-aesthetic-predictor/blob/main/simple_inference.py import cv2 # isort:skip import argparse import gc import os from datetime import timedelta import clip import numpy as np import pandas as pd import torch import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F from einops import rearrange from torch.utils.data import DataLoader, DistributedSampler from torchvision.datasets.folder import pil_loader from tqdm import tqdm from tools.datasets.utils import extract_frames, is_video NUM_FRAMES_POINTS = { 1: (0.5,), 2: (0.25, 0.5), 3: (0.1, 0.5, 0.9), } def merge_scores(gathered_list: list, meta: pd.DataFrame, column): # reorder indices_list = list(map(lambda x: x[0], gathered_list)) scores_list = list(map(lambda x: x[1], gathered_list)) flat_indices = [] for x in zip(*indices_list): flat_indices.extend(x) flat_scores = [] for x in zip(*scores_list): flat_scores.extend(x) flat_indices = np.array(flat_indices) flat_scores = np.array(flat_scores) # filter duplicates unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True) meta.loc[unique_indices, column] = flat_scores[unique_indices_idx] # drop indices in meta not in unique_indices meta = meta.loc[unique_indices] return meta class VideoTextDataset(torch.utils.data.Dataset): def __init__(self, meta_path, transform=None, num_frames=3): self.meta_path = meta_path self.meta = pd.read_csv(meta_path) self.transform = transform self.points = NUM_FRAMES_POINTS[num_frames] def __getitem__(self, index): sample = self.meta.iloc[index] path = sample["path"] # extract frames if not is_video(path): images = [pil_loader(path)] else: num_frames = sample["num_frames"] if "num_frames" in sample else None images = extract_frames(sample["path"], points=self.points, backend="opencv", num_frames=num_frames) # transform images = [self.transform(img) for img in images] # stack images = torch.stack(images) ret = dict(index=index, images=images) return ret def __len__(self): return len(self.meta) class MLP(nn.Module): def __init__(self, input_size): super().__init__() self.input_size = input_size self.layers = nn.Sequential( nn.Linear(self.input_size, 1024), nn.Dropout(0.2), nn.Linear(1024, 128), nn.Dropout(0.2), nn.Linear(128, 64), nn.Dropout(0.1), nn.Linear(64, 16), nn.Linear(16, 1), ) def forward(self, x): return self.layers(x) class AestheticScorer(nn.Module): def __init__(self, input_size, device): super().__init__() self.mlp = MLP(input_size) self.clip, self.preprocess = clip.load("ViT-L/14", device=device) self.eval() self.to(device) def forward(self, x): image_features = self.clip.encode_image(x) image_features = F.normalize(image_features, p=2, dim=-1).float() return self.mlp(image_features) def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("meta_path", type=str, help="Path to the input CSV file") parser.add_argument("--bs", type=int, default=1024, help="Batch size") parser.add_argument("--num_workers", type=int, default=16, help="Number of workers") parser.add_argument("--prefetch_factor", type=int, default=3, help="Prefetch factor") parser.add_argument("--num_frames", type=int, default=3, help="Number of frames to extract") parser.add_argument("--skip_if_existing", action="store_true") args = parser.parse_args() return args def main(): args = parse_args() meta_path = args.meta_path if not os.path.exists(meta_path): print(f"Meta file '{meta_path}' not found. Exit.") exit() wo_ext, ext = os.path.splitext(meta_path) out_path = f"{wo_ext}_aes{ext}" if args.skip_if_existing and os.path.exists(out_path): print(f"Output meta file '{out_path}' already exists. Exit.") exit() dist.init_process_group(backend="nccl", timeout=timedelta(hours=24)) torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count()) # build model device = "cuda" if torch.cuda.is_available() else "cpu" model = AestheticScorer(768, device) model.mlp.load_state_dict(torch.load("pretrained_models/aesthetic.pth", map_location=device)) preprocess = model.preprocess # build dataset dataset = VideoTextDataset(args.meta_path, transform=preprocess, num_frames=args.num_frames) dataloader = DataLoader( dataset, batch_size=args.bs, num_workers=args.num_workers, sampler=DistributedSampler( dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=False, drop_last=False, ), ) # compute aesthetic scores indices_list = [] scores_list = [] model.eval() for batch in tqdm(dataloader, disable=dist.get_rank() != 0): indices = batch["index"] images = batch["images"].to(device, non_blocking=True) B = images.shape[0] images = rearrange(images, "B N C H W -> (B N) C H W") # compute score with torch.no_grad(): scores = model(images) scores = rearrange(scores, "(B N) 1 -> B N", B=B) scores = scores.mean(dim=1) scores_np = scores.to(torch.float32).cpu().numpy() indices_list.extend(indices.tolist()) scores_list.extend(scores_np.tolist()) # save local results meta_local = merge_scores([(indices_list, scores_list)], dataset.meta, column="aes") save_dir_local = os.path.join(os.path.dirname(out_path), "parts") os.makedirs(save_dir_local, exist_ok=True) out_path_local = os.path.join( save_dir_local, os.path.basename(out_path).replace(".csv", f"_part_{dist.get_rank()}.csv") ) meta_local.to_csv(out_path_local, index=False) # wait for all ranks to finish data processing dist.barrier() torch.cuda.empty_cache() gc.collect() gathered_list = [None] * dist.get_world_size() dist.all_gather_object(gathered_list, (indices_list, scores_list)) if dist.get_rank() == 0: meta_new = merge_scores(gathered_list, dataset.meta, column="aes") meta_new.to_csv(out_path, index=False) print(f"New meta with aesthetic scores saved to '{out_path}'.") if __name__ == "__main__": main() ================================================ FILE: Open-Sora/build/lib/tools/scoring/matching/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/tools/scoring/matching/inference.py ================================================ import argparse import os import clip import colossalai import numpy as np import pandas as pd import torch import torch.distributed as dist import torch.nn.functional as F from torch.utils.data import DataLoader, DistributedSampler from torchvision.datasets.folder import pil_loader from tqdm import tqdm from tools.datasets.utils import extract_frames, is_video def merge_scores(gathered_list: list, meta: pd.DataFrame, column): # reorder indices_list = list(map(lambda x: x[0], gathered_list)) scores_list = list(map(lambda x: x[1], gathered_list)) flat_indices = [] for x in zip(*indices_list): flat_indices.extend(x) flat_scores = [] for x in zip(*scores_list): flat_scores.extend(x) flat_indices = np.array(flat_indices) flat_scores = np.array(flat_scores) # filter duplicates unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True) meta.loc[unique_indices, column] = flat_scores[unique_indices_idx] return meta class VideoTextDataset(torch.utils.data.Dataset): def __init__(self, meta_path, transform): self.meta_path = meta_path self.meta = pd.read_csv(meta_path) self.transform = transform def __getitem__(self, index): row = self.meta.iloc[index] path = row["path"] if is_video(path): img = extract_frames(path, points=[0.5], backend="opencv")[0] else: img = pil_loader(path) img = self.transform(img) text = row["text"] text = clip.tokenize(text, truncate=True).squeeze() return img, text, index def __len__(self): return len(self.meta) def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("meta_path", type=str, help="Path to the input CSV file") parser.add_argument("--bs", type=int, default=16, help="Batch size") parser.add_argument("--num_workers", type=int, default=16, help="Number of workers") parser.add_argument("--skip_if_existing", action="store_true") args = parser.parse_args() return args def main(): args = parse_args() meta_path = args.meta_path if not os.path.exists(meta_path): print(f"Meta file '{meta_path}' not found. Exit.") exit() wo_ext, ext = os.path.splitext(meta_path) out_path = f"{wo_ext}_match{ext}" if args.skip_if_existing and os.path.exists(out_path): print(f"Output meta file '{out_path}' already exists. Exit.") exit() colossalai.launch_from_torch({}) # build model device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") model, preprocess = clip.load("ViT-L/14", device=device) logit_scale = model.logit_scale.exp().item() # build dataset dataset = VideoTextDataset(meta_path=meta_path, transform=preprocess) dataloader = DataLoader( dataset, batch_size=args.bs, num_workers=args.num_workers, sampler=DistributedSampler( dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=False, drop_last=False, ), ) # compute scores indices_list = [] scores_list = [] model.eval() for imgs, text, indices in tqdm(dataloader, disable=dist.get_rank() != 0): imgs = imgs.to(device) text = text.to(device) with torch.no_grad(): feat_img = model.encode_image(imgs) feat_text = model.encode_text(text) feat_img = F.normalize(feat_img, dim=1) feat_text = F.normalize(feat_text, dim=1) clip_scores = logit_scale * (feat_img * feat_text).sum(dim=1) clip_scores = clip_scores.cpu().tolist() indices_list.extend(indices) scores_list.extend(clip_scores) gathered_list = [None] * dist.get_world_size() dist.all_gather_object(gathered_list, (indices_list, scores_list)) if dist.get_rank() == 0: meta_new = merge_scores(gathered_list, dataset.meta, column="match") meta_new.to_csv(out_path, index=False) print(f"New meta with matching scores saved to '{out_path}'.") if __name__ == "__main__": main() ================================================ FILE: Open-Sora/build/lib/vbench/__init__.py ================================================ import os from .utils import get_prompt_from_filename, init_submodules, save_json, load_json import importlib from itertools import chain from pathlib import Path class VBench(object): def __init__(self, device, full_info_dir, output_path): self.device = device # cuda or cpu self.full_info_dir = full_info_dir # full json file that VBench originally provides self.output_path = output_path # output directory to save VBench results if not os.path.exists(self.output_path): os.makedirs(self.output_path, exist_ok=False) def build_full_dimension_list(self, ): return ["subject_consistency", "background_consistency", "aesthetic_quality", "imaging_quality", "object_class", "multiple_objects", "color", "spatial_relationship", "scene", "temporal_style", 'overall_consistency', "human_action", "temporal_flickering", "motion_smoothness", "dynamic_degree", "appearance_style"] def check_dimension_requires_extra_info(self, dimension_list): dim_custom_not_supported = set(dimension_list) & set([ 'background_consistency', 'object_class', 'multiple_objects', 'scene', 'appearance_style', 'color', 'spatial_relationship' ]) assert len(dim_custom_not_supported) == 0, f"dimensions : {dim_custom_not_supported} not supported for custom input" def build_full_info_json(self, videos_path, name, dimension_list, prompt_list=[], special_str='', verbose=False, mode='vbench_standard', **kwargs): cur_full_info_list=[] # to save the prompt and video path info for the current dimensions if mode=='custom_input': self.check_dimension_requires_extra_info(dimension_list) if os.path.isfile(videos_path): cur_full_info_list = [{"prompt_en": get_prompt_from_filename(videos_path), "dimension": dimension_list, "video_list": [videos_path]}] if len(prompt_list) == 1: cur_full_info_list[0]["prompt_en"] = prompt_list[0] else: video_names = os.listdir(videos_path) cur_full_info_list = [] for filename in video_names: postfix = Path(os.path.join(videos_path, filename)).suffix if postfix.lower() not in ['.mp4', '.gif', '.jpg', '.png']: continue cur_full_info_list.append({ "prompt_en": get_prompt_from_filename(filename), "dimension": dimension_list, "video_list": [os.path.join(videos_path, filename)] }) if len(prompt_list) > 0: prompt_list = {os.path.join(videos_path, path): prompt_list[path] for path in prompt_list} assert len(prompt_list) >= len(cur_full_info_list), """ Number of prompts should match with number of videos.\n Got {len(prompt_list)=}, {len(cur_full_info_list)=}\n To read the prompt from filename, delete --prompt_file and --prompt_list """ all_video_path = [os.path.abspath(file) for file in list(chain.from_iterable(vid["video_list"] for vid in cur_full_info_list))] backslash = "\n" assert len(set(all_video_path) - set([os.path.abspath(path_key) for path_key in prompt_list])) == 0, f""" The prompts for the following videos are not found in the prompt file: \n {backslash.join(set(all_video_path) - set([os.path.abspath(path_key) for path_key in prompt_list]))} """ video_map = {} for prompt_key in prompt_list: video_map[os.path.abspath(prompt_key)] = prompt_list[prompt_key] for video_info in cur_full_info_list: video_info["prompt_en"] = video_map[os.path.abspath(video_info["video_list"][0])] elif mode=='vbench_category': self.check_dimension_requires_extra_info(dimension_list) CUR_DIR = os.path.dirname(os.path.abspath(__file__)) category_supported = [ Path(category).stem for category in os.listdir(f'prompts/prompts_per_category') ]# TODO: probably need refactoring again if 'category' not in kwargs: category = category_supported else: category = kwargs['category'] assert category is not None, "Please specify the category to be evaluated with --category" assert category in category_supported, f''' The following category is not supported, {category}. ''' video_names = os.listdir(videos_path) postfix = Path(video_names[0]).suffix with open(f'{CUR_DIR}/prompts_per_category/{category}.txt', 'r') as f: video_prompts = [line.strip() for line in f.readlines()] for prompt in video_prompts: video_list = [] for filename in video_names: if (not Path(filename).stem.startswith(prompt)): continue postfix = Path(os.path.join(videos_path, filename)).suffix if postfix.lower() not in ['.mp4', '.gif', '.jpg', '.png']: continue video_list.append(os.path.join(videos_path, filename)) cur_full_info_list.append({ "prompt_en": prompt, "dimension": dimension_list, "video_list": video_list }) else: full_info_list = load_json(self.full_info_dir) video_names = os.listdir(videos_path) postfix = Path(video_names[0]).suffix for prompt_dict in full_info_list: # if the prompt belongs to any dimension we want to evaluate if set(dimension_list) & set(prompt_dict["dimension"]): prompt = prompt_dict['prompt_en'] prompt_dict['video_list'] = [] for i in range(5): # video index for the same prompt intended_video_name = f'{prompt}{special_str}-{str(i)}{postfix}' if intended_video_name in video_names: # if the video exists intended_video_path = os.path.join(videos_path, intended_video_name) prompt_dict['video_list'].append(intended_video_path) if verbose: print(f'Successfully found video: {intended_video_name}') else: print(f'WARNING!!! This required video is not found! Missing benchmark videos can lead to unfair evaluation result. The missing video is: {intended_video_name}') cur_full_info_list.append(prompt_dict) cur_full_info_path = os.path.join(self.output_path, name+'_full_info.json') save_json(cur_full_info_list, cur_full_info_path) print(f'Evaluation meta data saved to {cur_full_info_path}') return cur_full_info_path def evaluate(self, videos_path, name, prompt_list=[], dimension_list=None, local=False, read_frame=False, mode='vbench_standard', **kwargs): results_dict = {} if dimension_list is None: dimension_list = self.build_full_dimension_list() submodules_dict = init_submodules(dimension_list, local=local, read_frame=read_frame) cur_full_info_path = self.build_full_info_json(videos_path, name, dimension_list, prompt_list, mode=mode, **kwargs) for dimension in dimension_list: try: dimension_module = importlib.import_module(f'vbench.{dimension}') evaluate_func = getattr(dimension_module, f'compute_{dimension}') except Exception as e: raise NotImplementedError(f'UnImplemented dimension {dimension}!, {e}') submodules_list = submodules_dict[dimension] print(f'cur_full_info_path: {cur_full_info_path}') # TODO: to delete results = evaluate_func(cur_full_info_path, self.device, submodules_list, **kwargs) results_dict[dimension] = results output_name = os.path.join(self.output_path, name+'_eval_results.json') save_json(results_dict, output_name) print(f'Evaluation results saved to {output_name}') ================================================ FILE: Open-Sora/build/lib/vbench/aesthetic_quality.py ================================================ import os import clip import torch import torch.nn as nn import torch.nn.functional as F import subprocess from urllib.request import urlretrieve from vbench.utils import load_video, load_dimension_info, clip_transform from tqdm import tqdm def get_aesthetic_model(cache_folder): """load the aethetic model""" path_to_model = cache_folder + "/sa_0_4_vit_l_14_linear.pth" if not os.path.exists(path_to_model): os.makedirs(cache_folder, exist_ok=True) url_model = ( "https://github.com/LAION-AI/aesthetic-predictor/blob/main/sa_0_4_vit_l_14_linear.pth?raw=true" ) # download aesthetic predictor if not os.path.isfile(path_to_model): try: print(f'trying urlretrieve to download {url_model} to {path_to_model}') urlretrieve(url_model, path_to_model) # unable to download https://github.com/LAION-AI/aesthetic-predictor/blob/main/sa_0_4_vit_l_14_linear.pth?raw=true to pretrained/aesthetic_model/emb_reader/sa_0_4_vit_l_14_linear.pth except: print(f'unable to download {url_model} to {path_to_model} using urlretrieve, trying wget') wget_command = ['wget', url_model, '-P', os.path.dirname(path_to_model)] subprocess.run(wget_command) m = nn.Linear(768, 1) s = torch.load(path_to_model) m.load_state_dict(s) m.eval() return m def laion_aesthetic(aesthetic_model, clip_model, video_list, device): aesthetic_model.eval() clip_model.eval() aesthetic_avg = 0.0 num = 0 video_results = [] for video_path in tqdm(video_list): images = load_video(video_path) image_transform = clip_transform(224) images = image_transform(images) images = images.to(device) image_feats = clip_model.encode_image(images).to(torch.float32) image_feats = F.normalize(image_feats, dim=-1, p=2) aesthetic_scores = aesthetic_model(image_feats).squeeze() normalized_aesthetic_scores = aesthetic_scores/10 cur_avg = torch.mean(normalized_aesthetic_scores, dim=0, keepdim=True) aesthetic_avg += cur_avg.item() num += 1 video_results.append({'video_path': video_path, 'video_results': cur_avg.item()}) aesthetic_avg /= num return aesthetic_avg, video_results def compute_aesthetic_quality(json_dir, device, submodules_list, **kwargs): vit_path = submodules_list[0] aes_path = submodules_list[1] aesthetic_model = get_aesthetic_model(aes_path).to(device) clip_model, preprocess = clip.load(vit_path, device=device) video_list, _ = load_dimension_info(json_dir, dimension='aesthetic_quality', lang='en') all_results, video_results = laion_aesthetic(aesthetic_model, clip_model, video_list, device) return all_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench/appearance_style.py ================================================ import os import json import numpy as np from tqdm import tqdm import torch import clip from PIL import Image from vbench.utils import load_video, load_dimension_info, clip_transform, read_frames_decord_by_fps, clip_transform_Image def get_text_features(model, input_text, tokenizer, text_feature_dict={}): if input_text in text_feature_dict: return text_feature_dict[input_text] text_template= f"{input_text}" with torch.no_grad(): text_features = model.encode_text(text_template).float() text_features /= text_features.norm(dim=-1, keepdim=True) text_feature_dict[input_text] = text_features return text_features def get_vid_features(model, input_frames): with torch.no_grad(): clip_feat = model.encode_vision(input_frames,test=True).float() clip_feat /= clip_feat.norm(dim=-1, keepdim=True) return clip_feat def get_predict_label(clip_feature, text_feats_tensor, top=5): label_probs = (100.0 * clip_feature @ text_feats_tensor.T).softmax(dim=-1) top_probs, top_labels = label_probs.cpu().topk(top, dim=-1) return top_probs, top_labels def appearance_style(clip_model, video_dict, device, sample="rand"): sim = 0.0 cnt = 0 video_results = [] image_transform = clip_transform_Image(224) for info in tqdm(video_dict): if 'auxiliary_info' not in info: raise "Auxiliary info is not in json, please check your json." query = info['auxiliary_info']['appearance_style'] text = clip.tokenize([query]).to(device) video_list = info['video_list'] for video_path in video_list: cur_video = [] with torch.no_grad(): video_arrays = load_video(video_path, return_tensor=False) images = [Image.fromarray(i) for i in video_arrays] for image in images: image = image_transform(image) image = image.to(device) logits_per_image, logits_per_text = clip_model(image.unsqueeze(0), text) cur_sim = float(logits_per_text[0][0].cpu()) cur_sim = cur_sim / 100 cur_video.append(cur_sim) sim += cur_sim cnt +=1 video_sim = np.mean(cur_video) video_results.append({'video_path': video_path, 'video_results': video_sim, 'frame_results':cur_video}) sim_per_frame = sim / cnt return sim_per_frame, video_results def compute_appearance_style(json_dir, device, submodules_list, **kwargs): clip_model, preprocess = clip.load(device=device, **submodules_list) _, video_dict = load_dimension_info(json_dir, dimension='appearance_style', lang='en') all_results, video_results = appearance_style(clip_model, video_dict, device) return all_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench/background_consistency.py ================================================ import os import json import logging import numpy as np import clip from PIL import Image import torch import torch.nn as nn import torch.nn.functional as F from vbench.utils import load_video, load_dimension_info, clip_transform from tqdm import tqdm def background_consistency(clip_model, preprocess, video_list, device, read_frame): sim = 0.0 cnt = 0 video_results = [] image_transform = clip_transform(224) for video_path in tqdm(video_list): video_sim = 0.0 if read_frame: video_path = video_path[:-4].replace('videos', 'frames').replace(' ', '_') tmp_paths = [os.path.join(video_path, f) for f in sorted(os.listdir(video_path))] images = [] for tmp_path in tmp_paths: images.append(preprocess(Image.open(tmp_path))) images = torch.stack(images) else: images = load_video(video_path) images = image_transform(images) images = images.to(device) image_features = clip_model.encode_image(images) image_features = F.normalize(image_features, dim=-1, p=2) for i in range(len(image_features)): image_feature = image_features[i].unsqueeze(0) if i == 0: first_image_feature = image_feature else: sim_pre = max(0.0, F.cosine_similarity(former_image_feature, image_feature).item()) sim_fir = max(0.0, F.cosine_similarity(first_image_feature, image_feature).item()) cur_sim = (sim_pre + sim_fir) / 2 video_sim += cur_sim cnt += 1 former_image_feature = image_feature sim_per_image = video_sim / (len(image_features) - 1) sim += video_sim video_results.append({'video_path': video_path, 'video_results': sim_per_image}) sim_per_video = sim / (len(video_list) - 1) sim_per_frame = sim / cnt return sim_per_frame, video_results def compute_background_consistency(json_dir, device, submodules_list, **kwargs): vit_path, read_frame = submodules_list[0], submodules_list[1] clip_model, preprocess = clip.load(vit_path, device=device) video_list, _ = load_dimension_info(json_dir, dimension='background_consistency', lang='en') all_results, video_results = background_consistency(clip_model, preprocess, video_list, device, read_frame) return all_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench/cli/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/cli/evaluate.py ================================================ import torch import os from vbench import VBench from datetime import datetime import argparse import json CUR_DIR = os.path.dirname(os.path.abspath(__file__)) def register_subparsers(subparser): parser = subparser.add_parser('evaluate', formatter_class=argparse.RawTextHelpFormatter) parser.add_argument( "--output_path", type=str, default='./evaluation_results/', help="output path to save the evaluation results", ) parser.add_argument( "--full_json_dir", type=str, default=f'{CUR_DIR}/../VBench_full_info.json', help="path to save the json file that contains the prompt and dimension information", ) parser.add_argument( "--videos_path", type=str, required=True, help="folder that contains the sampled videos", ) parser.add_argument( "--dimension", nargs='+', required=True, help="list of evaluation dimensions, usage: --dimension ", ) parser.add_argument( "--load_ckpt_from_local", type=bool, required=False, help="whether load checkpoints from local default paths (assuming you have downloaded the checkpoints locally", ) parser.add_argument( "--read_frame", type=bool, required=False, help="whether directly read frames, or directly read videos", ) parser.add_argument( "--mode", choices=['custom_input', 'vbench_standard', 'vbench_category'], default='vbench_standard', help="""This flags determine the mode of evaluations, choose one of the following: 1. "custom_input": receive input prompt from either --prompt/--prompt_file flags or the filename 2. "vbench_standard": evaluate on standard prompt suite of VBench 3. "vbench_category": evaluate on specific category """, ) parser.add_argument( "--custom_input", action="store_true", required=False, help="(deprecated) use --mode=\"custom_input\" instead", ) parser.add_argument( "--prompt", type=str, default="", help="""Specify the input prompt If not specified, filenames will be used as input prompts * Mutually exclusive to --prompt_file. ** This option must be used with --custom_input flag """ ) parser.add_argument( "--prompt_file", type=str, required=False, help="""Specify the path of the file that contains prompt lists If not specified, filenames will be used as input prompts * Mutually exclusive to --prompt. ** This option must be used with --custom_input flag """ ) parser.add_argument( "--category", type=str, required=False, help="""This is for mode=='vbench_category' The category to evaluate on, usage: --category=animal. """, ) ## for dimension specific params ### parser.add_argument( "--imaging_quality_preprocessing_mode", type=str, required=False, default='longer', help="""This is for setting preprocessing in imaging_quality 1. 'shorter': if the shorter side is more than 512, the image is resized so that the shorter side is 512. 2. 'longer': if the longer side is more than 512, the image is resized so that the longer side is 512. 3. 'shorter_centercrop': if the shorter side is more than 512, the image is resized so that the shorter side is 512. Then the center 512 x 512 after resized is used for evaluation. 4. 'None': no preprocessing """, ) parser.set_defaults(func=evaluate) def evaluate(args): print(f'args: {args}') device = torch.device("cuda") my_VBench = VBench(device, args.full_json_dir, args.output_path) print(f'start evaluation') current_time = datetime.now().strftime('%Y-%m-%d-%H:%M:%S') kwargs = {} prompt = [] assert args.custom_input == False, "(Deprecated) use --mode=custom_input instead" if (args.prompt_file is not None) and (args.prompt != ""): raise Exception("--prompt_file and --prompt cannot be used together") if (args.prompt_file is not None or args.prompt != "") and (not args.mode=='custom_input'): raise Exception("must set --mode=custom_input for using external prompt") if args.prompt_file: with open(args.prompt_file, 'r') as f: prompt = json.load(f) assert type(prompt) == dict, "Invalid prompt file format. The correct format is {\"video_path\": prompt, ... }" elif args.prompt != "": prompt = [args.prompt] if args.category != "": kwargs['category'] = args.category kwargs['imaging_quality_preprocessing_mode'] = args.imaging_quality_preprocessing_mode my_VBench.evaluate( videos_path = args.videos_path, name = f'results_{current_time}', prompt_list=prompt, # pass in [] to read prompt from filename dimension_list = args.dimension, local=args.load_ckpt_from_local, read_frame=args.read_frame, mode=args.mode, **kwargs ) print('done') ================================================ FILE: Open-Sora/build/lib/vbench/cli/static_filter.py ================================================ import os import cv2 import glob import numpy as np import torch from tqdm import tqdm from pathlib import Path import json import shutil import logging logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) from vbench.utils import CACHE_DIR, get_prompt_from_filename, load_json from vbench.third_party.RAFT.core.raft import RAFT from vbench.third_party.RAFT.core.utils_core.utils import InputPadder CUR_DIR = os.path.dirname(os.path.abspath(__file__)) DEVICE = 'cuda' class StaticFilter: def __init__(self, args, device): self.args = args self.device = device self.load_model() def load_model(self): self.model = torch.nn.DataParallel(RAFT(self.args)) self.model.load_state_dict(torch.load(self.args.model)) self.model = self.model.module self.model.to(self.device) self.model.eval() def get_score(self, img, flo): img = img[0].permute(1,2,0).cpu().numpy() flo = flo[0].permute(1,2,0).cpu().numpy() u = flo[:,:,0] v = flo[:,:,1] rad = np.sqrt(np.square(u) + np.square(v)) h, w = rad.shape rad_flat = rad.flatten() cut_index = int(h*w*0.02) max_rad = np.mean(abs(np.sort(-rad_flat))[:cut_index]) return max_rad def check_static(self, score_list): thres = self.params["thres"] count_num = self.params["count_num"] count = 0 for score in score_list[:-2]: if score > thres: count += 1 if count > count_num: return False for score in score_list[-2:]: if score > thres*count_num*2: return False return True def set_params(self, frame, count): scale = min(list(frame.shape)[-2:]) self.params = {"thres":3.0*(scale/256.0), "count_num":round(2*(count/16.0))} def infer(self, path): with torch.no_grad(): frames = self.get_frames(path) self.set_params(frame=frames[0], count=len(frames)) static_score = [] for image1, image2 in zip(frames[:-1]+[frames[0],frames[-1]], frames[1:]+[frames[-1],frames[0]]): padder = InputPadder(image1.shape) image1, image2 = padder.pad(image1, image2) _, flow_up = self.model(image1, image2, iters=20, test_mode=True) max_rad = self.get_score(image1, flow_up) static_score.append(max_rad) whether_static = self.check_static(static_score) return whether_static def get_frames(self, video_path): frame_list = [] video = cv2.VideoCapture(video_path) while video.isOpened(): success, frame = video.read() if success: frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # convert to rgb frame = torch.from_numpy(frame.astype(np.uint8)).permute(2, 0, 1).float() frame = frame[None].to(DEVICE) frame_list.append(frame) else: break video.release() assert frame_list != [] return frame_list def check_and_move(args, filter_results, target_path=None): if target_path is None: target_path = os.path.join(args.result_path, "filtered_videos") os.makedirs(target_path, exist_ok=True) for prompt, v in filter_results.items(): if v["static_count"] < 5 and args.filter_scope=='temporal_flickering': logger.warning(f"Prompt: '{prompt}' has fewer than 5 filter results.") for i, video_path in enumerate(v["static_path"]): target_name = os.path.join(target_path, f"{prompt}-{i}.mp4") shutil.copy(video_path, target_name) logger.info(f"All filtered videos are saved in the '{target_path}' path") def static_filter(args): static_filter = StaticFilter(args, device=DEVICE) prompt_dict = {} prompt_list = [] paths = sorted(glob.glob(os.path.join(args.videos_path, "*.mp4"))) if args.filter_scope=='temporal_flickering': full_prompt_list = load_json(f"{CUR_DIR}/../VBench_full_info.json") for prompt in full_prompt_list: if 'temporal_flickering' in prompt['dimension']: prompt_dict[prompt['prompt_en']] = {"static_count":0, "static_path":[]} prompt_list.append(prompt['prompt_en']) elif args.filter_scope=='all': for prompt in paths: prompt = get_prompt_from_filename(prompt) prompt_dict[prompt] = {"static_count":0, "static_path":[]} prompt_list.append(prompt) else: assert os.path.isfile(args.filter_scope) and Path(args.filter_scope).suffix.lower() == '.json', f""" --filter_scope flag is not correctly set, set to 'all' to filter all videos in the --videos_path directory, or provide the correct path to the JSON file """ full_prompt_list = load_json(args.filter_scope) for prompt in full_prompt_list: prompt = get_prompt_from_filename(prompt) prompt_dict[prompt] = {"static_count":0, "static_path":[]} prompt_list.append(prompt) for path in tqdm(paths): name = get_prompt_from_filename(path) if name in prompt_list: if prompt_dict[name]["static_count"] < 5 or args.filter_scope != 'temporal_flickering': if static_filter.infer(path): prompt_dict[name]["static_count"] += 1 prompt_dict[name]["static_path"].append(path) os.makedirs(args.result_path, exist_ok=True) info_file = os.path.join(args.result_path, args.store_name) json.dump(prompt_dict, open(info_file, "w")) logger.info(f"Filtered results info is saved in the '{info_file}' file") check_and_move(args, prompt_dict) def register_subparsers(subparser): parser = subparser.add_parser('static_filter') parser.add_argument('--model', type=str, default=f"{CACHE_DIR}/raft_model/models/raft-things.pth", help="restore checkpoint") parser.add_argument('--videos_path', default="", required=True, help="video path for filtering") parser.add_argument('--result_path', type=str, default="./filter_results", help='result save path') parser.add_argument('--store_name', type=str, default="filtered_static_video.json", help='result file name') parser.add_argument('--small', action='store_true', help='use small model') parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision') parser.add_argument('--alternate_corr', action='store_true', help='use efficent correlation implementation') parser.add_argument('--filter_scope', default='temporal_flickering', help=f'''For specifying the scope for filtering videos 1. 'temporal_flickering' (default): filter videos based on matches with temporal_flickering dimension of VBench. 2. 'all': filter all video in the current directory. 3. '$filename': if a filepath to a JSON file is provided, only the filename exists in JSON file will be filtered. > usage: --filter_scope example.json ''') parser.set_defaults(func=static_filter) ================================================ FILE: Open-Sora/build/lib/vbench/cli/vbench.py ================================================ import argparse import importlib import subprocess vbench_cmd = ['evaluate', 'static_filter'] def main(): parser = argparse.ArgumentParser(prog="vbench", formatter_class=argparse.RawTextHelpFormatter) subparsers = parser.add_subparsers(title='vbench subcommands') for cmd in vbench_cmd: module = importlib.import_module(f'vbench.cli.{cmd}') module.register_subparsers(subparsers) parser.set_defaults(func=help) args = parser.parse_args() args.func(args) def help(args): subprocess.run(['vbench', '-h'], check=True) ================================================ FILE: Open-Sora/build/lib/vbench/color.py ================================================ import os import json import torch import numpy as np from tqdm import tqdm from vbench.utils import load_video, load_dimension_info, read_frames_decord_by_fps from vbench.third_party.grit_model import DenseCaptioning import logging logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def get_dect_from_grit(model, image_arrays): pred = [] if type(image_arrays) is not list and type(image_arrays) is not np.ndarray: image_arrays = image_arrays.numpy() with torch.no_grad(): for frame in image_arrays: ret = model.run_caption_tensor(frame) cur_pred = [] if len(ret[0])<1: cur_pred.append(['','']) else: for idx, cap_det in enumerate(ret[0]): cur_pred.append([cap_det[0], cap_det[2][0]]) pred.append(cur_pred) return pred def check_generate(color_key, object_key, predictions): cur_object_color, cur_object = 0, 0 for frame_pred in predictions: object_flag, color_flag = False, False for pred in frame_pred: if object_key == pred[1]: for color_query in ["white","red","pink","blue","silver","purple","orange","green","gray","yellow","black","grey"]: if color_query in pred[0]: object_flag =True if color_key in pred[0]: color_flag = True if color_flag: cur_object_color+=1 if object_flag: cur_object +=1 return cur_object, cur_object_color def color(model, video_dict, device): success_frame_count_all, video_count = 0, 0 video_results = [] for info in tqdm(video_dict): if 'auxiliary_info' not in info: raise "Auxiliary info is not in json, please check your json." # print(info) color_info = info['auxiliary_info']['color'] object_info = info['prompt'] object_info = object_info.replace('a ','').replace('an ','').replace(color_info,'').strip() for video_path in info['video_list']: video_arrays = load_video(video_path, num_frames=16, return_tensor=False) cur_video_pred = get_dect_from_grit(model ,video_arrays) cur_object, cur_object_color = check_generate(color_info, object_info, cur_video_pred) if cur_object>0: cur_success_frame_rate = cur_object_color/cur_object success_frame_count_all += cur_success_frame_rate video_count += 1 video_results.append({'video_path': video_path, 'video_results': cur_success_frame_rate}) success_rate = success_frame_count_all / video_count return success_rate, video_results def compute_color(json_dir, device, submodules_dict, **kwargs): dense_caption_model = DenseCaptioning(device) dense_caption_model.initialize_model(**submodules_dict) logger.info("Initialize detection model success") _, prompt_dict_ls = load_dimension_info(json_dir, dimension='color', lang='en') all_results, video_results = color(dense_caption_model, prompt_dict_ls, device) return all_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench/dynamic_degree.py ================================================ import argparse import os import cv2 import glob import numpy as np import torch from tqdm import tqdm from easydict import EasyDict as edict from vbench.utils import load_dimension_info from vbench.third_party.RAFT.core.raft import RAFT from vbench.third_party.RAFT.core.utils_core.utils import InputPadder class DynamicDegree: def __init__(self, args, device): self.args = args self.device = device self.load_model() def load_model(self): self.model = torch.nn.DataParallel(RAFT(self.args)) self.model.load_state_dict(torch.load(self.args.model)) self.model = self.model.module self.model.to(self.device) self.model.eval() def get_score(self, img, flo): img = img[0].permute(1,2,0).cpu().numpy() flo = flo[0].permute(1,2,0).cpu().numpy() u = flo[:,:,0] v = flo[:,:,1] rad = np.sqrt(np.square(u) + np.square(v)) h, w = rad.shape rad_flat = rad.flatten() cut_index = int(h*w*0.05) max_rad = np.mean(abs(np.sort(-rad_flat))[:cut_index]) return max_rad.item() def set_params(self, frame, count): scale = min(list(frame.shape)[-2:]) self.params = {"thres":6.0*(scale/256.0), "count_num":round(4*(count/16.0))} def infer(self, video_path): with torch.no_grad(): if video_path.endswith('.mp4'): frames = self.get_frames(video_path) elif os.path.isdir(video_path): frames = self.get_frames_from_img_folder(video_path) else: raise NotImplementedError self.set_params(frame=frames[0], count=len(frames)) static_score = [] for image1, image2 in zip(frames[:-1], frames[1:]): padder = InputPadder(image1.shape) image1, image2 = padder.pad(image1, image2) _, flow_up = self.model(image1, image2, iters=20, test_mode=True) max_rad = self.get_score(image1, flow_up) static_score.append(max_rad) whether_move = self.check_move(static_score) return whether_move def check_move(self, score_list): thres = self.params["thres"] count_num = self.params["count_num"] count = 0 for score in score_list: if score > thres: count += 1 if count >= count_num: return True return False def get_frames(self, video_path): frame_list = [] video = cv2.VideoCapture(video_path) fps = video.get(cv2.CAP_PROP_FPS) # get fps interval = round(fps/8) while video.isOpened(): success, frame = video.read() if success: frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # convert to rgb frame = torch.from_numpy(frame.astype(np.uint8)).permute(2, 0, 1).float() frame = frame[None].to(self.device) frame_list.append(frame) else: break video.release() assert frame_list != [] frame_list = self.extract_frame(frame_list, interval) return frame_list def extract_frame(self, frame_list, interval=1): extract = [] for i in range(0, len(frame_list), interval): extract.append(frame_list[i]) return extract def get_frames_from_img_folder(self, img_folder): exts = ['jpg', 'png', 'jpeg', 'bmp', 'tif', 'tiff', 'JPG', 'PNG', 'JPEG', 'BMP', 'TIF', 'TIFF'] frame_list = [] imgs = sorted([p for p in glob.glob(os.path.join(img_folder, "*")) if os.path.splitext(p)[1][1:] in exts]) # imgs = sorted(glob.glob(os.path.join(img_folder, "*.png"))) for img in imgs: frame = cv2.imread(img, cv2.IMREAD_COLOR) frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame = torch.from_numpy(frame.astype(np.uint8)).permute(2, 0, 1).float() frame = frame[None].to(self.device) frame_list.append(frame) assert frame_list != [] return frame_list def dynamic_degree(dynamic, video_list): sim = [] video_results = [] for video_path in tqdm(video_list): score_per_video = dynamic.infer(video_path) video_results.append({'video_path': video_path, 'video_results': score_per_video}) sim.append(score_per_video) avg_score = np.mean(sim) return avg_score, video_results def compute_dynamic_degree(json_dir, device, submodules_list, **kwargs): model_path = submodules_list["model"] # set_args args_new = edict({"model":model_path, "small":False, "mixed_precision":False, "alternate_corr":False}) dynamic = DynamicDegree(args_new, device) video_list, _ = load_dimension_info(json_dir, dimension='dynamic_degree', lang='en') all_results, video_results = dynamic_degree(dynamic, video_list) return all_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench/human_action.py ================================================ import os import json import numpy as np import clip from PIL import Image import torch import torch.nn as nn import torch.nn.functional as F from vbench.utils import load_video, load_dimension_info from vbench.third_party.umt.datasets.video_transforms import ( Compose, Resize, CenterCrop, Normalize, create_random_augment, random_short_side_scale_jitter, random_crop, random_resized_crop_with_shift, random_resized_crop, horizontal_flip, random_short_side_scale_jitter, uniform_crop, ) from vbench.third_party.umt.datasets.volume_transforms import ClipToTensor from timm.models import create_model from vbench.third_party.umt.models.modeling_finetune import vit_large_patch16_224 from tqdm import tqdm def build_dict(): CUR_DIR = os.path.dirname(os.path.abspath(__file__)) path = f'{CUR_DIR}/third_party/umt/kinetics_400_categories.txt' results = {} with open(path, 'r') as f: cat_list = f.readlines() cat_list = [c.strip() for c in cat_list] for line in cat_list: cat, number = line.split('\t') results[number] = cat.lower() return results def human_action(umt_path, video_list, device): state_dict = torch.load(umt_path, map_location='cpu') model = create_model( "vit_large_patch16_224", pretrained=False, num_classes=400, all_frames=16, tubelet_size=1, use_learnable_pos_emb=False, fc_drop_rate=0., drop_rate=0., drop_path_rate=0.2, attn_drop_rate=0., drop_block_rate=None, use_checkpoint=False, checkpoint_num=16, use_mean_pooling=True, init_scale=0.001, ) data_transform = Compose([ Resize(256, interpolation='bilinear'), CenterCrop(size=(224, 224)), ClipToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) model = model.to(device) model.load_state_dict(state_dict, strict=False) model.eval() cat_dict = build_dict() cnt= 0 cor_num = 0 video_results = [] for video_path in tqdm(video_list): video_label_ls = video_path.split('/')[-1].lower().split('-')[0].split("person is ")[-1].split('_')[0] cnt += 1 images = load_video(video_path, data_transform, num_frames=16) images = images.unsqueeze(0) images = images.to(device) with torch.no_grad(): logits = torch.sigmoid(model(images)) results, indices = torch.topk(logits, 5, dim=1) indices = indices.squeeze().tolist() results = results.squeeze().tolist() results = [round(f, 4) for f in results] cat_ls = [] for i in range(5): if results[i] >= 0.85: cat_ls.append(cat_dict[str(indices[i])]) flag = False for cat in cat_ls: if cat == video_label_ls: cor_num += 1 flag = True # print(f"{cnt}: {video_path} correct, top-5: {cat_ls}, logits: {results}", flush=True) break if flag is False: # print(f"{cnt}: {video_path} false, gt: {video_label_ls}, top-5: {cat_ls}, logits: {results}", flush=True) pass video_results.append({'video_path': video_path, 'video_results': flag}) # print(f"cor num: {cor_num}, total: {cnt}") acc = cor_num / cnt return acc, video_results def compute_human_action(json_dir, device, submodules_list, **kwargs): umt_path = submodules_list[0] video_list, _ = load_dimension_info(json_dir, dimension='human_action', lang='en') all_results, video_results = human_action(umt_path, video_list, device) return all_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench/imaging_quality.py ================================================ import torch from tqdm import tqdm from torchvision import transforms from pyiqa.archs.musiq_arch import MUSIQ from vbench.utils import load_video, load_dimension_info def transform(images, preprocess_mode='shorter'): if preprocess_mode.startswith('shorter'): _, _, h, w = images.size() if min(h,w) > 512: scale = 512./min(h,w) images = transforms.Resize(size=( int(scale * h), int(scale * w) ))(images) if preprocess_mode == 'shorter_centercrop': images = transforms.CenterCrop(512)(images) elif preprocess_mode == 'longer': _, _, h, w = images.size() if max(h,w) > 512: scale = 512./max(h,w) images = transforms.Resize(size=( int(scale * h), int(scale * w) ))(images) elif preprocess_mode == 'None': return images / 255. else: raise ValueError("Please recheck imaging_quality_mode") return images / 255. def technical_quality(model, video_list, device, **kwargs): preprocess_mode = kwargs['imaging_quality_preprocessing_mode'] video_results = [] for video_path in tqdm(video_list): images = load_video(video_path) images = transform(images, preprocess_mode) acc_score_video = 0. for i in range(len(images)): frame = images[i].unsqueeze(0).to(device) score = model(frame) acc_score_video += float(score) video_results.append({'video_path': video_path, 'video_results': acc_score_video/len(images)}) average_score = sum([o['video_results'] for o in video_results]) / len(video_results) average_score = average_score / 100. return average_score, video_results def compute_imaging_quality(json_dir, device, submodules_list, **kwargs): model_path = submodules_list['model_path'] model = MUSIQ(pretrained_model_path=model_path) model.to(device) model.training = False video_list, _ = load_dimension_info(json_dir, dimension='imaging_quality', lang='en') all_results, video_results = technical_quality(model, video_list, device, **kwargs) return all_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench/motion_smoothness.py ================================================ import os import cv2 import glob import torch import numpy as np from tqdm import tqdm from omegaconf import OmegaConf from vbench.utils import load_dimension_info from vbench.third_party.amt.utils.utils import ( img2tensor, tensor2img, check_dim_and_resize ) from vbench.third_party.amt.utils.build_utils import build_from_cfg from vbench.third_party.amt.utils.utils import InputPadder class FrameProcess: def __init__(self): pass def get_frames(self, video_path): frame_list = [] video = cv2.VideoCapture(video_path) while video.isOpened(): success, frame = video.read() if success: frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # convert to rgb frame_list.append(frame) else: break video.release() assert frame_list != [] return frame_list def get_frames_from_img_folder(self, img_folder): exts = ['jpg', 'png', 'jpeg', 'bmp', 'tif', 'tiff', 'JPG', 'PNG', 'JPEG', 'BMP', 'TIF', 'TIFF'] frame_list = [] imgs = sorted([p for p in glob.glob(os.path.join(img_folder, "*")) if os.path.splitext(p)[1][1:] in exts]) # imgs = sorted(glob.glob(os.path.join(img_folder, "*.png"))) for img in imgs: frame = cv2.imread(img, cv2.IMREAD_COLOR) frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame_list.append(frame) assert frame_list != [] return frame_list def extract_frame(self, frame_list, start_from=0): extract = [] for i in range(start_from, len(frame_list), 2): extract.append(frame_list[i]) return extract class MotionSmoothness: def __init__(self, config, ckpt, device): self.device = device self.config = config self.ckpt = ckpt self.niters = 1 self.initialization() self.load_model() def load_model(self): cfg_path = self.config ckpt_path = self.ckpt network_cfg = OmegaConf.load(cfg_path).network network_name = network_cfg.name print(f'Loading [{network_name}] from [{ckpt_path}]...') self.model = build_from_cfg(network_cfg) ckpt = torch.load(ckpt_path) self.model.load_state_dict(ckpt['state_dict']) self.model = self.model.to(self.device) self.model.eval() def initialization(self): if self.device == 'cuda': self.anchor_resolution = 1024 * 512 self.anchor_memory = 1500 * 1024**2 self.anchor_memory_bias = 2500 * 1024**2 self.vram_avail = torch.cuda.get_device_properties(self.device).total_memory print("VRAM available: {:.1f} MB".format(self.vram_avail / 1024 ** 2)) else: # Do not resize in cpu mode self.anchor_resolution = 8192*8192 self.anchor_memory = 1 self.anchor_memory_bias = 0 self.vram_avail = 1 self.embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(self.device) self.fp = FrameProcess() def motion_score(self, video_path): iters = int(self.niters) # get inputs if video_path.endswith('.mp4'): frames = self.fp.get_frames(video_path) elif os.path.isdir(video_path): frames = self.fp.get_frames_from_img_folder(video_path) else: raise NotImplementedError frame_list = self.fp.extract_frame(frames, start_from=0) # print(f'Loading [images] from [{video_path}], the number of images = [{len(frame_list)}]') inputs = [img2tensor(frame).to(self.device) for frame in frame_list] assert len(inputs) > 1, f"The number of input should be more than one (current {len(inputs)})" inputs = check_dim_and_resize(inputs) h, w = inputs[0].shape[-2:] scale = self.anchor_resolution / (h * w) * np.sqrt((self.vram_avail - self.anchor_memory_bias) / self.anchor_memory) scale = 1 if scale > 1 else scale scale = 1 / np.floor(1 / np.sqrt(scale) * 16) * 16 if scale < 1: print(f"Due to the limited VRAM, the video will be scaled by {scale:.2f}") padding = int(16 / scale) padder = InputPadder(inputs[0].shape, padding) inputs = padder.pad(*inputs) # ----------------------- Interpolater ----------------------- # print(f'Start frame interpolation:') for i in range(iters): # print(f'Iter {i+1}. input_frames={len(inputs)} output_frames={2*len(inputs)-1}') outputs = [inputs[0]] for in_0, in_1 in zip(inputs[:-1], inputs[1:]): in_0 = in_0.to(self.device) in_1 = in_1.to(self.device) with torch.no_grad(): imgt_pred = self.model(in_0, in_1, self.embt, scale_factor=scale, eval=True)['imgt_pred'] outputs += [imgt_pred.cpu(), in_1.cpu()] inputs = outputs # ----------------------- cal_vfi_score ----------------------- outputs = padder.unpad(*outputs) outputs = [tensor2img(out) for out in outputs] vfi_score = self.vfi_score(frames, outputs) norm = (255.0 - vfi_score)/255.0 return norm def vfi_score(self, ori_frames, interpolate_frames): ori = self.fp.extract_frame(ori_frames, start_from=1) interpolate = self.fp.extract_frame(interpolate_frames, start_from=1) scores = [] for i in range(len(interpolate)): scores.append(self.get_diff(ori[i], interpolate[i])) return np.mean(np.array(scores)) def get_diff(self, img1, img2): img = cv2.absdiff(img1, img2) return np.mean(img) def motion_smoothness(motion, video_list): sim = [] video_results = [] for video_path in tqdm(video_list): score_per_video = motion.motion_score(video_path) video_results.append({'video_path': video_path, 'video_results': score_per_video}) sim.append(score_per_video) avg_score = np.mean(sim) return avg_score, video_results def compute_motion_smoothness(json_dir, device, submodules_list, **kwargs): config = submodules_list["config"] # pretrained/amt_model/AMT-S.yaml ckpt = submodules_list["ckpt"] # pretrained/amt_model/amt-s.pth motion = MotionSmoothness(config, ckpt, device) video_list, _ = load_dimension_info(json_dir, dimension='motion_smoothness', lang='en') all_results, video_results = motion_smoothness(motion, video_list) return all_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench/multiple_objects.py ================================================ import os import json import torch import numpy as np from tqdm import tqdm from vbench.utils import load_video, load_dimension_info from vbench.third_party.grit_model import DenseCaptioning import logging logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def get_dect_from_grit(model, image_arrays): pred = [] if type(image_arrays) is not list: image_arrays = image_arrays.numpy() with torch.no_grad(): for frame in image_arrays: ret = model.run_caption_tensor(frame) if len(ret[0])>0: pred.append(set(ret[0][0][2])) else: pred.append(set([])) return pred def check_generate(key_info, predictions): cur_cnt = 0 key_a, key_b = key_info.split(' and ') key_a = key_a.strip() key_b = key_b.strip() for pred in predictions: if key_a in pred and key_b in pred: cur_cnt+=1 return cur_cnt def multiple_objects(model, video_dict, device): success_frame_count, frame_count = 0,0 video_results = [] for info in tqdm(video_dict): if 'auxiliary_info' not in info: raise "Auxiliary info is not in json, please check your json." object_info = info['auxiliary_info']['object'] for video_path in info['video_list']: video_tensor = load_video(video_path, num_frames=16) cur_video_pred = get_dect_from_grit(model, video_tensor.permute(0,2,3,1)) cur_success_frame_count = check_generate(object_info, cur_video_pred) cur_success_frame_rate = cur_success_frame_count/len(cur_video_pred) success_frame_count += cur_success_frame_count frame_count += len(cur_video_pred) video_results.append({'video_path': video_path, 'video_results': cur_success_frame_rate}) success_rate = success_frame_count / frame_count return success_rate, video_results def compute_multiple_objects(json_dir, device, submodules_dict, **kwargs): dense_caption_model = DenseCaptioning(device) dense_caption_model.initialize_model_det(**submodules_dict) logger.info("Initialize detection model success") _, prompt_dict_ls = load_dimension_info(json_dir, dimension='multiple_objects', lang='en') all_results, video_results = multiple_objects(dense_caption_model, prompt_dict_ls, device) return all_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench/object_class.py ================================================ import os import json import torch import numpy as np from tqdm import tqdm from vbench.utils import load_video, load_dimension_info from vbench.third_party.grit_model import DenseCaptioning import logging logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def get_dect_from_grit(model, image_arrays): pred = [] if type(image_arrays) is not list: image_arrays = image_arrays.numpy() with torch.no_grad(): for frame in image_arrays: try: pred.append(set(model.run_caption_tensor(frame)[0][0][2])) except: pred.append(set()) return pred def check_generate(key_info, predictions): cur_cnt = 0 for pred in predictions: if key_info in pred: cur_cnt+=1 return cur_cnt def object_class(model, video_dict, device): success_frame_count, frame_count = 0,0 video_results = [] for info in tqdm(video_dict): if 'auxiliary_info' not in info: raise "Auxiliary info is not in json, please check your json." object_info = info['auxiliary_info']['object'] for video_path in info['video_list']: video_tensor = load_video(video_path, num_frames=16) cur_video_pred = get_dect_from_grit(model, video_tensor.permute(0,2,3,1)) cur_success_frame_count = check_generate(object_info, cur_video_pred) cur_success_frame_rate = cur_success_frame_count/len(cur_video_pred) success_frame_count += cur_success_frame_count frame_count += len(cur_video_pred) video_results.append({'video_path': video_path, 'video_results': cur_success_frame_rate}) success_rate = success_frame_count / frame_count return success_rate, video_results def compute_object_class(json_dir, device, submodules_dict, **kwargs): dense_caption_model = DenseCaptioning(device) dense_caption_model.initialize_model_det(**submodules_dict) logger.info("Initialize detection model success") _, prompt_dict_ls = load_dimension_info(json_dir, dimension='object_class', lang='en') all_results, video_results = object_class(dense_caption_model, prompt_dict_ls, device) return all_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench/overall_consistency.py ================================================ import os import json import numpy as np import torch import clip from tqdm import tqdm from vbench.utils import load_video, load_dimension_info, clip_transform, read_frames_decord_by_fps, CACHE_DIR from vbench.third_party.ViCLIP.viclip import ViCLIP from vbench.third_party.ViCLIP.simple_tokenizer import SimpleTokenizer def get_text_features(model, input_text, tokenizer, text_feature_dict={}): if input_text in text_feature_dict: return text_feature_dict[input_text] text_template= f"{input_text}" with torch.no_grad(): text_features = model.encode_text(text_template).float() text_features /= text_features.norm(dim=-1, keepdim=True) text_feature_dict[input_text] = text_features return text_features def get_vid_features(model, input_frames): with torch.no_grad(): clip_feat = model.encode_vision(input_frames,test=True).float() clip_feat /= clip_feat.norm(dim=-1, keepdim=True) return clip_feat def get_predict_label(clip_feature, text_feats_tensor, top=5): label_probs = (100.0 * clip_feature @ text_feats_tensor.T).softmax(dim=-1) top_probs, top_labels = label_probs.cpu().topk(top, dim=-1) return top_probs, top_labels def overall_consistency(clip_model, video_dict, tokenizer, device, sample="middle"): sim = [] video_results = [] image_transform = clip_transform(224) for info in tqdm(video_dict): query = info['prompt'] text = clip.tokenize([query]).to(device) video_list = info['video_list'] for video_path in video_list: cur_video = [] with torch.no_grad(): images = read_frames_decord_by_fps(video_path, num_frames=8, sample=sample) images = image_transform(images) images = images.to(device) clip_feat = get_vid_features(clip_model,images.unsqueeze(0)) text_feat = get_text_features(clip_model, query, tokenizer) logit_per_text = clip_feat @ text_feat.T score_per_video = float(logit_per_text[0][0].cpu()) sim.append(score_per_video) video_results.append({'video_path': video_path, 'video_results': score_per_video}) avg_score = np.mean(sim) return avg_score, video_results def compute_overall_consistency(json_dir, device, submodules_list, **kwargs): tokenizer = SimpleTokenizer(os.path.join(CACHE_DIR, "ViCLIP/bpe_simple_vocab_16e6.txt.gz")) viclip = ViCLIP(tokenizer= tokenizer, **submodules_list).to(device) _, video_dict = load_dimension_info(json_dir, dimension='overall_consistency', lang='en') all_results, video_results = overall_consistency(viclip, video_dict, tokenizer, device) return all_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench/scene.py ================================================ import os import json import torch import numpy as np from tqdm import tqdm from vbench.utils import load_video, load_dimension_info, tag2text_transform from vbench.third_party.tag2Text.tag2text import tag2text_caption import logging logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def get_caption(model, image_arrays): caption, tag_predict = model.generate(image_arrays, tag_input = None, return_tag_predict = True) return caption def check_generate(key_info, predictions): cur_cnt = 0 key = key_info['scene'] for pred in predictions: q_flag = [q in pred for q in key.split(' ')] if len(q_flag) == sum(q_flag): cur_cnt +=1 return cur_cnt def scene(model, video_dict, device): success_frame_count, frame_count = 0,0 video_results = [] transform = tag2text_transform(384) for info in tqdm(video_dict): if 'auxiliary_info' not in info: raise "Auxiliary info is not in json, please check your json." scene_info = info['auxiliary_info']['scene'] for video_path in info['video_list']: video_array = load_video(video_path, num_frames=16, return_tensor=False, width=384, height=384) video_tensor_list = [] for i in video_array: video_tensor_list.append(transform(i).to(device).unsqueeze(0)) video_tensor = torch.cat(video_tensor_list) cur_video_pred = get_caption(model, video_tensor) cur_success_frame_count = check_generate(scene_info, cur_video_pred) cur_success_frame_rate = cur_success_frame_count/len(cur_video_pred) success_frame_count += cur_success_frame_count frame_count += len(cur_video_pred) video_results.append({'video_path': video_path, 'video_results': cur_success_frame_rate}) success_rate = success_frame_count / frame_count return success_rate, video_results def compute_scene(json_dir, device, submodules_dict, **kwargs): model = tag2text_caption(**submodules_dict) model.eval() model = model.to(device) logger.info("Initialize caption model success") _, prompt_dict_ls = load_dimension_info(json_dir, dimension='scene', lang='en') all_results, video_results = scene(model, prompt_dict_ls, device) return all_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench/spatial_relationship.py ================================================ import os import json import torch import numpy as np from tqdm import tqdm from vbench.utils import load_video, load_dimension_info from vbench.third_party.grit_model import DenseCaptioning import logging logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def get_position_score(locality, obj1,obj2, iou_threshold=0.1): # input obj1 and obj2 should be [x0,y0,x1,y1] # Calculate centers of bounding boxes box1 = { 'x_min': obj1[0], 'y_min': obj1[1], 'x_max': obj1[2], 'y_max': obj1[3], 'width': obj1[2] - obj1[0], 'height': obj1[3] - obj1[1] } box2 = { 'x_min': obj2[0], 'y_min': obj2[1], 'x_max': obj2[2], 'y_max': obj2[3], 'width': obj2[2] - obj2[0], 'height': obj2[3] - obj2[1] } # Get the object center box1_center = ((box1['x_min'] + box1['x_max']) / 2, (box1['y_min'] + box1['y_max']) / 2) box2_center = ((box2['x_min'] + box2['x_max']) / 2, (box2['y_min'] + box2['y_max']) / 2) # Calculate horizontal and vertical distances x_distance = box2_center[0] - box1_center[0] y_distance = box2_center[1] - box1_center[1] # Calculate IoU x_overlap = max(0, min(box1['x_max'], box2['x_max']) - max(box1['x_min'], box2['x_min'])) y_overlap = max(0, min(box1['y_max'], box2['y_max']) - max(box1['y_min'], box2['y_min'])) intersection = x_overlap * y_overlap box1_area = (box1['x_max'] - box1['x_min']) * (box1['y_max'] - box1['y_min']) box2_area = (box2['x_max'] - box2['x_min']) * (box2['y_max'] - box2['y_min']) union = box1_area + box2_area - intersection iou = intersection / union # get max object width and max object height max_width = max(box1['width'], box2['width']) max_height = max(box1['height'], box2['height']) score=0 if locality in 'on the right of' or locality in 'on the left of': if abs(x_distance) > abs(y_distance) and iou < iou_threshold: score=1 elif abs(x_distance) > abs(y_distance) and iou >= iou_threshold: score=iou_threshold/iou else: score=0 elif locality in 'on the bottom of' or locality in 'on the top of': if abs(y_distance) > abs(x_distance) and iou < iou_threshold: score=1 elif abs(y_distance) > abs(x_distance) and iou >= iou_threshold: score=iou_threshold/iou else: score = 0 return score def get_dect_from_grit(model, image_arrays): pred = [] if type(image_arrays) is not list: image_arrays = image_arrays.numpy() with torch.no_grad(): for frame in image_arrays: ret = model.run_caption_tensor(frame) pred_cur = [] if len(ret[0])>0: for info in ret[0]: pred_cur.append([info[0],info[1]]) pred.append(pred_cur) return pred def check_generate(key_info, predictions): key_a = key_info['object_a'] key_b = key_info['object_b'] relation = key_info['relationship'] frame_score =[] for frame_pred in predictions: # filter the target object frame_obj_locats = [] cur_score = [0] for item in frame_pred: if (key_a == item[0]) or (key_b == item[0]): frame_obj_locats.append(item[1]) for c_obj1 in range(len(frame_obj_locats)-1): for c_obj2 in range(c_obj1+1 ,len(frame_obj_locats)): score_obj1_obj2 = get_position_score(relation, frame_obj_locats[c_obj1], frame_obj_locats[c_obj2]) cur_score.append(score_obj1_obj2) frame_score.append(max(cur_score)) return frame_score def spatial_relationship(model, video_dict, device): video_results = [] frame_score_overall = [] for info in tqdm(video_dict): if 'auxiliary_info' not in info: raise "Auxiliary info is not in json, please check your json." object_info = info['auxiliary_info']['spatial_relationship'] for video_path in info['video_list']: video_tensor = load_video(video_path, num_frames=16) cur_video_pred = get_dect_from_grit(model, video_tensor.permute(0,2,3,1)) cur_video_frame_score = check_generate(object_info, cur_video_pred) cur_success_frame_rate = np.mean(cur_video_frame_score) frame_score_overall.extend(cur_video_frame_score) video_results.append({'video_path': video_path, 'video_results': cur_success_frame_rate, 'frame_results':cur_video_frame_score}) success_rate = np.mean(frame_score_overall) return success_rate, video_results def compute_spatial_relationship(json_dir, device, submodules_dict, **kwargs): dense_caption_model = DenseCaptioning(device) dense_caption_model.initialize_model_det(**submodules_dict) logger.info("Initialize detection model success") _, prompt_dict_ls = load_dimension_info(json_dir, dimension='spatial_relationship', lang='en') all_results, video_results = spatial_relationship(dense_caption_model, prompt_dict_ls, device) return all_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench/subject_consistency.py ================================================ import io import os import cv2 import json import numpy as np from PIL import Image from tqdm import tqdm import torch import torch.nn as nn import torch.nn.functional as F import torchvision.transforms as transforms from vbench.utils import load_video, load_dimension_info, dino_transform, dino_transform_Image import logging logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def subject_consistency(model, video_list, device, read_frame): sim = 0.0 cnt = 0 video_results = [] if read_frame: image_transform = dino_transform_Image(224) else: image_transform = dino_transform(224) for video_path in tqdm(video_list): video_sim = 0.0 if read_frame: video_path = video_path[:-4].replace('videos', 'frames').replace(' ', '_') tmp_paths = [os.path.join(video_path, f) for f in sorted(os.listdir(video_path))] images = [] for tmp_path in tmp_paths: images.append(image_transform(Image.open(tmp_path))) else: images = load_video(video_path) images = image_transform(images) for i in range(len(images)): with torch.no_grad(): image = images[i].unsqueeze(0) image = image.to(device) image_features = model(image) image_features = F.normalize(image_features, dim=-1, p=2) if i == 0: first_image_features = image_features else: sim_pre = max(0.0, F.cosine_similarity(former_image_features, image_features).item()) sim_fir = max(0.0, F.cosine_similarity(first_image_features, image_features).item()) cur_sim = (sim_pre + sim_fir) / 2 video_sim += cur_sim cnt += 1 former_image_features = image_features sim += video_sim video_results.append({'video_path': video_path, 'video_results': video_sim}) sim_per_video = sim / (len(video_list) - 1) sim_per_frame = sim / cnt return sim_per_frame, video_results def compute_subject_consistency(json_dir, device, submodules_list, **kwargs): dino_model = torch.hub.load(**submodules_list).to(device) read_frame = submodules_list['read_frame'] logger.info("Initialize DINO success") video_list, _ = load_dimension_info(json_dir, dimension='subject_consistency', lang='en') all_results, video_results = subject_consistency(dino_model, video_list, device, read_frame) return all_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench/temporal_flickering.py ================================================ import numpy as np from tqdm import tqdm import cv2 from vbench.utils import load_dimension_info def get_frames(video_path): frames = [] video = cv2.VideoCapture(video_path) while video.isOpened(): success, frame = video.read() if success: frames.append(frame) else: break video.release() assert frames != [] return frames def mae_seq(frames): ssds = [] for i in range(len(frames)-1): ssds.append(calculate_mae(frames[i], frames[i+1])) return np.array(ssds) def calculate_mae(img1, img2): """Computing the mean absolute error (MAE) between two images.""" if img1.shape != img2.shape: print("Images don't have the same shape.") return return np.mean(cv2.absdiff(np.array(img1, dtype=np.float32), np.array(img2, dtype=np.float32))) def cal_score(video_path): """please ensure the video is static""" frames = get_frames(video_path) score_seq = mae_seq(frames) return (255.0 - np.mean(score_seq).item())/255.0 def temporal_flickering(video_list): sim = [] video_results = [] for video_path in tqdm(video_list): try: score_per_video = cal_score(video_path) except AssertionError: continue video_results.append({'video_path': video_path, 'video_results': score_per_video}) sim.append(score_per_video) avg_score = np.mean(sim) return avg_score, video_results def compute_temporal_flickering(json_dir, device, submodules_list, **kwargs): video_list, _ = load_dimension_info(json_dir, dimension='temporal_flickering', lang='en') all_results, video_results = temporal_flickering(video_list) return all_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench/temporal_style.py ================================================ import os import json import numpy as np import torch import clip from tqdm import tqdm from vbench.utils import load_video, load_dimension_info, clip_transform, read_frames_decord_by_fps, CACHE_DIR from vbench.third_party.ViCLIP.viclip import ViCLIP from vbench.third_party.ViCLIP.simple_tokenizer import SimpleTokenizer def get_text_features(model, input_text, tokenizer, text_feature_dict={}): if input_text in text_feature_dict: return text_feature_dict[input_text] text_template= f"{input_text}" with torch.no_grad(): text_features = model.encode_text(text_template).float() text_features /= text_features.norm(dim=-1, keepdim=True) text_feature_dict[input_text] = text_features return text_features def get_vid_features(model, input_frames): with torch.no_grad(): clip_feat = model.encode_vision(input_frames,test=True).float() clip_feat /= clip_feat.norm(dim=-1, keepdim=True) return clip_feat def get_predict_label(clip_feature, text_feats_tensor, top=5): label_probs = (100.0 * clip_feature @ text_feats_tensor.T).softmax(dim=-1) top_probs, top_labels = label_probs.cpu().topk(top, dim=-1) return top_probs, top_labels def temporal_style(clip_model, video_dict, tokenizer, device, sample="middle"): sim = [] video_results = [] image_transform = clip_transform(224) for info in tqdm(video_dict): query = info['prompt'] text = clip.tokenize([query]).to(device) video_list = info['video_list'] for video_path in video_list: cur_video = [] with torch.no_grad(): # images = load_video(video_path, num_frames=8) images = read_frames_decord_by_fps(video_path, num_frames=8, sample=sample) images = image_transform(images) images = images.to(device) clip_feat = get_vid_features(clip_model,images.unsqueeze(0)) text_feat = get_text_features(clip_model, query, tokenizer) logit_per_text = clip_feat @ text_feat.T score_per_video = float(logit_per_text[0][0].cpu()) sim.append(score_per_video) video_results.append({'video_path': video_path, 'video_results': score_per_video}) avg_score = np.mean(sim) return avg_score, video_results def compute_temporal_style(json_dir, device, submodules_list, **kwargs): tokenizer = SimpleTokenizer(os.path.join(CACHE_DIR, "ViCLIP/bpe_simple_vocab_16e6.txt.gz")) viclip = ViCLIP(tokenizer= tokenizer, **submodules_list).to(device) _, video_dict = load_dimension_info(json_dir, dimension='temporal_style', lang='en') all_results, video_results = temporal_style(viclip, video_dict, tokenizer, device) return all_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/0.txt ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/corr.py ================================================ import torch import torch.nn.functional as F from .utils_core.utils import bilinear_sampler, coords_grid try: import alt_cuda_corr except: # alt_cuda_corr is not compiled pass class CorrBlock: def __init__(self, fmap1, fmap2, num_levels=4, radius=4): self.num_levels = num_levels self.radius = radius self.corr_pyramid = [] # all pairs correlation corr = CorrBlock.corr(fmap1, fmap2) batch, h1, w1, dim, h2, w2 = corr.shape corr = corr.reshape(batch*h1*w1, dim, h2, w2) self.corr_pyramid.append(corr) for i in range(self.num_levels-1): corr = F.avg_pool2d(corr, 2, stride=2) self.corr_pyramid.append(corr) def __call__(self, coords): r = self.radius coords = coords.permute(0, 2, 3, 1) batch, h1, w1, _ = coords.shape out_pyramid = [] for i in range(self.num_levels): corr = self.corr_pyramid[i] dx = torch.linspace(-r, r, 2*r+1, device=coords.device) dy = torch.linspace(-r, r, 2*r+1, device=coords.device) delta = torch.stack(torch.meshgrid(dy, dx), axis=-1) centroid_lvl = coords.reshape(batch*h1*w1, 1, 1, 2) / 2**i delta_lvl = delta.view(1, 2*r+1, 2*r+1, 2) coords_lvl = centroid_lvl + delta_lvl corr = bilinear_sampler(corr, coords_lvl) corr = corr.view(batch, h1, w1, -1) out_pyramid.append(corr) out = torch.cat(out_pyramid, dim=-1) return out.permute(0, 3, 1, 2).contiguous().float() @staticmethod def corr(fmap1, fmap2): batch, dim, ht, wd = fmap1.shape fmap1 = fmap1.view(batch, dim, ht*wd) fmap2 = fmap2.view(batch, dim, ht*wd) corr = torch.matmul(fmap1.transpose(1,2), fmap2) corr = corr.view(batch, ht, wd, 1, ht, wd) return corr / torch.sqrt(torch.tensor(dim).float()) class AlternateCorrBlock: def __init__(self, fmap1, fmap2, num_levels=4, radius=4): self.num_levels = num_levels self.radius = radius self.pyramid = [(fmap1, fmap2)] for i in range(self.num_levels): fmap1 = F.avg_pool2d(fmap1, 2, stride=2) fmap2 = F.avg_pool2d(fmap2, 2, stride=2) self.pyramid.append((fmap1, fmap2)) def __call__(self, coords): coords = coords.permute(0, 2, 3, 1) B, H, W, _ = coords.shape dim = self.pyramid[0][0].shape[1] corr_list = [] for i in range(self.num_levels): r = self.radius fmap1_i = self.pyramid[0][0].permute(0, 2, 3, 1).contiguous() fmap2_i = self.pyramid[i][1].permute(0, 2, 3, 1).contiguous() coords_i = (coords / 2**i).reshape(B, 1, H, W, 2).contiguous() corr, = alt_cuda_corr.forward(fmap1_i, fmap2_i, coords_i, r) corr_list.append(corr.squeeze(1)) corr = torch.stack(corr_list, dim=1) corr = corr.reshape(B, -1, H, W) return corr / torch.sqrt(torch.tensor(dim).float()) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/datasets.py ================================================ # Data loading based on https://github.com/NVIDIA/flownet2-pytorch import numpy as np import torch import torch.utils.data as data import torch.nn.functional as F import os import math import random from glob import glob import os.path as osp from utils_core import frame_utils from utils_core.augmentor import FlowAugmentor, SparseFlowAugmentor class FlowDataset(data.Dataset): def __init__(self, aug_params=None, sparse=False): self.augmentor = None self.sparse = sparse if aug_params is not None: if sparse: self.augmentor = SparseFlowAugmentor(**aug_params) else: self.augmentor = FlowAugmentor(**aug_params) self.is_test = False self.init_seed = False self.flow_list = [] self.image_list = [] self.extra_info = [] def __getitem__(self, index): if self.is_test: img1 = frame_utils.read_gen(self.image_list[index][0]) img2 = frame_utils.read_gen(self.image_list[index][1]) img1 = np.array(img1).astype(np.uint8)[..., :3] img2 = np.array(img2).astype(np.uint8)[..., :3] img1 = torch.from_numpy(img1).permute(2, 0, 1).float() img2 = torch.from_numpy(img2).permute(2, 0, 1).float() return img1, img2, self.extra_info[index] if not self.init_seed: worker_info = torch.utils.data.get_worker_info() if worker_info is not None: torch.manual_seed(worker_info.id) np.random.seed(worker_info.id) random.seed(worker_info.id) self.init_seed = True index = index % len(self.image_list) valid = None if self.sparse: flow, valid = frame_utils.readFlowKITTI(self.flow_list[index]) else: flow = frame_utils.read_gen(self.flow_list[index]) img1 = frame_utils.read_gen(self.image_list[index][0]) img2 = frame_utils.read_gen(self.image_list[index][1]) flow = np.array(flow).astype(np.float32) img1 = np.array(img1).astype(np.uint8) img2 = np.array(img2).astype(np.uint8) # grayscale images if len(img1.shape) == 2: img1 = np.tile(img1[...,None], (1, 1, 3)) img2 = np.tile(img2[...,None], (1, 1, 3)) else: img1 = img1[..., :3] img2 = img2[..., :3] if self.augmentor is not None: if self.sparse: img1, img2, flow, valid = self.augmentor(img1, img2, flow, valid) else: img1, img2, flow = self.augmentor(img1, img2, flow) img1 = torch.from_numpy(img1).permute(2, 0, 1).float() img2 = torch.from_numpy(img2).permute(2, 0, 1).float() flow = torch.from_numpy(flow).permute(2, 0, 1).float() if valid is not None: valid = torch.from_numpy(valid) else: valid = (flow[0].abs() < 1000) & (flow[1].abs() < 1000) return img1, img2, flow, valid.float() def __rmul__(self, v): self.flow_list = v * self.flow_list self.image_list = v * self.image_list return self def __len__(self): return len(self.image_list) class MpiSintel(FlowDataset): def __init__(self, aug_params=None, split='training', root='datasets/Sintel', dstype='clean'): super(MpiSintel, self).__init__(aug_params) flow_root = osp.join(root, split, 'flow') image_root = osp.join(root, split, dstype) if split == 'test': self.is_test = True for scene in os.listdir(image_root): image_list = sorted(glob(osp.join(image_root, scene, '*.png'))) for i in range(len(image_list)-1): self.image_list += [ [image_list[i], image_list[i+1]] ] self.extra_info += [ (scene, i) ] # scene and frame_id if split != 'test': self.flow_list += sorted(glob(osp.join(flow_root, scene, '*.flo'))) class FlyingChairs(FlowDataset): def __init__(self, aug_params=None, split='train', root='datasets/FlyingChairs_release/data'): super(FlyingChairs, self).__init__(aug_params) images = sorted(glob(osp.join(root, '*.ppm'))) flows = sorted(glob(osp.join(root, '*.flo'))) assert (len(images)//2 == len(flows)) split_list = np.loadtxt('chairs_split.txt', dtype=np.int32) for i in range(len(flows)): xid = split_list[i] if (split=='training' and xid==1) or (split=='validation' and xid==2): self.flow_list += [ flows[i] ] self.image_list += [ [images[2*i], images[2*i+1]] ] class FlyingThings3D(FlowDataset): def __init__(self, aug_params=None, root='datasets/FlyingThings3D', dstype='frames_cleanpass'): super(FlyingThings3D, self).__init__(aug_params) for cam in ['left']: for direction in ['into_future', 'into_past']: image_dirs = sorted(glob(osp.join(root, dstype, 'TRAIN/*/*'))) image_dirs = sorted([osp.join(f, cam) for f in image_dirs]) flow_dirs = sorted(glob(osp.join(root, 'optical_flow/TRAIN/*/*'))) flow_dirs = sorted([osp.join(f, direction, cam) for f in flow_dirs]) for idir, fdir in zip(image_dirs, flow_dirs): images = sorted(glob(osp.join(idir, '*.png')) ) flows = sorted(glob(osp.join(fdir, '*.pfm')) ) for i in range(len(flows)-1): if direction == 'into_future': self.image_list += [ [images[i], images[i+1]] ] self.flow_list += [ flows[i] ] elif direction == 'into_past': self.image_list += [ [images[i+1], images[i]] ] self.flow_list += [ flows[i+1] ] class KITTI(FlowDataset): def __init__(self, aug_params=None, split='training', root='datasets/KITTI'): super(KITTI, self).__init__(aug_params, sparse=True) if split == 'testing': self.is_test = True root = osp.join(root, split) images1 = sorted(glob(osp.join(root, 'image_2/*_10.png'))) images2 = sorted(glob(osp.join(root, 'image_2/*_11.png'))) for img1, img2 in zip(images1, images2): frame_id = img1.split('/')[-1] self.extra_info += [ [frame_id] ] self.image_list += [ [img1, img2] ] if split == 'training': self.flow_list = sorted(glob(osp.join(root, 'flow_occ/*_10.png'))) class HD1K(FlowDataset): def __init__(self, aug_params=None, root='datasets/HD1k'): super(HD1K, self).__init__(aug_params, sparse=True) seq_ix = 0 while 1: flows = sorted(glob(os.path.join(root, 'hd1k_flow_gt', 'flow_occ/%06d_*.png' % seq_ix))) images = sorted(glob(os.path.join(root, 'hd1k_input', 'image_2/%06d_*.png' % seq_ix))) if len(flows) == 0: break for i in range(len(flows)-1): self.flow_list += [flows[i]] self.image_list += [ [images[i], images[i+1]] ] seq_ix += 1 def fetch_dataloader(args, TRAIN_DS='C+T+K+S+H'): """ Create the data loader for the corresponding trainign set """ if args.stage == 'chairs': aug_params = {'crop_size': args.image_size, 'min_scale': -0.1, 'max_scale': 1.0, 'do_flip': True} train_dataset = FlyingChairs(aug_params, split='training') elif args.stage == 'things': aug_params = {'crop_size': args.image_size, 'min_scale': -0.4, 'max_scale': 0.8, 'do_flip': True} clean_dataset = FlyingThings3D(aug_params, dstype='frames_cleanpass') final_dataset = FlyingThings3D(aug_params, dstype='frames_finalpass') train_dataset = clean_dataset + final_dataset elif args.stage == 'sintel': aug_params = {'crop_size': args.image_size, 'min_scale': -0.2, 'max_scale': 0.6, 'do_flip': True} things = FlyingThings3D(aug_params, dstype='frames_cleanpass') sintel_clean = MpiSintel(aug_params, split='training', dstype='clean') sintel_final = MpiSintel(aug_params, split='training', dstype='final') if TRAIN_DS == 'C+T+K+S+H': kitti = KITTI({'crop_size': args.image_size, 'min_scale': -0.3, 'max_scale': 0.5, 'do_flip': True}) hd1k = HD1K({'crop_size': args.image_size, 'min_scale': -0.5, 'max_scale': 0.2, 'do_flip': True}) train_dataset = 100*sintel_clean + 100*sintel_final + 200*kitti + 5*hd1k + things elif TRAIN_DS == 'C+T+K/S': train_dataset = 100*sintel_clean + 100*sintel_final + things elif args.stage == 'kitti': aug_params = {'crop_size': args.image_size, 'min_scale': -0.2, 'max_scale': 0.4, 'do_flip': False} train_dataset = KITTI(aug_params, split='training') train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, pin_memory=False, shuffle=True, num_workers=4, drop_last=True) print('Training with %d image pairs' % len(train_dataset)) return train_loader ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/extractor.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F class ResidualBlock(nn.Module): def __init__(self, in_planes, planes, norm_fn='group', stride=1): super(ResidualBlock, self).__init__() self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1) self.relu = nn.ReLU(inplace=True) num_groups = planes // 8 if norm_fn == 'group': self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) if not stride == 1: self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) elif norm_fn == 'batch': self.norm1 = nn.BatchNorm2d(planes) self.norm2 = nn.BatchNorm2d(planes) if not stride == 1: self.norm3 = nn.BatchNorm2d(planes) elif norm_fn == 'instance': self.norm1 = nn.InstanceNorm2d(planes) self.norm2 = nn.InstanceNorm2d(planes) if not stride == 1: self.norm3 = nn.InstanceNorm2d(planes) elif norm_fn == 'none': self.norm1 = nn.Sequential() self.norm2 = nn.Sequential() if not stride == 1: self.norm3 = nn.Sequential() if stride == 1: self.downsample = None else: self.downsample = nn.Sequential( nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3) def forward(self, x): y = x y = self.relu(self.norm1(self.conv1(y))) y = self.relu(self.norm2(self.conv2(y))) if self.downsample is not None: x = self.downsample(x) return self.relu(x+y) class BottleneckBlock(nn.Module): def __init__(self, in_planes, planes, norm_fn='group', stride=1): super(BottleneckBlock, self).__init__() self.conv1 = nn.Conv2d(in_planes, planes//4, kernel_size=1, padding=0) self.conv2 = nn.Conv2d(planes//4, planes//4, kernel_size=3, padding=1, stride=stride) self.conv3 = nn.Conv2d(planes//4, planes, kernel_size=1, padding=0) self.relu = nn.ReLU(inplace=True) num_groups = planes // 8 if norm_fn == 'group': self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4) self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4) self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) if not stride == 1: self.norm4 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) elif norm_fn == 'batch': self.norm1 = nn.BatchNorm2d(planes//4) self.norm2 = nn.BatchNorm2d(planes//4) self.norm3 = nn.BatchNorm2d(planes) if not stride == 1: self.norm4 = nn.BatchNorm2d(planes) elif norm_fn == 'instance': self.norm1 = nn.InstanceNorm2d(planes//4) self.norm2 = nn.InstanceNorm2d(planes//4) self.norm3 = nn.InstanceNorm2d(planes) if not stride == 1: self.norm4 = nn.InstanceNorm2d(planes) elif norm_fn == 'none': self.norm1 = nn.Sequential() self.norm2 = nn.Sequential() self.norm3 = nn.Sequential() if not stride == 1: self.norm4 = nn.Sequential() if stride == 1: self.downsample = None else: self.downsample = nn.Sequential( nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm4) def forward(self, x): y = x y = self.relu(self.norm1(self.conv1(y))) y = self.relu(self.norm2(self.conv2(y))) y = self.relu(self.norm3(self.conv3(y))) if self.downsample is not None: x = self.downsample(x) return self.relu(x+y) class BasicEncoder(nn.Module): def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0): super(BasicEncoder, self).__init__() self.norm_fn = norm_fn if self.norm_fn == 'group': self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64) elif self.norm_fn == 'batch': self.norm1 = nn.BatchNorm2d(64) elif self.norm_fn == 'instance': self.norm1 = nn.InstanceNorm2d(64) elif self.norm_fn == 'none': self.norm1 = nn.Sequential() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) self.relu1 = nn.ReLU(inplace=True) self.in_planes = 64 self.layer1 = self._make_layer(64, stride=1) self.layer2 = self._make_layer(96, stride=2) self.layer3 = self._make_layer(128, stride=2) # output convolution self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1) self.dropout = None if dropout > 0: self.dropout = nn.Dropout2d(p=dropout) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0) def _make_layer(self, dim, stride=1): layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride) layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1) layers = (layer1, layer2) self.in_planes = dim return nn.Sequential(*layers) def forward(self, x): # if input is list, combine batch dimension is_list = isinstance(x, tuple) or isinstance(x, list) if is_list: batch_dim = x[0].shape[0] x = torch.cat(x, dim=0) x = self.conv1(x) x = self.norm1(x) x = self.relu1(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.conv2(x) if self.training and self.dropout is not None: x = self.dropout(x) if is_list: x = torch.split(x, [batch_dim, batch_dim], dim=0) return x class SmallEncoder(nn.Module): def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0): super(SmallEncoder, self).__init__() self.norm_fn = norm_fn if self.norm_fn == 'group': self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32) elif self.norm_fn == 'batch': self.norm1 = nn.BatchNorm2d(32) elif self.norm_fn == 'instance': self.norm1 = nn.InstanceNorm2d(32) elif self.norm_fn == 'none': self.norm1 = nn.Sequential() self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3) self.relu1 = nn.ReLU(inplace=True) self.in_planes = 32 self.layer1 = self._make_layer(32, stride=1) self.layer2 = self._make_layer(64, stride=2) self.layer3 = self._make_layer(96, stride=2) self.dropout = None if dropout > 0: self.dropout = nn.Dropout2d(p=dropout) self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0) def _make_layer(self, dim, stride=1): layer1 = BottleneckBlock(self.in_planes, dim, self.norm_fn, stride=stride) layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1) layers = (layer1, layer2) self.in_planes = dim return nn.Sequential(*layers) def forward(self, x): # if input is list, combine batch dimension is_list = isinstance(x, tuple) or isinstance(x, list) if is_list: batch_dim = x[0].shape[0] x = torch.cat(x, dim=0) x = self.conv1(x) x = self.norm1(x) x = self.relu1(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.conv2(x) if self.training and self.dropout is not None: x = self.dropout(x) if is_list: x = torch.split(x, [batch_dim, batch_dim], dim=0) return x ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/raft.py ================================================ import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from .update import BasicUpdateBlock, SmallUpdateBlock from .extractor import BasicEncoder, SmallEncoder from .corr import CorrBlock, AlternateCorrBlock from .utils_core.utils import bilinear_sampler, coords_grid, upflow8 try: autocast = torch.cuda.amp.autocast except: # dummy autocast for PyTorch < 1.6 class autocast: def __init__(self, enabled): pass def __enter__(self): pass def __exit__(self, *args): pass class RAFT(nn.Module): def __init__(self, args): super(RAFT, self).__init__() self.args = args if args.small: self.hidden_dim = hdim = 96 self.context_dim = cdim = 64 args.corr_levels = 4 args.corr_radius = 3 else: self.hidden_dim = hdim = 128 self.context_dim = cdim = 128 args.corr_levels = 4 args.corr_radius = 4 if 'dropout' not in self.args: self.args.dropout = 0 if 'alternate_corr' not in self.args: self.args.alternate_corr = False # feature network, context network, and update block if args.small: self.fnet = SmallEncoder(output_dim=128, norm_fn='instance', dropout=args.dropout) self.cnet = SmallEncoder(output_dim=hdim+cdim, norm_fn='none', dropout=args.dropout) self.update_block = SmallUpdateBlock(self.args, hidden_dim=hdim) else: self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', dropout=args.dropout) self.cnet = BasicEncoder(output_dim=hdim+cdim, norm_fn='batch', dropout=args.dropout) self.update_block = BasicUpdateBlock(self.args, hidden_dim=hdim) def freeze_bn(self): for m in self.modules(): if isinstance(m, nn.BatchNorm2d): m.eval() def initialize_flow(self, img): """ Flow is represented as difference between two coordinate grids flow = coords1 - coords0""" N, C, H, W = img.shape coords0 = coords_grid(N, H//8, W//8, device=img.device) coords1 = coords_grid(N, H//8, W//8, device=img.device) # optical flow computed as difference: flow = coords1 - coords0 return coords0, coords1 def upsample_flow(self, flow, mask): """ Upsample flow field [H/8, W/8, 2] -> [H, W, 2] using convex combination """ N, _, H, W = flow.shape mask = mask.view(N, 1, 9, 8, 8, H, W) mask = torch.softmax(mask, dim=2) up_flow = F.unfold(8 * flow, [3,3], padding=1) up_flow = up_flow.view(N, 2, 9, 1, 1, H, W) up_flow = torch.sum(mask * up_flow, dim=2) up_flow = up_flow.permute(0, 1, 4, 2, 5, 3) return up_flow.reshape(N, 2, 8*H, 8*W) def forward(self, image1, image2, iters=12, flow_init=None, upsample=True, test_mode=False): """ Estimate optical flow between pair of frames """ image1 = 2 * (image1 / 255.0) - 1.0 image2 = 2 * (image2 / 255.0) - 1.0 image1 = image1.contiguous() image2 = image2.contiguous() hdim = self.hidden_dim cdim = self.context_dim # run the feature network with autocast(enabled=self.args.mixed_precision): fmap1, fmap2 = self.fnet([image1, image2]) fmap1 = fmap1.float() fmap2 = fmap2.float() if self.args.alternate_corr: corr_fn = AlternateCorrBlock(fmap1, fmap2, radius=self.args.corr_radius) else: corr_fn = CorrBlock(fmap1, fmap2, radius=self.args.corr_radius) # run the context network with autocast(enabled=self.args.mixed_precision): cnet = self.cnet(image1) net, inp = torch.split(cnet, [hdim, cdim], dim=1) net = torch.tanh(net) inp = torch.relu(inp) coords0, coords1 = self.initialize_flow(image1) if flow_init is not None: coords1 = coords1 + flow_init flow_predictions = [] for itr in range(iters): coords1 = coords1.detach() corr = corr_fn(coords1) # index correlation volume flow = coords1 - coords0 with autocast(enabled=self.args.mixed_precision): net, up_mask, delta_flow = self.update_block(net, inp, corr, flow) # F(t+1) = F(t) + \Delta(t) coords1 = coords1 + delta_flow # upsample predictions if up_mask is None: flow_up = upflow8(coords1 - coords0) else: flow_up = self.upsample_flow(coords1 - coords0, up_mask) flow_predictions.append(flow_up) if test_mode: return coords1 - coords0, flow_up return flow_predictions ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/update.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F class FlowHead(nn.Module): def __init__(self, input_dim=128, hidden_dim=256): super(FlowHead, self).__init__() self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1) self.conv2 = nn.Conv2d(hidden_dim, 2, 3, padding=1) self.relu = nn.ReLU(inplace=True) def forward(self, x): return self.conv2(self.relu(self.conv1(x))) class ConvGRU(nn.Module): def __init__(self, hidden_dim=128, input_dim=192+128): super(ConvGRU, self).__init__() self.convz = nn.Conv2d(hidden_dim+input_dim, hidden_dim, 3, padding=1) self.convr = nn.Conv2d(hidden_dim+input_dim, hidden_dim, 3, padding=1) self.convq = nn.Conv2d(hidden_dim+input_dim, hidden_dim, 3, padding=1) def forward(self, h, x): hx = torch.cat([h, x], dim=1) z = torch.sigmoid(self.convz(hx)) r = torch.sigmoid(self.convr(hx)) q = torch.tanh(self.convq(torch.cat([r*h, x], dim=1))) h = (1-z) * h + z * q return h class SepConvGRU(nn.Module): def __init__(self, hidden_dim=128, input_dim=192+128): super(SepConvGRU, self).__init__() self.convz1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2)) self.convr1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2)) self.convq1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2)) self.convz2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0)) self.convr2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0)) self.convq2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0)) def forward(self, h, x): # horizontal hx = torch.cat([h, x], dim=1) z = torch.sigmoid(self.convz1(hx)) r = torch.sigmoid(self.convr1(hx)) q = torch.tanh(self.convq1(torch.cat([r*h, x], dim=1))) h = (1-z) * h + z * q # vertical hx = torch.cat([h, x], dim=1) z = torch.sigmoid(self.convz2(hx)) r = torch.sigmoid(self.convr2(hx)) q = torch.tanh(self.convq2(torch.cat([r*h, x], dim=1))) h = (1-z) * h + z * q return h class SmallMotionEncoder(nn.Module): def __init__(self, args): super(SmallMotionEncoder, self).__init__() cor_planes = args.corr_levels * (2*args.corr_radius + 1)**2 self.convc1 = nn.Conv2d(cor_planes, 96, 1, padding=0) self.convf1 = nn.Conv2d(2, 64, 7, padding=3) self.convf2 = nn.Conv2d(64, 32, 3, padding=1) self.conv = nn.Conv2d(128, 80, 3, padding=1) def forward(self, flow, corr): cor = F.relu(self.convc1(corr)) flo = F.relu(self.convf1(flow)) flo = F.relu(self.convf2(flo)) cor_flo = torch.cat([cor, flo], dim=1) out = F.relu(self.conv(cor_flo)) return torch.cat([out, flow], dim=1) class BasicMotionEncoder(nn.Module): def __init__(self, args): super(BasicMotionEncoder, self).__init__() cor_planes = args.corr_levels * (2*args.corr_radius + 1)**2 self.convc1 = nn.Conv2d(cor_planes, 256, 1, padding=0) self.convc2 = nn.Conv2d(256, 192, 3, padding=1) self.convf1 = nn.Conv2d(2, 128, 7, padding=3) self.convf2 = nn.Conv2d(128, 64, 3, padding=1) self.conv = nn.Conv2d(64+192, 128-2, 3, padding=1) def forward(self, flow, corr): cor = F.relu(self.convc1(corr)) cor = F.relu(self.convc2(cor)) flo = F.relu(self.convf1(flow)) flo = F.relu(self.convf2(flo)) cor_flo = torch.cat([cor, flo], dim=1) out = F.relu(self.conv(cor_flo)) return torch.cat([out, flow], dim=1) class SmallUpdateBlock(nn.Module): def __init__(self, args, hidden_dim=96): super(SmallUpdateBlock, self).__init__() self.encoder = SmallMotionEncoder(args) self.gru = ConvGRU(hidden_dim=hidden_dim, input_dim=82+64) self.flow_head = FlowHead(hidden_dim, hidden_dim=128) def forward(self, net, inp, corr, flow): motion_features = self.encoder(flow, corr) inp = torch.cat([inp, motion_features], dim=1) net = self.gru(net, inp) delta_flow = self.flow_head(net) return net, None, delta_flow class BasicUpdateBlock(nn.Module): def __init__(self, args, hidden_dim=128, input_dim=128): super(BasicUpdateBlock, self).__init__() self.args = args self.encoder = BasicMotionEncoder(args) self.gru = SepConvGRU(hidden_dim=hidden_dim, input_dim=128+hidden_dim) self.flow_head = FlowHead(hidden_dim, hidden_dim=256) self.mask = nn.Sequential( nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(256, 64*9, 1, padding=0)) def forward(self, net, inp, corr, flow, upsample=True): motion_features = self.encoder(flow, corr) inp = torch.cat([inp, motion_features], dim=1) net = self.gru(net, inp) delta_flow = self.flow_head(net) # scale mask to balence gradients mask = .25 * self.mask(net) return net, mask, delta_flow ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/utils_core/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/utils_core/augmentor.py ================================================ import numpy as np import random import math from PIL import Image import cv2 cv2.setNumThreads(0) cv2.ocl.setUseOpenCL(False) import torch from torchvision.transforms import ColorJitter import torch.nn.functional as F class FlowAugmentor: def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=True): # spatial augmentation params self.crop_size = crop_size self.min_scale = min_scale self.max_scale = max_scale self.spatial_aug_prob = 0.8 self.stretch_prob = 0.8 self.max_stretch = 0.2 # flip augmentation params self.do_flip = do_flip self.h_flip_prob = 0.5 self.v_flip_prob = 0.1 # photometric augmentation params self.photo_aug = ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.5/3.14) self.asymmetric_color_aug_prob = 0.2 self.eraser_aug_prob = 0.5 def color_transform(self, img1, img2): """ Photometric augmentation """ # asymmetric if np.random.rand() < self.asymmetric_color_aug_prob: img1 = np.array(self.photo_aug(Image.fromarray(img1)), dtype=np.uint8) img2 = np.array(self.photo_aug(Image.fromarray(img2)), dtype=np.uint8) # symmetric else: image_stack = np.concatenate([img1, img2], axis=0) image_stack = np.array(self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8) img1, img2 = np.split(image_stack, 2, axis=0) return img1, img2 def eraser_transform(self, img1, img2, bounds=[50, 100]): """ Occlusion augmentation """ ht, wd = img1.shape[:2] if np.random.rand() < self.eraser_aug_prob: mean_color = np.mean(img2.reshape(-1, 3), axis=0) for _ in range(np.random.randint(1, 3)): x0 = np.random.randint(0, wd) y0 = np.random.randint(0, ht) dx = np.random.randint(bounds[0], bounds[1]) dy = np.random.randint(bounds[0], bounds[1]) img2[y0:y0+dy, x0:x0+dx, :] = mean_color return img1, img2 def spatial_transform(self, img1, img2, flow): # randomly sample scale ht, wd = img1.shape[:2] min_scale = np.maximum( (self.crop_size[0] + 8) / float(ht), (self.crop_size[1] + 8) / float(wd)) scale = 2 ** np.random.uniform(self.min_scale, self.max_scale) scale_x = scale scale_y = scale if np.random.rand() < self.stretch_prob: scale_x *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch) scale_y *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch) scale_x = np.clip(scale_x, min_scale, None) scale_y = np.clip(scale_y, min_scale, None) if np.random.rand() < self.spatial_aug_prob: # rescale the images img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) flow = cv2.resize(flow, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) flow = flow * [scale_x, scale_y] if self.do_flip: if np.random.rand() < self.h_flip_prob: # h-flip img1 = img1[:, ::-1] img2 = img2[:, ::-1] flow = flow[:, ::-1] * [-1.0, 1.0] if np.random.rand() < self.v_flip_prob: # v-flip img1 = img1[::-1, :] img2 = img2[::-1, :] flow = flow[::-1, :] * [1.0, -1.0] y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0]) x0 = np.random.randint(0, img1.shape[1] - self.crop_size[1]) img1 = img1[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] img2 = img2[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] flow = flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] return img1, img2, flow def __call__(self, img1, img2, flow): img1, img2 = self.color_transform(img1, img2) img1, img2 = self.eraser_transform(img1, img2) img1, img2, flow = self.spatial_transform(img1, img2, flow) img1 = np.ascontiguousarray(img1) img2 = np.ascontiguousarray(img2) flow = np.ascontiguousarray(flow) return img1, img2, flow class SparseFlowAugmentor: def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=False): # spatial augmentation params self.crop_size = crop_size self.min_scale = min_scale self.max_scale = max_scale self.spatial_aug_prob = 0.8 self.stretch_prob = 0.8 self.max_stretch = 0.2 # flip augmentation params self.do_flip = do_flip self.h_flip_prob = 0.5 self.v_flip_prob = 0.1 # photometric augmentation params self.photo_aug = ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.3/3.14) self.asymmetric_color_aug_prob = 0.2 self.eraser_aug_prob = 0.5 def color_transform(self, img1, img2): image_stack = np.concatenate([img1, img2], axis=0) image_stack = np.array(self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8) img1, img2 = np.split(image_stack, 2, axis=0) return img1, img2 def eraser_transform(self, img1, img2): ht, wd = img1.shape[:2] if np.random.rand() < self.eraser_aug_prob: mean_color = np.mean(img2.reshape(-1, 3), axis=0) for _ in range(np.random.randint(1, 3)): x0 = np.random.randint(0, wd) y0 = np.random.randint(0, ht) dx = np.random.randint(50, 100) dy = np.random.randint(50, 100) img2[y0:y0+dy, x0:x0+dx, :] = mean_color return img1, img2 def resize_sparse_flow_map(self, flow, valid, fx=1.0, fy=1.0): ht, wd = flow.shape[:2] coords = np.meshgrid(np.arange(wd), np.arange(ht)) coords = np.stack(coords, axis=-1) coords = coords.reshape(-1, 2).astype(np.float32) flow = flow.reshape(-1, 2).astype(np.float32) valid = valid.reshape(-1).astype(np.float32) coords0 = coords[valid>=1] flow0 = flow[valid>=1] ht1 = int(round(ht * fy)) wd1 = int(round(wd * fx)) coords1 = coords0 * [fx, fy] flow1 = flow0 * [fx, fy] xx = np.round(coords1[:,0]).astype(np.int32) yy = np.round(coords1[:,1]).astype(np.int32) v = (xx > 0) & (xx < wd1) & (yy > 0) & (yy < ht1) xx = xx[v] yy = yy[v] flow1 = flow1[v] flow_img = np.zeros([ht1, wd1, 2], dtype=np.float32) valid_img = np.zeros([ht1, wd1], dtype=np.int32) flow_img[yy, xx] = flow1 valid_img[yy, xx] = 1 return flow_img, valid_img def spatial_transform(self, img1, img2, flow, valid): # randomly sample scale ht, wd = img1.shape[:2] min_scale = np.maximum( (self.crop_size[0] + 1) / float(ht), (self.crop_size[1] + 1) / float(wd)) scale = 2 ** np.random.uniform(self.min_scale, self.max_scale) scale_x = np.clip(scale, min_scale, None) scale_y = np.clip(scale, min_scale, None) if np.random.rand() < self.spatial_aug_prob: # rescale the images img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) flow, valid = self.resize_sparse_flow_map(flow, valid, fx=scale_x, fy=scale_y) if self.do_flip: if np.random.rand() < 0.5: # h-flip img1 = img1[:, ::-1] img2 = img2[:, ::-1] flow = flow[:, ::-1] * [-1.0, 1.0] valid = valid[:, ::-1] margin_y = 20 margin_x = 50 y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0] + margin_y) x0 = np.random.randint(-margin_x, img1.shape[1] - self.crop_size[1] + margin_x) y0 = np.clip(y0, 0, img1.shape[0] - self.crop_size[0]) x0 = np.clip(x0, 0, img1.shape[1] - self.crop_size[1]) img1 = img1[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] img2 = img2[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] flow = flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] valid = valid[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] return img1, img2, flow, valid def __call__(self, img1, img2, flow, valid): img1, img2 = self.color_transform(img1, img2) img1, img2 = self.eraser_transform(img1, img2) img1, img2, flow, valid = self.spatial_transform(img1, img2, flow, valid) img1 = np.ascontiguousarray(img1) img2 = np.ascontiguousarray(img2) flow = np.ascontiguousarray(flow) valid = np.ascontiguousarray(valid) return img1, img2, flow, valid ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/utils_core/flow_viz.py ================================================ # Flow visualization code used from https://github.com/tomrunia/OpticalFlow_Visualization # MIT License # # Copyright (c) 2018 Tom Runia # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to conditions. # # Author: Tom Runia # Date Created: 2018-08-03 import numpy as np def make_colorwheel(): """ Generates a color wheel for optical flow visualization as presented in: Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007) URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf Code follows the original C++ source code of Daniel Scharstein. Code follows the the Matlab source code of Deqing Sun. Returns: np.ndarray: Color wheel """ RY = 15 YG = 6 GC = 4 CB = 11 BM = 13 MR = 6 ncols = RY + YG + GC + CB + BM + MR colorwheel = np.zeros((ncols, 3)) col = 0 # RY colorwheel[0:RY, 0] = 255 colorwheel[0:RY, 1] = np.floor(255*np.arange(0,RY)/RY) col = col+RY # YG colorwheel[col:col+YG, 0] = 255 - np.floor(255*np.arange(0,YG)/YG) colorwheel[col:col+YG, 1] = 255 col = col+YG # GC colorwheel[col:col+GC, 1] = 255 colorwheel[col:col+GC, 2] = np.floor(255*np.arange(0,GC)/GC) col = col+GC # CB colorwheel[col:col+CB, 1] = 255 - np.floor(255*np.arange(CB)/CB) colorwheel[col:col+CB, 2] = 255 col = col+CB # BM colorwheel[col:col+BM, 2] = 255 colorwheel[col:col+BM, 0] = np.floor(255*np.arange(0,BM)/BM) col = col+BM # MR colorwheel[col:col+MR, 2] = 255 - np.floor(255*np.arange(MR)/MR) colorwheel[col:col+MR, 0] = 255 return colorwheel def flow_uv_to_colors(u, v, convert_to_bgr=False): """ Applies the flow color wheel to (possibly clipped) flow components u and v. According to the C++ source code of Daniel Scharstein According to the Matlab source code of Deqing Sun Args: u (np.ndarray): Input horizontal flow of shape [H,W] v (np.ndarray): Input vertical flow of shape [H,W] convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False. Returns: np.ndarray: Flow visualization image of shape [H,W,3] """ flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8) colorwheel = make_colorwheel() # shape [55x3] ncols = colorwheel.shape[0] rad = np.sqrt(np.square(u) + np.square(v)) a = np.arctan2(-v, -u)/np.pi fk = (a+1) / 2*(ncols-1) k0 = np.floor(fk).astype(np.int32) k1 = k0 + 1 k1[k1 == ncols] = 0 f = fk - k0 for i in range(colorwheel.shape[1]): tmp = colorwheel[:,i] col0 = tmp[k0] / 255.0 col1 = tmp[k1] / 255.0 col = (1-f)*col0 + f*col1 idx = (rad <= 1) col[idx] = 1 - rad[idx] * (1-col[idx]) col[~idx] = col[~idx] * 0.75 # out of range # Note the 2-i => BGR instead of RGB ch_idx = 2-i if convert_to_bgr else i flow_image[:,:,ch_idx] = np.floor(255 * col) return flow_image def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False): """ Expects a two dimensional flow image of shape. Args: flow_uv (np.ndarray): Flow UV image of shape [H,W,2] clip_flow (float, optional): Clip maximum of flow values. Defaults to None. convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False. Returns: np.ndarray: Flow visualization image of shape [H,W,3] """ assert flow_uv.ndim == 3, 'input flow must have three dimensions' assert flow_uv.shape[2] == 2, 'input flow must have shape [H,W,2]' if clip_flow is not None: flow_uv = np.clip(flow_uv, 0, clip_flow) u = flow_uv[:,:,0] v = flow_uv[:,:,1] rad = np.sqrt(np.square(u) + np.square(v)) rad_max = np.max(rad) epsilon = 1e-5 u = u / (rad_max + epsilon) v = v / (rad_max + epsilon) return flow_uv_to_colors(u, v, convert_to_bgr) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/utils_core/frame_utils.py ================================================ import numpy as np from PIL import Image from os.path import * import re import cv2 cv2.setNumThreads(0) cv2.ocl.setUseOpenCL(False) TAG_CHAR = np.array([202021.25], np.float32) def readFlow(fn): """ Read .flo file in Middlebury format""" # Code adapted from: # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy # WARNING: this will work on little-endian architectures (eg Intel x86) only! # print 'fn = %s'%(fn) with open(fn, 'rb') as f: magic = np.fromfile(f, np.float32, count=1) if 202021.25 != magic: print('Magic number incorrect. Invalid .flo file') return None else: w = np.fromfile(f, np.int32, count=1) h = np.fromfile(f, np.int32, count=1) # print 'Reading %d x %d flo file\n' % (w, h) data = np.fromfile(f, np.float32, count=2*int(w)*int(h)) # Reshape data into 3D array (columns, rows, bands) # The reshape here is for visualization, the original code is (w,h,2) return np.resize(data, (int(h), int(w), 2)) def readPFM(file): file = open(file, 'rb') color = None width = None height = None scale = None endian = None header = file.readline().rstrip() if header == b'PF': color = True elif header == b'Pf': color = False else: raise Exception('Not a PFM file.') dim_match = re.match(rb'^(\d+)\s(\d+)\s$', file.readline()) if dim_match: width, height = map(int, dim_match.groups()) else: raise Exception('Malformed PFM header.') scale = float(file.readline().rstrip()) if scale < 0: # little-endian endian = '<' scale = -scale else: endian = '>' # big-endian data = np.fromfile(file, endian + 'f') shape = (height, width, 3) if color else (height, width) data = np.reshape(data, shape) data = np.flipud(data) return data def writeFlow(filename,uv,v=None): """ Write optical flow to file. If v is None, uv is assumed to contain both u and v channels, stacked in depth. Original code by Deqing Sun, adapted from Daniel Scharstein. """ nBands = 2 if v is None: assert(uv.ndim == 3) assert(uv.shape[2] == 2) u = uv[:,:,0] v = uv[:,:,1] else: u = uv assert(u.shape == v.shape) height,width = u.shape f = open(filename,'wb') # write the header f.write(TAG_CHAR) np.array(width).astype(np.int32).tofile(f) np.array(height).astype(np.int32).tofile(f) # arrange into matrix form tmp = np.zeros((height, width*nBands)) tmp[:,np.arange(width)*2] = u tmp[:,np.arange(width)*2 + 1] = v tmp.astype(np.float32).tofile(f) f.close() def readFlowKITTI(filename): flow = cv2.imread(filename, cv2.IMREAD_ANYDEPTH|cv2.IMREAD_COLOR) flow = flow[:,:,::-1].astype(np.float32) flow, valid = flow[:, :, :2], flow[:, :, 2] flow = (flow - 2**15) / 64.0 return flow, valid def readDispKITTI(filename): disp = cv2.imread(filename, cv2.IMREAD_ANYDEPTH) / 256.0 valid = disp > 0.0 flow = np.stack([-disp, np.zeros_like(disp)], -1) return flow, valid def writeFlowKITTI(filename, uv): uv = 64.0 * uv + 2**15 valid = np.ones([uv.shape[0], uv.shape[1], 1]) uv = np.concatenate([uv, valid], axis=-1).astype(np.uint16) cv2.imwrite(filename, uv[..., ::-1]) def read_gen(file_name, pil=False): ext = splitext(file_name)[-1] if ext == '.png' or ext == '.jpeg' or ext == '.ppm' or ext == '.jpg': return Image.open(file_name) elif ext == '.bin' or ext == '.raw': return np.load(file_name) elif ext == '.flo': return readFlow(file_name).astype(np.float32) elif ext == '.pfm': flow = readPFM(file_name).astype(np.float32) if len(flow.shape) == 2: return flow else: return flow[:, :, :-1] return [] ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/utils_core/utils.py ================================================ import torch import torch.nn.functional as F import numpy as np from scipy import interpolate class InputPadder: """ Pads images such that dimensions are divisible by 8 """ def __init__(self, dims, mode='sintel'): self.ht, self.wd = dims[-2:] pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8 pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8 if mode == 'sintel': self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2] else: self._pad = [pad_wd//2, pad_wd - pad_wd//2, 0, pad_ht] def pad(self, *inputs): return [F.pad(x, self._pad, mode='replicate') for x in inputs] def unpad(self,x): ht, wd = x.shape[-2:] c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]] return x[..., c[0]:c[1], c[2]:c[3]] def forward_interpolate(flow): flow = flow.detach().cpu().numpy() dx, dy = flow[0], flow[1] ht, wd = dx.shape x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht)) x1 = x0 + dx y1 = y0 + dy x1 = x1.reshape(-1) y1 = y1.reshape(-1) dx = dx.reshape(-1) dy = dy.reshape(-1) valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht) x1 = x1[valid] y1 = y1[valid] dx = dx[valid] dy = dy[valid] flow_x = interpolate.griddata( (x1, y1), dx, (x0, y0), method='nearest', fill_value=0) flow_y = interpolate.griddata( (x1, y1), dy, (x0, y0), method='nearest', fill_value=0) flow = np.stack([flow_x, flow_y], axis=0) return torch.from_numpy(flow).float() def bilinear_sampler(img, coords, mode='bilinear', mask=False): """ Wrapper for grid_sample, uses pixel coordinates """ H, W = img.shape[-2:] xgrid, ygrid = coords.split([1,1], dim=-1) xgrid = 2*xgrid/(W-1) - 1 ygrid = 2*ygrid/(H-1) - 1 grid = torch.cat([xgrid, ygrid], dim=-1) img = F.grid_sample(img, grid, align_corners=True) if mask: mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1) return img, mask.float() return img def coords_grid(batch, ht, wd, device): coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device)) coords = torch.stack(coords[::-1], dim=0).float() return coords[None].repeat(batch, 1, 1, 1) def upflow8(flow, mode='bilinear'): new_size = (8 * flow.shape[2], 8 * flow.shape[3]) return 8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/ViCLIP/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/ViCLIP/simple_tokenizer.py ================================================ import gzip import html import os import subprocess from functools import lru_cache import ftfy import regex as re from vbench.utils import CACHE_DIR def default_bpe(): tokenizer_file = os.path.join(CACHE_DIR, "ViCLIP/bpe_simple_vocab_16e6.txt.gz") if not os.path.exists(tokenizer_file): print(f'Downloading ViCLIP tokenizer to {tokenizer_file}') wget_command = ['wget', 'https://raw.githubusercontent.com/openai/CLIP/main/clip/bpe_simple_vocab_16e6.txt.gz', '-P', os.path.dirname(tokenizer_file)] subprocess.run(wget_command) return tokenizer_file @lru_cache() def bytes_to_unicode(): """ Returns list of utf-8 byte and a corresponding list of unicode strings. The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a signficant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on. """ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) cs = bs[:] n = 0 for b in range(2**8): if b not in bs: bs.append(b) cs.append(2**8+n) n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) def get_pairs(word): """Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length strings). """ pairs = set() prev_char = word[0] for char in word[1:]: pairs.add((prev_char, char)) prev_char = char return pairs def basic_clean(text): text = ftfy.fix_text(text) text = html.unescape(html.unescape(text)) return text.strip() def whitespace_clean(text): text = re.sub(r'\s+', ' ', text) text = text.strip() return text class SimpleTokenizer(object): def __init__(self, bpe_path: str = default_bpe()): self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} merges = gzip.open(bpe_path).read().decode("utf-8").split('\n') merges = merges[1:49152-256-2+1] merges = [tuple(merge.split()) for merge in merges] vocab = list(bytes_to_unicode().values()) vocab = vocab + [v+'' for v in vocab] for merge in merges: vocab.append(''.join(merge)) vocab.extend(['<|startoftext|>', '<|endoftext|>']) self.encoder = dict(zip(vocab, range(len(vocab)))) self.decoder = {v: k for k, v in self.encoder.items()} self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) def bpe(self, token): if token in self.cache: return self.cache[token] word = tuple(token[:-1]) + ( token[-1] + '',) pairs = get_pairs(word) if not pairs: return token+'' while True: bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) if bigram not in self.bpe_ranks: break first, second = bigram new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) new_word.extend(word[i:j]) i = j except: new_word.extend(word[i:]) break if word[i] == first and i < len(word)-1 and word[i+1] == second: new_word.append(first+second) i += 2 else: new_word.append(word[i]) i += 1 new_word = tuple(new_word) word = new_word if len(word) == 1: break else: pairs = get_pairs(word) word = ' '.join(word) self.cache[token] = word return word def encode(self, text): bpe_tokens = [] text = whitespace_clean(basic_clean(text)).lower() for token in re.findall(self.pat, text): token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) return bpe_tokens def decode(self, tokens): text = ''.join([self.decoder[token] for token in tokens]) text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ') return text ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/ViCLIP/viclip.py ================================================ import os import logging import torch from einops import rearrange from torch import nn import math from .simple_tokenizer import SimpleTokenizer as _Tokenizer from .viclip_vision import clip_joint_l14 from .viclip_text import clip_text_l14 logger = logging.getLogger(__name__) class ViCLIP(nn.Module): """docstring for ViCLIP""" def __init__(self, tokenizer=None, pretrain=os.path.join(os.path.dirname(os.path.abspath(__file__)), "ViClip-InternVid-10M-FLT.pth"), freeze_text=True): super(ViCLIP, self).__init__() if tokenizer: self.tokenizer = tokenizer else: self.tokenizer = _Tokenizer() self.max_txt_l = 32 self.vision_encoder_name = 'vit_l14' self.vision_encoder_pretrained = False self.inputs_image_res = 224 self.vision_encoder_kernel_size = 1 self.vision_encoder_center = True self.video_input_num_frames = 8 self.vision_encoder_drop_path_rate = 0.1 self.vision_encoder_checkpoint_num = 24 self.is_pretrain = pretrain self.vision_width = 1024 self.text_width = 768 self.embed_dim = 768 self.masking_prob = 0.9 self.text_encoder_name = 'vit_l14' self.text_encoder_pretrained = False#'bert-base-uncased' self.text_encoder_d_model = 768 self.text_encoder_vocab_size = 49408 # create modules. self.vision_encoder = self.build_vision_encoder() self.text_encoder = self.build_text_encoder() self.temp = nn.parameter.Parameter(torch.ones([]) * 1 / 100.0) self.temp_min = 1 / 100.0 if pretrain: logger.info(f"Load pretrained weights from {pretrain}") state_dict = torch.load(pretrain, map_location='cpu')['model'] self.load_state_dict(state_dict) # Freeze weights if freeze_text: self.freeze_text() def freeze_text(self): """freeze text encoder""" for p in self.text_encoder.parameters(): p.requires_grad = False def no_weight_decay(self): ret = {"temp"} ret.update( {"vision_encoder." + k for k in self.vision_encoder.no_weight_decay()} ) ret.update( {"text_encoder." + k for k in self.text_encoder.no_weight_decay()} ) return ret def forward(self, image, text, raw_text, idx, log_generation=None, return_sims=False): """forward and calculate loss. Args: image (torch.Tensor): The input images. Shape: [B,T,C,H,W]. text (dict): TODO idx (torch.Tensor): TODO Returns: TODO """ self.clip_contrastive_temperature() vision_embeds = self.encode_vision(image) text_embeds = self.encode_text(raw_text) if return_sims: sims = torch.nn.functional.normalize(vision_embeds, dim=-1) @ \ torch.nn.functional.normalize(text_embeds, dim=-1).transpose(0, 1) return sims # calculate loss ## VTC loss loss_vtc = self.clip_loss.vtc_loss( vision_embeds, text_embeds, idx, self.temp, all_gather=True ) return dict( loss_vtc=loss_vtc, ) def encode_vision(self, image, test=False): """encode image / videos as features. Args: image (torch.Tensor): The input images. test (bool): Whether testing. Returns: tuple. - vision_embeds (torch.Tensor): The features of all patches. Shape: [B,T,L,C]. - pooled_vision_embeds (torch.Tensor): The pooled features. Shape: [B,T,C]. """ if image.ndim == 5: image = image.permute(0, 2, 1, 3, 4).contiguous() else: image = image.unsqueeze(2) if not test and self.masking_prob > 0.0: return self.vision_encoder( image, masking_prob=self.masking_prob ) return self.vision_encoder(image) def encode_text(self, text): """encode text. Args: text (dict): The output of huggingface's `PreTrainedTokenizer`. contains keys: - input_ids (torch.Tensor): Token ids to be fed to a model. Shape: [B,L]. - attention_mask (torch.Tensor): The mask indicate padded tokens. Shape: [B,L]. 0 is padded token. - other keys refer to "https://huggingface.co/docs/transformers/v4.21.2/en/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__". Returns: tuple. - text_embeds (torch.Tensor): The features of all tokens. Shape: [B,L,C]. - pooled_text_embeds (torch.Tensor): The pooled features. Shape: [B,C]. """ device = next(self.text_encoder.parameters()).device text = self.text_encoder.tokenize( text, context_length=self.max_txt_l ).to(device) text_embeds = self.text_encoder(text) return text_embeds @torch.no_grad() def clip_contrastive_temperature(self, min_val=0.001, max_val=0.5): """Seems only used during pre-training""" self.temp.clamp_(min=self.temp_min) def build_vision_encoder(self): """build vision encoder Returns: (vision_encoder, vision_layernorm). Each is a `nn.Module`. """ encoder_name = self.vision_encoder_name if encoder_name != "vit_l14": raise ValueError(f"Not implemented: {encoder_name}") vision_encoder = clip_joint_l14( pretrained=self.vision_encoder_pretrained, input_resolution=self.inputs_image_res, kernel_size=self.vision_encoder_kernel_size, center=self.vision_encoder_center, num_frames=self.video_input_num_frames, drop_path=self.vision_encoder_drop_path_rate, checkpoint_num=self.vision_encoder_checkpoint_num, ) return vision_encoder def build_text_encoder(self): """build text_encoder and possiblly video-to-text multimodal fusion encoder. Returns: nn.Module. The text encoder """ encoder_name = self.text_encoder_name if encoder_name != "vit_l14": raise ValueError(f"Not implemented: {encoder_name}") text_encoder = clip_text_l14( pretrained=self.text_encoder_pretrained, embed_dim=self.text_encoder_d_model, context_length=self.max_txt_l, vocab_size=self.text_encoder_vocab_size, checkpoint_num=0, ) return text_encoder def get_text_encoder(self): """get text encoder, used for text and cross-modal encoding""" encoder = self.text_encoder return encoder.bert if hasattr(encoder, "bert") else encoder def get_text_features(self, input_text, tokenizer, text_feature_dict={}): if input_text in text_feature_dict: return text_feature_dict[input_text] text_template= f"{input_text}" with torch.no_grad(): # text_token = tokenizer.encode(text_template).cuda() text_features = self.encode_text(text_template).float() text_features /= text_features.norm(dim=-1, keepdim=True) text_feature_dict[input_text] = text_features return text_features def get_vid_features(self, input_frames): with torch.no_grad(): clip_feat = self.encode_vision(input_frames,test=True).float() clip_feat /= clip_feat.norm(dim=-1, keepdim=True) return clip_feat def get_predict_label(self, clip_feature, text_feats_tensor, top=5): label_probs = (100.0 * clip_feature @ text_feats_tensor.T).softmax(dim=-1) top_probs, top_labels = label_probs.cpu().topk(top, dim=-1) return top_probs, top_labels ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/ViCLIP/viclip_text.py ================================================ import os import logging from collections import OrderedDict from pkg_resources import packaging from .simple_tokenizer import SimpleTokenizer as _Tokenizer import numpy as np import torch import torch.nn.functional as F from torch import nn import torch.utils.checkpoint as checkpoint import functools logger = logging.getLogger(__name__) MODEL_PATH = 'https://huggingface.co/laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K' _MODELS = { "ViT-L/14": os.path.join(MODEL_PATH, "vit_l14_text.pth"), } class LayerNorm(nn.LayerNorm): """Subclass torch's LayerNorm to handle fp16.""" def forward(self, x: torch.Tensor): orig_type = x.dtype ret = super().forward(x.type(torch.float32)) return ret.type(orig_type) class QuickGELU(nn.Module): def forward(self, x: torch.Tensor): return x * torch.sigmoid(1.702 * x) class ResidualAttentionBlock(nn.Module): def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None): super().__init__() self.attn = nn.MultiheadAttention(d_model, n_head) self.ln_1 = LayerNorm(d_model) self.mlp = nn.Sequential(OrderedDict([ ("c_fc", nn.Linear(d_model, d_model * 4)), ("gelu", QuickGELU()), ("c_proj", nn.Linear(d_model * 4, d_model)) ])) self.ln_2 = LayerNorm(d_model) self.attn_mask = attn_mask def attention(self, x: torch.Tensor): self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] def forward(self, x: torch.Tensor): x = x + self.attention(self.ln_1(x)) x = x + self.mlp(self.ln_2(x)) return x class Transformer(nn.Module): def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, checkpoint_num: int = 0): super().__init__() self.width = width self.layers = layers self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)]) self.checkpoint_num = checkpoint_num def forward(self, x: torch.Tensor): if self.checkpoint_num > 0: segments = min(self.checkpoint_num, len(self.resblocks)) return checkpoint.checkpoint_sequential(self.resblocks, segments, x) else: return self.resblocks(x) class CLIP_TEXT(nn.Module): def __init__( self, embed_dim: int, context_length: int, vocab_size: int, transformer_width: int, transformer_heads: int, transformer_layers: int, checkpoint_num: int, ): super().__init__() self.context_length = context_length self._tokenizer = _Tokenizer() self.transformer = Transformer( width=transformer_width, layers=transformer_layers, heads=transformer_heads, attn_mask=self.build_attention_mask(), checkpoint_num=checkpoint_num, ) self.vocab_size = vocab_size self.token_embedding = nn.Embedding(vocab_size, transformer_width) self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width)) self.ln_final = LayerNorm(transformer_width) self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim)) def no_weight_decay(self): return {'token_embedding', 'positional_embedding'} @functools.lru_cache(maxsize=None) def build_attention_mask(self): # lazily create causal attention mask, with full attention between the vision tokens # pytorch uses additive attention mask; fill with -inf mask = torch.empty(self.context_length, self.context_length) mask.fill_(float("-inf")) mask.triu_(1) # zero out the lower diagonal return mask def tokenize(self, texts, context_length=77, truncate=True): """ Returns the tokenized representation of given input string(s) Parameters ---------- texts : Union[str, List[str]] An input string or a list of input strings to tokenize context_length : int The context length to use; all CLIP models use 77 as the context length truncate: bool Whether to truncate the text in case its encoding is longer than the context length Returns ------- A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]. We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long. """ if isinstance(texts, str): texts = [texts] sot_token = self._tokenizer.encoder["<|startoftext|>"] eot_token = self._tokenizer.encoder["<|endoftext|>"] all_tokens = [[sot_token] + self._tokenizer.encode(text) + [eot_token] for text in texts] if packaging.version.parse(torch.__version__) < packaging.version.parse("1.8.0"): result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) else: result = torch.zeros(len(all_tokens), context_length, dtype=torch.int) for i, tokens in enumerate(all_tokens): if len(tokens) > context_length: if truncate: tokens = tokens[:context_length] tokens[-1] = eot_token else: raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}") result[i, :len(tokens)] = torch.tensor(tokens) return result def forward(self, text): x = self.token_embedding(text) # [batch_size, n_ctx, d_model] x = x + self.positional_embedding x = x.permute(1, 0, 2) # NLD -> LND x = self.transformer(x) x = x.permute(1, 0, 2) # LND -> NLD x = self.ln_final(x) # x.shape = [batch_size, n_ctx, transformer.width] # take features from the eot embedding (eot_token is the highest number in each sequence) x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection return x def clip_text_b16( embed_dim=512, context_length=77, vocab_size=49408, transformer_width=512, transformer_heads=8, transformer_layers=12, ): raise NotImplementedError model = CLIP_TEXT( embed_dim, context_length, vocab_size, transformer_width, transformer_heads, transformer_layers ) pretrained = _MODELS["ViT-B/16"] logger.info(f"Load pretrained weights from {pretrained}") state_dict = torch.load(pretrained, map_location='cpu') model.load_state_dict(state_dict, strict=False) return model.eval() def clip_text_l14( embed_dim=768, context_length=77, vocab_size=49408, transformer_width=768, transformer_heads=12, transformer_layers=12, checkpoint_num=0, pretrained=True, ): model = CLIP_TEXT( embed_dim, context_length, vocab_size, transformer_width, transformer_heads, transformer_layers, checkpoint_num, ) if pretrained: if isinstance(pretrained, str) and pretrained != "bert-base-uncased": pretrained = _MODELS[pretrained] else: pretrained = _MODELS["ViT-L/14"] logger.info(f"Load pretrained weights from {pretrained}") state_dict = torch.load(pretrained, map_location='cpu') if context_length != state_dict["positional_embedding"].size(0): # assert context_length < state_dict["positional_embedding"].size(0), "Cannot increase context length." print(f"Resize positional embedding from {state_dict['positional_embedding'].size(0)} to {context_length}") if context_length < state_dict["positional_embedding"].size(0): state_dict["positional_embedding"] = state_dict["positional_embedding"][:context_length] else: state_dict["positional_embedding"] = F.pad( state_dict["positional_embedding"], (0, 0, 0, context_length - state_dict["positional_embedding"].size(0)), value=0, ) message = model.load_state_dict(state_dict, strict=False) print(f"Load pretrained weights from {pretrained}: {message}") return model.eval() def clip_text_l14_336( embed_dim=768, context_length=77, vocab_size=49408, transformer_width=768, transformer_heads=12, transformer_layers=12, ): raise NotImplementedError model = CLIP_TEXT( embed_dim, context_length, vocab_size, transformer_width, transformer_heads, transformer_layers ) pretrained = _MODELS["ViT-L/14_336"] logger.info(f"Load pretrained weights from {pretrained}") state_dict = torch.load(pretrained, map_location='cpu') model.load_state_dict(state_dict, strict=False) return model.eval() def build_clip(config): model_cls = config.text_encoder.clip_teacher model = eval(model_cls)() return model ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/ViCLIP/viclip_vision.py ================================================ #!/usr/bin/env python import os import logging from collections import OrderedDict import torch from torch import nn from einops import rearrange from timm.models.layers import DropPath from timm.models.registry import register_model import torch.utils.checkpoint as checkpoint logger = logging.getLogger(__name__) def load_temp_embed_with_mismatch(temp_embed_old, temp_embed_new, add_zero=True): """ Add/Remove extra temporal_embeddings as needed. https://arxiv.org/abs/2104.00650 shows adding zero paddings works. temp_embed_old: (1, num_frames_old, 1, d) temp_embed_new: (1, num_frames_new, 1, d) add_zero: bool, if True, add zero, else, interpolate trained embeddings. """ # TODO zero pad num_frms_new = temp_embed_new.shape[1] num_frms_old = temp_embed_old.shape[1] logger.info(f"Load temporal_embeddings, lengths: {num_frms_old}-->{num_frms_new}") if num_frms_new > num_frms_old: if add_zero: temp_embed_new[ :, :num_frms_old ] = temp_embed_old # untrained embeddings are zeros. else: temp_embed_new = interpolate_temporal_pos_embed(temp_embed_old, num_frms_new) elif num_frms_new < num_frms_old: temp_embed_new = temp_embed_old[:, :num_frms_new] else: # = temp_embed_new = temp_embed_old return temp_embed_new MODEL_PATH = 'https://pjlab-gvm-data.oss-cn-shanghai.aliyuncs.com/internvideo/viclip/' _MODELS = { "ViT-L/14": os.path.join(MODEL_PATH, "ViClip-InternVid-10M-FLT.pth"), } class QuickGELU(nn.Module): def forward(self, x): return x * torch.sigmoid(1.702 * x) class ResidualAttentionBlock(nn.Module): def __init__(self, d_model, n_head, drop_path=0., attn_mask=None, dropout=0.): super().__init__() self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.attn = nn.MultiheadAttention(d_model, n_head, dropout=dropout) self.ln_1 = nn.LayerNorm(d_model) self.mlp = nn.Sequential(OrderedDict([ ("c_fc", nn.Linear(d_model, d_model * 4)), ("gelu", QuickGELU()), ("drop1", nn.Dropout(dropout)), ("c_proj", nn.Linear(d_model * 4, d_model)), ("drop2", nn.Dropout(dropout)), ])) self.ln_2 = nn.LayerNorm(d_model) self.attn_mask = attn_mask def attention(self, x): self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] def forward(self, x): x = x + self.drop_path1(self.attention(self.ln_1(x))) x = x + self.drop_path2(self.mlp(self.ln_2(x))) return x class Transformer(nn.Module): def __init__(self, width, layers, heads, drop_path=0., checkpoint_num=0, dropout=0.): super().__init__() dpr = [x.item() for x in torch.linspace(0, drop_path, layers)] self.resblocks = nn.ModuleList() for idx in range(layers): self.resblocks.append(ResidualAttentionBlock(width, heads, drop_path=dpr[idx], dropout=dropout)) self.checkpoint_num = checkpoint_num def forward(self, x): for idx, blk in enumerate(self.resblocks): if idx < self.checkpoint_num: x = checkpoint.checkpoint(blk, x) else: x = blk(x) return x class VisionTransformer(nn.Module): def __init__( self, input_resolution, patch_size, width, layers, heads, output_dim=None, kernel_size=1, num_frames=8, drop_path=0, checkpoint_num=0, dropout=0., temp_embed=True, ): super().__init__() self.output_dim = output_dim self.conv1 = nn.Conv3d( 3, width, (kernel_size, patch_size, patch_size), (kernel_size, patch_size, patch_size), (0, 0, 0), bias=False ) scale = width ** -0.5 self.class_embedding = nn.Parameter(scale * torch.randn(width)) self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width)) self.ln_pre = nn.LayerNorm(width) if temp_embed: self.temporal_positional_embedding = nn.Parameter(torch.zeros(1, num_frames, width)) self.transformer = Transformer( width, layers, heads, drop_path=drop_path, checkpoint_num=checkpoint_num, dropout=dropout) self.ln_post = nn.LayerNorm(width) if output_dim is not None: self.proj = nn.Parameter(torch.empty(width, output_dim)) else: self.proj = None self.dropout = nn.Dropout(dropout) def get_num_layers(self): return len(self.transformer.resblocks) @torch.jit.ignore def no_weight_decay(self): return {'positional_embedding', 'class_embedding', 'temporal_positional_embedding'} def mask_tokens(self, inputs, masking_prob=0.0): B, L, _ = inputs.shape # This is different from text as we are masking a fix number of tokens Lm = int(masking_prob * L) masked_indices = torch.zeros(B, L) indices = torch.argsort(torch.rand_like(masked_indices), dim=-1)[:, :Lm] batch_indices = ( torch.arange(masked_indices.shape[0]).unsqueeze(-1).expand_as(indices) ) masked_indices[batch_indices, indices] = 1 masked_indices = masked_indices.bool() return inputs[~masked_indices].reshape(B, -1, inputs.shape[-1]) def forward(self, x, masking_prob=0.0): x = self.conv1(x) # shape = [*, width, grid, grid] B, C, T, H, W = x.shape x = x.permute(0, 2, 3, 4, 1).reshape(B * T, H * W, C) x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width] x = x + self.positional_embedding.to(x.dtype) # temporal pos cls_tokens = x[:B, :1, :] x = x[:, 1:] x = rearrange(x, '(b t) n m -> (b n) t m', b=B, t=T) if hasattr(self, 'temporal_positional_embedding'): if x.size(1) == 1: # This is a workaround for unused parameter issue x = x + self.temporal_positional_embedding.mean(1) else: x = x + self.temporal_positional_embedding x = rearrange(x, '(b n) t m -> b (n t) m', b=B, t=T) if masking_prob > 0.0: x = self.mask_tokens(x, masking_prob) x = torch.cat((cls_tokens, x), dim=1) x = self.ln_pre(x) x = x.permute(1, 0, 2) #BND -> NBD x = self.transformer(x) x = self.ln_post(x) if self.proj is not None: x = self.dropout(x[0]) @ self.proj else: x = x.permute(1, 0, 2) #NBD -> BND return x def inflate_weight(weight_2d, time_dim, center=True): logger.info(f'Init center: {center}') if center: weight_3d = torch.zeros(*weight_2d.shape) weight_3d = weight_3d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1) middle_idx = time_dim // 2 weight_3d[:, :, middle_idx, :, :] = weight_2d else: weight_3d = weight_2d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1) weight_3d = weight_3d / time_dim return weight_3d def load_state_dict(model, state_dict, input_resolution=224, patch_size=16, center=True): state_dict_3d = model.state_dict() for k in state_dict.keys(): if k in state_dict_3d.keys() and state_dict[k].shape != state_dict_3d[k].shape: if len(state_dict_3d[k].shape) <= 2: logger.info(f'Ignore: {k}') continue logger.info(f'Inflate: {k}, {state_dict[k].shape} => {state_dict_3d[k].shape}') time_dim = state_dict_3d[k].shape[2] state_dict[k] = inflate_weight(state_dict[k], time_dim, center=center) pos_embed_checkpoint = state_dict['positional_embedding'] embedding_size = pos_embed_checkpoint.shape[-1] num_patches = (input_resolution // patch_size) ** 2 orig_size = int((pos_embed_checkpoint.shape[-2] - 1) ** 0.5) new_size = int(num_patches ** 0.5) if orig_size != new_size: logger.info(f'Pos_emb from {orig_size} to {new_size}') extra_tokens = pos_embed_checkpoint[:1] pos_tokens = pos_embed_checkpoint[1:] pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) pos_tokens = torch.nn.functional.interpolate( pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(0, 2) new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=0) state_dict['positional_embedding'] = new_pos_embed message = model.load_state_dict(state_dict, strict=False) logger.info(f"Load pretrained weights: {message}") @register_model def clip_joint_b16( pretrained=True, input_resolution=224, kernel_size=1, center=True, num_frames=8, drop_path=0. ): model = VisionTransformer( input_resolution=input_resolution, patch_size=16, width=768, layers=12, heads=12, output_dim=512, kernel_size=kernel_size, num_frames=num_frames, drop_path=drop_path, ) raise NotImplementedError if pretrained: logger.info('load pretrained weights') state_dict = torch.load(_MODELS["ViT-B/16"], map_location='cpu') load_state_dict(model, state_dict, input_resolution=input_resolution, patch_size=16, center=center) return model.eval() @register_model def clip_joint_l14( pretrained=False, input_resolution=224, kernel_size=1, center=True, num_frames=8, drop_path=0., checkpoint_num=0, dropout=0., ): model = VisionTransformer( input_resolution=input_resolution, patch_size=14, width=1024, layers=24, heads=16, output_dim=768, kernel_size=kernel_size, num_frames=num_frames, drop_path=drop_path, checkpoint_num=checkpoint_num, dropout=dropout, ) if pretrained: if isinstance(pretrained, str): model_name = pretrained else: model_name = "ViT-L/14" logger.info('load pretrained weights') state_dict = torch.load(_MODELS[model_name], map_location='cpu') load_state_dict(model, state_dict, input_resolution=input_resolution, patch_size=14, center=center) return model.eval() @register_model def clip_joint_l14_336( pretrained=True, input_resolution=336, kernel_size=1, center=True, num_frames=8, drop_path=0. ): raise NotImplementedError model = VisionTransformer( input_resolution=input_resolution, patch_size=14, width=1024, layers=24, heads=16, output_dim=768, kernel_size=kernel_size, num_frames=num_frames, drop_path=drop_path, ) if pretrained: logger.info('load pretrained weights') state_dict = torch.load(_MODELS["ViT-L/14_336"], map_location='cpu') load_state_dict(model, state_dict, input_resolution=input_resolution, patch_size=14, center=center) return model.eval() def interpolate_pos_embed_vit(state_dict, new_model): key = "vision_encoder.temporal_positional_embedding" if key in state_dict: vision_temp_embed_new = new_model.state_dict()[key] vision_temp_embed_new = vision_temp_embed_new.unsqueeze(2) # [1, n, d] -> [1, n, 1, d] vision_temp_embed_old = state_dict[key] vision_temp_embed_old = vision_temp_embed_old.unsqueeze(2) state_dict[key] = load_temp_embed_with_mismatch( vision_temp_embed_old, vision_temp_embed_new, add_zero=False ).squeeze(2) key = "text_encoder.positional_embedding" if key in state_dict: text_temp_embed_new = new_model.state_dict()[key] text_temp_embed_new = text_temp_embed_new.unsqueeze(0).unsqueeze(2) # [n, d] -> [1, n, 1, d] text_temp_embed_old = state_dict[key] text_temp_embed_old = text_temp_embed_old.unsqueeze(0).unsqueeze(2) state_dict[key] = load_temp_embed_with_mismatch( text_temp_embed_old, text_temp_embed_new, add_zero=False ).squeeze(2).squeeze(0) return state_dict ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/benchmarks/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/benchmarks/adobe240.py ================================================ import sys import tqdm import torch import argparse import numpy as np from omegaconf import OmegaConf sys.path.append('.') from utils.build_utils import build_from_cfg from datasets.adobe_datasets import Adobe240_Dataset from metrics.psnr_ssim import calculate_psnr, calculate_ssim parser = argparse.ArgumentParser( prog = 'AMT', description = 'Adobe240 evaluation', ) parser.add_argument('-c', '--config', default='cfgs/AMT-S_gopro.yaml') parser.add_argument('-p', '--ckpt', default='pretrained/gopro_amt-s.pth',) parser.add_argument('-r', '--root', default='data/Adobe240/test_frames',) args = parser.parse_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') cfg_path = args.config ckpt_path = args.ckpt root = args.root network_cfg = OmegaConf.load(cfg_path).network network_name = network_cfg.name model = build_from_cfg(network_cfg) ckpt = torch.load(ckpt_path) model.load_state_dict(ckpt['state_dict']) model = model.to(device) model.eval() dataset = Adobe240_Dataset(dataset_dir=root, augment=False) psnr_list = [] ssim_list = [] pbar = tqdm.tqdm(dataset, total=len(dataset)) for data in pbar: input_dict = {} for k, v in data.items(): input_dict[k] = v.to(device).unsqueeze(0) with torch.no_grad(): imgt_pred = model(**input_dict)['imgt_pred'] psnr = calculate_psnr(imgt_pred, input_dict['imgt']) ssim = calculate_ssim(imgt_pred, input_dict['imgt']) psnr_list.append(psnr) ssim_list.append(ssim) avg_psnr = np.mean(psnr_list) avg_ssim = np.mean(ssim_list) desc_str = f'[{network_name}/Adobe240] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}' pbar.set_description_str(desc_str) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/benchmarks/gopro.py ================================================ import sys import tqdm import torch import argparse import numpy as np from omegaconf import OmegaConf sys.path.append('.') from utils.build_utils import build_from_cfg from datasets.gopro_datasets import GoPro_Test_Dataset from metrics.psnr_ssim import calculate_psnr, calculate_ssim parser = argparse.ArgumentParser( prog = 'AMT', description = 'GOPRO evaluation', ) parser.add_argument('-c', '--config', default='cfgs/AMT-S_gopro.yaml') parser.add_argument('-p', '--ckpt', default='pretrained/gopro_amt-s.pth',) parser.add_argument('-r', '--root', default='data/GOPRO',) args = parser.parse_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') cfg_path = args.config ckpt_path = args.ckpt root = args.root network_cfg = OmegaConf.load(cfg_path).network network_name = network_cfg.name model = build_from_cfg(network_cfg) ckpt = torch.load(ckpt_path) model.load_state_dict(ckpt['state_dict']) model = model.to(device) model.eval() dataset = GoPro_Test_Dataset(dataset_dir=root) psnr_list = [] ssim_list = [] pbar = tqdm.tqdm(dataset, total=len(dataset)) for data in pbar: input_dict = {} for k, v in data.items(): input_dict[k] = v.to(device).unsqueeze(0) with torch.no_grad(): imgt_pred = model(**input_dict)['imgt_pred'] psnr = calculate_psnr(imgt_pred, input_dict['imgt']) ssim = calculate_ssim(imgt_pred, input_dict['imgt']) psnr_list.append(psnr) ssim_list.append(ssim) avg_psnr = np.mean(psnr_list) avg_ssim = np.mean(ssim_list) desc_str = f'[{network_name}/GOPRO] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}' pbar.set_description_str(desc_str) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/benchmarks/snu_film.py ================================================ import os import sys import tqdm import torch import argparse import numpy as np import os.path as osp from omegaconf import OmegaConf sys.path.append('.') from utils.build_utils import build_from_cfg from metrics.psnr_ssim import calculate_psnr, calculate_ssim from utils.utils import InputPadder, read, img2tensor def parse_path(path): path_list = path.split('/') new_path = osp.join(*path_list[-3:]) return new_path parser = argparse.ArgumentParser( prog = 'AMT', description = 'SNU-FILM evaluation', ) parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') parser.add_argument('-p', '--ckpt', default='pretrained/amt-s.pth') parser.add_argument('-r', '--root', default='data/SNU_FILM') args = parser.parse_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') cfg_path = args.config ckpt_path = args.ckpt root = args.root network_cfg = OmegaConf.load(cfg_path).network network_name = network_cfg.name model = build_from_cfg(network_cfg) ckpt = torch.load(ckpt_path) model.load_state_dict(ckpt['state_dict']) model = model.to(device) model.eval() divisor = 20; scale_factor = 0.8 splits = ['easy', 'medium', 'hard', 'extreme'] for split in splits: with open(os.path.join(root, f'test-{split}.txt'), "r") as fr: file_list = [l.strip().split(' ') for l in fr.readlines()] pbar = tqdm.tqdm(file_list, total=len(file_list)) psnr_list = []; ssim_list = [] for name in pbar: img0 = img2tensor(read(osp.join(root, parse_path(name[0])))).to(device) imgt = img2tensor(read(osp.join(root, parse_path(name[1])))).to(device) img1 = img2tensor(read(osp.join(root, parse_path(name[2])))).to(device) padder = InputPadder(img0.shape, divisor) img0, img1 = padder.pad(img0, img1) embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device) imgt_pred = model(img0, img1, embt, scale_factor=scale_factor, eval=True)['imgt_pred'] imgt_pred = padder.unpad(imgt_pred) psnr = calculate_psnr(imgt_pred, imgt).detach().cpu().numpy() ssim = calculate_ssim(imgt_pred, imgt).detach().cpu().numpy() psnr_list.append(psnr) ssim_list.append(ssim) avg_psnr = np.mean(psnr_list) avg_ssim = np.mean(ssim_list) desc_str = f'[{network_name}/SNU-FILM] [{split}] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}' pbar.set_description_str(desc_str) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/benchmarks/speed_parameters.py ================================================ import sys import time import torch import argparse from omegaconf import OmegaConf sys.path.append('.') from utils.build_utils import build_from_cfg parser = argparse.ArgumentParser( prog = 'AMT', description = 'Speed¶meter benchmark', ) parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') args = parser.parse_args() cfg_path = args.config network_cfg = OmegaConf.load(cfg_path).network model = build_from_cfg(network_cfg) model = model.cuda() model.eval() img0 = torch.randn(1, 3, 256, 448).cuda() img1 = torch.randn(1, 3, 256, 448).cuda() embt = torch.tensor(1/2).float().view(1, 1, 1, 1).cuda() with torch.no_grad(): for i in range(100): out = model(img0, img1, embt, eval=True) torch.cuda.synchronize() time_stamp = time.time() for i in range(1000): out = model(img0, img1, embt, eval=True) torch.cuda.synchronize() print('Time: {:.5f}s'.format((time.time() - time_stamp) / 1)) total = sum([param.nelement() for param in model.parameters()]) print('Parameters: {:.2f}M'.format(total / 1e6)) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/benchmarks/ucf101.py ================================================ import os import sys import tqdm import torch import argparse import numpy as np import os.path as osp from omegaconf import OmegaConf sys.path.append('.') from utils.utils import read, img2tensor from utils.build_utils import build_from_cfg from metrics.psnr_ssim import calculate_psnr, calculate_ssim parser = argparse.ArgumentParser( prog = 'AMT', description = 'UCF101 evaluation', ) parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') parser.add_argument('-p', '--ckpt', default='pretrained/amt-s.pth') parser.add_argument('-r', '--root', default='data/ucf101_interp_ours') args = parser.parse_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') cfg_path = args.config ckpt_path = args.ckpt root = args.root network_cfg = OmegaConf.load(cfg_path).network network_name = network_cfg.name model = build_from_cfg(network_cfg) ckpt = torch.load(ckpt_path) model.load_state_dict(ckpt['state_dict']) model = model.to(device) model.eval() dirs = sorted(os.listdir(root)) psnr_list = [] ssim_list = [] pbar = tqdm.tqdm(dirs, total=len(dirs)) for d in pbar: dir_path = osp.join(root, d) I0 = img2tensor(read(osp.join(dir_path, 'frame_00.png'))).to(device) I1 = img2tensor(read(osp.join(dir_path, 'frame_01_gt.png'))).to(device) I2 = img2tensor(read(osp.join(dir_path, 'frame_02.png'))).to(device) embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device) I1_pred = model(I0, I2, embt, eval=True)['imgt_pred'] psnr = calculate_psnr(I1_pred, I1).detach().cpu().numpy() ssim = calculate_ssim(I1_pred, I1).detach().cpu().numpy() psnr_list.append(psnr) ssim_list.append(ssim) avg_psnr = np.mean(psnr_list) avg_ssim = np.mean(ssim_list) desc_str = f'[{network_name}/UCF101] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}' pbar.set_description_str(desc_str) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/benchmarks/vimeo90k.py ================================================ import sys import tqdm import torch import argparse import numpy as np import os.path as osp from omegaconf import OmegaConf sys.path.append('.') from utils.utils import read, img2tensor from utils.build_utils import build_from_cfg from metrics.psnr_ssim import calculate_psnr, calculate_ssim parser = argparse.ArgumentParser( prog = 'AMT', description = 'Vimeo90K evaluation', ) parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') parser.add_argument('-p', '--ckpt', default='pretrained/amt-s.pth',) parser.add_argument('-r', '--root', default='data/vimeo_triplet',) args = parser.parse_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') cfg_path = args.config ckpt_path = args.ckpt root = args.root network_cfg = OmegaConf.load(cfg_path).network network_name = network_cfg.name model = build_from_cfg(network_cfg) ckpt = torch.load(ckpt_path) model.load_state_dict(ckpt['state_dict']) model = model.to(device) model.eval() with open(osp.join(root, 'tri_testlist.txt'), 'r') as fr: file_list = fr.readlines() psnr_list = [] ssim_list = [] pbar = tqdm.tqdm(file_list, total=len(file_list)) for name in pbar: name = str(name).strip() if(len(name) <= 1): continue dir_path = osp.join(root, 'sequences', name) I0 = img2tensor(read(osp.join(dir_path, 'im1.png'))).to(device) I1 = img2tensor(read(osp.join(dir_path, 'im2.png'))).to(device) I2 = img2tensor(read(osp.join(dir_path, 'im3.png'))).to(device) embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device) I1_pred = model(I0, I2, embt, scale_factor=1.0, eval=True)['imgt_pred'] psnr = calculate_psnr(I1_pred, I1).detach().cpu().numpy() ssim = calculate_ssim(I1_pred, I1).detach().cpu().numpy() psnr_list.append(psnr) ssim_list.append(ssim) avg_psnr = np.mean(psnr_list) avg_ssim = np.mean(ssim_list) desc_str = f'[{network_name}/Vimeo90K] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}' pbar.set_description_str(desc_str) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/benchmarks/vimeo90k_tta.py ================================================ import sys import tqdm import torch import argparse import numpy as np import os.path as osp from omegaconf import OmegaConf sys.path.append('.') from utils.utils import read, img2tensor from utils.build_utils import build_from_cfg from metrics.psnr_ssim import calculate_psnr, calculate_ssim parser = argparse.ArgumentParser( prog = 'AMT', description = 'Vimeo90K evaluation (with Test-Time Augmentation)', ) parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') parser.add_argument('p', '--ckpt', default='pretrained/amt-s.pth',) parser.add_argument('-r', '--root', default='data/vimeo_triplet',) args = parser.parse_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') cfg_path = args.config ckpt_path = args.ckpt root = args.root network_cfg = OmegaConf.load(cfg_path).network network_name = network_cfg.name model = build_from_cfg(network_cfg) ckpt = torch.load(ckpt_path) model.load_state_dict(ckpt['state_dict']) model = model.to(device) model.eval() with open(osp.join(root, 'tri_testlist.txt'), 'r') as fr: file_list = fr.readlines() psnr_list = [] ssim_list = [] pbar = tqdm.tqdm(file_list, total=len(file_list)) for name in pbar: name = str(name).strip() if(len(name) <= 1): continue dir_path = osp.join(root, 'sequences', name) I0 = img2tensor(read(osp.join(dir_path, 'im1.png'))).to(device) I1 = img2tensor(read(osp.join(dir_path, 'im2.png'))).to(device) I2 = img2tensor(read(osp.join(dir_path, 'im3.png'))).to(device) embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device) I1_pred1 = model(I0, I2, embt, scale_factor=1.0, eval=True)['imgt_pred'] I1_pred2 = model(torch.flip(I0, [2]), torch.flip(I2, [2]), embt, scale_factor=1.0, eval=True)['imgt_pred'] I1_pred = I1_pred1 / 2 + torch.flip(I1_pred2, [2]) / 2 psnr = calculate_psnr(I1_pred, I1).detach().cpu().numpy() ssim = calculate_ssim(I1_pred, I1).detach().cpu().numpy() psnr_list.append(psnr) ssim_list.append(ssim) avg_psnr = np.mean(psnr_list) avg_ssim = np.mean(ssim_list) desc_str = f'[{network_name}/Vimeo90K] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}' pbar.set_description_str(desc_str) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/benchmarks/xiph.py ================================================ import os import sys import cv2 import tqdm import glob import torch import argparse import numpy as np import os.path as osp from omegaconf import OmegaConf sys.path.append('.') from utils.utils import InputPadder, read, img2tensor from utils.build_utils import build_from_cfg from metrics.psnr_ssim import calculate_psnr, calculate_ssim parser = argparse.ArgumentParser( prog = 'AMT', description = 'Xiph evaluation', ) parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') parser.add_argument('-p', '--ckpt', default='pretrained/amt-s.pth') parser.add_argument('-r', '--root', default='data/xiph') args = parser.parse_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') cfg_path = args.config ckpt_path = args.ckpt root = args.root network_cfg = OmegaConf.load(cfg_path).network network_name = network_cfg.name model = build_from_cfg(network_cfg) ckpt = torch.load(ckpt_path) model.load_state_dict(ckpt['state_dict'], False) model = model.to(device) model.eval() ############################################# Prepare Dataset ############################################# download_links = [ 'https://media.xiph.org/video/derf/ElFuente/Netflix_BoxingPractice_4096x2160_60fps_10bit_420.y4m', 'https://media.xiph.org/video/derf/ElFuente/Netflix_Crosswalk_4096x2160_60fps_10bit_420.y4m', 'https://media.xiph.org/video/derf/Chimera/Netflix_DrivingPOV_4096x2160_60fps_10bit_420.y4m', 'https://media.xiph.org/video/derf/ElFuente/Netflix_FoodMarket_4096x2160_60fps_10bit_420.y4m', 'https://media.xiph.org/video/derf/ElFuente/Netflix_FoodMarket2_4096x2160_60fps_10bit_420.y4m', 'https://media.xiph.org/video/derf/ElFuente/Netflix_RitualDance_4096x2160_60fps_10bit_420.y4m', 'https://media.xiph.org/video/derf/ElFuente/Netflix_SquareAndTimelapse_4096x2160_60fps_10bit_420.y4m', 'https://media.xiph.org/video/derf/ElFuente/Netflix_Tango_4096x2160_60fps_10bit_420.y4m', ] file_list = ['BoxingPractice', 'Crosswalk', 'DrivingPOV', 'FoodMarket', 'FoodMarket2', 'RitualDance', 'SquareAndTimelapse', 'Tango'] for file_name, link in zip(file_list, download_links): data_dir = osp.join(root, file_name) if osp.exists(data_dir) is False: os.makedirs(data_dir) if len(glob.glob(f'{data_dir}/*.png')) < 100: os.system(f'ffmpeg -i {link} -pix_fmt rgb24 -vframes 100 {data_dir}/%03d.png') ############################################### Prepare End ############################################### divisor = 32; scale_factor = 0.5 for category in ['resized-2k', 'cropped-4k']: psnr_list = [] ssim_list = [] pbar = tqdm.tqdm(file_list, total=len(file_list)) for flie_name in pbar: dir_name = osp.join(root, flie_name) for intFrame in range(2, 99, 2): img0 = read(f'{dir_name}/{intFrame - 1:03d}.png') img1 = read(f'{dir_name}/{intFrame + 1:03d}.png') imgt = read(f'{dir_name}/{intFrame:03d}.png') if category == 'resized-2k': img0 = cv2.resize(src=img0, dsize=(2048, 1080), fx=0.0, fy=0.0, interpolation=cv2.INTER_AREA) img1 = cv2.resize(src=img1, dsize=(2048, 1080), fx=0.0, fy=0.0, interpolation=cv2.INTER_AREA) imgt = cv2.resize(src=imgt, dsize=(2048, 1080), fx=0.0, fy=0.0, interpolation=cv2.INTER_AREA) elif category == 'cropped-4k': img0 = img0[540:-540, 1024:-1024, :] img1 = img1[540:-540, 1024:-1024, :] imgt = imgt[540:-540, 1024:-1024, :] img0 = img2tensor(img0).to(device) imgt = img2tensor(imgt).to(device) img1 = img2tensor(img1).to(device) embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device) padder = InputPadder(img0.shape, divisor) img0, img1 = padder.pad(img0, img1) with torch.no_grad(): imgt_pred = model(img0, img1, embt, scale_factor=scale_factor, eval=True)['imgt_pred'] imgt_pred = padder.unpad(imgt_pred) psnr = calculate_psnr(imgt_pred, imgt) ssim = calculate_ssim(imgt_pred, imgt) avg_psnr = np.mean(psnr_list) avg_ssim = np.mean(ssim_list) psnr_list.append(psnr) ssim_list.append(ssim) desc_str = f'[{network_name}/Xiph] [{category}/{flie_name}] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}' pbar.set_description_str(desc_str) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/datasets/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/datasets/adobe_datasets.py ================================================ ''' This code is partially borrowed from IFRNet (https://github.com/ltkong218/IFRNet). ''' import os import sys import torch import numpy as np from torch.utils.data import Dataset sys.path.append('.') from utils.utils import read, img2tensor from datasets.gopro_datasets import ( random_resize_woflow, random_crop_woflow, center_crop_woflow, random_reverse_channel_woflow, random_vertical_flip_woflow, random_horizontal_flip_woflow, random_rotate_woflow, random_reverse_time_woflow ) class Adobe240_Dataset(Dataset): def __init__(self, dataset_dir='data/adobe240/test_frames', interFrames=7, augment=True): super().__init__() self.augment = augment self.interFrames = interFrames self.setLength = interFrames + 2 self.dataset_dir = os.path.join(dataset_dir) video_list = os.listdir(self.dataset_dir)[9::10] self.frames_list = [] self.file_list = [] for video in video_list: frames = sorted(os.listdir(os.path.join(self.dataset_dir, video))) n_sets = (len(frames) - self.setLength) // (interFrames + 1) + 1 videoInputs = [frames[(interFrames + 1) * i: (interFrames + 1) * i + self.setLength] for i in range(n_sets)] videoInputs = [[os.path.join(video, f) for f in group] for group in videoInputs] self.file_list.extend(videoInputs) def __getitem__(self, idx): clip_idx = idx // self.interFrames embt_idx = idx % self.interFrames imgpaths = [os.path.join(self.dataset_dir, fp) for fp in self.file_list[clip_idx]] pick_idxs = list(range(0, self.setLength, self.interFrames + 1)) imgt_beg = self.setLength // 2 - self.interFrames // 2 imgt_end = self.setLength // 2 + self.interFrames // 2 + self.interFrames % 2 imgt_idx = list(range(imgt_beg, imgt_end)) input_paths = [imgpaths[idx] for idx in pick_idxs] imgt_paths = [imgpaths[idx] for idx in imgt_idx] img0 = np.array(read(input_paths[0])) imgt = np.array(read(imgt_paths[embt_idx])) img1 = np.array(read(input_paths[1])) embt = torch.from_numpy(np.array((embt_idx + 1) / (self.interFrames + 1) ).reshape(1, 1, 1).astype(np.float32)) if self.augment == True: img0, imgt, img1 = random_resize_woflow(img0, imgt, img1, p=0.1) img0, imgt, img1 = random_crop_woflow(img0, imgt, img1, crop_size=(224, 224)) img0, imgt, img1 = random_reverse_channel_woflow(img0, imgt, img1, p=0.5) img0, imgt, img1 = random_vertical_flip_woflow(img0, imgt, img1, p=0.3) img0, imgt, img1 = random_horizontal_flip_woflow(img0, imgt, img1, p=0.5) img0, imgt, img1 = random_rotate_woflow(img0, imgt, img1, p=0.05) img0, imgt, img1, embt = random_reverse_time_woflow(img0, imgt, img1, embt=embt, p=0.5) else: img0, imgt, img1 = center_crop_woflow(img0, imgt, img1, crop_size=(512, 512)) img0 = img2tensor(img0).squeeze(0) imgt = img2tensor(imgt).squeeze(0) img1 = img2tensor(img1).squeeze(0) return {'img0': img0.float(), 'imgt': imgt.float(), 'img1': img1.float(), 'embt': embt} def __len__(self): return len(self.file_list) * self.interFrames ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/datasets/gopro_datasets.py ================================================ ''' This code is partially borrowed from IFRNet (https://github.com/ltkong218/IFRNet). In the consideration of the difficulty in flow supervision generation, we abort flow loss in the 8x case. ''' import os import cv2 import torch import random import numpy as np from torch.utils.data import Dataset from utils.utils import read, img2tensor def random_resize_woflow(img0, imgt, img1, p=0.1): if random.uniform(0, 1) < p: img0 = cv2.resize(img0, dsize=None, fx=2.0, fy=2.0, interpolation=cv2.INTER_LINEAR) imgt = cv2.resize(imgt, dsize=None, fx=2.0, fy=2.0, interpolation=cv2.INTER_LINEAR) img1 = cv2.resize(img1, dsize=None, fx=2.0, fy=2.0, interpolation=cv2.INTER_LINEAR) return img0, imgt, img1 def random_crop_woflow(img0, imgt, img1, crop_size=(224, 224)): h, w = crop_size[0], crop_size[1] ih, iw, _ = img0.shape x = np.random.randint(0, ih-h+1) y = np.random.randint(0, iw-w+1) img0 = img0[x: x + h, y : y + w, :] imgt = imgt[x: x + h, y : y + w, :] img1 = img1[x: x + h, y : y + w, :] return img0, imgt, img1 def center_crop_woflow(img0, imgt, img1, crop_size=(512, 512)): h, w = crop_size[0], crop_size[1] ih, iw, _ = img0.shape img0 = img0[ih // 2 - h // 2: ih // 2 + h // 2, iw // 2 - w // 2: iw // 2 + w // 2, :] imgt = imgt[ih // 2 - h // 2: ih // 2 + h // 2, iw // 2 - w // 2: iw // 2 + w // 2, :] img1 = img1[ih // 2 - h // 2: ih // 2 + h // 2, iw // 2 - w // 2: iw // 2 + w // 2, :] return img0, imgt, img1 def random_reverse_channel_woflow(img0, imgt, img1, p=0.5): if random.uniform(0, 1) < p: img0 = img0[:, :, ::-1] imgt = imgt[:, :, ::-1] img1 = img1[:, :, ::-1] return img0, imgt, img1 def random_vertical_flip_woflow(img0, imgt, img1, p=0.3): if random.uniform(0, 1) < p: img0 = img0[::-1] imgt = imgt[::-1] img1 = img1[::-1] return img0, imgt, img1 def random_horizontal_flip_woflow(img0, imgt, img1, p=0.5): if random.uniform(0, 1) < p: img0 = img0[:, ::-1] imgt = imgt[:, ::-1] img1 = img1[:, ::-1] return img0, imgt, img1 def random_rotate_woflow(img0, imgt, img1, p=0.05): if random.uniform(0, 1) < p: img0 = img0.transpose((1, 0, 2)) imgt = imgt.transpose((1, 0, 2)) img1 = img1.transpose((1, 0, 2)) return img0, imgt, img1 def random_reverse_time_woflow(img0, imgt, img1, embt, p=0.5): if random.uniform(0, 1) < p: tmp = img1 img1 = img0 img0 = tmp embt = 1 - embt return img0, imgt, img1, embt class GoPro_Train_Dataset(Dataset): def __init__(self, dataset_dir='data/GOPRO', interFrames=7, augment=True): self.dataset_dir = dataset_dir + '/train' self.interFrames = interFrames self.augment = augment self.setLength = interFrames + 2 video_list = [ 'GOPR0372_07_00', 'GOPR0374_11_01', 'GOPR0378_13_00', 'GOPR0384_11_01', 'GOPR0384_11_04', 'GOPR0477_11_00', 'GOPR0868_11_02', 'GOPR0884_11_00', 'GOPR0372_07_01', 'GOPR0374_11_02', 'GOPR0379_11_00', 'GOPR0384_11_02', 'GOPR0385_11_00', 'GOPR0857_11_00', 'GOPR0871_11_01', 'GOPR0374_11_00', 'GOPR0374_11_03', 'GOPR0380_11_00', 'GOPR0384_11_03', 'GOPR0386_11_00', 'GOPR0868_11_01', 'GOPR0881_11_00'] self.frames_list = [] self.file_list = [] for video in video_list: frames = sorted(os.listdir(os.path.join(self.dataset_dir, video))) n_sets = (len(frames) - self.setLength) // (interFrames+1) + 1 videoInputs = [frames[(interFrames + 1) * i: (interFrames + 1) * i + self.setLength ] for i in range(n_sets)] videoInputs = [[os.path.join(video, f) for f in group] for group in videoInputs] self.file_list.extend(videoInputs) def __len__(self): return len(self.file_list) * self.interFrames def __getitem__(self, idx): clip_idx = idx // self.interFrames embt_idx = idx % self.interFrames imgpaths = [os.path.join(self.dataset_dir, fp) for fp in self.file_list[clip_idx]] pick_idxs = list(range(0, self.setLength, self.interFrames + 1)) imgt_beg = self.setLength // 2 - self.interFrames // 2 imgt_end = self.setLength // 2 + self.interFrames // 2 + self.interFrames % 2 imgt_idx = list(range(imgt_beg, imgt_end)) input_paths = [imgpaths[idx] for idx in pick_idxs] imgt_paths = [imgpaths[idx] for idx in imgt_idx] embt = torch.from_numpy(np.array((embt_idx + 1) / (self.interFrames+1) ).reshape(1, 1, 1).astype(np.float32)) img0 = np.array(read(input_paths[0])) imgt = np.array(read(imgt_paths[embt_idx])) img1 = np.array(read(input_paths[1])) if self.augment == True: img0, imgt, img1 = random_resize_woflow(img0, imgt, img1, p=0.1) img0, imgt, img1 = random_crop_woflow(img0, imgt, img1, crop_size=(224, 224)) img0, imgt, img1 = random_reverse_channel_woflow(img0, imgt, img1, p=0.5) img0, imgt, img1 = random_vertical_flip_woflow(img0, imgt, img1, p=0.3) img0, imgt, img1 = random_horizontal_flip_woflow(img0, imgt, img1, p=0.5) img0, imgt, img1 = random_rotate_woflow(img0, imgt, img1, p=0.05) img0, imgt, img1, embt = random_reverse_time_woflow(img0, imgt, img1, embt=embt, p=0.5) else: img0, imgt, img1 = center_crop_woflow(img0, imgt, img1, crop_size=(512, 512)) img0 = img2tensor(img0.copy()).squeeze(0) imgt = img2tensor(imgt.copy()).squeeze(0) img1 = img2tensor(img1.copy()).squeeze(0) return {'img0': img0.float(), 'imgt': imgt.float(), 'img1': img1.float(), 'embt': embt} class GoPro_Test_Dataset(Dataset): def __init__(self, dataset_dir='data/GOPRO', interFrames=7): self.dataset_dir = dataset_dir + '/test' self.interFrames = interFrames self.setLength = interFrames + 2 video_list = [ 'GOPR0384_11_00', 'GOPR0385_11_01', 'GOPR0410_11_00', 'GOPR0862_11_00', 'GOPR0869_11_00', 'GOPR0881_11_01', 'GOPR0384_11_05', 'GOPR0396_11_00', 'GOPR0854_11_00', 'GOPR0868_11_00', 'GOPR0871_11_00'] self.frames_list = [] self.file_list = [] for video in video_list: frames = sorted(os.listdir(os.path.join(self.dataset_dir, video))) n_sets = (len(frames) - self.setLength)//(interFrames+1) + 1 videoInputs = [frames[(interFrames + 1) * i:(interFrames + 1) * i + self.setLength ] for i in range(n_sets)] videoInputs = [[os.path.join(video, f) for f in group] for group in videoInputs] self.file_list.extend(videoInputs) def __len__(self): return len(self.file_list) * self.interFrames def __getitem__(self, idx): clip_idx = idx // self.interFrames embt_idx = idx % self.interFrames imgpaths = [os.path.join(self.dataset_dir, fp) for fp in self.file_list[clip_idx]] pick_idxs = list(range(0, self.setLength, self.interFrames + 1)) imgt_beg = self.setLength // 2 - self.interFrames // 2 imgt_end = self.setLength // 2 + self.interFrames // 2 + self.interFrames % 2 imgt_idx = list(range(imgt_beg, imgt_end)) input_paths = [imgpaths[idx] for idx in pick_idxs] imgt_paths = [imgpaths[idx] for idx in imgt_idx] img0 = np.array(read(input_paths[0])) imgt = np.array(read(imgt_paths[embt_idx])) img1 = np.array(read(input_paths[1])) img0, imgt, img1 = center_crop_woflow(img0, imgt, img1, crop_size=(512, 512)) img0 = img2tensor(img0).squeeze(0) imgt = img2tensor(imgt).squeeze(0) img1 = img2tensor(img1).squeeze(0) embt = torch.from_numpy(np.array((embt_idx + 1) / (self.interFrames + 1) ).reshape(1, 1, 1).astype(np.float32)) return {'img0': img0.float(), 'imgt': imgt.float(), 'img1': img1.float(), 'embt': embt} ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/datasets/vimeo_datasets.py ================================================ ''' This code is partially borrowed from IFRNet (https://github.com/ltkong218/IFRNet). ''' import os import cv2 import torch import random import numpy as np from torch.utils.data import Dataset from utils.utils import read def random_resize(img0, imgt, img1, flow, p=0.1): if random.uniform(0, 1) < p: img0 = cv2.resize(img0, dsize=None, fx=2.0, fy=2.0, interpolation=cv2.INTER_LINEAR) imgt = cv2.resize(imgt, dsize=None, fx=2.0, fy=2.0, interpolation=cv2.INTER_LINEAR) img1 = cv2.resize(img1, dsize=None, fx=2.0, fy=2.0, interpolation=cv2.INTER_LINEAR) flow = cv2.resize(flow, dsize=None, fx=2.0, fy=2.0, interpolation=cv2.INTER_LINEAR) * 2.0 return img0, imgt, img1, flow def random_crop(img0, imgt, img1, flow, crop_size=(224, 224)): h, w = crop_size[0], crop_size[1] ih, iw, _ = img0.shape x = np.random.randint(0, ih-h+1) y = np.random.randint(0, iw-w+1) img0 = img0[x:x+h, y:y+w, :] imgt = imgt[x:x+h, y:y+w, :] img1 = img1[x:x+h, y:y+w, :] flow = flow[x:x+h, y:y+w, :] return img0, imgt, img1, flow def random_reverse_channel(img0, imgt, img1, flow, p=0.5): if random.uniform(0, 1) < p: img0 = img0[:, :, ::-1] imgt = imgt[:, :, ::-1] img1 = img1[:, :, ::-1] return img0, imgt, img1, flow def random_vertical_flip(img0, imgt, img1, flow, p=0.3): if random.uniform(0, 1) < p: img0 = img0[::-1] imgt = imgt[::-1] img1 = img1[::-1] flow = flow[::-1] flow = np.concatenate((flow[:, :, 0:1], -flow[:, :, 1:2], flow[:, :, 2:3], -flow[:, :, 3:4]), 2) return img0, imgt, img1, flow def random_horizontal_flip(img0, imgt, img1, flow, p=0.5): if random.uniform(0, 1) < p: img0 = img0[:, ::-1] imgt = imgt[:, ::-1] img1 = img1[:, ::-1] flow = flow[:, ::-1] flow = np.concatenate((-flow[:, :, 0:1], flow[:, :, 1:2], -flow[:, :, 2:3], flow[:, :, 3:4]), 2) return img0, imgt, img1, flow def random_rotate(img0, imgt, img1, flow, p=0.05): if random.uniform(0, 1) < p: img0 = img0.transpose((1, 0, 2)) imgt = imgt.transpose((1, 0, 2)) img1 = img1.transpose((1, 0, 2)) flow = flow.transpose((1, 0, 2)) flow = np.concatenate((flow[:, :, 1:2], flow[:, :, 0:1], flow[:, :, 3:4], flow[:, :, 2:3]), 2) return img0, imgt, img1, flow def random_reverse_time(img0, imgt, img1, flow, p=0.5): if random.uniform(0, 1) < p: tmp = img1 img1 = img0 img0 = tmp flow = np.concatenate((flow[:, :, 2:4], flow[:, :, 0:2]), 2) return img0, imgt, img1, flow class Vimeo90K_Train_Dataset(Dataset): def __init__(self, dataset_dir='data/vimeo_triplet', flow_dir=None, augment=True, crop_size=(224, 224)): self.dataset_dir = dataset_dir self.augment = augment self.crop_size = crop_size self.img0_list = [] self.imgt_list = [] self.img1_list = [] self.flow_t0_list = [] self.flow_t1_list = [] if flow_dir is None: flow_dir = 'flow' with open(os.path.join(dataset_dir, 'tri_trainlist.txt'), 'r') as f: for i in f: name = str(i).strip() if(len(name) <= 1): continue self.img0_list.append(os.path.join(dataset_dir, 'sequences', name, 'im1.png')) self.imgt_list.append(os.path.join(dataset_dir, 'sequences', name, 'im2.png')) self.img1_list.append(os.path.join(dataset_dir, 'sequences', name, 'im3.png')) self.flow_t0_list.append(os.path.join(dataset_dir, flow_dir, name, 'flow_t0.flo')) self.flow_t1_list.append(os.path.join(dataset_dir, flow_dir, name, 'flow_t1.flo')) def __len__(self): return len(self.imgt_list) def __getitem__(self, idx): img0 = read(self.img0_list[idx]) imgt = read(self.imgt_list[idx]) img1 = read(self.img1_list[idx]) flow_t0 = read(self.flow_t0_list[idx]) flow_t1 = read(self.flow_t1_list[idx]) flow = np.concatenate((flow_t0, flow_t1), 2).astype(np.float64) if self.augment == True: img0, imgt, img1, flow = random_resize(img0, imgt, img1, flow, p=0.1) img0, imgt, img1, flow = random_crop(img0, imgt, img1, flow, crop_size=self.crop_size) img0, imgt, img1, flow = random_reverse_channel(img0, imgt, img1, flow, p=0.5) img0, imgt, img1, flow = random_vertical_flip(img0, imgt, img1, flow, p=0.3) img0, imgt, img1, flow = random_horizontal_flip(img0, imgt, img1, flow, p=0.5) img0, imgt, img1, flow = random_rotate(img0, imgt, img1, flow, p=0.05) img0, imgt, img1, flow = random_reverse_time(img0, imgt, img1, flow, p=0.5) img0 = torch.from_numpy(img0.transpose((2, 0, 1)).astype(np.float32) / 255.0) imgt = torch.from_numpy(imgt.transpose((2, 0, 1)).astype(np.float32) / 255.0) img1 = torch.from_numpy(img1.transpose((2, 0, 1)).astype(np.float32) / 255.0) flow = torch.from_numpy(flow.transpose((2, 0, 1)).astype(np.float32)) embt = torch.from_numpy(np.array(1/2).reshape(1, 1, 1).astype(np.float32)) return {'img0': img0.float(), 'imgt': imgt.float(), 'img1': img1.float(), 'flow': flow.float(), 'embt': embt} class Vimeo90K_Test_Dataset(Dataset): def __init__(self, dataset_dir='data/vimeo_triplet'): self.dataset_dir = dataset_dir self.img0_list = [] self.imgt_list = [] self.img1_list = [] self.flow_t0_list = [] self.flow_t1_list = [] with open(os.path.join(dataset_dir, 'tri_testlist.txt'), 'r') as f: for i in f: name = str(i).strip() if(len(name) <= 1): continue self.img0_list.append(os.path.join(dataset_dir, 'sequences', name, 'im1.png')) self.imgt_list.append(os.path.join(dataset_dir, 'sequences', name, 'im2.png')) self.img1_list.append(os.path.join(dataset_dir, 'sequences', name, 'im3.png')) self.flow_t0_list.append(os.path.join(dataset_dir, 'flow', name, 'flow_t0.flo')) self.flow_t1_list.append(os.path.join(dataset_dir, 'flow', name, 'flow_t1.flo')) def __len__(self): return len(self.imgt_list) def __getitem__(self, idx): img0 = read(self.img0_list[idx]) imgt = read(self.imgt_list[idx]) img1 = read(self.img1_list[idx]) flow_t0 = read(self.flow_t0_list[idx]) flow_t1 = read(self.flow_t1_list[idx]) flow = np.concatenate((flow_t0, flow_t1), 2) img0 = torch.from_numpy(img0.transpose((2, 0, 1)).astype(np.float32) / 255.0) imgt = torch.from_numpy(imgt.transpose((2, 0, 1)).astype(np.float32) / 255.0) img1 = torch.from_numpy(img1.transpose((2, 0, 1)).astype(np.float32) / 255.0) flow = torch.from_numpy(flow.transpose((2, 0, 1)).astype(np.float32)) embt = torch.from_numpy(np.array(1/2).reshape(1, 1, 1).astype(np.float32)) return {'img0': img0.float(), 'imgt': imgt.float(), 'img1': img1.float(), 'flow': flow.float(), 'embt': embt} ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/flow_generation/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/flow_generation/gen_flow.py ================================================ import os import sys import torch import argparse import numpy as np import os.path as osp import torch.nn.functional as F sys.path.append('.') from utils.utils import read, write from flow_generation.liteflownet.run import estimate parser = argparse.ArgumentParser( prog = 'AMT', description = 'Flow generation', ) parser.add_argument('-r', '--root', default='data/vimeo_triplet') args = parser.parse_args() vimeo90k_dir = args.root vimeo90k_sequences_dir = osp.join(vimeo90k_dir, 'sequences') vimeo90k_flow_dir = osp.join(vimeo90k_dir, 'flow') def pred_flow(img1, img2): img1 = torch.from_numpy(img1).float().permute(2, 0, 1) / 255.0 img2 = torch.from_numpy(img2).float().permute(2, 0, 1) / 255.0 flow = estimate(img1, img2) flow = flow.permute(1, 2, 0).cpu().numpy() return flow print('Built Flow Path') if not osp.exists(vimeo90k_flow_dir): os.makedirs(vimeo90k_flow_dir) for sequences_path in sorted(os.listdir(vimeo90k_sequences_dir)): vimeo90k_sequences_path_dir = osp.join(vimeo90k_sequences_dir, sequences_path) vimeo90k_flow_path_dir = osp.join(vimeo90k_flow_dir, sequences_path) if not osp.exists(vimeo90k_flow_path_dir): os.mkdir(vimeo90k_flow_path_dir) for sequences_id in sorted(os.listdir(vimeo90k_sequences_path_dir)): vimeo90k_flow_id_dir = osp.join(vimeo90k_flow_path_dir, sequences_id) if not osp.exists(vimeo90k_flow_id_dir): os.mkdir(vimeo90k_flow_id_dir) for sequences_path in sorted(os.listdir(vimeo90k_sequences_dir)): vimeo90k_sequences_path_dir = os.path.join(vimeo90k_sequences_dir, sequences_path) vimeo90k_flow_path_dir = os.path.join(vimeo90k_flow_dir, sequences_path) for sequences_id in sorted(os.listdir(vimeo90k_sequences_path_dir)): vimeo90k_sequences_id_dir = os.path.join(vimeo90k_sequences_path_dir, sequences_id) vimeo90k_flow_id_dir = os.path.join(vimeo90k_flow_path_dir, sequences_id) img0_path = vimeo90k_sequences_id_dir + '/im1.png' imgt_path = vimeo90k_sequences_id_dir + '/im2.png' img1_path = vimeo90k_sequences_id_dir + '/im3.png' flow_t0_path = vimeo90k_flow_id_dir + '/flow_t0.flo' flow_t1_path = vimeo90k_flow_id_dir + '/flow_t1.flo' img0 = read(img0_path) imgt = read(imgt_path) img1 = read(img1_path) flow_t0 = pred_flow(imgt, img0) flow_t1 = pred_flow(imgt, img1) write(flow_t0_path, flow_t0) write(flow_t1_path, flow_t1) print('Written Sequences {}'.format(sequences_path)) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/flow_generation/liteflownet/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/flow_generation/liteflownet/run.py ================================================ #!/usr/bin/env python import getopt import math import numpy import PIL import PIL.Image import sys import torch try: from .correlation import correlation # the custom cost volume layer except: sys.path.insert(0, './correlation'); import correlation # you should consider upgrading python # end ########################################################## assert(int(str('').join(torch.__version__.split('.')[0:2])) >= 13) # requires at least pytorch version 1.3.0 torch.set_grad_enabled(False) # make sure to not compute gradients for computational performance torch.backends.cudnn.enabled = True # make sure to use cudnn for computational performance ########################################################## arguments_strModel = 'default' # 'default', or 'kitti', or 'sintel' arguments_strOne = './images/one.png' arguments_strTwo = './images/two.png' arguments_strOut = './out.flo' for strOption, strArgument in getopt.getopt(sys.argv[1:], '', [ strParameter[2:] + '=' for strParameter in sys.argv[1::2] ])[0]: if strOption == '--model' and strArgument != '': arguments_strModel = strArgument # which model to use if strOption == '--one' and strArgument != '': arguments_strOne = strArgument # path to the first frame if strOption == '--two' and strArgument != '': arguments_strTwo = strArgument # path to the second frame if strOption == '--out' and strArgument != '': arguments_strOut = strArgument # path to where the output should be stored # end ########################################################## backwarp_tenGrid = {} def backwarp(tenInput, tenFlow): if str(tenFlow.shape) not in backwarp_tenGrid: tenHor = torch.linspace(-1.0 + (1.0 / tenFlow.shape[3]), 1.0 - (1.0 / tenFlow.shape[3]), tenFlow.shape[3]).view(1, 1, 1, -1).repeat(1, 1, tenFlow.shape[2], 1) tenVer = torch.linspace(-1.0 + (1.0 / tenFlow.shape[2]), 1.0 - (1.0 / tenFlow.shape[2]), tenFlow.shape[2]).view(1, 1, -1, 1).repeat(1, 1, 1, tenFlow.shape[3]) backwarp_tenGrid[str(tenFlow.shape)] = torch.cat([ tenHor, tenVer ], 1).cuda() # end tenFlow = torch.cat([ tenFlow[:, 0:1, :, :] / ((tenInput.shape[3] - 1.0) / 2.0), tenFlow[:, 1:2, :, :] / ((tenInput.shape[2] - 1.0) / 2.0) ], 1) return torch.nn.functional.grid_sample(input=tenInput, grid=(backwarp_tenGrid[str(tenFlow.shape)] + tenFlow).permute(0, 2, 3, 1), mode='bilinear', padding_mode='zeros', align_corners=False) # end ########################################################## class Network(torch.nn.Module): def __init__(self): super().__init__() class Features(torch.nn.Module): def __init__(self): super().__init__() self.netOne = torch.nn.Sequential( torch.nn.Conv2d(in_channels=3, out_channels=32, kernel_size=7, stride=1, padding=3), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1) ) self.netTwo = torch.nn.Sequential( torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=2, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1) ) self.netThr = torch.nn.Sequential( torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1) ) self.netFou = torch.nn.Sequential( torch.nn.Conv2d(in_channels=64, out_channels=96, kernel_size=3, stride=2, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1) ) self.netFiv = torch.nn.Sequential( torch.nn.Conv2d(in_channels=96, out_channels=128, kernel_size=3, stride=2, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1) ) self.netSix = torch.nn.Sequential( torch.nn.Conv2d(in_channels=128, out_channels=192, kernel_size=3, stride=2, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1) ) # end def forward(self, tenInput): tenOne = self.netOne(tenInput) tenTwo = self.netTwo(tenOne) tenThr = self.netThr(tenTwo) tenFou = self.netFou(tenThr) tenFiv = self.netFiv(tenFou) tenSix = self.netSix(tenFiv) return [ tenOne, tenTwo, tenThr, tenFou, tenFiv, tenSix ] # end # end class Matching(torch.nn.Module): def __init__(self, intLevel): super().__init__() self.fltBackwarp = [ 0.0, 0.0, 10.0, 5.0, 2.5, 1.25, 0.625 ][intLevel] if intLevel != 2: self.netFeat = torch.nn.Sequential() elif intLevel == 2: self.netFeat = torch.nn.Sequential( torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=1, stride=1, padding=0), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1) ) # end if intLevel == 6: self.netUpflow = None elif intLevel != 6: self.netUpflow = torch.nn.ConvTranspose2d(in_channels=2, out_channels=2, kernel_size=4, stride=2, padding=1, bias=False, groups=2) # end if intLevel >= 4: self.netUpcorr = None elif intLevel < 4: self.netUpcorr = torch.nn.ConvTranspose2d(in_channels=49, out_channels=49, kernel_size=4, stride=2, padding=1, bias=False, groups=49) # end self.netMain = torch.nn.Sequential( torch.nn.Conv2d(in_channels=49, out_channels=128, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=128, out_channels=64, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=32, out_channels=2, kernel_size=[ 0, 0, 7, 5, 5, 3, 3 ][intLevel], stride=1, padding=[ 0, 0, 3, 2, 2, 1, 1 ][intLevel]) ) # end def forward(self, tenOne, tenTwo, tenFeaturesOne, tenFeaturesTwo, tenFlow): tenFeaturesOne = self.netFeat(tenFeaturesOne) tenFeaturesTwo = self.netFeat(tenFeaturesTwo) if tenFlow is not None: tenFlow = self.netUpflow(tenFlow) # end if tenFlow is not None: tenFeaturesTwo = backwarp(tenInput=tenFeaturesTwo, tenFlow=tenFlow * self.fltBackwarp) # end if self.netUpcorr is None: tenCorrelation = torch.nn.functional.leaky_relu(input=correlation.FunctionCorrelation(tenOne=tenFeaturesOne, tenTwo=tenFeaturesTwo, intStride=1), negative_slope=0.1, inplace=False) elif self.netUpcorr is not None: tenCorrelation = self.netUpcorr(torch.nn.functional.leaky_relu(input=correlation.FunctionCorrelation(tenOne=tenFeaturesOne, tenTwo=tenFeaturesTwo, intStride=2), negative_slope=0.1, inplace=False)) # end return (tenFlow if tenFlow is not None else 0.0) + self.netMain(tenCorrelation) # end # end class Subpixel(torch.nn.Module): def __init__(self, intLevel): super().__init__() self.fltBackward = [ 0.0, 0.0, 10.0, 5.0, 2.5, 1.25, 0.625 ][intLevel] if intLevel != 2: self.netFeat = torch.nn.Sequential() elif intLevel == 2: self.netFeat = torch.nn.Sequential( torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=1, stride=1, padding=0), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1) ) # end self.netMain = torch.nn.Sequential( torch.nn.Conv2d(in_channels=[ 0, 0, 130, 130, 194, 258, 386 ][intLevel], out_channels=128, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=128, out_channels=64, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=32, out_channels=2, kernel_size=[ 0, 0, 7, 5, 5, 3, 3 ][intLevel], stride=1, padding=[ 0, 0, 3, 2, 2, 1, 1 ][intLevel]) ) # end def forward(self, tenOne, tenTwo, tenFeaturesOne, tenFeaturesTwo, tenFlow): tenFeaturesOne = self.netFeat(tenFeaturesOne) tenFeaturesTwo = self.netFeat(tenFeaturesTwo) if tenFlow is not None: tenFeaturesTwo = backwarp(tenInput=tenFeaturesTwo, tenFlow=tenFlow * self.fltBackward) # end return (tenFlow if tenFlow is not None else 0.0) + self.netMain(torch.cat([ tenFeaturesOne, tenFeaturesTwo, tenFlow ], 1)) # end # end class Regularization(torch.nn.Module): def __init__(self, intLevel): super().__init__() self.fltBackward = [ 0.0, 0.0, 10.0, 5.0, 2.5, 1.25, 0.625 ][intLevel] self.intUnfold = [ 0, 0, 7, 5, 5, 3, 3 ][intLevel] if intLevel >= 5: self.netFeat = torch.nn.Sequential() elif intLevel < 5: self.netFeat = torch.nn.Sequential( torch.nn.Conv2d(in_channels=[ 0, 0, 32, 64, 96, 128, 192 ][intLevel], out_channels=128, kernel_size=1, stride=1, padding=0), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1) ) # end self.netMain = torch.nn.Sequential( torch.nn.Conv2d(in_channels=[ 0, 0, 131, 131, 131, 131, 195 ][intLevel], out_channels=128, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=128, out_channels=64, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1) ) if intLevel >= 5: self.netDist = torch.nn.Sequential( torch.nn.Conv2d(in_channels=32, out_channels=[ 0, 0, 49, 25, 25, 9, 9 ][intLevel], kernel_size=[ 0, 0, 7, 5, 5, 3, 3 ][intLevel], stride=1, padding=[ 0, 0, 3, 2, 2, 1, 1 ][intLevel]) ) elif intLevel < 5: self.netDist = torch.nn.Sequential( torch.nn.Conv2d(in_channels=32, out_channels=[ 0, 0, 49, 25, 25, 9, 9 ][intLevel], kernel_size=([ 0, 0, 7, 5, 5, 3, 3 ][intLevel], 1), stride=1, padding=([ 0, 0, 3, 2, 2, 1, 1 ][intLevel], 0)), torch.nn.Conv2d(in_channels=[ 0, 0, 49, 25, 25, 9, 9 ][intLevel], out_channels=[ 0, 0, 49, 25, 25, 9, 9 ][intLevel], kernel_size=(1, [ 0, 0, 7, 5, 5, 3, 3 ][intLevel]), stride=1, padding=(0, [ 0, 0, 3, 2, 2, 1, 1 ][intLevel])) ) # end self.netScaleX = torch.nn.Conv2d(in_channels=[ 0, 0, 49, 25, 25, 9, 9 ][intLevel], out_channels=1, kernel_size=1, stride=1, padding=0) self.netScaleY = torch.nn.Conv2d(in_channels=[ 0, 0, 49, 25, 25, 9, 9 ][intLevel], out_channels=1, kernel_size=1, stride=1, padding=0) # eny def forward(self, tenOne, tenTwo, tenFeaturesOne, tenFeaturesTwo, tenFlow): tenDifference = ((tenOne - backwarp(tenInput=tenTwo, tenFlow=tenFlow * self.fltBackward)) ** 2).sum(1, True).sqrt().detach() tenDist = self.netDist(self.netMain(torch.cat([ tenDifference, tenFlow - tenFlow.view(tenFlow.shape[0], 2, -1).mean(2, True).view(tenFlow.shape[0], 2, 1, 1), self.netFeat(tenFeaturesOne) ], 1))) tenDist = (tenDist ** 2).neg() tenDist = (tenDist - tenDist.max(1, True)[0]).exp() tenDivisor = tenDist.sum(1, True).reciprocal() tenScaleX = self.netScaleX(tenDist * torch.nn.functional.unfold(input=tenFlow[:, 0:1, :, :], kernel_size=self.intUnfold, stride=1, padding=int((self.intUnfold - 1) / 2)).view_as(tenDist)) * tenDivisor tenScaleY = self.netScaleY(tenDist * torch.nn.functional.unfold(input=tenFlow[:, 1:2, :, :], kernel_size=self.intUnfold, stride=1, padding=int((self.intUnfold - 1) / 2)).view_as(tenDist)) * tenDivisor return torch.cat([ tenScaleX, tenScaleY ], 1) # end # end self.netFeatures = Features() self.netMatching = torch.nn.ModuleList([ Matching(intLevel) for intLevel in [ 2, 3, 4, 5, 6 ] ]) self.netSubpixel = torch.nn.ModuleList([ Subpixel(intLevel) for intLevel in [ 2, 3, 4, 5, 6 ] ]) self.netRegularization = torch.nn.ModuleList([ Regularization(intLevel) for intLevel in [ 2, 3, 4, 5, 6 ] ]) self.load_state_dict({ strKey.replace('module', 'net'): tenWeight for strKey, tenWeight in torch.hub.load_state_dict_from_url(url='http://content.sniklaus.com/github/pytorch-liteflownet/network-' + arguments_strModel + '.pytorch').items() }) # self.load_state_dict(torch.load('./liteflownet/network-default.pth')) # end def forward(self, tenOne, tenTwo): tenOne[:, 0, :, :] = tenOne[:, 0, :, :] - 0.411618 tenOne[:, 1, :, :] = tenOne[:, 1, :, :] - 0.434631 tenOne[:, 2, :, :] = tenOne[:, 2, :, :] - 0.454253 tenTwo[:, 0, :, :] = tenTwo[:, 0, :, :] - 0.410782 tenTwo[:, 1, :, :] = tenTwo[:, 1, :, :] - 0.433645 tenTwo[:, 2, :, :] = tenTwo[:, 2, :, :] - 0.452793 tenFeaturesOne = self.netFeatures(tenOne) tenFeaturesTwo = self.netFeatures(tenTwo) tenOne = [ tenOne ] tenTwo = [ tenTwo ] for intLevel in [ 1, 2, 3, 4, 5 ]: tenOne.append(torch.nn.functional.interpolate(input=tenOne[-1], size=(tenFeaturesOne[intLevel].shape[2], tenFeaturesOne[intLevel].shape[3]), mode='bilinear', align_corners=False)) tenTwo.append(torch.nn.functional.interpolate(input=tenTwo[-1], size=(tenFeaturesTwo[intLevel].shape[2], tenFeaturesTwo[intLevel].shape[3]), mode='bilinear', align_corners=False)) # end tenFlow = None for intLevel in [ -1, -2, -3, -4, -5 ]: tenFlow = self.netMatching[intLevel](tenOne[intLevel], tenTwo[intLevel], tenFeaturesOne[intLevel], tenFeaturesTwo[intLevel], tenFlow) tenFlow = self.netSubpixel[intLevel](tenOne[intLevel], tenTwo[intLevel], tenFeaturesOne[intLevel], tenFeaturesTwo[intLevel], tenFlow) tenFlow = self.netRegularization[intLevel](tenOne[intLevel], tenTwo[intLevel], tenFeaturesOne[intLevel], tenFeaturesTwo[intLevel], tenFlow) # end return tenFlow * 20.0 # end # end netNetwork = None ########################################################## def estimate(tenOne, tenTwo): global netNetwork if netNetwork is None: netNetwork = Network().cuda().eval() # end assert(tenOne.shape[1] == tenTwo.shape[1]) assert(tenOne.shape[2] == tenTwo.shape[2]) intWidth = tenOne.shape[2] intHeight = tenOne.shape[1] # assert(intWidth == 1024) # remember that there is no guarantee for correctness, comment this line out if you acknowledge this and want to continue # assert(intHeight == 436) # remember that there is no guarantee for correctness, comment this line out if you acknowledge this and want to continue tenPreprocessedOne = tenOne.cuda().view(1, 3, intHeight, intWidth) tenPreprocessedTwo = tenTwo.cuda().view(1, 3, intHeight, intWidth) intPreprocessedWidth = int(math.floor(math.ceil(intWidth / 32.0) * 32.0)) intPreprocessedHeight = int(math.floor(math.ceil(intHeight / 32.0) * 32.0)) tenPreprocessedOne = torch.nn.functional.interpolate(input=tenPreprocessedOne, size=(intPreprocessedHeight, intPreprocessedWidth), mode='bilinear', align_corners=False) tenPreprocessedTwo = torch.nn.functional.interpolate(input=tenPreprocessedTwo, size=(intPreprocessedHeight, intPreprocessedWidth), mode='bilinear', align_corners=False) tenFlow = torch.nn.functional.interpolate(input=netNetwork(tenPreprocessedOne, tenPreprocessedTwo), size=(intHeight, intWidth), mode='bilinear', align_corners=False) tenFlow[:, 0, :, :] *= float(intWidth) / float(intPreprocessedWidth) tenFlow[:, 1, :, :] *= float(intHeight) / float(intPreprocessedHeight) return tenFlow[0, :, :, :].cpu() # end ########################################################## if __name__ == '__main__': tenOne = torch.FloatTensor(numpy.ascontiguousarray(numpy.array(PIL.Image.open(arguments_strOne))[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0))) tenTwo = torch.FloatTensor(numpy.ascontiguousarray(numpy.array(PIL.Image.open(arguments_strTwo))[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0))) tenOutput = estimate(tenOne, tenTwo) objOutput = open(arguments_strOut, 'wb') numpy.array([ 80, 73, 69, 72 ], numpy.uint8).tofile(objOutput) numpy.array([ tenOutput.shape[2], tenOutput.shape[1] ], numpy.int32).tofile(objOutput) numpy.array(tenOutput.numpy().transpose(1, 2, 0), numpy.float32).tofile(objOutput) objOutput.close() # end ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/losses/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/losses/loss.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import numpy as np class Loss(nn.Module): def __init__(self, loss_weight, keys, mapping=None) -> None: ''' mapping: map the kwargs keys into desired ones. ''' super().__init__() self.loss_weight = loss_weight self.keys = keys self.mapping = mapping if isinstance(mapping, dict): self.mapping = {k: v for k, v in mapping if v in keys} def forward(self, **kwargs): params = {k: v for k, v in kwargs.items() if k in self.keys} if self.mapping is not None: for k, v in kwargs.items(): if self.mapping.get(k) is not None: params[self.mapping[k]] = v return self._forward(**params) * self.loss_weight def _forward(self, **kwargs): pass class CharbonnierLoss(Loss): def __init__(self, loss_weight, keys) -> None: super().__init__(loss_weight, keys) def _forward(self, imgt_pred, imgt): diff = imgt_pred - imgt loss = ((diff ** 2 + 1e-6) ** 0.5).mean() return loss class AdaCharbonnierLoss(Loss): def __init__(self, loss_weight, keys) -> None: super().__init__(loss_weight, keys) def _forward(self, imgt_pred, imgt, weight): alpha = weight / 2 epsilon = 10 ** (-(10 * weight - 1) / 3) diff = imgt_pred - imgt loss = ((diff ** 2 + epsilon ** 2) ** alpha).mean() return loss class TernaryLoss(Loss): def __init__(self, loss_weight, keys, patch_size=7): super().__init__(loss_weight, keys) self.patch_size = patch_size out_channels = patch_size * patch_size self.w = np.eye(out_channels).reshape((patch_size, patch_size, 1, out_channels)) self.w = np.transpose(self.w, (3, 2, 0, 1)) self.w = torch.tensor(self.w, dtype=torch.float32) def transform(self, tensor): self.w = self.w.to(tensor.device) tensor_ = tensor.mean(dim=1, keepdim=True) patches = F.conv2d(tensor_, self.w, padding=self.patch_size//2, bias=None) loc_diff = patches - tensor_ loc_diff_norm = loc_diff / torch.sqrt(0.81 + loc_diff ** 2) return loc_diff_norm def valid_mask(self, tensor): padding = self.patch_size//2 b, c, h, w = tensor.size() inner = torch.ones(b, 1, h - 2 * padding, w - 2 * padding).type_as(tensor) mask = F.pad(inner, [padding] * 4) return mask def _forward(self, imgt_pred, imgt): loc_diff_x = self.transform(imgt_pred) loc_diff_y = self.transform(imgt) diff = loc_diff_x - loc_diff_y.detach() dist = (diff ** 2 / (0.1 + diff ** 2)).mean(dim=1, keepdim=True) mask = self.valid_mask(imgt_pred) loss = (dist * mask).mean() return loss class GeometryLoss(Loss): def __init__(self, loss_weight, keys, patch_size=3): super().__init__(loss_weight, keys) self.patch_size = patch_size out_channels = patch_size * patch_size self.w = np.eye(out_channels).reshape((patch_size, patch_size, 1, out_channels)) self.w = np.transpose(self.w, (3, 2, 0, 1)) self.w = torch.tensor(self.w).float() def transform(self, tensor): b, c, h, w = tensor.size() self.w = self.w.to(tensor.device) tensor_ = tensor.reshape(b*c, 1, h, w) patches = F.conv2d(tensor_, self.w, padding=self.patch_size // 2, bias=None) loc_diff = patches - tensor_ loc_diff_ = loc_diff.reshape(b, c*(self.patch_size ** 2), h, w) loc_diff_norm = loc_diff_ / torch.sqrt(0.81 + loc_diff_ ** 2) return loc_diff_norm def valid_mask(self, tensor): padding = self.patch_size // 2 b, c, h, w = tensor.size() inner = torch.ones(b, 1, h - 2 * padding, w - 2 * padding).type_as(tensor) mask = F.pad(inner, [padding] * 4) return mask def _forward(self, ft_pred, ft_gt): loss = 0. for pred, gt in zip(ft_pred, ft_gt): loc_diff_x = self.transform(pred) loc_diff_y = self.transform(gt) diff = loc_diff_x - loc_diff_y dist = (diff ** 2 / (0.1 + diff ** 2)).mean(dim=1, keepdim=True) mask = self.valid_mask(pred) loss = loss + (dist * mask).mean() return loss class IFRFlowLoss(Loss): def __init__(self, loss_weight, keys, beta=0.3) -> None: super().__init__(loss_weight, keys) self.beta = beta self.ada_cb_loss = AdaCharbonnierLoss(1.0, ['imgt_pred', 'imgt', 'weight']) def _forward(self, flow0_pred, flow1_pred, flow): robust_weight0 = self.get_robust_weight(flow0_pred[0], flow[:, 0:2]) robust_weight1 = self.get_robust_weight(flow1_pred[0], flow[:, 2:4]) loss = 0 for lvl in range(1, len(flow0_pred)): scale_factor = 2**lvl loss = loss + self.ada_cb_loss(**{ 'imgt_pred': self.resize(flow0_pred[lvl], scale_factor), 'imgt': flow[:, 0:2], 'weight': robust_weight0 }) loss = loss + self.ada_cb_loss(**{ 'imgt_pred': self.resize(flow1_pred[lvl], scale_factor), 'imgt': flow[:, 2:4], 'weight': robust_weight1 }) return loss def resize(self, x, scale_factor): return scale_factor * F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False) def get_robust_weight(self, flow_pred, flow_gt): epe = ((flow_pred.detach() - flow_gt) ** 2).sum(dim=1, keepdim=True) ** 0.5 robust_weight = torch.exp(-self.beta * epe) return robust_weight class MultipleFlowLoss(Loss): def __init__(self, loss_weight, keys, beta=0.3) -> None: super().__init__(loss_weight, keys) self.beta = beta self.ada_cb_loss = AdaCharbonnierLoss(1.0, ['imgt_pred', 'imgt', 'weight']) def _forward(self, flow0_pred, flow1_pred, flow): robust_weight0 = self.get_mutli_flow_robust_weight(flow0_pred[0], flow[:, 0:2]) robust_weight1 = self.get_mutli_flow_robust_weight(flow1_pred[0], flow[:, 2:4]) loss = 0 for lvl in range(1, len(flow0_pred)): scale_factor = 2**lvl loss = loss + self.ada_cb_loss(**{ 'imgt_pred': self.resize(flow0_pred[lvl], scale_factor), 'imgt': flow[:, 0:2], 'weight': robust_weight0 }) loss = loss + self.ada_cb_loss(**{ 'imgt_pred': self.resize(flow1_pred[lvl], scale_factor), 'imgt': flow[:, 2:4], 'weight': robust_weight1 }) return loss def resize(self, x, scale_factor): return scale_factor * F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False) def get_mutli_flow_robust_weight(self, flow_pred, flow_gt): b, num_flows, c, h, w = flow_pred.shape flow_pred = flow_pred.view(b, num_flows, c, h, w) flow_gt = flow_gt.repeat(1, num_flows, 1, 1).view(b, num_flows, c, h, w) epe = ((flow_pred.detach() - flow_gt) ** 2).sum(dim=2, keepdim=True).max(1)[0] ** 0.5 robust_weight = torch.exp(-self.beta * epe) return robust_weight ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/metrics/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/metrics/psnr_ssim.py ================================================ import torch import torch.nn.functional as F from math import exp device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') def gaussian(window_size, sigma): gauss = torch.Tensor([exp(-(x - window_size//2)**2/float(2*sigma**2)) for x in range(window_size)]) return gauss/gauss.sum() def create_window(window_size, channel=1): _1D_window = gaussian(window_size, 1.5).unsqueeze(1) _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0).to(device) window = _2D_window.expand(channel, 1, window_size, window_size).contiguous() return window def create_window_3d(window_size, channel=1): _1D_window = gaussian(window_size, 1.5).unsqueeze(1) _2D_window = _1D_window.mm(_1D_window.t()) _3D_window = _2D_window.unsqueeze(2) @ (_1D_window.t()) window = _3D_window.expand(1, channel, window_size, window_size, window_size).contiguous().to(device) return window def ssim(img1, img2, window_size=11, window=None, size_average=True, full=False, val_range=None): if val_range is None: if torch.max(img1) > 128: max_val = 255 else: max_val = 1 if torch.min(img1) < -0.5: min_val = -1 else: min_val = 0 L = max_val - min_val else: L = val_range padd = 0 (_, channel, height, width) = img1.size() if window is None: real_size = min(window_size, height, width) window = create_window(real_size, channel=channel).to(img1.device) mu1 = F.conv2d(F.pad(img1, (5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=channel) mu2 = F.conv2d(F.pad(img2, (5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=channel) mu1_sq = mu1.pow(2) mu2_sq = mu2.pow(2) mu1_mu2 = mu1 * mu2 sigma1_sq = F.conv2d(F.pad(img1 * img1, (5, 5, 5, 5), 'replicate'), window, padding=padd, groups=channel) - mu1_sq sigma2_sq = F.conv2d(F.pad(img2 * img2, (5, 5, 5, 5), 'replicate'), window, padding=padd, groups=channel) - mu2_sq sigma12 = F.conv2d(F.pad(img1 * img2, (5, 5, 5, 5), 'replicate'), window, padding=padd, groups=channel) - mu1_mu2 C1 = (0.01 * L) ** 2 C2 = (0.03 * L) ** 2 v1 = 2.0 * sigma12 + C2 v2 = sigma1_sq + sigma2_sq + C2 cs = torch.mean(v1 / v2) ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2) if size_average: ret = ssim_map.mean() else: ret = ssim_map.mean(1).mean(1).mean(1) if full: return ret, cs return ret def calculate_ssim(img1, img2, window_size=11, window=None, size_average=True, full=False, val_range=None): if val_range is None: if torch.max(img1) > 128: max_val = 255 else: max_val = 1 if torch.min(img1) < -0.5: min_val = -1 else: min_val = 0 L = max_val - min_val else: L = val_range padd = 0 (_, _, height, width) = img1.size() if window is None: real_size = min(window_size, height, width) window = create_window_3d(real_size, channel=1).to(img1.device) img1 = img1.unsqueeze(1) img2 = img2.unsqueeze(1) mu1 = F.conv3d(F.pad(img1, (5, 5, 5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=1) mu2 = F.conv3d(F.pad(img2, (5, 5, 5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=1) mu1_sq = mu1.pow(2) mu2_sq = mu2.pow(2) mu1_mu2 = mu1 * mu2 sigma1_sq = F.conv3d(F.pad(img1 * img1, (5, 5, 5, 5, 5, 5), 'replicate'), window, padding=padd, groups=1) - mu1_sq sigma2_sq = F.conv3d(F.pad(img2 * img2, (5, 5, 5, 5, 5, 5), 'replicate'), window, padding=padd, groups=1) - mu2_sq sigma12 = F.conv3d(F.pad(img1 * img2, (5, 5, 5, 5, 5, 5), 'replicate'), window, padding=padd, groups=1) - mu1_mu2 C1 = (0.01 * L) ** 2 C2 = (0.03 * L) ** 2 v1 = 2.0 * sigma12 + C2 v2 = sigma1_sq + sigma2_sq + C2 cs = torch.mean(v1 / v2) ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2) if size_average: ret = ssim_map.mean() else: ret = ssim_map.mean(1).mean(1).mean(1) if full: return ret, cs return ret.detach().cpu().numpy() def calculate_psnr(img1, img2): psnr = -10 * torch.log10(((img1 - img2) * (img1 - img2)).mean()) return psnr.detach().cpu().numpy() def calculate_ie(img1, img2): ie = torch.abs(torch.round(img1 * 255.0) - torch.round(img2 * 255.0)).mean() return ie.detach().cpu().numpy() ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/networks/AMT-G.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from vbench.third_party.amt.networks.blocks.raft import ( coords_grid, BasicUpdateBlock, BidirCorrBlock ) from vbench.third_party.amt.networks.blocks.feat_enc import ( LargeEncoder ) from vbench.third_party.amt.networks.blocks.ifrnet import ( resize, Encoder, InitDecoder, IntermediateDecoder ) from vbench.third_party.amt.networks.blocks.multi_flow import ( multi_flow_combine, MultiFlowDecoder ) class Model(nn.Module): def __init__(self, corr_radius=3, corr_lvls=4, num_flows=5, channels=[84, 96, 112, 128], skip_channels=84): super(Model, self).__init__() self.radius = corr_radius self.corr_levels = corr_lvls self.num_flows = num_flows self.feat_encoder = LargeEncoder(output_dim=128, norm_fn='instance', dropout=0.) self.encoder = Encoder(channels, large=True) self.decoder4 = InitDecoder(channels[3], channels[2], skip_channels) self.decoder3 = IntermediateDecoder(channels[2], channels[1], skip_channels) self.decoder2 = IntermediateDecoder(channels[1], channels[0], skip_channels) self.decoder1 = MultiFlowDecoder(channels[0], skip_channels, num_flows) self.update4 = self._get_updateblock(112, None) self.update3_low = self._get_updateblock(96, 2.0) self.update2_low = self._get_updateblock(84, 4.0) self.update3_high = self._get_updateblock(96, None) self.update2_high = self._get_updateblock(84, None) self.comb_block = nn.Sequential( nn.Conv2d(3*self.num_flows, 6*self.num_flows, 7, 1, 3), nn.PReLU(6*self.num_flows), nn.Conv2d(6*self.num_flows, 3, 7, 1, 3), ) def _get_updateblock(self, cdim, scale_factor=None): return BasicUpdateBlock(cdim=cdim, hidden_dim=192, flow_dim=64, corr_dim=256, corr_dim2=192, fc_dim=188, scale_factor=scale_factor, corr_levels=self.corr_levels, radius=self.radius) def _corr_scale_lookup(self, corr_fn, coord, flow0, flow1, embt, downsample=1): # convert t -> 0 to 0 -> 1 | convert t -> 1 to 1 -> 0 # based on linear assumption t1_scale = 1. / embt t0_scale = 1. / (1. - embt) if downsample != 1: inv = 1 / downsample flow0 = inv * resize(flow0, scale_factor=inv) flow1 = inv * resize(flow1, scale_factor=inv) corr0, corr1 = corr_fn(coord + flow1 * t1_scale, coord + flow0 * t0_scale) corr = torch.cat([corr0, corr1], dim=1) flow = torch.cat([flow0, flow1], dim=1) return corr, flow def forward(self, img0, img1, embt, scale_factor=1.0, eval=False, **kwargs): mean_ = torch.cat([img0, img1], 2).mean(1, keepdim=True).mean(2, keepdim=True).mean(3, keepdim=True) img0 = img0 - mean_ img1 = img1 - mean_ img0_ = resize(img0, scale_factor) if scale_factor != 1.0 else img0 img1_ = resize(img1, scale_factor) if scale_factor != 1.0 else img1 b, _, h, w = img0_.shape coord = coords_grid(b, h // 8, w // 8, img0.device) fmap0, fmap1 = self.feat_encoder([img0_, img1_]) # [1, 128, H//8, W//8] corr_fn = BidirCorrBlock(fmap0, fmap1, radius=self.radius, num_levels=self.corr_levels) # f0_1: [1, c0, H//2, W//2] | f0_2: [1, c1, H//4, W//4] # f0_3: [1, c2, H//8, W//8] | f0_4: [1, c3, H//16, W//16] f0_1, f0_2, f0_3, f0_4 = self.encoder(img0_) f1_1, f1_2, f1_3, f1_4 = self.encoder(img1_) ######################################### the 4th decoder ######################################### up_flow0_4, up_flow1_4, ft_3_ = self.decoder4(f0_4, f1_4, embt) corr_4, flow_4 = self._corr_scale_lookup(corr_fn, coord, up_flow0_4, up_flow1_4, embt, downsample=1) # residue update with lookup corr delta_ft_3_, delta_flow_4 = self.update4(ft_3_, flow_4, corr_4) delta_flow0_4, delta_flow1_4 = torch.chunk(delta_flow_4, 2, 1) up_flow0_4 = up_flow0_4 + delta_flow0_4 up_flow1_4 = up_flow1_4 + delta_flow1_4 ft_3_ = ft_3_ + delta_ft_3_ ######################################### the 3rd decoder ######################################### up_flow0_3, up_flow1_3, ft_2_ = self.decoder3(ft_3_, f0_3, f1_3, up_flow0_4, up_flow1_4) corr_3, flow_3 = self._corr_scale_lookup(corr_fn, coord, up_flow0_3, up_flow1_3, embt, downsample=2) # residue update with lookup corr delta_ft_2_, delta_flow_3 = self.update3_low(ft_2_, flow_3, corr_3) delta_flow0_3, delta_flow1_3 = torch.chunk(delta_flow_3, 2, 1) up_flow0_3 = up_flow0_3 + delta_flow0_3 up_flow1_3 = up_flow1_3 + delta_flow1_3 ft_2_ = ft_2_ + delta_ft_2_ # residue update with lookup corr (hr) corr_3 = resize(corr_3, scale_factor=2.0) up_flow_3 = torch.cat([up_flow0_3, up_flow1_3], dim=1) delta_ft_2_, delta_up_flow_3 = self.update3_high(ft_2_, up_flow_3, corr_3) ft_2_ += delta_ft_2_ up_flow0_3 += delta_up_flow_3[:, 0:2] up_flow1_3 += delta_up_flow_3[:, 2:4] ######################################### the 2nd decoder ######################################### up_flow0_2, up_flow1_2, ft_1_ = self.decoder2(ft_2_, f0_2, f1_2, up_flow0_3, up_flow1_3) corr_2, flow_2 = self._corr_scale_lookup(corr_fn, coord, up_flow0_2, up_flow1_2, embt, downsample=4) # residue update with lookup corr delta_ft_1_, delta_flow_2 = self.update2_low(ft_1_, flow_2, corr_2) delta_flow0_2, delta_flow1_2 = torch.chunk(delta_flow_2, 2, 1) up_flow0_2 = up_flow0_2 + delta_flow0_2 up_flow1_2 = up_flow1_2 + delta_flow1_2 ft_1_ = ft_1_ + delta_ft_1_ # residue update with lookup corr (hr) corr_2 = resize(corr_2, scale_factor=4.0) up_flow_2 = torch.cat([up_flow0_2, up_flow1_2], dim=1) delta_ft_1_, delta_up_flow_2 = self.update2_high(ft_1_, up_flow_2, corr_2) ft_1_ += delta_ft_1_ up_flow0_2 += delta_up_flow_2[:, 0:2] up_flow1_2 += delta_up_flow_2[:, 2:4] ######################################### the 1st decoder ######################################### up_flow0_1, up_flow1_1, mask, img_res = self.decoder1(ft_1_, f0_1, f1_1, up_flow0_2, up_flow1_2) if scale_factor != 1.0: up_flow0_1 = resize(up_flow0_1, scale_factor=(1.0/scale_factor)) * (1.0/scale_factor) up_flow1_1 = resize(up_flow1_1, scale_factor=(1.0/scale_factor)) * (1.0/scale_factor) mask = resize(mask, scale_factor=(1.0/scale_factor)) img_res = resize(img_res, scale_factor=(1.0/scale_factor)) # Merge multiple predictions imgt_pred = multi_flow_combine(self.comb_block, img0, img1, up_flow0_1, up_flow1_1, mask, img_res, mean_) imgt_pred = torch.clamp(imgt_pred, 0, 1) if eval: return { 'imgt_pred': imgt_pred, } else: up_flow0_1 = up_flow0_1.reshape(b, self.num_flows, 2, h, w) up_flow1_1 = up_flow1_1.reshape(b, self.num_flows, 2, h, w) return { 'imgt_pred': imgt_pred, 'flow0_pred': [up_flow0_1, up_flow0_2, up_flow0_3, up_flow0_4], 'flow1_pred': [up_flow1_1, up_flow1_2, up_flow1_3, up_flow1_4], 'ft_pred': [ft_1_, ft_2_, ft_3_], } ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/networks/AMT-L.py ================================================ import torch import torch.nn as nn from vbench.third_party.amt.networks.blocks.raft import ( coords_grid, BasicUpdateBlock, BidirCorrBlock ) from vbench.third_party.amt.networks.blocks.feat_enc import ( BasicEncoder, ) from vbench.third_party.amt.networks.blocks.ifrnet import ( resize, Encoder, InitDecoder, IntermediateDecoder ) from vbench.third_party.amt.networks.blocks.multi_flow import ( multi_flow_combine, MultiFlowDecoder ) class Model(nn.Module): def __init__(self, corr_radius=3, corr_lvls=4, num_flows=5, channels=[48, 64, 72, 128], skip_channels=48 ): super(Model, self).__init__() self.radius = corr_radius self.corr_levels = corr_lvls self.num_flows = num_flows self.feat_encoder = BasicEncoder(output_dim=128, norm_fn='instance', dropout=0.) self.encoder = Encoder([48, 64, 72, 128], large=True) self.decoder4 = InitDecoder(channels[3], channels[2], skip_channels) self.decoder3 = IntermediateDecoder(channels[2], channels[1], skip_channels) self.decoder2 = IntermediateDecoder(channels[1], channels[0], skip_channels) self.decoder1 = MultiFlowDecoder(channels[0], skip_channels, num_flows) self.update4 = self._get_updateblock(72, None) self.update3 = self._get_updateblock(64, 2.0) self.update2 = self._get_updateblock(48, 4.0) self.comb_block = nn.Sequential( nn.Conv2d(3*self.num_flows, 6*self.num_flows, 7, 1, 3), nn.PReLU(6*self.num_flows), nn.Conv2d(6*self.num_flows, 3, 7, 1, 3), ) def _get_updateblock(self, cdim, scale_factor=None): return BasicUpdateBlock(cdim=cdim, hidden_dim=128, flow_dim=48, corr_dim=256, corr_dim2=160, fc_dim=124, scale_factor=scale_factor, corr_levels=self.corr_levels, radius=self.radius) def _corr_scale_lookup(self, corr_fn, coord, flow0, flow1, embt, downsample=1): # convert t -> 0 to 0 -> 1 | convert t -> 1 to 1 -> 0 # based on linear assumption t1_scale = 1. / embt t0_scale = 1. / (1. - embt) if downsample != 1: inv = 1 / downsample flow0 = inv * resize(flow0, scale_factor=inv) flow1 = inv * resize(flow1, scale_factor=inv) corr0, corr1 = corr_fn(coord + flow1 * t1_scale, coord + flow0 * t0_scale) corr = torch.cat([corr0, corr1], dim=1) flow = torch.cat([flow0, flow1], dim=1) return corr, flow def forward(self, img0, img1, embt, scale_factor=1.0, eval=False, **kwargs): mean_ = torch.cat([img0, img1], 2).mean(1, keepdim=True).mean(2, keepdim=True).mean(3, keepdim=True) img0 = img0 - mean_ img1 = img1 - mean_ img0_ = resize(img0, scale_factor) if scale_factor != 1.0 else img0 img1_ = resize(img1, scale_factor) if scale_factor != 1.0 else img1 b, _, h, w = img0_.shape coord = coords_grid(b, h // 8, w // 8, img0.device) fmap0, fmap1 = self.feat_encoder([img0_, img1_]) # [1, 128, H//8, W//8] corr_fn = BidirCorrBlock(fmap0, fmap1, radius=self.radius, num_levels=self.corr_levels) # f0_1: [1, c0, H//2, W//2] | f0_2: [1, c1, H//4, W//4] # f0_3: [1, c2, H//8, W//8] | f0_4: [1, c3, H//16, W//16] f0_1, f0_2, f0_3, f0_4 = self.encoder(img0_) f1_1, f1_2, f1_3, f1_4 = self.encoder(img1_) ######################################### the 4th decoder ######################################### up_flow0_4, up_flow1_4, ft_3_ = self.decoder4(f0_4, f1_4, embt) corr_4, flow_4 = self._corr_scale_lookup(corr_fn, coord, up_flow0_4, up_flow1_4, embt, downsample=1) # residue update with lookup corr delta_ft_3_, delta_flow_4 = self.update4(ft_3_, flow_4, corr_4) delta_flow0_4, delta_flow1_4 = torch.chunk(delta_flow_4, 2, 1) up_flow0_4 = up_flow0_4 + delta_flow0_4 up_flow1_4 = up_flow1_4 + delta_flow1_4 ft_3_ = ft_3_ + delta_ft_3_ ######################################### the 3rd decoder ######################################### up_flow0_3, up_flow1_3, ft_2_ = self.decoder3(ft_3_, f0_3, f1_3, up_flow0_4, up_flow1_4) corr_3, flow_3 = self._corr_scale_lookup(corr_fn, coord, up_flow0_3, up_flow1_3, embt, downsample=2) # residue update with lookup corr delta_ft_2_, delta_flow_3 = self.update3(ft_2_, flow_3, corr_3) delta_flow0_3, delta_flow1_3 = torch.chunk(delta_flow_3, 2, 1) up_flow0_3 = up_flow0_3 + delta_flow0_3 up_flow1_3 = up_flow1_3 + delta_flow1_3 ft_2_ = ft_2_ + delta_ft_2_ ######################################### the 2nd decoder ######################################### up_flow0_2, up_flow1_2, ft_1_ = self.decoder2(ft_2_, f0_2, f1_2, up_flow0_3, up_flow1_3) corr_2, flow_2 = self._corr_scale_lookup(corr_fn, coord, up_flow0_2, up_flow1_2, embt, downsample=4) # residue update with lookup corr delta_ft_1_, delta_flow_2 = self.update2(ft_1_, flow_2, corr_2) delta_flow0_2, delta_flow1_2 = torch.chunk(delta_flow_2, 2, 1) up_flow0_2 = up_flow0_2 + delta_flow0_2 up_flow1_2 = up_flow1_2 + delta_flow1_2 ft_1_ = ft_1_ + delta_ft_1_ ######################################### the 1st decoder ######################################### up_flow0_1, up_flow1_1, mask, img_res = self.decoder1(ft_1_, f0_1, f1_1, up_flow0_2, up_flow1_2) if scale_factor != 1.0: up_flow0_1 = resize(up_flow0_1, scale_factor=(1.0/scale_factor)) * (1.0/scale_factor) up_flow1_1 = resize(up_flow1_1, scale_factor=(1.0/scale_factor)) * (1.0/scale_factor) mask = resize(mask, scale_factor=(1.0/scale_factor)) img_res = resize(img_res, scale_factor=(1.0/scale_factor)) # Merge multiple predictions imgt_pred = multi_flow_combine(self.comb_block, img0, img1, up_flow0_1, up_flow1_1, mask, img_res, mean_) imgt_pred = torch.clamp(imgt_pred, 0, 1) if eval: return { 'imgt_pred': imgt_pred, } else: up_flow0_1 = up_flow0_1.reshape(b, self.num_flows, 2, h, w) up_flow1_1 = up_flow1_1.reshape(b, self.num_flows, 2, h, w) return { 'imgt_pred': imgt_pred, 'flow0_pred': [up_flow0_1, up_flow0_2, up_flow0_3, up_flow0_4], 'flow1_pred': [up_flow1_1, up_flow1_2, up_flow1_3, up_flow1_4], 'ft_pred': [ft_1_, ft_2_, ft_3_], } ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/networks/AMT-S.py ================================================ import torch import torch.nn as nn from vbench.third_party.amt.networks.blocks.raft import ( SmallUpdateBlock, coords_grid, BidirCorrBlock ) from vbench.third_party.amt.networks.blocks.feat_enc import ( SmallEncoder ) from vbench.third_party.amt.networks.blocks.ifrnet import ( resize, Encoder, InitDecoder, IntermediateDecoder ) from vbench.third_party.amt.networks.blocks.multi_flow import ( multi_flow_combine, MultiFlowDecoder ) class Model(nn.Module): def __init__(self, corr_radius=3, corr_lvls=4, num_flows=3, channels=[20, 32, 44, 56], skip_channels=20): super(Model, self).__init__() self.radius = corr_radius self.corr_levels = corr_lvls self.num_flows = num_flows self.channels = channels self.skip_channels = skip_channels self.feat_encoder = SmallEncoder(output_dim=84, norm_fn='instance', dropout=0.) self.encoder = Encoder(channels) self.decoder4 = InitDecoder(channels[3], channels[2], skip_channels) self.decoder3 = IntermediateDecoder(channels[2], channels[1], skip_channels) self.decoder2 = IntermediateDecoder(channels[1], channels[0], skip_channels) self.decoder1 = MultiFlowDecoder(channels[0], skip_channels, num_flows) self.update4 = self._get_updateblock(44) self.update3 = self._get_updateblock(32, 2) self.update2 = self._get_updateblock(20, 4) self.comb_block = nn.Sequential( nn.Conv2d(3*num_flows, 6*num_flows, 3, 1, 1), nn.PReLU(6*num_flows), nn.Conv2d(6*num_flows, 3, 3, 1, 1), ) def _get_updateblock(self, cdim, scale_factor=None): return SmallUpdateBlock(cdim=cdim, hidden_dim=76, flow_dim=20, corr_dim=64, fc_dim=68, scale_factor=scale_factor, corr_levels=self.corr_levels, radius=self.radius) def _corr_scale_lookup(self, corr_fn, coord, flow0, flow1, embt, downsample=1): # convert t -> 0 to 0 -> 1 | convert t -> 1 to 1 -> 0 # based on linear assumption t1_scale = 1. / embt t0_scale = 1. / (1. - embt) if downsample != 1: inv = 1 / downsample flow0 = inv * resize(flow0, scale_factor=inv) flow1 = inv * resize(flow1, scale_factor=inv) corr0, corr1 = corr_fn(coord + flow1 * t1_scale, coord + flow0 * t0_scale) corr = torch.cat([corr0, corr1], dim=1) flow = torch.cat([flow0, flow1], dim=1) return corr, flow def forward(self, img0, img1, embt, scale_factor=1.0, eval=False, **kwargs): mean_ = torch.cat([img0, img1], 2).mean(1, keepdim=True).mean(2, keepdim=True).mean(3, keepdim=True) img0 = img0 - mean_ img1 = img1 - mean_ img0_ = resize(img0, scale_factor) if scale_factor != 1.0 else img0 img1_ = resize(img1, scale_factor) if scale_factor != 1.0 else img1 b, _, h, w = img0_.shape coord = coords_grid(b, h // 8, w // 8, img0.device) fmap0, fmap1 = self.feat_encoder([img0_, img1_]) # [1, 128, H//8, W//8] corr_fn = BidirCorrBlock(fmap0, fmap1, radius=self.radius, num_levels=self.corr_levels) # f0_1: [1, c0, H//2, W//2] | f0_2: [1, c1, H//4, W//4] # f0_3: [1, c2, H//8, W//8] | f0_4: [1, c3, H//16, W//16] f0_1, f0_2, f0_3, f0_4 = self.encoder(img0_) f1_1, f1_2, f1_3, f1_4 = self.encoder(img1_) ######################################### the 4th decoder ######################################### up_flow0_4, up_flow1_4, ft_3_ = self.decoder4(f0_4, f1_4, embt) corr_4, flow_4 = self._corr_scale_lookup(corr_fn, coord, up_flow0_4, up_flow1_4, embt, downsample=1) # residue update with lookup corr delta_ft_3_, delta_flow_4 = self.update4(ft_3_, flow_4, corr_4) delta_flow0_4, delta_flow1_4 = torch.chunk(delta_flow_4, 2, 1) up_flow0_4 = up_flow0_4 + delta_flow0_4 up_flow1_4 = up_flow1_4 + delta_flow1_4 ft_3_ = ft_3_ + delta_ft_3_ ######################################### the 3rd decoder ######################################### up_flow0_3, up_flow1_3, ft_2_ = self.decoder3(ft_3_, f0_3, f1_3, up_flow0_4, up_flow1_4) corr_3, flow_3 = self._corr_scale_lookup(corr_fn, coord, up_flow0_3, up_flow1_3, embt, downsample=2) # residue update with lookup corr delta_ft_2_, delta_flow_3 = self.update3(ft_2_, flow_3, corr_3) delta_flow0_3, delta_flow1_3 = torch.chunk(delta_flow_3, 2, 1) up_flow0_3 = up_flow0_3 + delta_flow0_3 up_flow1_3 = up_flow1_3 + delta_flow1_3 ft_2_ = ft_2_ + delta_ft_2_ ######################################### the 2nd decoder ######################################### up_flow0_2, up_flow1_2, ft_1_ = self.decoder2(ft_2_, f0_2, f1_2, up_flow0_3, up_flow1_3) corr_2, flow_2 = self._corr_scale_lookup(corr_fn, coord, up_flow0_2, up_flow1_2, embt, downsample=4) # residue update with lookup corr delta_ft_1_, delta_flow_2 = self.update2(ft_1_, flow_2, corr_2) delta_flow0_2, delta_flow1_2 = torch.chunk(delta_flow_2, 2, 1) up_flow0_2 = up_flow0_2 + delta_flow0_2 up_flow1_2 = up_flow1_2 + delta_flow1_2 ft_1_ = ft_1_ + delta_ft_1_ ######################################### the 1st decoder ######################################### up_flow0_1, up_flow1_1, mask, img_res = self.decoder1(ft_1_, f0_1, f1_1, up_flow0_2, up_flow1_2) if scale_factor != 1.0: up_flow0_1 = resize(up_flow0_1, scale_factor=(1.0/scale_factor)) * (1.0/scale_factor) up_flow1_1 = resize(up_flow1_1, scale_factor=(1.0/scale_factor)) * (1.0/scale_factor) mask = resize(mask, scale_factor=(1.0/scale_factor)) img_res = resize(img_res, scale_factor=(1.0/scale_factor)) # Merge multiple predictions imgt_pred = multi_flow_combine(self.comb_block, img0, img1, up_flow0_1, up_flow1_1, mask, img_res, mean_) imgt_pred = torch.clamp(imgt_pred, 0, 1) if eval: return { 'imgt_pred': imgt_pred, } else: up_flow0_1 = up_flow0_1.reshape(b, self.num_flows, 2, h, w) up_flow1_1 = up_flow1_1.reshape(b, self.num_flows, 2, h, w) return { 'imgt_pred': imgt_pred, 'flow0_pred': [up_flow0_1, up_flow0_2, up_flow0_3, up_flow0_4], 'flow1_pred': [up_flow1_1, up_flow1_2, up_flow1_3, up_flow1_4], 'ft_pred': [ft_1_, ft_2_, ft_3_], } ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/networks/blocks/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/networks/blocks/feat_enc.py ================================================ import torch import torch.nn as nn class BottleneckBlock(nn.Module): def __init__(self, in_planes, planes, norm_fn='group', stride=1): super(BottleneckBlock, self).__init__() self.conv1 = nn.Conv2d(in_planes, planes//4, kernel_size=1, padding=0) self.conv2 = nn.Conv2d(planes//4, planes//4, kernel_size=3, padding=1, stride=stride) self.conv3 = nn.Conv2d(planes//4, planes, kernel_size=1, padding=0) self.relu = nn.ReLU(inplace=True) num_groups = planes // 8 if norm_fn == 'group': self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4) self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4) self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) if not stride == 1: self.norm4 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) elif norm_fn == 'batch': self.norm1 = nn.BatchNorm2d(planes//4) self.norm2 = nn.BatchNorm2d(planes//4) self.norm3 = nn.BatchNorm2d(planes) if not stride == 1: self.norm4 = nn.BatchNorm2d(planes) elif norm_fn == 'instance': self.norm1 = nn.InstanceNorm2d(planes//4) self.norm2 = nn.InstanceNorm2d(planes//4) self.norm3 = nn.InstanceNorm2d(planes) if not stride == 1: self.norm4 = nn.InstanceNorm2d(planes) elif norm_fn == 'none': self.norm1 = nn.Sequential() self.norm2 = nn.Sequential() self.norm3 = nn.Sequential() if not stride == 1: self.norm4 = nn.Sequential() if stride == 1: self.downsample = None else: self.downsample = nn.Sequential( nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm4) def forward(self, x): y = x y = self.relu(self.norm1(self.conv1(y))) y = self.relu(self.norm2(self.conv2(y))) y = self.relu(self.norm3(self.conv3(y))) if self.downsample is not None: x = self.downsample(x) return self.relu(x+y) class ResidualBlock(nn.Module): def __init__(self, in_planes, planes, norm_fn='group', stride=1): super(ResidualBlock, self).__init__() self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1) self.relu = nn.ReLU(inplace=True) num_groups = planes // 8 if norm_fn == 'group': self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) if not stride == 1: self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) elif norm_fn == 'batch': self.norm1 = nn.BatchNorm2d(planes) self.norm2 = nn.BatchNorm2d(planes) if not stride == 1: self.norm3 = nn.BatchNorm2d(planes) elif norm_fn == 'instance': self.norm1 = nn.InstanceNorm2d(planes) self.norm2 = nn.InstanceNorm2d(planes) if not stride == 1: self.norm3 = nn.InstanceNorm2d(planes) elif norm_fn == 'none': self.norm1 = nn.Sequential() self.norm2 = nn.Sequential() if not stride == 1: self.norm3 = nn.Sequential() if stride == 1: self.downsample = None else: self.downsample = nn.Sequential( nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3) def forward(self, x): y = x y = self.relu(self.norm1(self.conv1(y))) y = self.relu(self.norm2(self.conv2(y))) if self.downsample is not None: x = self.downsample(x) return self.relu(x+y) class SmallEncoder(nn.Module): def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0): super(SmallEncoder, self).__init__() self.norm_fn = norm_fn if self.norm_fn == 'group': self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32) elif self.norm_fn == 'batch': self.norm1 = nn.BatchNorm2d(32) elif self.norm_fn == 'instance': self.norm1 = nn.InstanceNorm2d(32) elif self.norm_fn == 'none': self.norm1 = nn.Sequential() self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3) self.relu1 = nn.ReLU(inplace=True) self.in_planes = 32 self.layer1 = self._make_layer(32, stride=1) self.layer2 = self._make_layer(64, stride=2) self.layer3 = self._make_layer(96, stride=2) self.dropout = None if dropout > 0: self.dropout = nn.Dropout2d(p=dropout) self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0) def _make_layer(self, dim, stride=1): layer1 = BottleneckBlock(self.in_planes, dim, self.norm_fn, stride=stride) layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1) layers = (layer1, layer2) self.in_planes = dim return nn.Sequential(*layers) def forward(self, x): # if input is list, combine batch dimension is_list = isinstance(x, tuple) or isinstance(x, list) if is_list: batch_dim = x[0].shape[0] x = torch.cat(x, dim=0) x = self.conv1(x) x = self.norm1(x) x = self.relu1(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.conv2(x) if self.training and self.dropout is not None: x = self.dropout(x) if is_list: x = torch.split(x, [batch_dim, batch_dim], dim=0) return x class BasicEncoder(nn.Module): def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0): super(BasicEncoder, self).__init__() self.norm_fn = norm_fn if self.norm_fn == 'group': self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64) elif self.norm_fn == 'batch': self.norm1 = nn.BatchNorm2d(64) elif self.norm_fn == 'instance': self.norm1 = nn.InstanceNorm2d(64) elif self.norm_fn == 'none': self.norm1 = nn.Sequential() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) self.relu1 = nn.ReLU(inplace=True) self.in_planes = 64 self.layer1 = self._make_layer(64, stride=1) self.layer2 = self._make_layer(72, stride=2) self.layer3 = self._make_layer(128, stride=2) # output convolution self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1) self.dropout = None if dropout > 0: self.dropout = nn.Dropout2d(p=dropout) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0) def _make_layer(self, dim, stride=1): layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride) layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1) layers = (layer1, layer2) self.in_planes = dim return nn.Sequential(*layers) def forward(self, x): # if input is list, combine batch dimension is_list = isinstance(x, tuple) or isinstance(x, list) if is_list: batch_dim = x[0].shape[0] x = torch.cat(x, dim=0) x = self.conv1(x) x = self.norm1(x) x = self.relu1(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.conv2(x) if self.training and self.dropout is not None: x = self.dropout(x) if is_list: x = torch.split(x, [batch_dim, batch_dim], dim=0) return x class LargeEncoder(nn.Module): def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0): super(LargeEncoder, self).__init__() self.norm_fn = norm_fn if self.norm_fn == 'group': self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64) elif self.norm_fn == 'batch': self.norm1 = nn.BatchNorm2d(64) elif self.norm_fn == 'instance': self.norm1 = nn.InstanceNorm2d(64) elif self.norm_fn == 'none': self.norm1 = nn.Sequential() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) self.relu1 = nn.ReLU(inplace=True) self.in_planes = 64 self.layer1 = self._make_layer(64, stride=1) self.layer2 = self._make_layer(112, stride=2) self.layer3 = self._make_layer(160, stride=2) self.layer3_2 = self._make_layer(160, stride=1) # output convolution self.conv2 = nn.Conv2d(self.in_planes, output_dim, kernel_size=1) self.dropout = None if dropout > 0: self.dropout = nn.Dropout2d(p=dropout) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0) def _make_layer(self, dim, stride=1): layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride) layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1) layers = (layer1, layer2) self.in_planes = dim return nn.Sequential(*layers) def forward(self, x): # if input is list, combine batch dimension is_list = isinstance(x, tuple) or isinstance(x, list) if is_list: batch_dim = x[0].shape[0] x = torch.cat(x, dim=0) x = self.conv1(x) x = self.norm1(x) x = self.relu1(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer3_2(x) x = self.conv2(x) if self.training and self.dropout is not None: x = self.dropout(x) if is_list: x = torch.split(x, [batch_dim, batch_dim], dim=0) return x ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/networks/blocks/ifrnet.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from vbench.third_party.amt.utils.flow_utils import warp def resize(x, scale_factor): return F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False) def convrelu(in_channels, out_channels, kernel_size=3, stride=1, padding=1, dilation=1, groups=1, bias=True): return nn.Sequential( nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias=bias), nn.PReLU(out_channels) ) class ResBlock(nn.Module): def __init__(self, in_channels, side_channels, bias=True): super(ResBlock, self).__init__() self.side_channels = side_channels self.conv1 = nn.Sequential( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(in_channels) ) self.conv2 = nn.Sequential( nn.Conv2d(side_channels, side_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(side_channels) ) self.conv3 = nn.Sequential( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(in_channels) ) self.conv4 = nn.Sequential( nn.Conv2d(side_channels, side_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(side_channels) ) self.conv5 = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias) self.prelu = nn.PReLU(in_channels) def forward(self, x): out = self.conv1(x) res_feat = out[:, :-self.side_channels, ...] side_feat = out[:, -self.side_channels:, :, :] side_feat = self.conv2(side_feat) out = self.conv3(torch.cat([res_feat, side_feat], 1)) res_feat = out[:, :-self.side_channels, ...] side_feat = out[:, -self.side_channels:, :, :] side_feat = self.conv4(side_feat) out = self.conv5(torch.cat([res_feat, side_feat], 1)) out = self.prelu(x + out) return out class Encoder(nn.Module): def __init__(self, channels, large=False): super(Encoder, self).__init__() self.channels = channels prev_ch = 3 for idx, ch in enumerate(channels, 1): k = 7 if large and idx == 1 else 3 p = 3 if k ==7 else 1 self.register_module(f'pyramid{idx}', nn.Sequential( convrelu(prev_ch, ch, k, 2, p), convrelu(ch, ch, 3, 1, 1) )) prev_ch = ch def forward(self, in_x): fs = [] for idx in range(len(self.channels)): out_x = getattr(self, f'pyramid{idx+1}')(in_x) fs.append(out_x) in_x = out_x return fs class InitDecoder(nn.Module): def __init__(self, in_ch, out_ch, skip_ch) -> None: super().__init__() self.convblock = nn.Sequential( convrelu(in_ch*2+1, in_ch*2), ResBlock(in_ch*2, skip_ch), nn.ConvTranspose2d(in_ch*2, out_ch+4, 4, 2, 1, bias=True) ) def forward(self, f0, f1, embt): h, w = f0.shape[2:] embt = embt.repeat(1, 1, h, w) out = self.convblock(torch.cat([f0, f1, embt], 1)) flow0, flow1 = torch.chunk(out[:, :4, ...], 2, 1) ft_ = out[:, 4:, ...] return flow0, flow1, ft_ class IntermediateDecoder(nn.Module): def __init__(self, in_ch, out_ch, skip_ch) -> None: super().__init__() self.convblock = nn.Sequential( convrelu(in_ch*3+4, in_ch*3), ResBlock(in_ch*3, skip_ch), nn.ConvTranspose2d(in_ch*3, out_ch+4, 4, 2, 1, bias=True) ) def forward(self, ft_, f0, f1, flow0_in, flow1_in): f0_warp = warp(f0, flow0_in) f1_warp = warp(f1, flow1_in) f_in = torch.cat([ft_, f0_warp, f1_warp, flow0_in, flow1_in], 1) out = self.convblock(f_in) flow0, flow1 = torch.chunk(out[:, :4, ...], 2, 1) ft_ = out[:, 4:, ...] flow0 = flow0 + 2.0 * resize(flow0_in, scale_factor=2.0) flow1 = flow1 + 2.0 * resize(flow1_in, scale_factor=2.0) return flow0, flow1, ft_ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/networks/blocks/multi_flow.py ================================================ import torch import torch.nn as nn from vbench.third_party.amt.utils.flow_utils import warp from vbench.third_party.amt.networks.blocks.ifrnet import ( convrelu, resize, ResBlock, ) def multi_flow_combine(comb_block, img0, img1, flow0, flow1, mask=None, img_res=None, mean=None): ''' A parallel implementation of multiple flow field warping comb_block: An nn.Seqential object. img shape: [b, c, h, w] flow shape: [b, 2*num_flows, h, w] mask (opt): If 'mask' is None, the function conduct a simple average. img_res (opt): If 'img_res' is None, the function adds zero instead. mean (opt): If 'mean' is None, the function adds zero instead. ''' b, c, h, w = flow0.shape num_flows = c // 2 flow0 = flow0.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w) flow1 = flow1.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w) mask = mask.reshape(b, num_flows, 1, h, w ).reshape(-1, 1, h, w) if mask is not None else None img_res = img_res.reshape(b, num_flows, 3, h, w ).reshape(-1, 3, h, w) if img_res is not None else 0 img0 = torch.stack([img0] * num_flows, 1).reshape(-1, 3, h, w) img1 = torch.stack([img1] * num_flows, 1).reshape(-1, 3, h, w) mean = torch.stack([mean] * num_flows, 1).reshape(-1, 1, 1, 1 ) if mean is not None else 0 img0_warp = warp(img0, flow0) img1_warp = warp(img1, flow1) img_warps = mask * img0_warp + (1 - mask) * img1_warp + mean + img_res img_warps = img_warps.reshape(b, num_flows, 3, h, w) imgt_pred = img_warps.mean(1) + comb_block(img_warps.view(b, -1, h, w)) return imgt_pred class MultiFlowDecoder(nn.Module): def __init__(self, in_ch, skip_ch, num_flows=3): super(MultiFlowDecoder, self).__init__() self.num_flows = num_flows self.convblock = nn.Sequential( convrelu(in_ch*3+4, in_ch*3), ResBlock(in_ch*3, skip_ch), nn.ConvTranspose2d(in_ch*3, 8*num_flows, 4, 2, 1, bias=True) ) def forward(self, ft_, f0, f1, flow0, flow1): n = self.num_flows f0_warp = warp(f0, flow0) f1_warp = warp(f1, flow1) out = self.convblock(torch.cat([ft_, f0_warp, f1_warp, flow0, flow1], 1)) delta_flow0, delta_flow1, mask, img_res = torch.split(out, [2*n, 2*n, n, 3*n], 1) mask = torch.sigmoid(mask) flow0 = delta_flow0 + 2.0 * resize(flow0, scale_factor=2.0 ).repeat(1, self.num_flows, 1, 1) flow1 = delta_flow1 + 2.0 * resize(flow1, scale_factor=2.0 ).repeat(1, self.num_flows, 1, 1) return flow0, flow1, mask, img_res ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/amt/networks/blocks/raft.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F def resize(x, scale_factor): return F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False) def bilinear_sampler(img, coords, mask=False): """ Wrapper for grid_sample, uses pixel coordinates """ H, W = img.shape[-2:] xgrid, ygrid = coords.split([1,1], dim=-1) xgrid = 2*xgrid/(W-1) - 1 ygrid = 2*ygrid/(H-1) - 1 grid = torch.cat([xgrid, ygrid], dim=-1) img = F.grid_sample(img, grid, align_corners=True) if mask: mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1) return img, mask.float() return img def coords_grid(batch, ht, wd, device): coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device), indexing='ij') coords = torch.stack(coords[::-1], dim=0).float() return coords[None].repeat(batch, 1, 1, 1) class SmallUpdateBlock(nn.Module): def __init__(self, cdim, hidden_dim, flow_dim, corr_dim, fc_dim, corr_levels=4, radius=3, scale_factor=None): super(SmallUpdateBlock, self).__init__() cor_planes = corr_levels * (2 * radius + 1) **2 self.scale_factor = scale_factor self.convc1 = nn.Conv2d(2 * cor_planes, corr_dim, 1, padding=0) self.convf1 = nn.Conv2d(4, flow_dim*2, 7, padding=3) self.convf2 = nn.Conv2d(flow_dim*2, flow_dim, 3, padding=1) self.conv = nn.Conv2d(corr_dim+flow_dim, fc_dim, 3, padding=1) self.gru = nn.Sequential( nn.Conv2d(fc_dim+4+cdim, hidden_dim, 3, padding=1), nn.LeakyReLU(negative_slope=0.1, inplace=True), nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), ) self.feat_head = nn.Sequential( nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), nn.LeakyReLU(negative_slope=0.1, inplace=True), nn.Conv2d(hidden_dim, cdim, 3, padding=1), ) self.flow_head = nn.Sequential( nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), nn.LeakyReLU(negative_slope=0.1, inplace=True), nn.Conv2d(hidden_dim, 4, 3, padding=1), ) self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True) def forward(self, net, flow, corr): net = resize(net, 1 / self.scale_factor ) if self.scale_factor is not None else net cor = self.lrelu(self.convc1(corr)) flo = self.lrelu(self.convf1(flow)) flo = self.lrelu(self.convf2(flo)) cor_flo = torch.cat([cor, flo], dim=1) inp = self.lrelu(self.conv(cor_flo)) inp = torch.cat([inp, flow, net], dim=1) out = self.gru(inp) delta_net = self.feat_head(out) delta_flow = self.flow_head(out) if self.scale_factor is not None: delta_net = resize(delta_net, scale_factor=self.scale_factor) delta_flow = self.scale_factor * resize(delta_flow, scale_factor=self.scale_factor) return delta_net, delta_flow class BasicUpdateBlock(nn.Module): def __init__(self, cdim, hidden_dim, flow_dim, corr_dim, corr_dim2, fc_dim, corr_levels=4, radius=3, scale_factor=None, out_num=1): super(BasicUpdateBlock, self).__init__() cor_planes = corr_levels * (2 * radius + 1) **2 self.scale_factor = scale_factor self.convc1 = nn.Conv2d(2 * cor_planes, corr_dim, 1, padding=0) self.convc2 = nn.Conv2d(corr_dim, corr_dim2, 3, padding=1) self.convf1 = nn.Conv2d(4, flow_dim*2, 7, padding=3) self.convf2 = nn.Conv2d(flow_dim*2, flow_dim, 3, padding=1) self.conv = nn.Conv2d(flow_dim+corr_dim2, fc_dim, 3, padding=1) self.gru = nn.Sequential( nn.Conv2d(fc_dim+4+cdim, hidden_dim, 3, padding=1), nn.LeakyReLU(negative_slope=0.1, inplace=True), nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), ) self.feat_head = nn.Sequential( nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), nn.LeakyReLU(negative_slope=0.1, inplace=True), nn.Conv2d(hidden_dim, cdim, 3, padding=1), ) self.flow_head = nn.Sequential( nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), nn.LeakyReLU(negative_slope=0.1, inplace=True), nn.Conv2d(hidden_dim, 4*out_num, 3, padding=1), ) self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True) def forward(self, net, flow, corr): net = resize(net, 1 / self.scale_factor ) if self.scale_factor is not None else net cor = self.lrelu(self.convc1(corr)) cor = self.lrelu(self.convc2(cor)) flo = self.lrelu(self.convf1(flow)) flo = self.lrelu(self.convf2(flo)) cor_flo = torch.cat([cor, flo], dim=1) inp = self.lrelu(self.conv(cor_flo)) inp = torch.cat([inp, flow, net], dim=1) out = self.gru(inp) delta_net = self.feat_head(out) delta_flow = self.flow_head(out) if self.scale_factor is not None: delta_net = resize(delta_net, scale_factor=self.scale_factor) delta_flow = self.scale_factor * resize(delta_flow, scale_factor=self.scale_factor) return delta_net, delta_flow class BidirCorrBlock: def __init__(self, fmap1, fmap2, num_levels=4, radius=4): self.num_levels = num_levels self.radius = radius self.corr_pyramid = [] self.corr_pyramid_T = [] corr = BidirCorrBlock.corr(fmap1, fmap2) batch, h1, w1, dim, h2, w2 = corr.shape corr_T = corr.clone().permute(0, 4, 5, 3, 1, 2) corr = corr.reshape(batch*h1*w1, dim, h2, w2) corr_T = corr_T.reshape(batch*h2*w2, dim, h1, w1) self.corr_pyramid.append(corr) self.corr_pyramid_T.append(corr_T) for _ in range(self.num_levels-1): corr = F.avg_pool2d(corr, 2, stride=2) corr_T = F.avg_pool2d(corr_T, 2, stride=2) self.corr_pyramid.append(corr) self.corr_pyramid_T.append(corr_T) def __call__(self, coords0, coords1): r = self.radius coords0 = coords0.permute(0, 2, 3, 1) coords1 = coords1.permute(0, 2, 3, 1) assert coords0.shape == coords1.shape, f"coords0 shape: [{coords0.shape}] is not equal to [{coords1.shape}]" batch, h1, w1, _ = coords0.shape out_pyramid = [] out_pyramid_T = [] for i in range(self.num_levels): corr = self.corr_pyramid[i] corr_T = self.corr_pyramid_T[i] dx = torch.linspace(-r, r, 2*r+1, device=coords0.device) dy = torch.linspace(-r, r, 2*r+1, device=coords0.device) delta = torch.stack(torch.meshgrid(dy, dx, indexing='ij'), axis=-1) delta_lvl = delta.view(1, 2*r+1, 2*r+1, 2) centroid_lvl_0 = coords0.reshape(batch*h1*w1, 1, 1, 2) / 2**i centroid_lvl_1 = coords1.reshape(batch*h1*w1, 1, 1, 2) / 2**i coords_lvl_0 = centroid_lvl_0 + delta_lvl coords_lvl_1 = centroid_lvl_1 + delta_lvl corr = bilinear_sampler(corr, coords_lvl_0) corr_T = bilinear_sampler(corr_T, coords_lvl_1) corr = corr.view(batch, h1, w1, -1) corr_T = corr_T.view(batch, h1, w1, -1) out_pyramid.append(corr) out_pyramid_T.append(corr_T) out = torch.cat(out_pyramid, dim=-1) out_T = torch.cat(out_pyramid_T, dim=-1) return out.permute(0, 3, 1, 2).contiguous().float(), out_T.permute(0, 3, 1, 2).contiguous().float() @staticmethod def corr(fmap1, fmap2): batch, dim, ht, wd = fmap1.shape fmap1 = fmap1.view(batch, dim, ht*wd) fmap2 = fmap2.view(batch, dim, ht*wd) corr = torch.matmul(fmap1.transpose(1,2), fmap2) corr = corr.view(batch, ht, wd, 1, ht, wd) return corr / torch.sqrt(torch.tensor(dim).float()) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_model.py ================================================ import os import sys from .grit_src.image_dense_captions import image_caption_api, init_demo, dense_pred_to_caption, dense_pred_to_caption_only_name,dense_pred_to_caption_tuple from detectron2.data.detection_utils import read_image class DenseCaptioning(): def __init__(self, device): self.device = device self.demo = None def initialize_model(self, model_weight): self.demo = init_demo(self.device, model_weight=model_weight) def initialize_model_det(self, model_weight): self.demo = init_demo(self.device, model_weight = model_weight, task="ObjectDet") def image_dense_caption(self, image_src): dense_caption = image_caption_api(image_src, self.device) print('\033[1;35m' + '*' * 100 + '\033[0m') print("Step2, Dense Caption:\n") print(dense_caption) print('\033[1;35m' + '*' * 100 + '\033[0m') return dense_caption def run_caption_api(self,image_src): img = read_image(image_src, format="BGR") print(img.shape) predictions, visualized_output = self.demo.run_on_image(img) new_caption = dense_pred_to_caption_only_name(predictions) return new_caption def run_caption_tensor(self,img): predictions, visualized_output = self.demo.run_on_image(img) new_caption = dense_pred_to_caption_tuple(predictions) return new_caption, visualized_output def run_det_tensor(self,img): predictions, visualized_output = self.demo.run_on_image(img) return predictions, visualized_output ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/__init__.py ================================================ from .modeling.meta_arch.centernet_detector import CenterNetDetector from .modeling.dense_heads.centernet import CenterNet from .modeling.roi_heads.custom_roi_heads import CustomROIHeads, CustomCascadeROIHeads from .modeling.backbone.fpn_p5 import build_p67_resnet_fpn_backbone from .modeling.backbone.dla import build_dla_backbone from .modeling.backbone.dlafpn import build_dla_fpn3_backbone from .modeling.backbone.bifpn import build_resnet_bifpn_backbone from .modeling.backbone.bifpn_fcos import build_fcos_resnet_bifpn_backbone from .modeling.backbone.res2net import build_p67_res2net_fpn_backbone ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/config.py ================================================ from detectron2.config import CfgNode as CN def add_centernet_config(cfg): _C = cfg _C.MODEL.CENTERNET = CN() _C.MODEL.CENTERNET.NUM_CLASSES = 80 _C.MODEL.CENTERNET.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"] _C.MODEL.CENTERNET.FPN_STRIDES = [8, 16, 32, 64, 128] _C.MODEL.CENTERNET.PRIOR_PROB = 0.01 _C.MODEL.CENTERNET.INFERENCE_TH = 0.05 _C.MODEL.CENTERNET.CENTER_NMS = False _C.MODEL.CENTERNET.NMS_TH_TRAIN = 0.6 _C.MODEL.CENTERNET.NMS_TH_TEST = 0.6 _C.MODEL.CENTERNET.PRE_NMS_TOPK_TRAIN = 1000 _C.MODEL.CENTERNET.POST_NMS_TOPK_TRAIN = 100 _C.MODEL.CENTERNET.PRE_NMS_TOPK_TEST = 1000 _C.MODEL.CENTERNET.POST_NMS_TOPK_TEST = 100 _C.MODEL.CENTERNET.NORM = "GN" _C.MODEL.CENTERNET.USE_DEFORMABLE = False _C.MODEL.CENTERNET.NUM_CLS_CONVS = 4 _C.MODEL.CENTERNET.NUM_BOX_CONVS = 4 _C.MODEL.CENTERNET.NUM_SHARE_CONVS = 0 _C.MODEL.CENTERNET.LOC_LOSS_TYPE = 'giou' _C.MODEL.CENTERNET.SIGMOID_CLAMP = 1e-4 _C.MODEL.CENTERNET.HM_MIN_OVERLAP = 0.8 _C.MODEL.CENTERNET.MIN_RADIUS = 4 _C.MODEL.CENTERNET.SOI = [[0, 80], [64, 160], [128, 320], [256, 640], [512, 10000000]] _C.MODEL.CENTERNET.POS_WEIGHT = 1. _C.MODEL.CENTERNET.NEG_WEIGHT = 1. _C.MODEL.CENTERNET.REG_WEIGHT = 2. _C.MODEL.CENTERNET.HM_FOCAL_BETA = 4 _C.MODEL.CENTERNET.HM_FOCAL_ALPHA = 0.25 _C.MODEL.CENTERNET.LOSS_GAMMA = 2.0 _C.MODEL.CENTERNET.WITH_AGN_HM = False _C.MODEL.CENTERNET.ONLY_PROPOSAL = False _C.MODEL.CENTERNET.AS_PROPOSAL = False _C.MODEL.CENTERNET.IGNORE_HIGH_FP = -1. _C.MODEL.CENTERNET.MORE_POS = False _C.MODEL.CENTERNET.MORE_POS_THRESH = 0.2 _C.MODEL.CENTERNET.MORE_POS_TOPK = 9 _C.MODEL.CENTERNET.NOT_NORM_REG = True _C.MODEL.CENTERNET.NOT_NMS = False _C.MODEL.CENTERNET.NO_REDUCE = False _C.MODEL.ROI_BOX_HEAD.USE_SIGMOID_CE = False _C.MODEL.ROI_BOX_HEAD.PRIOR_PROB = 0.01 _C.MODEL.ROI_BOX_HEAD.USE_EQL_LOSS = False _C.MODEL.ROI_BOX_HEAD.CAT_FREQ_PATH = \ 'datasets/lvis/lvis_v1_train_cat_info.json' _C.MODEL.ROI_BOX_HEAD.EQL_FREQ_CAT = 200 _C.MODEL.ROI_BOX_HEAD.USE_FED_LOSS = False _C.MODEL.ROI_BOX_HEAD.FED_LOSS_NUM_CAT = 50 _C.MODEL.ROI_BOX_HEAD.FED_LOSS_FREQ_WEIGHT = 0.5 _C.MODEL.ROI_BOX_HEAD.MULT_PROPOSAL_SCORE = False _C.MODEL.BIFPN = CN() _C.MODEL.BIFPN.NUM_LEVELS = 5 _C.MODEL.BIFPN.NUM_BIFPN = 6 _C.MODEL.BIFPN.NORM = 'GN' _C.MODEL.BIFPN.OUT_CHANNELS = 160 _C.MODEL.BIFPN.SEPARABLE_CONV = False _C.MODEL.DLA = CN() _C.MODEL.DLA.OUT_FEATURES = ['dla2'] _C.MODEL.DLA.USE_DLA_UP = True _C.MODEL.DLA.NUM_LAYERS = 34 _C.MODEL.DLA.MS_OUTPUT = False _C.MODEL.DLA.NORM = 'BN' _C.MODEL.DLA.DLAUP_IN_FEATURES = ['dla3', 'dla4', 'dla5'] _C.MODEL.DLA.DLAUP_NODE = 'conv' _C.SOLVER.RESET_ITER = False _C.SOLVER.TRAIN_ITER = -1 _C.INPUT.CUSTOM_AUG = '' _C.INPUT.TRAIN_SIZE = 640 _C.INPUT.TEST_SIZE = 640 _C.INPUT.SCALE_RANGE = (0.1, 2.) # 'default' for fixed short/ long edge, 'square' for max size=INPUT.SIZE _C.INPUT.TEST_INPUT_TYPE = 'default' _C.DEBUG = False _C.SAVE_DEBUG = False _C.SAVE_PTH = False _C.VIS_THRESH = 0.3 _C.DEBUG_SHOW_NAME = False ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/backbone/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/backbone/bifpn.py ================================================ # Modified from https://github.com/rwightman/efficientdet-pytorch/blob/master/effdet/efficientdet.py # The original file is under Apache-2.0 License import math from os.path import join import numpy as np from collections import OrderedDict from typing import List import torch from torch import nn import torch.utils.model_zoo as model_zoo import torch.nn.functional as F import fvcore.nn.weight_init as weight_init from detectron2.layers import ShapeSpec, Conv2d from detectron2.modeling.backbone.resnet import build_resnet_backbone from detectron2.modeling.backbone.build import BACKBONE_REGISTRY from detectron2.layers.batch_norm import get_norm from detectron2.modeling.backbone import Backbone from .dlafpn import dla34 def get_fpn_config(base_reduction=8): """BiFPN config with sum.""" p = { 'nodes': [ {'reduction': base_reduction << 3, 'inputs_offsets': [3, 4]}, {'reduction': base_reduction << 2, 'inputs_offsets': [2, 5]}, {'reduction': base_reduction << 1, 'inputs_offsets': [1, 6]}, {'reduction': base_reduction, 'inputs_offsets': [0, 7]}, {'reduction': base_reduction << 1, 'inputs_offsets': [1, 7, 8]}, {'reduction': base_reduction << 2, 'inputs_offsets': [2, 6, 9]}, {'reduction': base_reduction << 3, 'inputs_offsets': [3, 5, 10]}, {'reduction': base_reduction << 4, 'inputs_offsets': [4, 11]}, ], 'weight_method': 'fastattn', } return p def swish(x, inplace: bool = False): """Swish - Described in: https://arxiv.org/abs/1710.05941 """ return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid()) class Swish(nn.Module): def __init__(self, inplace: bool = False): super(Swish, self).__init__() self.inplace = inplace def forward(self, x): return swish(x, self.inplace) class SequentialAppend(nn.Sequential): def __init__(self, *args): super(SequentialAppend, self).__init__(*args) def forward(self, x): for module in self: x.append(module(x)) return x class SequentialAppendLast(nn.Sequential): def __init__(self, *args): super(SequentialAppendLast, self).__init__(*args) # def forward(self, x: List[torch.Tensor]): def forward(self, x): for module in self: x.append(module(x[-1])) return x class ConvBnAct2d(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, padding='', bias=False, norm='', act_layer=Swish): super(ConvBnAct2d, self).__init__() # self.conv = create_conv2d( # in_channels, out_channels, kernel_size, stride=stride, dilation=dilation, padding=padding, bias=bias) self.conv = Conv2d( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, bias=(norm == '')) self.bn = get_norm(norm, out_channels) self.act = None if act_layer is None else act_layer(inplace=True) def forward(self, x): x = self.conv(x) if self.bn is not None: x = self.bn(x) if self.act is not None: x = self.act(x) return x class SeparableConv2d(nn.Module): """ Separable Conv """ def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1, padding='', bias=False, channel_multiplier=1.0, pw_kernel_size=1, act_layer=Swish, norm=''): super(SeparableConv2d, self).__init__() # self.conv_dw = create_conv2d( # in_channels, int(in_channels * channel_multiplier), kernel_size, # stride=stride, dilation=dilation, padding=padding, depthwise=True) self.conv_dw = Conv2d( in_channels, int(in_channels * channel_multiplier), kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, bias=bias, groups=out_channels) # print('conv_dw', kernel_size, stride) # self.conv_pw = create_conv2d( # int(in_channels * channel_multiplier), out_channels, pw_kernel_size, padding=padding, bias=bias) self.conv_pw = Conv2d( int(in_channels * channel_multiplier), out_channels, kernel_size=pw_kernel_size, padding=pw_kernel_size // 2, bias=(norm=='')) # print('conv_pw', pw_kernel_size) self.bn = get_norm(norm, out_channels) self.act = None if act_layer is None else act_layer(inplace=True) def forward(self, x): x = self.conv_dw(x) x = self.conv_pw(x) if self.bn is not None: x = self.bn(x) if self.act is not None: x = self.act(x) return x class ResampleFeatureMap(nn.Sequential): def __init__(self, in_channels, out_channels, reduction_ratio=1., pad_type='', pooling_type='max', norm='', apply_bn=False, conv_after_downsample=False, redundant_bias=False): super(ResampleFeatureMap, self).__init__() pooling_type = pooling_type or 'max' self.in_channels = in_channels self.out_channels = out_channels self.reduction_ratio = reduction_ratio self.conv_after_downsample = conv_after_downsample conv = None if in_channels != out_channels: conv = ConvBnAct2d( in_channels, out_channels, kernel_size=1, padding=pad_type, norm=norm if apply_bn else '', bias=not apply_bn or redundant_bias, act_layer=None) if reduction_ratio > 1: stride_size = int(reduction_ratio) if conv is not None and not self.conv_after_downsample: self.add_module('conv', conv) self.add_module( 'downsample', # create_pool2d( # pooling_type, kernel_size=stride_size + 1, stride=stride_size, padding=pad_type) # nn.MaxPool2d(kernel_size=stride_size + 1, stride=stride_size, padding=pad_type) nn.MaxPool2d(kernel_size=stride_size, stride=stride_size) ) if conv is not None and self.conv_after_downsample: self.add_module('conv', conv) else: if conv is not None: self.add_module('conv', conv) if reduction_ratio < 1: scale = int(1 // reduction_ratio) self.add_module('upsample', nn.UpsamplingNearest2d(scale_factor=scale)) class FpnCombine(nn.Module): def __init__(self, feature_info, fpn_config, fpn_channels, inputs_offsets, target_reduction, pad_type='', pooling_type='max', norm='', apply_bn_for_resampling=False, conv_after_downsample=False, redundant_bias=False, weight_method='attn'): super(FpnCombine, self).__init__() self.inputs_offsets = inputs_offsets self.weight_method = weight_method self.resample = nn.ModuleDict() for idx, offset in enumerate(inputs_offsets): in_channels = fpn_channels if offset < len(feature_info): in_channels = feature_info[offset]['num_chs'] input_reduction = feature_info[offset]['reduction'] else: node_idx = offset - len(feature_info) # print('node_idx, len', node_idx, len(fpn_config['nodes'])) input_reduction = fpn_config['nodes'][node_idx]['reduction'] reduction_ratio = target_reduction / input_reduction self.resample[str(offset)] = ResampleFeatureMap( in_channels, fpn_channels, reduction_ratio=reduction_ratio, pad_type=pad_type, pooling_type=pooling_type, norm=norm, apply_bn=apply_bn_for_resampling, conv_after_downsample=conv_after_downsample, redundant_bias=redundant_bias) if weight_method == 'attn' or weight_method == 'fastattn': # WSM self.edge_weights = nn.Parameter(torch.ones(len(inputs_offsets)), requires_grad=True) else: self.edge_weights = None def forward(self, x): dtype = x[0].dtype nodes = [] for offset in self.inputs_offsets: input_node = x[offset] input_node = self.resample[str(offset)](input_node) nodes.append(input_node) if self.weight_method == 'attn': normalized_weights = torch.softmax(self.edge_weights.type(dtype), dim=0) x = torch.stack(nodes, dim=-1) * normalized_weights elif self.weight_method == 'fastattn': edge_weights = nn.functional.relu(self.edge_weights.type(dtype)) weights_sum = torch.sum(edge_weights) x = torch.stack( [(nodes[i] * edge_weights[i]) / (weights_sum + 0.0001) for i in range(len(nodes))], dim=-1) elif self.weight_method == 'sum': x = torch.stack(nodes, dim=-1) else: raise ValueError('unknown weight_method {}'.format(self.weight_method)) x = torch.sum(x, dim=-1) return x class BiFpnLayer(nn.Module): def __init__(self, feature_info, fpn_config, fpn_channels, num_levels=5, pad_type='', pooling_type='max', norm='', act_layer=Swish, apply_bn_for_resampling=False, conv_after_downsample=True, conv_bn_relu_pattern=False, separable_conv=True, redundant_bias=False): super(BiFpnLayer, self).__init__() self.fpn_config = fpn_config self.num_levels = num_levels self.conv_bn_relu_pattern = False self.feature_info = [] self.fnode = SequentialAppend() for i, fnode_cfg in enumerate(fpn_config['nodes']): # logging.debug('fnode {} : {}'.format(i, fnode_cfg)) # print('fnode {} : {}'.format(i, fnode_cfg)) fnode_layers = OrderedDict() # combine features reduction = fnode_cfg['reduction'] fnode_layers['combine'] = FpnCombine( feature_info, fpn_config, fpn_channels, fnode_cfg['inputs_offsets'], target_reduction=reduction, pad_type=pad_type, pooling_type=pooling_type, norm=norm, apply_bn_for_resampling=apply_bn_for_resampling, conv_after_downsample=conv_after_downsample, redundant_bias=redundant_bias, weight_method=fpn_config['weight_method']) self.feature_info.append(dict(num_chs=fpn_channels, reduction=reduction)) # after combine ops after_combine = OrderedDict() if not conv_bn_relu_pattern: after_combine['act'] = act_layer(inplace=True) conv_bias = redundant_bias conv_act = None else: conv_bias = False conv_act = act_layer conv_kwargs = dict( in_channels=fpn_channels, out_channels=fpn_channels, kernel_size=3, padding=pad_type, bias=conv_bias, norm=norm, act_layer=conv_act) after_combine['conv'] = SeparableConv2d(**conv_kwargs) if separable_conv else ConvBnAct2d(**conv_kwargs) fnode_layers['after_combine'] = nn.Sequential(after_combine) self.fnode.add_module(str(i), nn.Sequential(fnode_layers)) self.feature_info = self.feature_info[-num_levels::] def forward(self, x): x = self.fnode(x) return x[-self.num_levels::] class BiFPN(Backbone): def __init__( self, cfg, bottom_up, in_features, out_channels, norm='', num_levels=5, num_bifpn=4, separable_conv=False, ): super(BiFPN, self).__init__() assert isinstance(bottom_up, Backbone) # Feature map strides and channels from the bottom up network (e.g. ResNet) input_shapes = bottom_up.output_shape() in_strides = [input_shapes[f].stride for f in in_features] in_channels = [input_shapes[f].channels for f in in_features] self.num_levels = num_levels self.num_bifpn = num_bifpn self.bottom_up = bottom_up self.in_features = in_features self._size_divisibility = 128 levels = [int(math.log2(s)) for s in in_strides] self._out_feature_strides = { "p{}".format(int(math.log2(s))): s for s in in_strides} if len(in_features) < num_levels: for l in range(num_levels - len(in_features)): s = l + levels[-1] self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1) self._out_features = list(sorted(self._out_feature_strides.keys())) self._out_feature_channels = {k: out_channels for k in self._out_features} # print('self._out_feature_strides', self._out_feature_strides) # print('self._out_feature_channels', self._out_feature_channels) feature_info = [ {'num_chs': in_channels[level], 'reduction': in_strides[level]} \ for level in range(len(self.in_features)) ] # self.config = config fpn_config = get_fpn_config() self.resample = SequentialAppendLast() for level in range(num_levels): if level < len(feature_info): in_chs = in_channels[level] # feature_info[level]['num_chs'] reduction = in_strides[level] # feature_info[level]['reduction'] else: # Adds a coarser level by downsampling the last feature map reduction_ratio = 2 self.resample.add_module(str(level), ResampleFeatureMap( in_channels=in_chs, out_channels=out_channels, pad_type='same', pooling_type=None, norm=norm, reduction_ratio=reduction_ratio, apply_bn=True, conv_after_downsample=False, redundant_bias=False, )) in_chs = out_channels reduction = int(reduction * reduction_ratio) feature_info.append(dict(num_chs=in_chs, reduction=reduction)) self.cell = nn.Sequential() for rep in range(self.num_bifpn): # logging.debug('building cell {}'.format(rep)) # print('building cell {}'.format(rep)) fpn_layer = BiFpnLayer( feature_info=feature_info, fpn_config=fpn_config, fpn_channels=out_channels, num_levels=self.num_levels, pad_type='same', pooling_type=None, norm=norm, act_layer=Swish, separable_conv=separable_conv, apply_bn_for_resampling=True, conv_after_downsample=False, conv_bn_relu_pattern=False, redundant_bias=False, ) self.cell.add_module(str(rep), fpn_layer) feature_info = fpn_layer.feature_info # import pdb; pdb.set_trace() @property def size_divisibility(self): return self._size_divisibility def forward(self, x): # print('input shapes', x.shape) bottom_up_features = self.bottom_up(x) x = [bottom_up_features[f] for f in self.in_features] assert len(self.resample) == self.num_levels - len(x) x = self.resample(x) shapes = [xx.shape for xx in x] # print('resample shapes', shapes) x = self.cell(x) out = {f: xx for f, xx in zip(self._out_features, x)} # import pdb; pdb.set_trace() return out @BACKBONE_REGISTRY.register() def build_resnet_bifpn_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ bottom_up = build_resnet_backbone(cfg, input_shape) in_features = cfg.MODEL.FPN.IN_FEATURES backbone = BiFPN( cfg=cfg, bottom_up=bottom_up, in_features=in_features, out_channels=cfg.MODEL.BIFPN.OUT_CHANNELS, norm=cfg.MODEL.BIFPN.NORM, num_levels=cfg.MODEL.BIFPN.NUM_LEVELS, num_bifpn=cfg.MODEL.BIFPN.NUM_BIFPN, separable_conv=cfg.MODEL.BIFPN.SEPARABLE_CONV, ) return backbone @BACKBONE_REGISTRY.register() def build_p37_dla_bifpn_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ bottom_up = dla34(cfg) in_features = cfg.MODEL.FPN.IN_FEATURES assert cfg.MODEL.BIFPN.NUM_LEVELS == 5 backbone = BiFPN( cfg=cfg, bottom_up=bottom_up, in_features=in_features, out_channels=cfg.MODEL.BIFPN.OUT_CHANNELS, norm=cfg.MODEL.BIFPN.NORM, num_levels=cfg.MODEL.BIFPN.NUM_LEVELS, num_bifpn=cfg.MODEL.BIFPN.NUM_BIFPN, separable_conv=cfg.MODEL.BIFPN.SEPARABLE_CONV, ) return backbone ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/backbone/bifpn_fcos.py ================================================ # This file is modified from https://github.com/aim-uofa/AdelaiDet/blob/master/adet/modeling/backbone/bifpn.py # The original file is under 2-clause BSD License for academic use, and *non-commercial use*. import torch import torch.nn.functional as F from torch import nn from detectron2.layers import Conv2d, ShapeSpec, get_norm from detectron2.modeling.backbone import Backbone, build_resnet_backbone from detectron2.modeling import BACKBONE_REGISTRY from .dlafpn import dla34 __all__ = [] def swish(x): return x * x.sigmoid() def split_name(name): for i, c in enumerate(name): if not c.isalpha(): return name[:i], int(name[i:]) raise ValueError() class FeatureMapResampler(nn.Module): def __init__(self, in_channels, out_channels, stride, norm=""): super(FeatureMapResampler, self).__init__() if in_channels != out_channels: self.reduction = Conv2d( in_channels, out_channels, kernel_size=1, bias=(norm == ""), norm=get_norm(norm, out_channels), activation=None ) else: self.reduction = None assert stride <= 2 self.stride = stride def forward(self, x): if self.reduction is not None: x = self.reduction(x) if self.stride == 2: x = F.max_pool2d( x, kernel_size=self.stride + 1, stride=self.stride, padding=1 ) elif self.stride == 1: pass else: raise NotImplementedError() return x class BackboneWithTopLevels(Backbone): def __init__(self, backbone, out_channels, num_top_levels, norm=""): super(BackboneWithTopLevels, self).__init__() self.backbone = backbone backbone_output_shape = backbone.output_shape() self._out_feature_channels = {name: shape.channels for name, shape in backbone_output_shape.items()} self._out_feature_strides = {name: shape.stride for name, shape in backbone_output_shape.items()} self._out_features = list(self._out_feature_strides.keys()) last_feature_name = max(self._out_feature_strides.keys(), key=lambda x: split_name(x)[1]) self.last_feature_name = last_feature_name self.num_top_levels = num_top_levels last_channels = self._out_feature_channels[last_feature_name] last_stride = self._out_feature_strides[last_feature_name] prefix, suffix = split_name(last_feature_name) prev_channels = last_channels for i in range(num_top_levels): name = prefix + str(suffix + i + 1) self.add_module(name, FeatureMapResampler( prev_channels, out_channels, 2, norm )) prev_channels = out_channels self._out_feature_channels[name] = out_channels self._out_feature_strides[name] = last_stride * 2 ** (i + 1) self._out_features.append(name) def forward(self, x): outputs = self.backbone(x) last_features = outputs[self.last_feature_name] prefix, suffix = split_name(self.last_feature_name) x = last_features for i in range(self.num_top_levels): name = prefix + str(suffix + i + 1) x = self.__getattr__(name)(x) outputs[name] = x return outputs class SingleBiFPN(Backbone): """ This module implements Feature Pyramid Network. It creates pyramid features built on top of some input feature maps. """ def __init__( self, in_channels_list, out_channels, norm="" ): """ Args: bottom_up (Backbone): module representing the bottom up subnetwork. Must be a subclass of :class:`Backbone`. The multi-scale feature maps generated by the bottom up network, and listed in `in_features`, are used to generate FPN levels. in_features (list[str]): names of the input feature maps coming from the backbone to which FPN is attached. For example, if the backbone produces ["res2", "res3", "res4"], any *contiguous* sublist of these may be used; order must be from high to low resolution. out_channels (int): number of channels in the output feature maps. norm (str): the normalization to use. """ super(SingleBiFPN, self).__init__() self.out_channels = out_channels # build 5-levels bifpn if len(in_channels_list) == 5: self.nodes = [ {'feat_level': 3, 'inputs_offsets': [3, 4]}, {'feat_level': 2, 'inputs_offsets': [2, 5]}, {'feat_level': 1, 'inputs_offsets': [1, 6]}, {'feat_level': 0, 'inputs_offsets': [0, 7]}, {'feat_level': 1, 'inputs_offsets': [1, 7, 8]}, {'feat_level': 2, 'inputs_offsets': [2, 6, 9]}, {'feat_level': 3, 'inputs_offsets': [3, 5, 10]}, {'feat_level': 4, 'inputs_offsets': [4, 11]}, ] elif len(in_channels_list) == 3: self.nodes = [ {'feat_level': 1, 'inputs_offsets': [1, 2]}, {'feat_level': 0, 'inputs_offsets': [0, 3]}, {'feat_level': 1, 'inputs_offsets': [1, 3, 4]}, {'feat_level': 2, 'inputs_offsets': [2, 5]}, ] else: raise NotImplementedError node_info = [_ for _ in in_channels_list] num_output_connections = [0 for _ in in_channels_list] for fnode in self.nodes: feat_level = fnode["feat_level"] inputs_offsets = fnode["inputs_offsets"] inputs_offsets_str = "_".join(map(str, inputs_offsets)) for input_offset in inputs_offsets: num_output_connections[input_offset] += 1 in_channels = node_info[input_offset] if in_channels != out_channels: lateral_conv = Conv2d( in_channels, out_channels, kernel_size=1, norm=get_norm(norm, out_channels) ) self.add_module( "lateral_{}_f{}".format(input_offset, feat_level), lateral_conv ) node_info.append(out_channels) num_output_connections.append(0) # generate attention weights name = "weights_f{}_{}".format(feat_level, inputs_offsets_str) self.__setattr__(name, nn.Parameter( torch.ones(len(inputs_offsets), dtype=torch.float32), requires_grad=True )) # generate convolutions after combination name = "outputs_f{}_{}".format(feat_level, inputs_offsets_str) self.add_module(name, Conv2d( out_channels, out_channels, kernel_size=3, padding=1, norm=get_norm(norm, out_channels), bias=(norm == "") )) def forward(self, feats): """ Args: input (dict[str->Tensor]): mapping feature map name (e.g., "p5") to feature map tensor for each feature level in high to low resolution order. Returns: dict[str->Tensor]: mapping from feature map name to FPN feature map tensor in high to low resolution order. Returned feature names follow the FPN paper convention: "p", where stage has stride = 2 ** stage e.g., ["n2", "n3", ..., "n6"]. """ feats = [_ for _ in feats] num_levels = len(feats) num_output_connections = [0 for _ in feats] for fnode in self.nodes: feat_level = fnode["feat_level"] inputs_offsets = fnode["inputs_offsets"] inputs_offsets_str = "_".join(map(str, inputs_offsets)) input_nodes = [] _, _, target_h, target_w = feats[feat_level].size() for input_offset in inputs_offsets: num_output_connections[input_offset] += 1 input_node = feats[input_offset] # reduction if input_node.size(1) != self.out_channels: name = "lateral_{}_f{}".format(input_offset, feat_level) input_node = self.__getattr__(name)(input_node) # maybe downsample _, _, h, w = input_node.size() if h > target_h and w > target_w: height_stride_size = int((h - 1) // target_h + 1) width_stride_size = int((w - 1) // target_w + 1) assert height_stride_size == width_stride_size == 2 input_node = F.max_pool2d( input_node, kernel_size=(height_stride_size + 1, width_stride_size + 1), stride=(height_stride_size, width_stride_size), padding=1 ) elif h <= target_h and w <= target_w: if h < target_h or w < target_w: input_node = F.interpolate( input_node, size=(target_h, target_w), mode="nearest" ) else: raise NotImplementedError() input_nodes.append(input_node) # attention name = "weights_f{}_{}".format(feat_level, inputs_offsets_str) weights = F.relu(self.__getattr__(name)) norm_weights = weights / (weights.sum() + 0.0001) new_node = torch.stack(input_nodes, dim=-1) new_node = (norm_weights * new_node).sum(dim=-1) new_node = swish(new_node) name = "outputs_f{}_{}".format(feat_level, inputs_offsets_str) feats.append(self.__getattr__(name)(new_node)) num_output_connections.append(0) output_feats = [] for idx in range(num_levels): for i, fnode in enumerate(reversed(self.nodes)): if fnode['feat_level'] == idx: output_feats.append(feats[-1 - i]) break else: raise ValueError() return output_feats class BiFPN(Backbone): """ This module implements Feature Pyramid Network. It creates pyramid features built on top of some input feature maps. """ def __init__( self, bottom_up, in_features, out_channels, num_top_levels, num_repeats, norm="" ): """ Args: bottom_up (Backbone): module representing the bottom up subnetwork. Must be a subclass of :class:`Backbone`. The multi-scale feature maps generated by the bottom up network, and listed in `in_features`, are used to generate FPN levels. in_features (list[str]): names of the input feature maps coming from the backbone to which FPN is attached. For example, if the backbone produces ["res2", "res3", "res4"], any *contiguous* sublist of these may be used; order must be from high to low resolution. out_channels (int): number of channels in the output feature maps. num_top_levels (int): the number of the top levels (p6 or p7). num_repeats (int): the number of repeats of BiFPN. norm (str): the normalization to use. """ super(BiFPN, self).__init__() assert isinstance(bottom_up, Backbone) # add extra feature levels (i.e., 6 and 7) self.bottom_up = BackboneWithTopLevels( bottom_up, out_channels, num_top_levels, norm ) bottom_up_output_shapes = self.bottom_up.output_shape() in_features = sorted(in_features, key=lambda x: split_name(x)[1]) self._size_divisibility = 128 #bottom_up_output_shapes[in_features[-1]].stride self.out_channels = out_channels self.min_level = split_name(in_features[0])[1] # add the names for top blocks prefix, last_suffix = split_name(in_features[-1]) for i in range(num_top_levels): in_features.append(prefix + str(last_suffix + i + 1)) self.in_features = in_features # generate output features self._out_features = ["p{}".format(split_name(name)[1]) for name in in_features] self._out_feature_strides = { out_name: bottom_up_output_shapes[in_name].stride for out_name, in_name in zip(self._out_features, in_features) } self._out_feature_channels = {k: out_channels for k in self._out_features} # build bifpn self.repeated_bifpn = nn.ModuleList() for i in range(num_repeats): if i == 0: in_channels_list = [ bottom_up_output_shapes[name].channels for name in in_features ] else: in_channels_list = [ self._out_feature_channels[name] for name in self._out_features ] self.repeated_bifpn.append(SingleBiFPN( in_channels_list, out_channels, norm )) @property def size_divisibility(self): return self._size_divisibility def forward(self, x): """ Args: input (dict[str->Tensor]): mapping feature map name (e.g., "p5") to feature map tensor for each feature level in high to low resolution order. Returns: dict[str->Tensor]: mapping from feature map name to FPN feature map tensor in high to low resolution order. Returned feature names follow the FPN paper convention: "p", where stage has stride = 2 ** stage e.g., ["n2", "n3", ..., "n6"]. """ bottom_up_features = self.bottom_up(x) feats = [bottom_up_features[f] for f in self.in_features] for bifpn in self.repeated_bifpn: feats = bifpn(feats) return dict(zip(self._out_features, feats)) def _assert_strides_are_log2_contiguous(strides): """ Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2". """ for i, stride in enumerate(strides[1:], 1): assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format( stride, strides[i - 1] ) @BACKBONE_REGISTRY.register() def build_fcos_resnet_bifpn_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ bottom_up = build_resnet_backbone(cfg, input_shape) in_features = cfg.MODEL.FPN.IN_FEATURES out_channels = cfg.MODEL.BIFPN.OUT_CHANNELS num_repeats = cfg.MODEL.BIFPN.NUM_BIFPN top_levels = 2 backbone = BiFPN( bottom_up=bottom_up, in_features=in_features, out_channels=out_channels, num_top_levels=top_levels, num_repeats=num_repeats, norm=cfg.MODEL.BIFPN.NORM ) return backbone @BACKBONE_REGISTRY.register() def build_p35_fcos_resnet_bifpn_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ bottom_up = build_resnet_backbone(cfg, input_shape) in_features = cfg.MODEL.FPN.IN_FEATURES out_channels = cfg.MODEL.BIFPN.OUT_CHANNELS num_repeats = cfg.MODEL.BIFPN.NUM_BIFPN top_levels = 0 backbone = BiFPN( bottom_up=bottom_up, in_features=in_features, out_channels=out_channels, num_top_levels=top_levels, num_repeats=num_repeats, norm=cfg.MODEL.BIFPN.NORM ) return backbone @BACKBONE_REGISTRY.register() def build_p35_fcos_dla_bifpn_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ bottom_up = dla34(cfg) in_features = cfg.MODEL.FPN.IN_FEATURES out_channels = cfg.MODEL.BIFPN.OUT_CHANNELS num_repeats = cfg.MODEL.BIFPN.NUM_BIFPN top_levels = 0 backbone = BiFPN( bottom_up=bottom_up, in_features=in_features, out_channels=out_channels, num_top_levels=top_levels, num_repeats=num_repeats, norm=cfg.MODEL.BIFPN.NORM ) return backbone @BACKBONE_REGISTRY.register() def build_p37_fcos_dla_bifpn_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ bottom_up = dla34(cfg) in_features = cfg.MODEL.FPN.IN_FEATURES out_channels = cfg.MODEL.BIFPN.OUT_CHANNELS num_repeats = cfg.MODEL.BIFPN.NUM_BIFPN assert cfg.MODEL.BIFPN.NUM_LEVELS == 5 top_levels = 2 backbone = BiFPN( bottom_up=bottom_up, in_features=in_features, out_channels=out_channels, num_top_levels=top_levels, num_repeats=num_repeats, norm=cfg.MODEL.BIFPN.NORM ) return backbone ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/backbone/dla.py ================================================ import numpy as np import math from os.path import join import fvcore.nn.weight_init as weight_init import torch import torch.nn.functional as F from torch import nn import torch.utils.model_zoo as model_zoo from detectron2.modeling.backbone.resnet import ( BasicStem, BottleneckBlock, DeformBottleneckBlock) from detectron2.layers import ( Conv2d, DeformConv, FrozenBatchNorm2d, ModulatedDeformConv, ShapeSpec, get_norm, ) from detectron2.modeling.backbone.backbone import Backbone from detectron2.modeling.backbone.build import BACKBONE_REGISTRY from detectron2.modeling.backbone.fpn import FPN __all__ = [ "BottleneckBlock", "DeformBottleneckBlock", "BasicStem", ] DCNV1 = False HASH = { 34: 'ba72cf86', 60: '24839fc4', } def get_model_url(data, name, hash): return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash)) class BasicBlock(nn.Module): def __init__(self, inplanes, planes, stride=1, dilation=1, norm='BN'): super(BasicBlock, self).__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation) self.bn1 = get_norm(norm, planes) self.relu = nn.ReLU(inplace=True) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=dilation, bias=False, dilation=dilation) self.bn2 = get_norm(norm, planes) self.stride = stride def forward(self, x, residual=None): if residual is None: residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out += residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 2 def __init__(self, inplanes, planes, stride=1, dilation=1, norm='BN'): super(Bottleneck, self).__init__() expansion = Bottleneck.expansion bottle_planes = planes // expansion self.conv1 = nn.Conv2d(inplanes, bottle_planes, kernel_size=1, bias=False) self.bn1 = get_norm(norm, bottle_planes) self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation) self.bn2 = get_norm(norm, bottle_planes) self.conv3 = nn.Conv2d(bottle_planes, planes, kernel_size=1, bias=False) self.bn3 = get_norm(norm, planes) self.relu = nn.ReLU(inplace=True) self.stride = stride def forward(self, x, residual=None): if residual is None: residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) out += residual out = self.relu(out) return out class Root(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, residual, norm='BN'): super(Root, self).__init__() self.conv = nn.Conv2d( in_channels, out_channels, 1, stride=1, bias=False, padding=(kernel_size - 1) // 2) self.bn = get_norm(norm, out_channels) self.relu = nn.ReLU(inplace=True) self.residual = residual def forward(self, *x): children = x x = self.conv(torch.cat(x, 1)) x = self.bn(x) if self.residual: x += children[0] x = self.relu(x) return x class Tree(nn.Module): def __init__(self, levels, block, in_channels, out_channels, stride=1, level_root=False, root_dim=0, root_kernel_size=1, dilation=1, root_residual=False, norm='BN'): super(Tree, self).__init__() if root_dim == 0: root_dim = 2 * out_channels if level_root: root_dim += in_channels if levels == 1: self.tree1 = block(in_channels, out_channels, stride, dilation=dilation, norm=norm) self.tree2 = block(out_channels, out_channels, 1, dilation=dilation, norm=norm) else: self.tree1 = Tree(levels - 1, block, in_channels, out_channels, stride, root_dim=0, root_kernel_size=root_kernel_size, dilation=dilation, root_residual=root_residual, norm=norm) self.tree2 = Tree(levels - 1, block, out_channels, out_channels, root_dim=root_dim + out_channels, root_kernel_size=root_kernel_size, dilation=dilation, root_residual=root_residual, norm=norm) if levels == 1: self.root = Root(root_dim, out_channels, root_kernel_size, root_residual, norm=norm) self.level_root = level_root self.root_dim = root_dim self.downsample = None self.project = None self.levels = levels if stride > 1: self.downsample = nn.MaxPool2d(stride, stride=stride) if in_channels != out_channels: self.project = nn.Sequential( nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False), get_norm(norm, out_channels) ) def forward(self, x, residual=None, children=None): children = [] if children is None else children bottom = self.downsample(x) if self.downsample else x residual = self.project(bottom) if self.project else bottom if self.level_root: children.append(bottom) x1 = self.tree1(x, residual) if self.levels == 1: x2 = self.tree2(x1) x = self.root(x2, x1, *children) else: children.append(x1) x = self.tree2(x1, children=children) return x class DLA(nn.Module): def __init__(self, num_layers, levels, channels, block=BasicBlock, residual_root=False, norm='BN'): """ Args: """ super(DLA, self).__init__() self.norm = norm self.channels = channels self.base_layer = nn.Sequential( nn.Conv2d(3, channels[0], kernel_size=7, stride=1, padding=3, bias=False), get_norm(self.norm, channels[0]), nn.ReLU(inplace=True)) self.level0 = self._make_conv_level( channels[0], channels[0], levels[0]) self.level1 = self._make_conv_level( channels[0], channels[1], levels[1], stride=2) self.level2 = Tree(levels[2], block, channels[1], channels[2], 2, level_root=False, root_residual=residual_root, norm=norm) self.level3 = Tree(levels[3], block, channels[2], channels[3], 2, level_root=True, root_residual=residual_root, norm=norm) self.level4 = Tree(levels[4], block, channels[3], channels[4], 2, level_root=True, root_residual=residual_root, norm=norm) self.level5 = Tree(levels[5], block, channels[4], channels[5], 2, level_root=True, root_residual=residual_root, norm=norm) self.load_pretrained_model( data='imagenet', name='dla{}'.format(num_layers), hash=HASH[num_layers]) def load_pretrained_model(self, data, name, hash): model_url = get_model_url(data, name, hash) model_weights = model_zoo.load_url(model_url) num_classes = len(model_weights[list(model_weights.keys())[-1]]) self.fc = nn.Conv2d( self.channels[-1], num_classes, kernel_size=1, stride=1, padding=0, bias=True) print('Loading pretrained') self.load_state_dict(model_weights, strict=False) def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1): modules = [] for i in range(convs): modules.extend([ nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride if i == 0 else 1, padding=dilation, bias=False, dilation=dilation), get_norm(self.norm, planes), nn.ReLU(inplace=True)]) inplanes = planes return nn.Sequential(*modules) def forward(self, x): y = [] x = self.base_layer(x) for i in range(6): x = getattr(self, 'level{}'.format(i))(x) y.append(x) return y def fill_up_weights(up): w = up.weight.data f = math.ceil(w.size(2) / 2) c = (2 * f - 1 - f % 2) / (2. * f) for i in range(w.size(2)): for j in range(w.size(3)): w[0, 0, i, j] = \ (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) for c in range(1, w.size(0)): w[c, 0, :, :] = w[0, 0, :, :] class _DeformConv(nn.Module): def __init__(self, chi, cho, norm='BN'): super(_DeformConv, self).__init__() self.actf = nn.Sequential( get_norm(norm, cho), nn.ReLU(inplace=True) ) if DCNV1: self.offset = Conv2d( chi, 18, kernel_size=3, stride=1, padding=1, dilation=1) self.conv = DeformConv( chi, cho, kernel_size=(3,3), stride=1, padding=1, dilation=1, deformable_groups=1) else: self.offset = Conv2d( chi, 27, kernel_size=3, stride=1, padding=1, dilation=1) self.conv = ModulatedDeformConv( chi, cho, kernel_size=3, stride=1, padding=1, dilation=1, deformable_groups=1) nn.init.constant_(self.offset.weight, 0) nn.init.constant_(self.offset.bias, 0) def forward(self, x): if DCNV1: offset = self.offset(x) x = self.conv(x, offset) else: offset_mask = self.offset(x) offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1) offset = torch.cat((offset_x, offset_y), dim=1) mask = mask.sigmoid() x = self.conv(x, offset, mask) x = self.actf(x) return x class IDAUp(nn.Module): def __init__(self, o, channels, up_f, norm='BN'): super(IDAUp, self).__init__() for i in range(1, len(channels)): c = channels[i] f = int(up_f[i]) proj = _DeformConv(c, o, norm=norm) node = _DeformConv(o, o, norm=norm) up = nn.ConvTranspose2d(o, o, f * 2, stride=f, padding=f // 2, output_padding=0, groups=o, bias=False) fill_up_weights(up) setattr(self, 'proj_' + str(i), proj) setattr(self, 'up_' + str(i), up) setattr(self, 'node_' + str(i), node) def forward(self, layers, startp, endp): for i in range(startp + 1, endp): upsample = getattr(self, 'up_' + str(i - startp)) project = getattr(self, 'proj_' + str(i - startp)) layers[i] = upsample(project(layers[i])) node = getattr(self, 'node_' + str(i - startp)) layers[i] = node(layers[i] + layers[i - 1]) class DLAUp(nn.Module): def __init__(self, startp, channels, scales, in_channels=None, norm='BN'): super(DLAUp, self).__init__() self.startp = startp if in_channels is None: in_channels = channels self.channels = channels channels = list(channels) scales = np.array(scales, dtype=int) for i in range(len(channels) - 1): j = -i - 2 setattr(self, 'ida_{}'.format(i), IDAUp(channels[j], in_channels[j:], scales[j:] // scales[j], norm=norm)) scales[j + 1:] = scales[j] in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]] def forward(self, layers): out = [layers[-1]] # start with 32 for i in range(len(layers) - self.startp - 1): ida = getattr(self, 'ida_{}'.format(i)) ida(layers, len(layers) -i - 2, len(layers)) out.insert(0, layers[-1]) return out DLA_CONFIGS = { 34: ([1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512], BasicBlock), 60: ([1, 1, 1, 2, 3, 1], [16, 32, 128, 256, 512, 1024], Bottleneck) } class DLASeg(Backbone): def __init__(self, num_layers, out_features, use_dla_up=True, ms_output=False, norm='BN'): super(DLASeg, self).__init__() # depth = 34 levels, channels, Block = DLA_CONFIGS[num_layers] self.base = DLA(num_layers=num_layers, levels=levels, channels=channels, block=Block, norm=norm) down_ratio = 4 self.first_level = int(np.log2(down_ratio)) self.ms_output = ms_output self.last_level = 5 if not self.ms_output else 6 channels = self.base.channels scales = [2 ** i for i in range(len(channels[self.first_level:]))] self.use_dla_up = use_dla_up if self.use_dla_up: self.dla_up = DLAUp( self.first_level, channels[self.first_level:], scales, norm=norm) out_channel = channels[self.first_level] if not self.ms_output: # stride 4 DLA self.ida_up = IDAUp( out_channel, channels[self.first_level:self.last_level], [2 ** i for i in range(self.last_level - self.first_level)], norm=norm) self._out_features = out_features self._out_feature_channels = { 'dla{}'.format(i): channels[i] for i in range(6)} self._out_feature_strides = { 'dla{}'.format(i): 2 ** i for i in range(6)} self._size_divisibility = 32 @property def size_divisibility(self): return self._size_divisibility def forward(self, x): x = self.base(x) if self.use_dla_up: x = self.dla_up(x) if not self.ms_output: # stride 4 dla y = [] for i in range(self.last_level - self.first_level): y.append(x[i].clone()) self.ida_up(y, 0, len(y)) ret = {} for i in range(self.last_level - self.first_level): out_feature = 'dla{}'.format(i) if out_feature in self._out_features: ret[out_feature] = y[i] else: ret = {} st = self.first_level if self.use_dla_up else 0 for i in range(self.last_level - st): out_feature = 'dla{}'.format(i + st) if out_feature in self._out_features: ret[out_feature] = x[i] return ret @BACKBONE_REGISTRY.register() def build_dla_backbone(cfg, input_shape): """ Create a ResNet instance from config. Returns: ResNet: a :class:`ResNet` instance. """ return DLASeg( out_features=cfg.MODEL.DLA.OUT_FEATURES, num_layers=cfg.MODEL.DLA.NUM_LAYERS, use_dla_up=cfg.MODEL.DLA.USE_DLA_UP, ms_output=cfg.MODEL.DLA.MS_OUTPUT, norm=cfg.MODEL.DLA.NORM) class LastLevelP6P7(nn.Module): """ This module is used in RetinaNet to generate extra layers, P6 and P7 from C5 feature. """ def __init__(self, in_channels, out_channels): super().__init__() self.num_levels = 2 self.in_feature = "dla5" self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1) self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1) for module in [self.p6, self.p7]: weight_init.c2_xavier_fill(module) def forward(self, c5): p6 = self.p6(c5) p7 = self.p7(F.relu(p6)) return [p6, p7] @BACKBONE_REGISTRY.register() def build_retinanet_dla_fpn_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ bottom_up = build_dla_backbone(cfg, input_shape) in_features = cfg.MODEL.FPN.IN_FEATURES out_channels = cfg.MODEL.FPN.OUT_CHANNELS in_channels_p6p7 = bottom_up.output_shape()['dla5'].channels backbone = FPN( bottom_up=bottom_up, in_features=in_features, out_channels=out_channels, norm=cfg.MODEL.FPN.NORM, top_block=LastLevelP6P7(in_channels_p6p7, out_channels), fuse_type=cfg.MODEL.FPN.FUSE_TYPE, ) return backbone ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/backbone/dlafpn.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- # this file is from https://github.com/ucbdrive/dla/blob/master/dla.py. import math from os.path import join import numpy as np import torch from torch import nn import torch.utils.model_zoo as model_zoo import torch.nn.functional as F import fvcore.nn.weight_init as weight_init from detectron2.modeling.backbone import FPN from detectron2.layers import ShapeSpec, ModulatedDeformConv, Conv2d from detectron2.modeling.backbone.build import BACKBONE_REGISTRY from detectron2.layers.batch_norm import get_norm from detectron2.modeling.backbone import Backbone WEB_ROOT = 'http://dl.yf.io/dla/models' def get_model_url(data, name, hash): return join( 'http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash)) def conv3x3(in_planes, out_planes, stride=1): "3x3 convolution with padding" return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) class BasicBlock(nn.Module): def __init__(self, cfg, inplanes, planes, stride=1, dilation=1): super(BasicBlock, self).__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation) self.bn1 = get_norm(cfg.MODEL.DLA.NORM, planes) self.relu = nn.ReLU(inplace=True) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=dilation, bias=False, dilation=dilation) self.bn2 = get_norm(cfg.MODEL.DLA.NORM, planes) self.stride = stride def forward(self, x, residual=None): if residual is None: residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out += residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 2 def __init__(self, cfg, inplanes, planes, stride=1, dilation=1): super(Bottleneck, self).__init__() expansion = Bottleneck.expansion bottle_planes = planes // expansion self.conv1 = nn.Conv2d(inplanes, bottle_planes, kernel_size=1, bias=False) self.bn1 = get_norm(cfg.MODEL.DLA.NORM, bottle_planes) self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation) self.bn2 = get_norm(cfg.MODEL.DLA.NORM, bottle_planes) self.conv3 = nn.Conv2d(bottle_planes, planes, kernel_size=1, bias=False) self.bn3 = get_norm(cfg.MODEL.DLA.NORM, planes) self.relu = nn.ReLU(inplace=True) self.stride = stride def forward(self, x, residual=None): if residual is None: residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) out += residual out = self.relu(out) return out class Root(nn.Module): def __init__(self, cfg, in_channels, out_channels, kernel_size, residual): super(Root, self).__init__() self.conv = nn.Conv2d( in_channels, out_channels, kernel_size, stride=1, bias=False, padding=(kernel_size - 1) // 2) self.bn = get_norm(cfg.MODEL.DLA.NORM, out_channels) self.relu = nn.ReLU(inplace=True) self.residual = residual def forward(self, *x): children = x x = self.conv(torch.cat(x, 1)) x = self.bn(x) if self.residual: x += children[0] x = self.relu(x) return x class Tree(nn.Module): def __init__(self, cfg, levels, block, in_channels, out_channels, stride=1, level_root=False, root_dim=0, root_kernel_size=1, dilation=1, root_residual=False): super(Tree, self).__init__() if root_dim == 0: root_dim = 2 * out_channels if level_root: root_dim += in_channels if levels == 1: self.tree1 = block(cfg, in_channels, out_channels, stride, dilation=dilation) self.tree2 = block(cfg, out_channels, out_channels, 1, dilation=dilation) else: self.tree1 = Tree(cfg, levels - 1, block, in_channels, out_channels, stride, root_dim=0, root_kernel_size=root_kernel_size, dilation=dilation, root_residual=root_residual) self.tree2 = Tree(cfg, levels - 1, block, out_channels, out_channels, root_dim=root_dim + out_channels, root_kernel_size=root_kernel_size, dilation=dilation, root_residual=root_residual) if levels == 1: self.root = Root(cfg, root_dim, out_channels, root_kernel_size, root_residual) self.level_root = level_root self.root_dim = root_dim self.downsample = None self.project = None self.levels = levels if stride > 1: self.downsample = nn.MaxPool2d(stride, stride=stride) if in_channels != out_channels: self.project = nn.Sequential( nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False), get_norm(cfg.MODEL.DLA.NORM, out_channels) ) def forward(self, x, residual=None, children=None): if self.training and residual is not None: x = x + residual.sum() * 0.0 children = [] if children is None else children bottom = self.downsample(x) if self.downsample else x residual = self.project(bottom) if self.project else bottom if self.level_root: children.append(bottom) x1 = self.tree1(x, residual) if self.levels == 1: x2 = self.tree2(x1) x = self.root(x2, x1, *children) else: children.append(x1) x = self.tree2(x1, children=children) return x class DLA(Backbone): def __init__(self, cfg, levels, channels, block=BasicBlock, residual_root=False): super(DLA, self).__init__() self.cfg = cfg self.channels = channels self._out_features = ["dla{}".format(i) for i in range(6)] self._out_feature_channels = {k: channels[i] for i, k in enumerate(self._out_features)} self._out_feature_strides = {k: 2 ** i for i, k in enumerate(self._out_features)} self.base_layer = nn.Sequential( nn.Conv2d(3, channels[0], kernel_size=7, stride=1, padding=3, bias=False), get_norm(cfg.MODEL.DLA.NORM, channels[0]), nn.ReLU(inplace=True)) self.level0 = self._make_conv_level( channels[0], channels[0], levels[0]) self.level1 = self._make_conv_level( channels[0], channels[1], levels[1], stride=2) self.level2 = Tree(cfg, levels[2], block, channels[1], channels[2], 2, level_root=False, root_residual=residual_root) self.level3 = Tree(cfg, levels[3], block, channels[2], channels[3], 2, level_root=True, root_residual=residual_root) self.level4 = Tree(cfg, levels[4], block, channels[3], channels[4], 2, level_root=True, root_residual=residual_root) self.level5 = Tree(cfg, levels[5], block, channels[4], channels[5], 2, level_root=True, root_residual=residual_root) for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) self.load_pretrained_model( data='imagenet', name='dla34', hash='ba72cf86') def load_pretrained_model(self, data, name, hash): model_url = get_model_url(data, name, hash) model_weights = model_zoo.load_url(model_url) del model_weights['fc.weight'] del model_weights['fc.bias'] print('Loading pretrained DLA!') self.load_state_dict(model_weights, strict=True) def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1): modules = [] for i in range(convs): modules.extend([ nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride if i == 0 else 1, padding=dilation, bias=False, dilation=dilation), get_norm(self.cfg.MODEL.DLA.NORM, planes), nn.ReLU(inplace=True)]) inplanes = planes return nn.Sequential(*modules) def forward(self, x): y = {} x = self.base_layer(x) for i in range(6): name = 'level{}'.format(i) x = getattr(self, name)(x) y['dla{}'.format(i)] = x return y def fill_up_weights(up): w = up.weight.data f = math.ceil(w.size(2) / 2) c = (2 * f - 1 - f % 2) / (2. * f) for i in range(w.size(2)): for j in range(w.size(3)): w[0, 0, i, j] = \ (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) for c in range(1, w.size(0)): w[c, 0, :, :] = w[0, 0, :, :] class Conv(nn.Module): def __init__(self, chi, cho, norm): super(Conv, self).__init__() self.conv = nn.Sequential( nn.Conv2d(chi, cho, kernel_size=1, stride=1, bias=False), get_norm(norm, cho), nn.ReLU(inplace=True)) def forward(self, x): return self.conv(x) class DeformConv(nn.Module): def __init__(self, chi, cho, norm): super(DeformConv, self).__init__() self.actf = nn.Sequential( get_norm(norm, cho), nn.ReLU(inplace=True) ) self.offset = Conv2d( chi, 27, kernel_size=3, stride=1, padding=1, dilation=1) self.conv = ModulatedDeformConv( chi, cho, kernel_size=3, stride=1, padding=1, dilation=1, deformable_groups=1) nn.init.constant_(self.offset.weight, 0) nn.init.constant_(self.offset.bias, 0) def forward(self, x): offset_mask = self.offset(x) offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1) offset = torch.cat((offset_x, offset_y), dim=1) mask = mask.sigmoid() x = self.conv(x, offset, mask) x = self.actf(x) return x class IDAUp(nn.Module): def __init__(self, o, channels, up_f, norm='FrozenBN', node_type=Conv): super(IDAUp, self).__init__() for i in range(1, len(channels)): c = channels[i] f = int(up_f[i]) proj = node_type(c, o, norm) node = node_type(o, o, norm) up = nn.ConvTranspose2d(o, o, f * 2, stride=f, padding=f // 2, output_padding=0, groups=o, bias=False) fill_up_weights(up) setattr(self, 'proj_' + str(i), proj) setattr(self, 'up_' + str(i), up) setattr(self, 'node_' + str(i), node) def forward(self, layers, startp, endp): for i in range(startp + 1, endp): upsample = getattr(self, 'up_' + str(i - startp)) project = getattr(self, 'proj_' + str(i - startp)) layers[i] = upsample(project(layers[i])) node = getattr(self, 'node_' + str(i - startp)) layers[i] = node(layers[i] + layers[i - 1]) DLAUP_NODE_MAP = { 'conv': Conv, 'dcn': DeformConv, } class DLAUP(Backbone): def __init__(self, bottom_up, in_features, norm, dlaup_node='conv'): super(DLAUP, self).__init__() assert isinstance(bottom_up, Backbone) self.bottom_up = bottom_up input_shapes = bottom_up.output_shape() in_strides = [input_shapes[f].stride for f in in_features] in_channels = [input_shapes[f].channels for f in in_features] in_levels = [int(math.log2(input_shapes[f].stride)) for f in in_features] self.in_features = in_features out_features = ['dlaup{}'.format(l) for l in in_levels] self._out_features = out_features self._out_feature_channels = { 'dlaup{}'.format(l): in_channels[i] for i, l in enumerate(in_levels)} self._out_feature_strides = { 'dlaup{}'.format(l): 2 ** l for l in in_levels} print('self._out_features', self._out_features) print('self._out_feature_channels', self._out_feature_channels) print('self._out_feature_strides', self._out_feature_strides) self._size_divisibility = 32 node_type = DLAUP_NODE_MAP[dlaup_node] self.startp = int(math.log2(in_strides[0])) self.channels = in_channels channels = list(in_channels) scales = np.array([2 ** i for i in range(len(out_features))], dtype=int) for i in range(len(channels) - 1): j = -i - 2 setattr(self, 'ida_{}'.format(i), IDAUp(channels[j], in_channels[j:], scales[j:] // scales[j], norm=norm, node_type=node_type)) scales[j + 1:] = scales[j] in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]] @property def size_divisibility(self): return self._size_divisibility def forward(self, x): bottom_up_features = self.bottom_up(x) layers = [bottom_up_features[f] for f in self.in_features] out = [layers[-1]] # start with 32 for i in range(len(layers) - 1): ida = getattr(self, 'ida_{}'.format(i)) ida(layers, len(layers) - i - 2, len(layers)) out.insert(0, layers[-1]) ret = {} for k, v in zip(self._out_features, out): ret[k] = v # import pdb; pdb.set_trace() return ret def dla34(cfg, pretrained=None): # DLA-34 model = DLA(cfg, [1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512], block=BasicBlock) return model class LastLevelP6P7(nn.Module): """ This module is used in RetinaNet to generate extra layers, P6 and P7 from C5 feature. """ def __init__(self, in_channels, out_channels): super().__init__() self.num_levels = 2 self.in_feature = "dla5" self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1) self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1) for module in [self.p6, self.p7]: weight_init.c2_xavier_fill(module) def forward(self, c5): p6 = self.p6(c5) p7 = self.p7(F.relu(p6)) return [p6, p7] @BACKBONE_REGISTRY.register() def build_dla_fpn3_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ depth_to_creator = {"dla34": dla34} bottom_up = depth_to_creator['dla{}'.format(cfg.MODEL.DLA.NUM_LAYERS)](cfg) in_features = cfg.MODEL.FPN.IN_FEATURES out_channels = cfg.MODEL.FPN.OUT_CHANNELS backbone = FPN( bottom_up=bottom_up, in_features=in_features, out_channels=out_channels, norm=cfg.MODEL.FPN.NORM, top_block=None, fuse_type=cfg.MODEL.FPN.FUSE_TYPE, ) return backbone @BACKBONE_REGISTRY.register() def build_dla_fpn5_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ depth_to_creator = {"dla34": dla34} bottom_up = depth_to_creator['dla{}'.format(cfg.MODEL.DLA.NUM_LAYERS)](cfg) in_features = cfg.MODEL.FPN.IN_FEATURES out_channels = cfg.MODEL.FPN.OUT_CHANNELS in_channels_top = bottom_up.output_shape()['dla5'].channels backbone = FPN( bottom_up=bottom_up, in_features=in_features, out_channels=out_channels, norm=cfg.MODEL.FPN.NORM, top_block=LastLevelP6P7(in_channels_top, out_channels), fuse_type=cfg.MODEL.FPN.FUSE_TYPE, ) return backbone @BACKBONE_REGISTRY.register() def build_dlaup_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ depth_to_creator = {"dla34": dla34} bottom_up = depth_to_creator['dla{}'.format(cfg.MODEL.DLA.NUM_LAYERS)](cfg) backbone = DLAUP( bottom_up=bottom_up, in_features=cfg.MODEL.DLA.DLAUP_IN_FEATURES, norm=cfg.MODEL.DLA.NORM, dlaup_node=cfg.MODEL.DLA.DLAUP_NODE, ) return backbone ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/backbone/fpn_p5.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved import math import fvcore.nn.weight_init as weight_init import torch.nn.functional as F from torch import nn from detectron2.layers import Conv2d, ShapeSpec, get_norm from detectron2.modeling.backbone import Backbone from detectron2.modeling.backbone.fpn import FPN from detectron2.modeling.backbone.build import BACKBONE_REGISTRY from detectron2.modeling.backbone.resnet import build_resnet_backbone class LastLevelP6P7_P5(nn.Module): """ This module is used in RetinaNet to generate extra layers, P6 and P7 from C5 feature. """ def __init__(self, in_channels, out_channels): super().__init__() self.num_levels = 2 self.in_feature = "p5" self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1) self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1) for module in [self.p6, self.p7]: weight_init.c2_xavier_fill(module) def forward(self, c5): p6 = self.p6(c5) p7 = self.p7(F.relu(p6)) return [p6, p7] @BACKBONE_REGISTRY.register() def build_p67_resnet_fpn_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ bottom_up = build_resnet_backbone(cfg, input_shape) in_features = cfg.MODEL.FPN.IN_FEATURES out_channels = cfg.MODEL.FPN.OUT_CHANNELS backbone = FPN( bottom_up=bottom_up, in_features=in_features, out_channels=out_channels, norm=cfg.MODEL.FPN.NORM, top_block=LastLevelP6P7_P5(out_channels, out_channels), fuse_type=cfg.MODEL.FPN.FUSE_TYPE, ) return backbone @BACKBONE_REGISTRY.register() def build_p35_resnet_fpn_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ bottom_up = build_resnet_backbone(cfg, input_shape) in_features = cfg.MODEL.FPN.IN_FEATURES out_channels = cfg.MODEL.FPN.OUT_CHANNELS backbone = FPN( bottom_up=bottom_up, in_features=in_features, out_channels=out_channels, norm=cfg.MODEL.FPN.NORM, top_block=None, fuse_type=cfg.MODEL.FPN.FUSE_TYPE, ) return backbone ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/backbone/res2net.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved # This file is modified from https://github.com/Res2Net/Res2Net-detectron2/blob/master/detectron2/modeling/backbone/resnet.py # The original file is under Apache-2.0 License import numpy as np import fvcore.nn.weight_init as weight_init import torch import torch.nn.functional as F from torch import nn from detectron2.layers import ( CNNBlockBase, Conv2d, DeformConv, ModulatedDeformConv, ShapeSpec, get_norm, ) from detectron2.modeling.backbone import Backbone from detectron2.modeling.backbone.fpn import FPN from detectron2.modeling.backbone.build import BACKBONE_REGISTRY from .fpn_p5 import LastLevelP6P7_P5 from .bifpn import BiFPN __all__ = [ "ResNetBlockBase", "BasicBlock", "BottleneckBlock", "DeformBottleneckBlock", "BasicStem", "ResNet", "make_stage", "build_res2net_backbone", ] ResNetBlockBase = CNNBlockBase """ Alias for backward compatibiltiy. """ class BasicBlock(CNNBlockBase): """ The basic residual block for ResNet-18 and ResNet-34, with two 3x3 conv layers and a projection shortcut if needed. """ def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"): """ Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. stride (int): Stride for the first conv. norm (str or callable): normalization for all conv layers. See :func:`layers.get_norm` for supported format. """ super().__init__(in_channels, out_channels, stride) if in_channels != out_channels: self.shortcut = Conv2d( in_channels, out_channels, kernel_size=1, stride=stride, bias=False, norm=get_norm(norm, out_channels), ) else: self.shortcut = None self.conv1 = Conv2d( in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False, norm=get_norm(norm, out_channels), ) self.conv2 = Conv2d( out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False, norm=get_norm(norm, out_channels), ) for layer in [self.conv1, self.conv2, self.shortcut]: if layer is not None: # shortcut can be None weight_init.c2_msra_fill(layer) def forward(self, x): out = self.conv1(x) out = F.relu_(out) out = self.conv2(out) if self.shortcut is not None: shortcut = self.shortcut(x) else: shortcut = x out += shortcut out = F.relu_(out) return out class BottleneckBlock(CNNBlockBase): """ The standard bottle2neck residual block used by Res2Net-50, 101 and 152. """ def __init__( self, in_channels, out_channels, *, bottleneck_channels, stride=1, num_groups=1, norm="BN", stride_in_1x1=False, dilation=1, basewidth=26, scale=4, ): """ Args: bottleneck_channels (int): number of output channels for the 3x3 "bottleneck" conv layers. num_groups (int): number of groups for the 3x3 conv layer. norm (str or callable): normalization for all conv layers. See :func:`layers.get_norm` for supported format. stride_in_1x1 (bool): when stride>1, whether to put stride in the first 1x1 convolution or the bottleneck 3x3 convolution. dilation (int): the dilation rate of the 3x3 conv layer. """ super().__init__(in_channels, out_channels, stride) if in_channels != out_channels: self.shortcut = nn.Sequential( nn.AvgPool2d(kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False), Conv2d( in_channels, out_channels, kernel_size=1, stride=1, bias=False, norm=get_norm(norm, out_channels), ) ) else: self.shortcut = None # The original MSRA ResNet models have stride in the first 1x1 conv # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have # stride in the 3x3 conv stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) width = bottleneck_channels//scale self.conv1 = Conv2d( in_channels, bottleneck_channels, kernel_size=1, stride=stride_1x1, bias=False, norm=get_norm(norm, bottleneck_channels), ) if scale == 1: self.nums = 1 else: self.nums = scale -1 if self.in_channels!=self.out_channels and stride_3x3!=2: self.pool = nn.AvgPool2d(kernel_size=3, stride = stride_3x3, padding=1) convs = [] bns = [] for i in range(self.nums): convs.append(nn.Conv2d( width, width, kernel_size=3, stride=stride_3x3, padding=1 * dilation, bias=False, groups=num_groups, dilation=dilation, )) bns.append(get_norm(norm, width)) self.convs = nn.ModuleList(convs) self.bns = nn.ModuleList(bns) self.conv3 = Conv2d( bottleneck_channels, out_channels, kernel_size=1, bias=False, norm=get_norm(norm, out_channels), ) self.scale = scale self.width = width self.in_channels = in_channels self.out_channels = out_channels self.stride_3x3 = stride_3x3 for layer in [self.conv1, self.conv3]: if layer is not None: # shortcut can be None weight_init.c2_msra_fill(layer) if self.shortcut is not None: for layer in self.shortcut.modules(): if isinstance(layer, Conv2d): weight_init.c2_msra_fill(layer) for layer in self.convs: if layer is not None: # shortcut can be None weight_init.c2_msra_fill(layer) # Zero-initialize the last normalization in each residual branch, # so that at the beginning, the residual branch starts with zeros, # and each residual block behaves like an identity. # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour": # "For BN layers, the learnable scaling coefficient γ is initialized # to be 1, except for each residual block's last BN # where γ is initialized to be 0." # nn.init.constant_(self.conv3.norm.weight, 0) # TODO this somehow hurts performance when training GN models from scratch. # Add it as an option when we need to use this code to train a backbone. def forward(self, x): out = self.conv1(x) out = F.relu_(out) spx = torch.split(out, self.width, 1) for i in range(self.nums): if i==0 or self.in_channels!=self.out_channels: sp = spx[i] else: sp = sp + spx[i] sp = self.convs[i](sp) sp = F.relu_(self.bns[i](sp)) if i==0: out = sp else: out = torch.cat((out, sp), 1) if self.scale!=1 and self.stride_3x3==1: out = torch.cat((out, spx[self.nums]), 1) elif self.scale != 1 and self.stride_3x3==2: out = torch.cat((out, self.pool(spx[self.nums])), 1) out = self.conv3(out) if self.shortcut is not None: shortcut = self.shortcut(x) else: shortcut = x out += shortcut out = F.relu_(out) return out class DeformBottleneckBlock(ResNetBlockBase): """ Not implemented for res2net yet. Similar to :class:`BottleneckBlock`, but with deformable conv in the 3x3 convolution. """ def __init__( self, in_channels, out_channels, *, bottleneck_channels, stride=1, num_groups=1, norm="BN", stride_in_1x1=False, dilation=1, deform_modulated=False, deform_num_groups=1, basewidth=26, scale=4, ): super().__init__(in_channels, out_channels, stride) self.deform_modulated = deform_modulated if in_channels != out_channels: # self.shortcut = Conv2d( # in_channels, # out_channels, # kernel_size=1, # stride=stride, # bias=False, # norm=get_norm(norm, out_channels), # ) self.shortcut = nn.Sequential( nn.AvgPool2d(kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False), Conv2d( in_channels, out_channels, kernel_size=1, stride=1, bias=False, norm=get_norm(norm, out_channels), ) ) else: self.shortcut = None stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) width = bottleneck_channels//scale self.conv1 = Conv2d( in_channels, bottleneck_channels, kernel_size=1, stride=stride_1x1, bias=False, norm=get_norm(norm, bottleneck_channels), ) if scale == 1: self.nums = 1 else: self.nums = scale -1 if self.in_channels!=self.out_channels and stride_3x3!=2: self.pool = nn.AvgPool2d(kernel_size=3, stride = stride_3x3, padding=1) if deform_modulated: deform_conv_op = ModulatedDeformConv # offset channels are 2 or 3 (if with modulated) * kernel_size * kernel_size offset_channels = 27 else: deform_conv_op = DeformConv offset_channels = 18 # self.conv2_offset = Conv2d( # bottleneck_channels, # offset_channels * deform_num_groups, # kernel_size=3, # stride=stride_3x3, # padding=1 * dilation, # dilation=dilation, # ) # self.conv2 = deform_conv_op( # bottleneck_channels, # bottleneck_channels, # kernel_size=3, # stride=stride_3x3, # padding=1 * dilation, # bias=False, # groups=num_groups, # dilation=dilation, # deformable_groups=deform_num_groups, # norm=get_norm(norm, bottleneck_channels), # ) conv2_offsets = [] convs = [] bns = [] for i in range(self.nums): conv2_offsets.append(Conv2d( width, offset_channels * deform_num_groups, kernel_size=3, stride=stride_3x3, padding=1 * dilation, bias=False, groups=num_groups, dilation=dilation, )) convs.append(deform_conv_op( width, width, kernel_size=3, stride=stride_3x3, padding=1 * dilation, bias=False, groups=num_groups, dilation=dilation, deformable_groups=deform_num_groups, )) bns.append(get_norm(norm, width)) self.conv2_offsets = nn.ModuleList(conv2_offsets) self.convs = nn.ModuleList(convs) self.bns = nn.ModuleList(bns) self.conv3 = Conv2d( bottleneck_channels, out_channels, kernel_size=1, bias=False, norm=get_norm(norm, out_channels), ) self.scale = scale self.width = width self.in_channels = in_channels self.out_channels = out_channels self.stride_3x3 = stride_3x3 # for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]: # if layer is not None: # shortcut can be None # weight_init.c2_msra_fill(layer) # nn.init.constant_(self.conv2_offset.weight, 0) # nn.init.constant_(self.conv2_offset.bias, 0) for layer in [self.conv1, self.conv3]: if layer is not None: # shortcut can be None weight_init.c2_msra_fill(layer) if self.shortcut is not None: for layer in self.shortcut.modules(): if isinstance(layer, Conv2d): weight_init.c2_msra_fill(layer) for layer in self.convs: if layer is not None: # shortcut can be None weight_init.c2_msra_fill(layer) for layer in self.conv2_offsets: if layer.weight is not None: nn.init.constant_(layer.weight, 0) if layer.bias is not None: nn.init.constant_(layer.bias, 0) def forward(self, x): out = self.conv1(x) out = F.relu_(out) # if self.deform_modulated: # offset_mask = self.conv2_offset(out) # offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1) # offset = torch.cat((offset_x, offset_y), dim=1) # mask = mask.sigmoid() # out = self.conv2(out, offset, mask) # else: # offset = self.conv2_offset(out) # out = self.conv2(out, offset) # out = F.relu_(out) spx = torch.split(out, self.width, 1) for i in range(self.nums): if i==0 or self.in_channels!=self.out_channels: sp = spx[i].contiguous() else: sp = sp + spx[i].contiguous() # sp = self.convs[i](sp) if self.deform_modulated: offset_mask = self.conv2_offsets[i](sp) offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1) offset = torch.cat((offset_x, offset_y), dim=1) mask = mask.sigmoid() sp = self.convs[i](sp, offset, mask) else: offset = self.conv2_offsets[i](sp) sp = self.convs[i](sp, offset) sp = F.relu_(self.bns[i](sp)) if i==0: out = sp else: out = torch.cat((out, sp), 1) if self.scale!=1 and self.stride_3x3==1: out = torch.cat((out, spx[self.nums]), 1) elif self.scale != 1 and self.stride_3x3==2: out = torch.cat((out, self.pool(spx[self.nums])), 1) out = self.conv3(out) if self.shortcut is not None: shortcut = self.shortcut(x) else: shortcut = x out += shortcut out = F.relu_(out) return out def make_stage(block_class, num_blocks, first_stride, *, in_channels, out_channels, **kwargs): """ Create a list of blocks just like those in a ResNet stage. Args: block_class (type): a subclass of ResNetBlockBase num_blocks (int): first_stride (int): the stride of the first block. The other blocks will have stride=1. in_channels (int): input channels of the entire stage. out_channels (int): output channels of **every block** in the stage. kwargs: other arguments passed to the constructor of every block. Returns: list[nn.Module]: a list of block module. """ assert "stride" not in kwargs, "Stride of blocks in make_stage cannot be changed." blocks = [] for i in range(num_blocks): blocks.append( block_class( in_channels=in_channels, out_channels=out_channels, stride=first_stride if i == 0 else 1, **kwargs, ) ) in_channels = out_channels return blocks class BasicStem(CNNBlockBase): """ The standard ResNet stem (layers before the first residual block). """ def __init__(self, in_channels=3, out_channels=64, norm="BN"): """ Args: norm (str or callable): norm after the first conv layer. See :func:`layers.get_norm` for supported format. """ super().__init__(in_channels, out_channels, 4) self.in_channels = in_channels self.conv1 = nn.Sequential( Conv2d( in_channels, 32, kernel_size=3, stride=2, padding=1, bias=False, ), get_norm(norm, 32), nn.ReLU(inplace=True), Conv2d( 32, 32, kernel_size=3, stride=1, padding=1, bias=False, ), get_norm(norm, 32), nn.ReLU(inplace=True), Conv2d( 32, out_channels, kernel_size=3, stride=1, padding=1, bias=False, ), ) self.bn1 = get_norm(norm, out_channels) for layer in self.conv1: if isinstance(layer, Conv2d): weight_init.c2_msra_fill(layer) def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = F.relu_(x) x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) return x class ResNet(Backbone): def __init__(self, stem, stages, num_classes=None, out_features=None): """ Args: stem (nn.Module): a stem module stages (list[list[CNNBlockBase]]): several (typically 4) stages, each contains multiple :class:`CNNBlockBase`. num_classes (None or int): if None, will not perform classification. Otherwise, will create a linear layer. out_features (list[str]): name of the layers whose outputs should be returned in forward. Can be anything in "stem", "linear", or "res2" ... If None, will return the output of the last layer. """ super(ResNet, self).__init__() self.stem = stem self.num_classes = num_classes current_stride = self.stem.stride self._out_feature_strides = {"stem": current_stride} self._out_feature_channels = {"stem": self.stem.out_channels} self.stages_and_names = [] for i, blocks in enumerate(stages): assert len(blocks) > 0, len(blocks) for block in blocks: assert isinstance(block, CNNBlockBase), block name = "res" + str(i + 2) stage = nn.Sequential(*blocks) self.add_module(name, stage) self.stages_and_names.append((stage, name)) self._out_feature_strides[name] = current_stride = int( current_stride * np.prod([k.stride for k in blocks]) ) self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels if num_classes is not None: self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.linear = nn.Linear(curr_channels, num_classes) # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour": # "The 1000-way fully-connected layer is initialized by # drawing weights from a zero-mean Gaussian with standard deviation of 0.01." nn.init.normal_(self.linear.weight, std=0.01) name = "linear" if out_features is None: out_features = [name] self._out_features = out_features assert len(self._out_features) children = [x[0] for x in self.named_children()] for out_feature in self._out_features: assert out_feature in children, "Available children: {}".format(", ".join(children)) def forward(self, x): outputs = {} x = self.stem(x) if "stem" in self._out_features: outputs["stem"] = x for stage, name in self.stages_and_names: x = stage(x) if name in self._out_features: outputs[name] = x if self.num_classes is not None: x = self.avgpool(x) x = torch.flatten(x, 1) x = self.linear(x) if "linear" in self._out_features: outputs["linear"] = x return outputs def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features } def freeze(self, freeze_at=0): """ Freeze the first several stages of the ResNet. Commonly used in fine-tuning. Args: freeze_at (int): number of stem and stages to freeze. `1` means freezing the stem. `2` means freezing the stem and the first stage, etc. Returns: nn.Module: this ResNet itself """ if freeze_at >= 1: self.stem.freeze() for idx, (stage, _) in enumerate(self.stages_and_names, start=2): if freeze_at >= idx: for block in stage.children(): block.freeze() return self @BACKBONE_REGISTRY.register() def build_res2net_backbone(cfg, input_shape): """ Create a Res2Net instance from config. Returns: ResNet: a :class:`ResNet` instance. """ # need registration of new blocks/stems? norm = cfg.MODEL.RESNETS.NORM stem = BasicStem( in_channels=input_shape.channels, out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS, norm=norm, ) # fmt: off freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT out_features = cfg.MODEL.RESNETS.OUT_FEATURES depth = cfg.MODEL.RESNETS.DEPTH num_groups = cfg.MODEL.RESNETS.NUM_GROUPS width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP scale = 4 bottleneck_channels = num_groups * width_per_group * scale in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1 res5_dilation = cfg.MODEL.RESNETS.RES5_DILATION deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE deform_modulated = cfg.MODEL.RESNETS.DEFORM_MODULATED deform_num_groups = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS # fmt: on assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation) num_blocks_per_stage = { 18: [2, 2, 2, 2], 34: [3, 4, 6, 3], 50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3], }[depth] if depth in [18, 34]: assert out_channels == 64, "Must set MODEL.RESNETS.RES2_OUT_CHANNELS = 64 for R18/R34" assert not any( deform_on_per_stage ), "MODEL.RESNETS.DEFORM_ON_PER_STAGE unsupported for R18/R34" assert res5_dilation == 1, "Must set MODEL.RESNETS.RES5_DILATION = 1 for R18/R34" assert num_groups == 1, "Must set MODEL.RESNETS.NUM_GROUPS = 1 for R18/R34" stages = [] # Avoid creating variables without gradients # It consumes extra memory and may cause allreduce to fail out_stage_idx = [{"res2": 2, "res3": 3, "res4": 4, "res5": 5}[f] for f in out_features] max_stage_idx = max(out_stage_idx) for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)): dilation = res5_dilation if stage_idx == 5 else 1 first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2 stage_kargs = { "num_blocks": num_blocks_per_stage[idx], "first_stride": first_stride, "in_channels": in_channels, "out_channels": out_channels, "norm": norm, } # Use BasicBlock for R18 and R34. if depth in [18, 34]: stage_kargs["block_class"] = BasicBlock else: stage_kargs["bottleneck_channels"] = bottleneck_channels stage_kargs["stride_in_1x1"] = stride_in_1x1 stage_kargs["dilation"] = dilation stage_kargs["num_groups"] = num_groups stage_kargs["scale"] = scale if deform_on_per_stage[idx]: stage_kargs["block_class"] = DeformBottleneckBlock stage_kargs["deform_modulated"] = deform_modulated stage_kargs["deform_num_groups"] = deform_num_groups else: stage_kargs["block_class"] = BottleneckBlock blocks = make_stage(**stage_kargs) in_channels = out_channels out_channels *= 2 bottleneck_channels *= 2 stages.append(blocks) return ResNet(stem, stages, out_features=out_features).freeze(freeze_at) @BACKBONE_REGISTRY.register() def build_p67_res2net_fpn_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ bottom_up = build_res2net_backbone(cfg, input_shape) in_features = cfg.MODEL.FPN.IN_FEATURES out_channels = cfg.MODEL.FPN.OUT_CHANNELS backbone = FPN( bottom_up=bottom_up, in_features=in_features, out_channels=out_channels, norm=cfg.MODEL.FPN.NORM, top_block=LastLevelP6P7_P5(out_channels, out_channels), fuse_type=cfg.MODEL.FPN.FUSE_TYPE, ) return backbone @BACKBONE_REGISTRY.register() def build_res2net_bifpn_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ bottom_up = build_res2net_backbone(cfg, input_shape) in_features = cfg.MODEL.FPN.IN_FEATURES backbone = BiFPN( cfg=cfg, bottom_up=bottom_up, in_features=in_features, out_channels=cfg.MODEL.BIFPN.OUT_CHANNELS, norm=cfg.MODEL.BIFPN.NORM, num_levels=cfg.MODEL.BIFPN.NUM_LEVELS, num_bifpn=cfg.MODEL.BIFPN.NUM_BIFPN, separable_conv=cfg.MODEL.BIFPN.SEPARABLE_CONV, ) return backbone ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/debug.py ================================================ import cv2 import numpy as np import torch import torch.nn.functional as F COLORS = ((np.random.rand(1300, 3) * 0.4 + 0.6) * 255).astype( np.uint8).reshape(1300, 1, 1, 3) def _get_color_image(heatmap): heatmap = heatmap.reshape( heatmap.shape[0], heatmap.shape[1], heatmap.shape[2], 1) if heatmap.shape[0] == 1: color_map = (heatmap * np.ones((1, 1, 1, 3), np.uint8) * 255).max( axis=0).astype(np.uint8) # H, W, 3 else: color_map = (heatmap * COLORS[:heatmap.shape[0]]).max(axis=0).astype(np.uint8) # H, W, 3 return color_map def _blend_image(image, color_map, a=0.7): color_map = cv2.resize(color_map, (image.shape[1], image.shape[0])) ret = np.clip(image * (1 - a) + color_map * a, 0, 255).astype(np.uint8) return ret def _blend_image_heatmaps(image, color_maps, a=0.7): merges = np.zeros((image.shape[0], image.shape[1], 3), np.float32) for color_map in color_maps: color_map = cv2.resize(color_map, (image.shape[1], image.shape[0])) merges = np.maximum(merges, color_map) ret = np.clip(image * (1 - a) + merges * a, 0, 255).astype(np.uint8) return ret def _decompose_level(x, shapes_per_level, N): ''' x: LNHiWi x C ''' x = x.view(x.shape[0], -1) ret = [] st = 0 for l in range(len(shapes_per_level)): ret.append([]) h = shapes_per_level[l][0].int().item() w = shapes_per_level[l][1].int().item() for i in range(N): ret[l].append(x[st + h * w * i:st + h * w * (i + 1)].view( h, w, -1).permute(2, 0, 1)) st += h * w * N return ret def _imagelist_to_tensor(images): images = [x for x in images] image_sizes = [x.shape[-2:] for x in images] h = max([size[0] for size in image_sizes]) w = max([size[1] for size in image_sizes]) S = 32 h, w = ((h - 1) // S + 1) * S, ((w - 1) // S + 1) * S images = [F.pad(x, (0, w - x.shape[2], 0, h - x.shape[1], 0, 0)) \ for x in images] images = torch.stack(images) return images def _ind2il(ind, shapes_per_level, N): r = ind l = 0 S = 0 while r - S >= N * shapes_per_level[l][0] * shapes_per_level[l][1]: S += N * shapes_per_level[l][0] * shapes_per_level[l][1] l += 1 i = (r - S) // (shapes_per_level[l][0] * shapes_per_level[l][1]) return i, l def debug_train( images, gt_instances, flattened_hms, reg_targets, labels, pos_inds, shapes_per_level, locations, strides): ''' images: N x 3 x H x W flattened_hms: LNHiWi x C shapes_per_level: L x 2 [(H_i, W_i)] locations: LNHiWi x 2 ''' reg_inds = torch.nonzero( reg_targets.max(dim=1)[0] > 0).squeeze(1) N = len(images) images = _imagelist_to_tensor(images) repeated_locations = [torch.cat([loc] * N, dim=0) \ for loc in locations] locations = torch.cat(repeated_locations, dim=0) gt_hms = _decompose_level(flattened_hms, shapes_per_level, N) masks = flattened_hms.new_zeros((flattened_hms.shape[0], 1)) masks[pos_inds] = 1 masks = _decompose_level(masks, shapes_per_level, N) for i in range(len(images)): image = images[i].detach().cpu().numpy().transpose(1, 2, 0) color_maps = [] for l in range(len(gt_hms)): color_map = _get_color_image( gt_hms[l][i].detach().cpu().numpy()) color_maps.append(color_map) cv2.imshow('gthm_{}'.format(l), color_map) blend = _blend_image_heatmaps(image.copy(), color_maps) if gt_instances is not None: bboxes = gt_instances[i].gt_boxes.tensor for j in range(len(bboxes)): bbox = bboxes[j] cv2.rectangle( blend, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (0, 0, 255), 3, cv2.LINE_AA) for j in range(len(pos_inds)): image_id, l = _ind2il(pos_inds[j], shapes_per_level, N) if image_id != i: continue loc = locations[pos_inds[j]] cv2.drawMarker( blend, (int(loc[0]), int(loc[1])), (0, 255, 255), markerSize=(l + 1) * 16) for j in range(len(reg_inds)): image_id, l = _ind2il(reg_inds[j], shapes_per_level, N) if image_id != i: continue ltrb = reg_targets[reg_inds[j]] ltrb *= strides[l] loc = locations[reg_inds[j]] bbox = [(loc[0] - ltrb[0]), (loc[1] - ltrb[1]), (loc[0] + ltrb[2]), (loc[1] + ltrb[3])] cv2.rectangle( blend, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (255, 0, 0), 1, cv2.LINE_AA) cv2.circle(blend, (int(loc[0]), int(loc[1])), 2, (255, 0, 0), -1) cv2.imshow('blend', blend) cv2.waitKey() def debug_test( images, logits_pred, reg_pred, agn_hm_pred=[], preds=[], vis_thresh=0.3, debug_show_name=False, mult_agn=False): ''' images: N x 3 x H x W class_target: LNHiWi x C cat_agn_heatmap: LNHiWi shapes_per_level: L x 2 [(H_i, W_i)] ''' N = len(images) for i in range(len(images)): image = images[i].detach().cpu().numpy().transpose(1, 2, 0) result = image.copy().astype(np.uint8) pred_image = image.copy().astype(np.uint8) color_maps = [] L = len(logits_pred) for l in range(L): if logits_pred[0] is not None: stride = min(image.shape[0], image.shape[1]) / min( logits_pred[l][i].shape[1], logits_pred[l][i].shape[2]) else: stride = min(image.shape[0], image.shape[1]) / min( agn_hm_pred[l][i].shape[1], agn_hm_pred[l][i].shape[2]) stride = stride if stride < 60 else 64 if stride < 100 else 128 if logits_pred[0] is not None: if mult_agn: logits_pred[l][i] = logits_pred[l][i] * agn_hm_pred[l][i] color_map = _get_color_image( logits_pred[l][i].detach().cpu().numpy()) color_maps.append(color_map) cv2.imshow('predhm_{}'.format(l), color_map) if debug_show_name: from detectron2.data.datasets.lvis_v1_categories import LVIS_CATEGORIES cat2name = [x['name'] for x in LVIS_CATEGORIES] for j in range(len(preds[i].scores) if preds is not None else 0): if preds[i].scores[j] > vis_thresh: bbox = preds[i].proposal_boxes[j] \ if preds[i].has('proposal_boxes') else \ preds[i].pred_boxes[j] bbox = bbox.tensor[0].detach().cpu().numpy().astype(np.int32) cat = int(preds[i].pred_classes[j]) \ if preds[i].has('pred_classes') else 0 cl = COLORS[cat, 0, 0] cv2.rectangle( pred_image, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (int(cl[0]), int(cl[1]), int(cl[2])), 2, cv2.LINE_AA) if debug_show_name: txt = '{}{:.1f}'.format( cat2name[cat] if cat > 0 else '', preds[i].scores[j]) font = cv2.FONT_HERSHEY_SIMPLEX cat_size = cv2.getTextSize(txt, font, 0.5, 2)[0] cv2.rectangle( pred_image, (int(bbox[0]), int(bbox[1] - cat_size[1] - 2)), (int(bbox[0] + cat_size[0]), int(bbox[1] - 2)), (int(cl[0]), int(cl[1]), int(cl[2])), -1) cv2.putText( pred_image, txt, (int(bbox[0]), int(bbox[1] - 2)), font, 0.5, (0, 0, 0), thickness=1, lineType=cv2.LINE_AA) if agn_hm_pred[l] is not None: agn_hm_ = agn_hm_pred[l][i, 0, :, :, None].detach().cpu().numpy() agn_hm_ = (agn_hm_ * np.array([255, 255, 255]).reshape( 1, 1, 3)).astype(np.uint8) cv2.imshow('agn_hm_{}'.format(l), agn_hm_) blend = _blend_image_heatmaps(image.copy(), color_maps) cv2.imshow('blend', blend) cv2.imshow('preds', pred_image) cv2.waitKey() global cnt cnt = 0 def debug_second_stage(images, instances, proposals=None, vis_thresh=0.3, save_debug=False, debug_show_name=False): images = _imagelist_to_tensor(images) if debug_show_name: from detectron2.data.datasets.lvis_v1_categories import LVIS_CATEGORIES cat2name = [x['name'] for x in LVIS_CATEGORIES] for i in range(len(images)): image = images[i].detach().cpu().numpy().transpose(1, 2, 0).astype(np.uint8).copy() if instances[i].has('gt_boxes'): bboxes = instances[i].gt_boxes.tensor.cpu().numpy() scores = np.ones(bboxes.shape[0]) cats = instances[i].gt_classes.cpu().numpy() else: bboxes = instances[i].pred_boxes.tensor.cpu().numpy() scores = instances[i].scores.cpu().numpy() cats = instances[i].pred_classes.cpu().numpy() for j in range(len(bboxes)): if scores[j] > vis_thresh: bbox = bboxes[j] cl = COLORS[cats[j], 0, 0] cl = (int(cl[0]), int(cl[1]), int(cl[2])) cv2.rectangle( image, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), cl, 2, cv2.LINE_AA) if debug_show_name: cat = cats[j] txt = '{}{:.1f}'.format( cat2name[cat] if cat > 0 else '', scores[j]) font = cv2.FONT_HERSHEY_SIMPLEX cat_size = cv2.getTextSize(txt, font, 0.5, 2)[0] cv2.rectangle( image, (int(bbox[0]), int(bbox[1] - cat_size[1] - 2)), (int(bbox[0] + cat_size[0]), int(bbox[1] - 2)), (int(cl[0]), int(cl[1]), int(cl[2])), -1) cv2.putText( image, txt, (int(bbox[0]), int(bbox[1] - 2)), font, 0.5, (0, 0, 0), thickness=1, lineType=cv2.LINE_AA) if proposals is not None: proposal_image = images[i].detach().cpu().numpy().transpose(1, 2, 0).astype(np.uint8).copy() bboxes = proposals[i].proposal_boxes.tensor.cpu().numpy() if proposals[i].has('scores'): scores = proposals[i].scores.cpu().numpy() else: scores = proposals[i].objectness_logits.sigmoid().cpu().numpy() for j in range(len(bboxes)): if scores[j] > vis_thresh: bbox = bboxes[j] cl = (209, 159, 83) cv2.rectangle( proposal_image, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), cl, 2, cv2.LINE_AA) cv2.imshow('image', image) if proposals is not None: cv2.imshow('proposals', proposal_image) if save_debug: global cnt cnt += 1 cv2.imwrite('output/save_debug/{}.jpg'.format(cnt), proposal_image) cv2.waitKey() ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/dense_heads/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/dense_heads/centernet.py ================================================ import math import json import copy from typing import List, Dict import numpy as np import torch from torch import nn from torch.nn import functional as F from detectron2.modeling.proposal_generator.build import PROPOSAL_GENERATOR_REGISTRY from detectron2.layers import ShapeSpec, cat from detectron2.structures import Instances, Boxes from detectron2.modeling import detector_postprocess from detectron2.utils.comm import get_world_size from detectron2.config import configurable from ..layers.heatmap_focal_loss import heatmap_focal_loss_jit from ..layers.heatmap_focal_loss import binary_heatmap_focal_loss from ..layers.iou_loss import IOULoss from ..layers.ml_nms import ml_nms from ..debug import debug_train, debug_test from .utils import reduce_sum, _transpose from .centernet_head import CenterNetHead __all__ = ["CenterNet"] INF = 100000000 @PROPOSAL_GENERATOR_REGISTRY.register() class CenterNet(nn.Module): @configurable def __init__(self, # input_shape: Dict[str, ShapeSpec], in_channels=256, *, num_classes=80, in_features=("p3", "p4", "p5", "p6", "p7"), strides=(8, 16, 32, 64, 128), score_thresh=0.05, hm_min_overlap=0.8, loc_loss_type='giou', min_radius=4, hm_focal_alpha=0.25, hm_focal_beta=4, loss_gamma=2.0, reg_weight=2.0, not_norm_reg=True, with_agn_hm=False, only_proposal=False, as_proposal=False, not_nms=False, pos_weight=1., neg_weight=1., sigmoid_clamp=1e-4, ignore_high_fp=-1., center_nms=False, sizes_of_interest=[[0,80],[64,160],[128,320],[256,640],[512,10000000]], more_pos=False, more_pos_thresh=0.2, more_pos_topk=9, pre_nms_topk_train=1000, pre_nms_topk_test=1000, post_nms_topk_train=100, post_nms_topk_test=100, nms_thresh_train=0.6, nms_thresh_test=0.6, no_reduce=False, debug=False, vis_thresh=0.5, pixel_mean=[103.530,116.280,123.675], pixel_std=[1.0,1.0,1.0], device='cuda', centernet_head=None, ): super().__init__() self.num_classes = num_classes self.in_features = in_features self.strides = strides self.score_thresh = score_thresh self.min_radius = min_radius self.hm_focal_alpha = hm_focal_alpha self.hm_focal_beta = hm_focal_beta self.loss_gamma = loss_gamma self.reg_weight = reg_weight self.not_norm_reg = not_norm_reg self.with_agn_hm = with_agn_hm self.only_proposal = only_proposal self.as_proposal = as_proposal self.not_nms = not_nms self.pos_weight = pos_weight self.neg_weight = neg_weight self.sigmoid_clamp = sigmoid_clamp self.ignore_high_fp = ignore_high_fp self.center_nms = center_nms self.sizes_of_interest = sizes_of_interest self.more_pos = more_pos self.more_pos_thresh = more_pos_thresh self.more_pos_topk = more_pos_topk self.pre_nms_topk_train = pre_nms_topk_train self.pre_nms_topk_test = pre_nms_topk_test self.post_nms_topk_train = post_nms_topk_train self.post_nms_topk_test = post_nms_topk_test self.nms_thresh_train = nms_thresh_train self.nms_thresh_test = nms_thresh_test self.no_reduce = no_reduce self.debug = debug self.vis_thresh = vis_thresh if self.center_nms: self.not_nms = True self.iou_loss = IOULoss(loc_loss_type) assert (not self.only_proposal) or self.with_agn_hm # delta for rendering heatmap self.delta = (1 - hm_min_overlap) / (1 + hm_min_overlap) if centernet_head is None: self.centernet_head = CenterNetHead( in_channels=in_channels, num_levels=len(in_features), with_agn_hm=with_agn_hm, only_proposal=only_proposal) else: self.centernet_head = centernet_head if self.debug: pixel_mean = torch.Tensor(pixel_mean).to( torch.device(device)).view(3, 1, 1) pixel_std = torch.Tensor(pixel_std).to( torch.device(device)).view(3, 1, 1) self.denormalizer = lambda x: x * pixel_std + pixel_mean @classmethod def from_config(cls, cfg, input_shape): ret = { # 'input_shape': input_shape, 'in_channels': input_shape[ cfg.MODEL.CENTERNET.IN_FEATURES[0]].channels, 'num_classes': cfg.MODEL.CENTERNET.NUM_CLASSES, 'in_features': cfg.MODEL.CENTERNET.IN_FEATURES, 'strides': cfg.MODEL.CENTERNET.FPN_STRIDES, 'score_thresh': cfg.MODEL.CENTERNET.INFERENCE_TH, 'loc_loss_type': cfg.MODEL.CENTERNET.LOC_LOSS_TYPE, 'hm_min_overlap': cfg.MODEL.CENTERNET.HM_MIN_OVERLAP, 'min_radius': cfg.MODEL.CENTERNET.MIN_RADIUS, 'hm_focal_alpha': cfg.MODEL.CENTERNET.HM_FOCAL_ALPHA, 'hm_focal_beta': cfg.MODEL.CENTERNET.HM_FOCAL_BETA, 'loss_gamma': cfg.MODEL.CENTERNET.LOSS_GAMMA, 'reg_weight': cfg.MODEL.CENTERNET.REG_WEIGHT, 'not_norm_reg': cfg.MODEL.CENTERNET.NOT_NORM_REG, 'with_agn_hm': cfg.MODEL.CENTERNET.WITH_AGN_HM, 'only_proposal': cfg.MODEL.CENTERNET.ONLY_PROPOSAL, 'as_proposal': cfg.MODEL.CENTERNET.AS_PROPOSAL, 'not_nms': cfg.MODEL.CENTERNET.NOT_NMS, 'pos_weight': cfg.MODEL.CENTERNET.POS_WEIGHT, 'neg_weight': cfg.MODEL.CENTERNET.NEG_WEIGHT, 'sigmoid_clamp': cfg.MODEL.CENTERNET.SIGMOID_CLAMP, 'ignore_high_fp': cfg.MODEL.CENTERNET.IGNORE_HIGH_FP, 'center_nms': cfg.MODEL.CENTERNET.CENTER_NMS, 'sizes_of_interest': cfg.MODEL.CENTERNET.SOI, 'more_pos': cfg.MODEL.CENTERNET.MORE_POS, 'more_pos_thresh': cfg.MODEL.CENTERNET.MORE_POS_THRESH, 'more_pos_topk': cfg.MODEL.CENTERNET.MORE_POS_TOPK, 'pre_nms_topk_train': cfg.MODEL.CENTERNET.PRE_NMS_TOPK_TRAIN, 'pre_nms_topk_test': cfg.MODEL.CENTERNET.PRE_NMS_TOPK_TEST, 'post_nms_topk_train': cfg.MODEL.CENTERNET.POST_NMS_TOPK_TRAIN, 'post_nms_topk_test': cfg.MODEL.CENTERNET.POST_NMS_TOPK_TEST, 'nms_thresh_train': cfg.MODEL.CENTERNET.NMS_TH_TRAIN, 'nms_thresh_test': cfg.MODEL.CENTERNET.NMS_TH_TEST, 'no_reduce': cfg.MODEL.CENTERNET.NO_REDUCE, 'debug': cfg.DEBUG, 'vis_thresh': cfg.VIS_THRESH, 'pixel_mean': cfg.MODEL.PIXEL_MEAN, 'pixel_std': cfg.MODEL.PIXEL_STD, 'device': cfg.MODEL.DEVICE, 'centernet_head': CenterNetHead( cfg, [input_shape[f] for f in cfg.MODEL.CENTERNET.IN_FEATURES]), } return ret def forward(self, images, features_dict, gt_instances): features = [features_dict[f] for f in self.in_features] clss_per_level, reg_pred_per_level, agn_hm_pred_per_level = \ self.centernet_head(features) grids = self.compute_grids(features) shapes_per_level = grids[0].new_tensor( [(x.shape[2], x.shape[3]) for x in reg_pred_per_level]) if not self.training: return self.inference( images, clss_per_level, reg_pred_per_level, agn_hm_pred_per_level, grids) else: pos_inds, labels, reg_targets, flattened_hms = \ self._get_ground_truth( grids, shapes_per_level, gt_instances) # logits_pred: M x F, reg_pred: M x 4, agn_hm_pred: M logits_pred, reg_pred, agn_hm_pred = self._flatten_outputs( clss_per_level, reg_pred_per_level, agn_hm_pred_per_level) if self.more_pos: # add more pixels as positive if \ # 1. they are within the center3x3 region of an object # 2. their regression losses are small (= 0).squeeze(1) reg_pred = reg_pred[reg_inds] reg_targets_pos = reg_targets[reg_inds] reg_weight_map = flattened_hms.max(dim=1)[0] reg_weight_map = reg_weight_map[reg_inds] reg_weight_map = reg_weight_map * 0 + 1 \ if self.not_norm_reg else reg_weight_map if self.no_reduce: reg_norm = max(reg_weight_map.sum(), 1) else: reg_norm = max(reduce_sum(reg_weight_map.sum()).item() / num_gpus, 1) reg_loss = self.reg_weight * self.iou_loss( reg_pred, reg_targets_pos, reg_weight_map, reduction='sum') / reg_norm losses['loss_centernet_loc'] = reg_loss if self.with_agn_hm: cat_agn_heatmap = flattened_hms.max(dim=1)[0] # M agn_pos_loss, agn_neg_loss = binary_heatmap_focal_loss( agn_hm_pred, cat_agn_heatmap, pos_inds, alpha=self.hm_focal_alpha, beta=self.hm_focal_beta, gamma=self.loss_gamma, sigmoid_clamp=self.sigmoid_clamp, ignore_high_fp=self.ignore_high_fp, ) agn_pos_loss = self.pos_weight * agn_pos_loss / num_pos_avg agn_neg_loss = self.neg_weight * agn_neg_loss / num_pos_avg losses['loss_centernet_agn_pos'] = agn_pos_loss losses['loss_centernet_agn_neg'] = agn_neg_loss if self.debug: print('losses', losses) print('total_num_pos', total_num_pos) return losses def compute_grids(self, features): grids = [] for level, feature in enumerate(features): h, w = feature.size()[-2:] shifts_x = torch.arange( 0, w * self.strides[level], step=self.strides[level], dtype=torch.float32, device=feature.device) shifts_y = torch.arange( 0, h * self.strides[level], step=self.strides[level], dtype=torch.float32, device=feature.device) shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) shift_x = shift_x.reshape(-1) shift_y = shift_y.reshape(-1) grids_per_level = torch.stack((shift_x, shift_y), dim=1) + \ self.strides[level] // 2 grids.append(grids_per_level) return grids def _get_ground_truth(self, grids, shapes_per_level, gt_instances): ''' Input: grids: list of tensors [(hl x wl, 2)]_l shapes_per_level: list of tuples L x 2: gt_instances: gt instances Retuen: pos_inds: N labels: N reg_targets: M x 4 flattened_hms: M x C or M x 1 N: number of objects in all images M: number of pixels from all FPN levels ''' # get positive pixel index if not self.more_pos: pos_inds, labels = self._get_label_inds( gt_instances, shapes_per_level) else: pos_inds, labels = None, None heatmap_channels = self.num_classes L = len(grids) num_loc_list = [len(loc) for loc in grids] strides = torch.cat([ shapes_per_level.new_ones(num_loc_list[l]) * self.strides[l] \ for l in range(L)]).float() # M reg_size_ranges = torch.cat([ shapes_per_level.new_tensor(self.sizes_of_interest[l]).float().view( 1, 2).expand(num_loc_list[l], 2) for l in range(L)]) # M x 2 grids = torch.cat(grids, dim=0) # M x 2 M = grids.shape[0] reg_targets = [] flattened_hms = [] for i in range(len(gt_instances)): # images boxes = gt_instances[i].gt_boxes.tensor # N x 4 area = gt_instances[i].gt_boxes.area() # N gt_classes = gt_instances[i].gt_classes # N in [0, self.num_classes] N = boxes.shape[0] if N == 0: reg_targets.append(grids.new_zeros((M, 4)) - INF) flattened_hms.append( grids.new_zeros(( M, 1 if self.only_proposal else heatmap_channels))) continue l = grids[:, 0].view(M, 1) - boxes[:, 0].view(1, N) # M x N t = grids[:, 1].view(M, 1) - boxes[:, 1].view(1, N) # M x N r = boxes[:, 2].view(1, N) - grids[:, 0].view(M, 1) # M x N b = boxes[:, 3].view(1, N) - grids[:, 1].view(M, 1) # M x N reg_target = torch.stack([l, t, r, b], dim=2) # M x N x 4 centers = ((boxes[:, [0, 1]] + boxes[:, [2, 3]]) / 2) # N x 2 centers_expanded = centers.view(1, N, 2).expand(M, N, 2) # M x N x 2 strides_expanded = strides.view(M, 1, 1).expand(M, N, 2) centers_discret = ((centers_expanded / strides_expanded).int() * \ strides_expanded).float() + strides_expanded / 2 # M x N x 2 is_peak = (((grids.view(M, 1, 2).expand(M, N, 2) - \ centers_discret) ** 2).sum(dim=2) == 0) # M x N is_in_boxes = reg_target.min(dim=2)[0] > 0 # M x N is_center3x3 = self.get_center3x3( grids, centers, strides) & is_in_boxes # M x N is_cared_in_the_level = self.assign_reg_fpn( reg_target, reg_size_ranges) # M x N reg_mask = is_center3x3 & is_cared_in_the_level # M x N dist2 = ((grids.view(M, 1, 2).expand(M, N, 2) - \ centers_expanded) ** 2).sum(dim=2) # M x N dist2[is_peak] = 0 radius2 = self.delta ** 2 * 2 * area # N radius2 = torch.clamp( radius2, min=self.min_radius ** 2) weighted_dist2 = dist2 / radius2.view(1, N).expand(M, N) # M x N reg_target = self._get_reg_targets( reg_target, weighted_dist2.clone(), reg_mask, area) # M x 4 if self.only_proposal: flattened_hm = self._create_agn_heatmaps_from_dist( weighted_dist2.clone()) # M x 1 else: flattened_hm = self._create_heatmaps_from_dist( weighted_dist2.clone(), gt_classes, channels=heatmap_channels) # M x C reg_targets.append(reg_target) flattened_hms.append(flattened_hm) # transpose im first training_targets to level first ones reg_targets = _transpose(reg_targets, num_loc_list) flattened_hms = _transpose(flattened_hms, num_loc_list) for l in range(len(reg_targets)): reg_targets[l] = reg_targets[l] / float(self.strides[l]) reg_targets = cat([x for x in reg_targets], dim=0) # MB x 4 flattened_hms = cat([x for x in flattened_hms], dim=0) # MB x C return pos_inds, labels, reg_targets, flattened_hms def _get_label_inds(self, gt_instances, shapes_per_level): ''' Inputs: gt_instances: [n_i], sum n_i = N shapes_per_level: L x 2 [(h_l, w_l)]_L Returns: pos_inds: N' labels: N' ''' pos_inds = [] labels = [] L = len(self.strides) B = len(gt_instances) shapes_per_level = shapes_per_level.long() loc_per_level = (shapes_per_level[:, 0] * shapes_per_level[:, 1]).long() # L level_bases = [] s = 0 for l in range(L): level_bases.append(s) s = s + B * loc_per_level[l] level_bases = shapes_per_level.new_tensor(level_bases).long() # L strides_default = shapes_per_level.new_tensor(self.strides).float() # L for im_i in range(B): targets_per_im = gt_instances[im_i] bboxes = targets_per_im.gt_boxes.tensor # n x 4 n = bboxes.shape[0] centers = ((bboxes[:, [0, 1]] + bboxes[:, [2, 3]]) / 2) # n x 2 centers = centers.view(n, 1, 2).expand(n, L, 2) strides = strides_default.view(1, L, 1).expand(n, L, 2) centers_inds = (centers / strides).long() # n x L x 2 Ws = shapes_per_level[:, 1].view(1, L).expand(n, L) pos_ind = level_bases.view(1, L).expand(n, L) + \ im_i * loc_per_level.view(1, L).expand(n, L) + \ centers_inds[:, :, 1] * Ws + \ centers_inds[:, :, 0] # n x L is_cared_in_the_level = self.assign_fpn_level(bboxes) pos_ind = pos_ind[is_cared_in_the_level].view(-1) label = targets_per_im.gt_classes.view( n, 1).expand(n, L)[is_cared_in_the_level].view(-1) pos_inds.append(pos_ind) # n' labels.append(label) # n' pos_inds = torch.cat(pos_inds, dim=0).long() labels = torch.cat(labels, dim=0) return pos_inds, labels # N, N def assign_fpn_level(self, boxes): ''' Inputs: boxes: n x 4 size_ranges: L x 2 Return: is_cared_in_the_level: n x L ''' size_ranges = boxes.new_tensor( self.sizes_of_interest).view(len(self.sizes_of_interest), 2) # L x 2 crit = ((boxes[:, 2:] - boxes[:, :2]) **2).sum(dim=1) ** 0.5 / 2 # n n, L = crit.shape[0], size_ranges.shape[0] crit = crit.view(n, 1).expand(n, L) size_ranges_expand = size_ranges.view(1, L, 2).expand(n, L, 2) is_cared_in_the_level = (crit >= size_ranges_expand[:, :, 0]) & \ (crit <= size_ranges_expand[:, :, 1]) return is_cared_in_the_level def assign_reg_fpn(self, reg_targets_per_im, size_ranges): ''' TODO (Xingyi): merge it with assign_fpn_level Inputs: reg_targets_per_im: M x N x 4 size_ranges: M x 2 ''' crit = ((reg_targets_per_im[:, :, :2] + \ reg_targets_per_im[:, :, 2:])**2).sum(dim=2) ** 0.5 / 2 # M x N is_cared_in_the_level = (crit >= size_ranges[:, [0]]) & \ (crit <= size_ranges[:, [1]]) return is_cared_in_the_level def _get_reg_targets(self, reg_targets, dist, mask, area): ''' reg_targets (M x N x 4): long tensor dist (M x N) is_*: M x N ''' dist[mask == 0] = INF * 1.0 min_dist, min_inds = dist.min(dim=1) # M reg_targets_per_im = reg_targets[ range(len(reg_targets)), min_inds] # M x N x 4 --> M x 4 reg_targets_per_im[min_dist == INF] = - INF return reg_targets_per_im def _create_heatmaps_from_dist(self, dist, labels, channels): ''' dist: M x N labels: N return: heatmaps: M x C ''' heatmaps = dist.new_zeros((dist.shape[0], channels)) for c in range(channels): inds = (labels == c) # N if inds.int().sum() == 0: continue heatmaps[:, c] = torch.exp(-dist[:, inds].min(dim=1)[0]) zeros = heatmaps[:, c] < 1e-4 heatmaps[zeros, c] = 0 return heatmaps def _create_agn_heatmaps_from_dist(self, dist): ''' TODO (Xingyi): merge it with _create_heatmaps_from_dist dist: M x N return: heatmaps: M x 1 ''' heatmaps = dist.new_zeros((dist.shape[0], 1)) heatmaps[:, 0] = torch.exp(-dist.min(dim=1)[0]) zeros = heatmaps < 1e-4 heatmaps[zeros] = 0 return heatmaps def _flatten_outputs(self, clss, reg_pred, agn_hm_pred): # Reshape: (N, F, Hl, Wl) -> (N, Hl, Wl, F) -> (sum_l N*Hl*Wl, F) clss = cat([x.permute(0, 2, 3, 1).reshape(-1, x.shape[1]) \ for x in clss], dim=0) if clss[0] is not None else None reg_pred = cat( [x.permute(0, 2, 3, 1).reshape(-1, 4) for x in reg_pred], dim=0) agn_hm_pred = cat([x.permute(0, 2, 3, 1).reshape(-1) \ for x in agn_hm_pred], dim=0) if self.with_agn_hm else None return clss, reg_pred, agn_hm_pred def get_center3x3(self, locations, centers, strides): ''' Inputs: locations: M x 2 centers: N x 2 strides: M ''' M, N = locations.shape[0], centers.shape[0] locations_expanded = locations.view(M, 1, 2).expand(M, N, 2) # M x N x 2 centers_expanded = centers.view(1, N, 2).expand(M, N, 2) # M x N x 2 strides_expanded = strides.view(M, 1, 1).expand(M, N, 2) # M x N centers_discret = ((centers_expanded / strides_expanded).int() * \ strides_expanded).float() + strides_expanded / 2 # M x N x 2 dist_x = (locations_expanded[:, :, 0] - centers_discret[:, :, 0]).abs() dist_y = (locations_expanded[:, :, 1] - centers_discret[:, :, 1]).abs() return (dist_x <= strides_expanded[:, :, 0]) & \ (dist_y <= strides_expanded[:, :, 0]) def inference(self, images, clss_per_level, reg_pred_per_level, agn_hm_pred_per_level, grids): logits_pred = [x.sigmoid() if x is not None else None \ for x in clss_per_level] agn_hm_pred_per_level = [x.sigmoid() if x is not None else None \ for x in agn_hm_pred_per_level] if self.only_proposal: proposals = self.predict_instances( grids, agn_hm_pred_per_level, reg_pred_per_level, images.image_sizes, [None for _ in agn_hm_pred_per_level]) else: proposals = self.predict_instances( grids, logits_pred, reg_pred_per_level, images.image_sizes, agn_hm_pred_per_level) if self.as_proposal or self.only_proposal: for p in range(len(proposals)): proposals[p].proposal_boxes = proposals[p].get('pred_boxes') proposals[p].objectness_logits = proposals[p].get('scores') proposals[p].remove('pred_boxes') if self.debug: debug_test( [self.denormalizer(x) for x in images], logits_pred, reg_pred_per_level, agn_hm_pred_per_level, preds=proposals, vis_thresh=self.vis_thresh, debug_show_name=False) return proposals, {} def predict_instances( self, grids, logits_pred, reg_pred, image_sizes, agn_hm_pred, is_proposal=False): sampled_boxes = [] for l in range(len(grids)): sampled_boxes.append(self.predict_single_level( grids[l], logits_pred[l], reg_pred[l] * self.strides[l], image_sizes, agn_hm_pred[l], l, is_proposal=is_proposal)) boxlists = list(zip(*sampled_boxes)) boxlists = [Instances.cat(boxlist) for boxlist in boxlists] boxlists = self.nms_and_topK( boxlists, nms=not self.not_nms) return boxlists def predict_single_level( self, grids, heatmap, reg_pred, image_sizes, agn_hm, level, is_proposal=False): N, C, H, W = heatmap.shape # put in the same format as grids if self.center_nms: heatmap_nms = nn.functional.max_pool2d( heatmap, (3, 3), stride=1, padding=1) heatmap = heatmap * (heatmap_nms == heatmap).float() heatmap = heatmap.permute(0, 2, 3, 1) # N x H x W x C heatmap = heatmap.reshape(N, -1, C) # N x HW x C box_regression = reg_pred.view(N, 4, H, W).permute(0, 2, 3, 1) # N x H x W x 4 box_regression = box_regression.reshape(N, -1, 4) candidate_inds = heatmap > self.score_thresh # 0.05 pre_nms_top_n = candidate_inds.view(N, -1).sum(1) # N pre_nms_topk = self.pre_nms_topk_train if self.training else self.pre_nms_topk_test pre_nms_top_n = pre_nms_top_n.clamp(max=pre_nms_topk) # N if agn_hm is not None: agn_hm = agn_hm.view(N, 1, H, W).permute(0, 2, 3, 1) agn_hm = agn_hm.reshape(N, -1) heatmap = heatmap * agn_hm[:, :, None] results = [] for i in range(N): per_box_cls = heatmap[i] # HW x C per_candidate_inds = candidate_inds[i] # n per_box_cls = per_box_cls[per_candidate_inds] # n per_candidate_nonzeros = per_candidate_inds.nonzero() # n per_box_loc = per_candidate_nonzeros[:, 0] # n per_class = per_candidate_nonzeros[:, 1] # n per_box_regression = box_regression[i] # HW x 4 per_box_regression = per_box_regression[per_box_loc] # n x 4 per_grids = grids[per_box_loc] # n x 2 per_pre_nms_top_n = pre_nms_top_n[i] # 1 if per_candidate_inds.sum().item() > per_pre_nms_top_n.item(): per_box_cls, top_k_indices = \ per_box_cls.topk(per_pre_nms_top_n, sorted=False) per_class = per_class[top_k_indices] per_box_regression = per_box_regression[top_k_indices] per_grids = per_grids[top_k_indices] detections = torch.stack([ per_grids[:, 0] - per_box_regression[:, 0], per_grids[:, 1] - per_box_regression[:, 1], per_grids[:, 0] + per_box_regression[:, 2], per_grids[:, 1] + per_box_regression[:, 3], ], dim=1) # n x 4 # avoid invalid boxes in RoI heads detections[:, 2] = torch.max(detections[:, 2], detections[:, 0] + 0.01) detections[:, 3] = torch.max(detections[:, 3], detections[:, 1] + 0.01) boxlist = Instances(image_sizes[i]) boxlist.scores = torch.sqrt(per_box_cls) \ if self.with_agn_hm else per_box_cls # n # import pdb; pdb.set_trace() boxlist.pred_boxes = Boxes(detections) boxlist.pred_classes = per_class results.append(boxlist) return results def nms_and_topK(self, boxlists, nms=True): num_images = len(boxlists) results = [] for i in range(num_images): nms_thresh = self.nms_thresh_train if self.training else \ self.nms_thresh_test result = ml_nms(boxlists[i], nms_thresh) if nms else boxlists[i] if self.debug: print('#proposals before nms', len(boxlists[i])) print('#proposals after nms', len(result)) num_dets = len(result) post_nms_topk = self.post_nms_topk_train if self.training else \ self.post_nms_topk_test if num_dets > post_nms_topk: cls_scores = result.scores image_thresh, _ = torch.kthvalue( cls_scores.float().cpu(), num_dets - post_nms_topk + 1 ) keep = cls_scores >= image_thresh.item() keep = torch.nonzero(keep).squeeze(1) result = result[keep] if self.debug: print('#proposals after filter', len(result)) results.append(result) return results def _add_more_pos(self, reg_pred, gt_instances, shapes_per_level): labels, level_masks, c33_inds, c33_masks, c33_regs = \ self._get_c33_inds(gt_instances, shapes_per_level) N, L, K = labels.shape[0], len(self.strides), 9 c33_inds[c33_masks == 0] = 0 reg_pred_c33 = reg_pred[c33_inds].detach() # N x L x K invalid_reg = c33_masks == 0 c33_regs_expand = c33_regs.view(N * L * K, 4).clamp(min=0) if N > 0: with torch.no_grad(): c33_reg_loss = self.iou_loss( reg_pred_c33.view(N * L * K, 4), c33_regs_expand, None, reduction='none').view(N, L, K).detach() # N x L x K else: c33_reg_loss = reg_pred_c33.new_zeros((N, L, K)).detach() c33_reg_loss[invalid_reg] = INF # N x L x K c33_reg_loss.view(N * L, K)[level_masks.view(N * L), 4] = 0 # real center c33_reg_loss = c33_reg_loss.view(N, L * K) if N == 0: loss_thresh = c33_reg_loss.new_ones((N)).float() else: loss_thresh = torch.kthvalue( c33_reg_loss, self.more_pos_topk, dim=1)[0] # N loss_thresh[loss_thresh > self.more_pos_thresh] = self.more_pos_thresh # N new_pos = c33_reg_loss.view(N, L, K) < \ loss_thresh.view(N, 1, 1).expand(N, L, K) pos_inds = c33_inds[new_pos].view(-1) # P labels = labels.view(N, 1, 1).expand(N, L, K)[new_pos].view(-1) return pos_inds, labels def _get_c33_inds(self, gt_instances, shapes_per_level): ''' TODO (Xingyi): The current implementation is ugly. Refactor. Get the center (and the 3x3 region near center) locations of each objects Inputs: gt_instances: [n_i], sum n_i = N shapes_per_level: L x 2 [(h_l, w_l)]_L ''' labels = [] level_masks = [] c33_inds = [] c33_masks = [] c33_regs = [] L = len(self.strides) B = len(gt_instances) shapes_per_level = shapes_per_level.long() loc_per_level = (shapes_per_level[:, 0] * shapes_per_level[:, 1]).long() # L level_bases = [] s = 0 for l in range(L): level_bases.append(s) s = s + B * loc_per_level[l] level_bases = shapes_per_level.new_tensor(level_bases).long() # L strides_default = shapes_per_level.new_tensor(self.strides).float() # L K = 9 dx = shapes_per_level.new_tensor([-1, 0, 1, -1, 0, 1, -1, 0, 1]).long() dy = shapes_per_level.new_tensor([-1, -1, -1, 0, 0, 0, 1, 1, 1]).long() for im_i in range(B): targets_per_im = gt_instances[im_i] bboxes = targets_per_im.gt_boxes.tensor # n x 4 n = bboxes.shape[0] if n == 0: continue centers = ((bboxes[:, [0, 1]] + bboxes[:, [2, 3]]) / 2) # n x 2 centers = centers.view(n, 1, 2).expand(n, L, 2) strides = strides_default.view(1, L, 1).expand(n, L, 2) # centers_inds = (centers / strides).long() # n x L x 2 center_grids = centers_inds * strides + strides // 2# n x L x 2 l = center_grids[:, :, 0] - bboxes[:, 0].view(n, 1).expand(n, L) t = center_grids[:, :, 1] - bboxes[:, 1].view(n, 1).expand(n, L) r = bboxes[:, 2].view(n, 1).expand(n, L) - center_grids[:, :, 0] b = bboxes[:, 3].view(n, 1).expand(n, L) - center_grids[:, :, 1] # n x L reg = torch.stack([l, t, r, b], dim=2) # n x L x 4 reg = reg / strides_default.view(1, L, 1).expand(n, L, 4).float() Ws = shapes_per_level[:, 1].view(1, L).expand(n, L) Hs = shapes_per_level[:, 0].view(1, L).expand(n, L) expand_Ws = Ws.view(n, L, 1).expand(n, L, K) expand_Hs = Hs.view(n, L, 1).expand(n, L, K) label = targets_per_im.gt_classes.view(n).clone() mask = reg.min(dim=2)[0] >= 0 # n x L mask = mask & self.assign_fpn_level(bboxes) labels.append(label) # n level_masks.append(mask) # n x L Dy = dy.view(1, 1, K).expand(n, L, K) Dx = dx.view(1, 1, K).expand(n, L, K) c33_ind = level_bases.view(1, L, 1).expand(n, L, K) + \ im_i * loc_per_level.view(1, L, 1).expand(n, L, K) + \ (centers_inds[:, :, 1:2].expand(n, L, K) + Dy) * expand_Ws + \ (centers_inds[:, :, 0:1].expand(n, L, K) + Dx) # n x L x K c33_mask = \ ((centers_inds[:, :, 1:2].expand(n, L, K) + dy) < expand_Hs) & \ ((centers_inds[:, :, 1:2].expand(n, L, K) + dy) >= 0) & \ ((centers_inds[:, :, 0:1].expand(n, L, K) + dx) < expand_Ws) & \ ((centers_inds[:, :, 0:1].expand(n, L, K) + dx) >= 0) # TODO (Xingyi): think about better way to implement this # Currently it hard codes the 3x3 region c33_reg = reg.view(n, L, 1, 4).expand(n, L, K, 4).clone() c33_reg[:, :, [0, 3, 6], 0] -= 1 c33_reg[:, :, [0, 3, 6], 2] += 1 c33_reg[:, :, [2, 5, 8], 0] += 1 c33_reg[:, :, [2, 5, 8], 2] -= 1 c33_reg[:, :, [0, 1, 2], 1] -= 1 c33_reg[:, :, [0, 1, 2], 3] += 1 c33_reg[:, :, [6, 7, 8], 1] += 1 c33_reg[:, :, [6, 7, 8], 3] -= 1 c33_mask = c33_mask & (c33_reg.min(dim=3)[0] >= 0) # n x L x K c33_inds.append(c33_ind) c33_masks.append(c33_mask) c33_regs.append(c33_reg) if len(level_masks) > 0: labels = torch.cat(labels, dim=0) level_masks = torch.cat(level_masks, dim=0) c33_inds = torch.cat(c33_inds, dim=0).long() c33_regs = torch.cat(c33_regs, dim=0) c33_masks = torch.cat(c33_masks, dim=0) else: labels = shapes_per_level.new_zeros((0)).long() level_masks = shapes_per_level.new_zeros((0, L)).bool() c33_inds = shapes_per_level.new_zeros((0, L, K)).long() c33_regs = shapes_per_level.new_zeros((0, L, K, 4)).float() c33_masks = shapes_per_level.new_zeros((0, L, K)).bool() return labels, level_masks, c33_inds, c33_masks, c33_regs # N x L, N x L x K ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/dense_heads/centernet_head.py ================================================ import math from typing import List import torch from torch import nn from torch.nn import functional as F from detectron2.layers import ShapeSpec, get_norm from detectron2.config import configurable from ..layers.deform_conv import DFConv2d __all__ = ["CenterNetHead"] class Scale(nn.Module): def __init__(self, init_value=1.0): super(Scale, self).__init__() self.scale = nn.Parameter(torch.FloatTensor([init_value])) def forward(self, input): return input * self.scale class CenterNetHead(nn.Module): @configurable def __init__(self, # input_shape: List[ShapeSpec], in_channels, num_levels, *, num_classes=80, with_agn_hm=False, only_proposal=False, norm='GN', num_cls_convs=4, num_box_convs=4, num_share_convs=0, use_deformable=False, prior_prob=0.01): super().__init__() self.num_classes = num_classes self.with_agn_hm = with_agn_hm self.only_proposal = only_proposal self.out_kernel = 3 head_configs = { "cls": (num_cls_convs if not self.only_proposal else 0, \ use_deformable), "bbox": (num_box_convs, use_deformable), "share": (num_share_convs, use_deformable)} # in_channels = [s.channels for s in input_shape] # assert len(set(in_channels)) == 1, \ # "Each level must have the same channel!" # in_channels = in_channels[0] channels = { 'cls': in_channels, 'bbox': in_channels, 'share': in_channels, } for head in head_configs: tower = [] num_convs, use_deformable = head_configs[head] channel = channels[head] for i in range(num_convs): if use_deformable and i == num_convs - 1: conv_func = DFConv2d else: conv_func = nn.Conv2d tower.append(conv_func( in_channels if i == 0 else channel, channel, kernel_size=3, stride=1, padding=1, bias=True )) if norm == 'GN' and channel % 32 != 0: tower.append(nn.GroupNorm(25, channel)) elif norm != '': tower.append(get_norm(norm, channel)) tower.append(nn.ReLU()) self.add_module('{}_tower'.format(head), nn.Sequential(*tower)) self.bbox_pred = nn.Conv2d( in_channels, 4, kernel_size=self.out_kernel, stride=1, padding=self.out_kernel // 2 ) self.scales = nn.ModuleList( [Scale(init_value=1.0) for _ in range(num_levels)]) for modules in [ self.cls_tower, self.bbox_tower, self.share_tower, self.bbox_pred, ]: for l in modules.modules(): if isinstance(l, nn.Conv2d): torch.nn.init.normal_(l.weight, std=0.01) torch.nn.init.constant_(l.bias, 0) torch.nn.init.constant_(self.bbox_pred.bias, 8.) prior_prob = prior_prob bias_value = -math.log((1 - prior_prob) / prior_prob) if self.with_agn_hm: self.agn_hm = nn.Conv2d( in_channels, 1, kernel_size=self.out_kernel, stride=1, padding=self.out_kernel // 2 ) torch.nn.init.constant_(self.agn_hm.bias, bias_value) torch.nn.init.normal_(self.agn_hm.weight, std=0.01) if not self.only_proposal: cls_kernel_size = self.out_kernel self.cls_logits = nn.Conv2d( in_channels, self.num_classes, kernel_size=cls_kernel_size, stride=1, padding=cls_kernel_size // 2, ) torch.nn.init.constant_(self.cls_logits.bias, bias_value) torch.nn.init.normal_(self.cls_logits.weight, std=0.01) @classmethod def from_config(cls, cfg, input_shape): ret = { # 'input_shape': input_shape, 'in_channels': [s.channels for s in input_shape][0], 'num_levels': len(input_shape), 'num_classes': cfg.MODEL.CENTERNET.NUM_CLASSES, 'with_agn_hm': cfg.MODEL.CENTERNET.WITH_AGN_HM, 'only_proposal': cfg.MODEL.CENTERNET.ONLY_PROPOSAL, 'norm': cfg.MODEL.CENTERNET.NORM, 'num_cls_convs': cfg.MODEL.CENTERNET.NUM_CLS_CONVS, 'num_box_convs': cfg.MODEL.CENTERNET.NUM_BOX_CONVS, 'num_share_convs': cfg.MODEL.CENTERNET.NUM_SHARE_CONVS, 'use_deformable': cfg.MODEL.CENTERNET.USE_DEFORMABLE, 'prior_prob': cfg.MODEL.CENTERNET.PRIOR_PROB, } return ret def forward(self, x): clss = [] bbox_reg = [] agn_hms = [] for l, feature in enumerate(x): feature = self.share_tower(feature) cls_tower = self.cls_tower(feature) bbox_tower = self.bbox_tower(feature) if not self.only_proposal: clss.append(self.cls_logits(cls_tower)) else: clss.append(None) if self.with_agn_hm: agn_hms.append(self.agn_hm(bbox_tower)) else: agn_hms.append(None) reg = self.bbox_pred(bbox_tower) reg = self.scales[l](reg) bbox_reg.append(F.relu(reg)) return clss, bbox_reg, agn_hms ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/dense_heads/utils.py ================================================ import cv2 import torch from torch import nn from detectron2.utils.comm import get_world_size from detectron2.structures import pairwise_iou, Boxes # from .data import CenterNetCrop import torch.nn.functional as F import numpy as np from detectron2.structures import Boxes, ImageList, Instances __all__ = ['reduce_sum', '_transpose'] INF = 1000000000 def _transpose(training_targets, num_loc_list): ''' This function is used to transpose image first training targets to level first ones :return: level first training targets ''' for im_i in range(len(training_targets)): training_targets[im_i] = torch.split( training_targets[im_i], num_loc_list, dim=0) targets_level_first = [] for targets_per_level in zip(*training_targets): targets_level_first.append( torch.cat(targets_per_level, dim=0)) return targets_level_first def reduce_sum(tensor): world_size = get_world_size() if world_size < 2: return tensor tensor = tensor.clone() torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM) return tensor ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/layers/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/layers/deform_conv.py ================================================ import torch from torch import nn from detectron2.layers import Conv2d class _NewEmptyTensorOp(torch.autograd.Function): @staticmethod def forward(ctx, x, new_shape): ctx.shape = x.shape return x.new_empty(new_shape) @staticmethod def backward(ctx, grad): shape = ctx.shape return _NewEmptyTensorOp.apply(grad, shape), None class DFConv2d(nn.Module): """Deformable convolutional layer""" def __init__( self, in_channels, out_channels, with_modulated_dcn=True, kernel_size=3, stride=1, groups=1, dilation=1, deformable_groups=1, bias=False, padding=None ): super(DFConv2d, self).__init__() if isinstance(kernel_size, (list, tuple)): assert isinstance(stride, (list, tuple)) assert isinstance(dilation, (list, tuple)) assert len(kernel_size) == 2 assert len(stride) == 2 assert len(dilation) == 2 padding = ( dilation[0] * (kernel_size[0] - 1) // 2, dilation[1] * (kernel_size[1] - 1) // 2 ) offset_base_channels = kernel_size[0] * kernel_size[1] else: padding = dilation * (kernel_size - 1) // 2 offset_base_channels = kernel_size * kernel_size if with_modulated_dcn: from detectron2.layers.deform_conv import ModulatedDeformConv offset_channels = offset_base_channels * 3 # default: 27 conv_block = ModulatedDeformConv else: from detectron2.layers.deform_conv import DeformConv offset_channels = offset_base_channels * 2 # default: 18 conv_block = DeformConv self.offset = Conv2d( in_channels, deformable_groups * offset_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=1, dilation=dilation ) nn.init.constant_(self.offset.weight, 0) nn.init.constant_(self.offset.bias, 0) ''' for l in [self.offset, ]: nn.init.kaiming_uniform_(l.weight, a=1) torch.nn.init.constant_(l.bias, 0.) ''' self.conv = conv_block( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, deformable_groups=deformable_groups, bias=bias ) self.with_modulated_dcn = with_modulated_dcn self.kernel_size = kernel_size self.stride = stride self.padding = padding self.dilation = dilation self.offset_split = offset_base_channels * deformable_groups * 2 def forward(self, x, return_offset=False): if x.numel() > 0: if not self.with_modulated_dcn: offset_mask = self.offset(x) x = self.conv(x, offset_mask) else: offset_mask = self.offset(x) offset = offset_mask[:, :self.offset_split, :, :] mask = offset_mask[:, self.offset_split:, :, :].sigmoid() x = self.conv(x, offset, mask) if return_offset: return x, offset_mask return x # get output shape output_shape = [ (i + 2 * p - (di * (k - 1) + 1)) // d + 1 for i, p, di, k, d in zip( x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride ) ] output_shape = [x.shape[0], self.conv.weight.shape[0]] + output_shape return _NewEmptyTensorOp.apply(x, output_shape) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/layers/heatmap_focal_loss.py ================================================ import torch from torch.nn import functional as F # TODO: merge these two function def heatmap_focal_loss( inputs, targets, pos_inds, labels, alpha: float = -1, beta: float = 4, gamma: float = 2, reduction: str = 'sum', sigmoid_clamp: float = 1e-4, ignore_high_fp: float = -1., ): """ Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. Args: inputs: (sum_l N*Hl*Wl, C) targets: (sum_l N*Hl*Wl, C) pos_inds: N labels: N Returns: Loss tensor with the reduction option applied. """ pred = torch.clamp(inputs.sigmoid_(), min=sigmoid_clamp, max=1-sigmoid_clamp) neg_weights = torch.pow(1 - targets, beta) pos_pred_pix = pred[pos_inds] # N x C pos_pred = pos_pred_pix.gather(1, labels.unsqueeze(1)) pos_loss = torch.log(pos_pred) * torch.pow(1 - pos_pred, gamma) neg_loss = torch.log(1 - pred) * torch.pow(pred, gamma) * neg_weights if ignore_high_fp > 0: not_high_fp = (pred < ignore_high_fp).float() neg_loss = not_high_fp * neg_loss if reduction == "sum": pos_loss = pos_loss.sum() neg_loss = neg_loss.sum() if alpha >= 0: pos_loss = alpha * pos_loss neg_loss = (1 - alpha) * neg_loss return - pos_loss, - neg_loss heatmap_focal_loss_jit = torch.jit.script(heatmap_focal_loss) # heatmap_focal_loss_jit = heatmap_focal_loss def binary_heatmap_focal_loss( inputs, targets, pos_inds, alpha: float = -1, beta: float = 4, gamma: float = 2, sigmoid_clamp: float = 1e-4, ignore_high_fp: float = -1., ): """ Args: inputs: (sum_l N*Hl*Wl,) targets: (sum_l N*Hl*Wl,) pos_inds: N Returns: Loss tensor with the reduction option applied. """ pred = torch.clamp(inputs.sigmoid_(), min=sigmoid_clamp, max=1-sigmoid_clamp) neg_weights = torch.pow(1 - targets, beta) for i, ind in enumerate(pos_inds): if ind >= pred.shape[0]: print('%'*100) print(pred.shape, ind, pos_inds) pos_inds[i] = pred.shape[0] - 1 pos_pred = pred[pos_inds] # N pos_loss = torch.log(pos_pred) * torch.pow(1 - pos_pred, gamma) neg_loss = torch.log(1 - pred) * torch.pow(pred, gamma) * neg_weights if ignore_high_fp > 0: not_high_fp = (pred < ignore_high_fp).float() neg_loss = not_high_fp * neg_loss pos_loss = - pos_loss.sum() neg_loss = - neg_loss.sum() if alpha >= 0: pos_loss = alpha * pos_loss neg_loss = (1 - alpha) * neg_loss return pos_loss, neg_loss # binary_heatmap_focal_loss_jit = torch.jit.script(binary_heatmap_focal_loss) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/layers/iou_loss.py ================================================ import torch from torch import nn class IOULoss(nn.Module): def __init__(self, loc_loss_type='iou'): super(IOULoss, self).__init__() self.loc_loss_type = loc_loss_type def forward(self, pred, target, weight=None, reduction='sum'): pred_left = pred[:, 0] pred_top = pred[:, 1] pred_right = pred[:, 2] pred_bottom = pred[:, 3] target_left = target[:, 0] target_top = target[:, 1] target_right = target[:, 2] target_bottom = target[:, 3] target_aera = (target_left + target_right) * \ (target_top + target_bottom) pred_aera = (pred_left + pred_right) * \ (pred_top + pred_bottom) w_intersect = torch.min(pred_left, target_left) + \ torch.min(pred_right, target_right) h_intersect = torch.min(pred_bottom, target_bottom) + \ torch.min(pred_top, target_top) g_w_intersect = torch.max(pred_left, target_left) + \ torch.max(pred_right, target_right) g_h_intersect = torch.max(pred_bottom, target_bottom) + \ torch.max(pred_top, target_top) ac_uion = g_w_intersect * g_h_intersect area_intersect = w_intersect * h_intersect area_union = target_aera + pred_aera - area_intersect ious = (area_intersect + 1.0) / (area_union + 1.0) gious = ious - (ac_uion - area_union) / ac_uion if self.loc_loss_type == 'iou': losses = -torch.log(ious) elif self.loc_loss_type == 'linear_iou': losses = 1 - ious elif self.loc_loss_type == 'giou': losses = 1 - gious else: raise NotImplementedError if weight is not None: losses = losses * weight else: losses = losses if reduction == 'sum': return losses.sum() elif reduction == 'batch': return losses.sum(dim=[1]) elif reduction == 'none': return losses else: raise NotImplementedError def giou_loss( boxes1: torch.Tensor, boxes2: torch.Tensor, reduction: str = "none", eps: float = 1e-7, ) -> torch.Tensor: """ Generalized Intersection over Union Loss (Hamid Rezatofighi et. al) https://arxiv.org/abs/1902.09630 Gradient-friendly IoU loss with an additional penalty that is non-zero when the boxes do not overlap and scales with the size of their smallest enclosing box. This loss is symmetric, so the boxes1 and boxes2 arguments are interchangeable. Args: boxes1, boxes2 (Tensor): box locations in XYXY format, shape (N, 4) or (4,). reduction: 'none' | 'mean' | 'sum' 'none': No reduction will be applied to the output. 'mean': The output will be averaged. 'sum': The output will be summed. eps (float): small number to prevent division by zero """ x1, y1, x2, y2 = boxes1.unbind(dim=-1) x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1) assert (x2 >= x1).all(), "bad box: x1 larger than x2" assert (y2 >= y1).all(), "bad box: y1 larger than y2" # Intersection keypoints xkis1 = torch.max(x1, x1g) ykis1 = torch.max(y1, y1g) xkis2 = torch.min(x2, x2g) ykis2 = torch.min(y2, y2g) intsctk = torch.zeros_like(x1) mask = (ykis2 > ykis1) & (xkis2 > xkis1) intsctk[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask]) unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsctk iouk = intsctk / (unionk + eps) # smallest enclosing box xc1 = torch.min(x1, x1g) yc1 = torch.min(y1, y1g) xc2 = torch.max(x2, x2g) yc2 = torch.max(y2, y2g) area_c = (xc2 - xc1) * (yc2 - yc1) miouk = iouk - ((area_c - unionk) / (area_c + eps)) loss = 1 - miouk if reduction == "mean": loss = loss.mean() elif reduction == "sum": loss = loss.sum() return loss ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/layers/ml_nms.py ================================================ from detectron2.layers import batched_nms def ml_nms(boxlist, nms_thresh, max_proposals=-1, score_field="scores", label_field="labels"): """ Performs non-maximum suppression on a boxlist, with scores specified in a boxlist field via score_field. Arguments: boxlist(BoxList) nms_thresh (float) max_proposals (int): if > 0, then only the top max_proposals are kept after non-maximum suppression score_field (str) """ if nms_thresh <= 0: return boxlist if boxlist.has('pred_boxes'): boxes = boxlist.pred_boxes.tensor labels = boxlist.pred_classes else: boxes = boxlist.proposal_boxes.tensor labels = boxlist.proposal_boxes.tensor.new_zeros( len(boxlist.proposal_boxes.tensor)) scores = boxlist.scores keep = batched_nms(boxes, scores, labels, nms_thresh) if max_proposals > 0: keep = keep[: max_proposals] boxlist = boxlist[keep] return boxlist ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/meta_arch/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/meta_arch/centernet_detector.py ================================================ import math import json import numpy as np import torch from torch import nn from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY from detectron2.modeling import build_backbone, build_proposal_generator from detectron2.modeling import detector_postprocess from detectron2.structures import ImageList @META_ARCH_REGISTRY.register() class CenterNetDetector(nn.Module): def __init__(self, cfg): super().__init__() self.mean, self.std = cfg.MODEL.PIXEL_MEAN, cfg.MODEL.PIXEL_STD self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1)) self.backbone = build_backbone(cfg) self.proposal_generator = build_proposal_generator( cfg, self.backbone.output_shape()) # TODO: change to a more precise name def forward(self, batched_inputs): if not self.training: return self.inference(batched_inputs) images = self.preprocess_image(batched_inputs) features = self.backbone(images.tensor) gt_instances = [x["instances"].to(self.device) for x in batched_inputs] _, proposal_losses = self.proposal_generator( images, features, gt_instances) return proposal_losses @property def device(self): return self.pixel_mean.device @torch.no_grad() def inference(self, batched_inputs, do_postprocess=True): images = self.preprocess_image(batched_inputs) inp = images.tensor features = self.backbone(inp) proposals, _ = self.proposal_generator(images, features, None) processed_results = [] for results_per_image, input_per_image, image_size in zip( proposals, batched_inputs, images.image_sizes): if do_postprocess: height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) else: r = results_per_image processed_results.append(r) return processed_results def preprocess_image(self, batched_inputs): """ Normalize, pad and batch the input images. """ images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) return images ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/roi_heads/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/roi_heads/custom_fast_rcnn.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved # Part of the code is from https://github.com/tztztztztz/eql.detectron2/blob/master/projects/EQL/eql/fast_rcnn.py import logging import math import json from typing import Dict, Union import torch from fvcore.nn import giou_loss, smooth_l1_loss from torch import nn from torch.nn import functional as F from detectron2.config import configurable from detectron2.layers import Linear, ShapeSpec, batched_nms, cat, nonzero_tuple from detectron2.modeling.box_regression import Box2BoxTransform from detectron2.structures import Boxes, Instances from detectron2.utils.events import get_event_storage from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers from detectron2.modeling.roi_heads.fast_rcnn import fast_rcnn_inference from detectron2.modeling.roi_heads.fast_rcnn import _log_classification_stats from detectron2.utils.comm import get_world_size from .fed_loss import load_class_freq, get_fed_loss_inds __all__ = ["CustomFastRCNNOutputLayers"] class CustomFastRCNNOutputLayers(FastRCNNOutputLayers): def __init__( self, cfg, input_shape: ShapeSpec, **kwargs ): super().__init__(cfg, input_shape, **kwargs) self.cfg = cfg def losses(self, predictions, proposals): """ enable advanced loss """ scores, proposal_deltas = predictions gt_classes = ( cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0) ) num_classes = self.num_classes _log_classification_stats(scores, gt_classes) if len(proposals): proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0) # Nx4 assert not proposal_boxes.requires_grad, "Proposals should not require gradients!" gt_boxes = cat( [(p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes).tensor for p in proposals], dim=0, ) else: proposal_boxes = gt_boxes = torch.empty((0, 4), device=proposal_deltas.device) loss_cls = self.softmax_cross_entropy_loss(scores, gt_classes) return { "loss_cls": loss_cls, "loss_box_reg": self.box_reg_loss( proposal_boxes, gt_boxes, proposal_deltas, gt_classes) } def sigmoid_cross_entropy_loss(self, pred_class_logits, gt_classes): if pred_class_logits.numel() == 0: return pred_class_logits.new_zeros([1])[0] # This is more robust than .sum() * 0. B = pred_class_logits.shape[0] C = pred_class_logits.shape[1] - 1 target = pred_class_logits.new_zeros(B, C + 1) target[range(len(gt_classes)), gt_classes] = 1 # B x (C + 1) target = target[:, :C] # B x C weight = 1 cls_loss = F.binary_cross_entropy_with_logits( pred_class_logits[:, :-1], target, reduction='none') # B x C loss = torch.sum(cls_loss * weight) / B return loss def softmax_cross_entropy_loss(self, pred_class_logits, gt_classes): """ change _no_instance handling """ if pred_class_logits.numel() == 0: return pred_class_logits.new_zeros([1])[0] loss = F.cross_entropy( pred_class_logits, gt_classes, reduction="mean") return loss def inference(self, predictions, proposals): """ enable use proposal boxes """ boxes = self.predict_boxes(predictions, proposals) scores = self.predict_probs(predictions, proposals) if self.cfg.MODEL.ROI_BOX_HEAD.MULT_PROPOSAL_SCORE: proposal_scores = [p.get('objectness_logits') for p in proposals] scores = [(s * ps[:, None]) ** 0.5 \ for s, ps in zip(scores, proposal_scores)] image_shapes = [x.image_size for x in proposals] return fast_rcnn_inference( boxes, scores, image_shapes, self.test_score_thresh, self.test_nms_thresh, self.test_topk_per_image, ) def predict_probs(self, predictions, proposals): """ support sigmoid """ scores, _ = predictions num_inst_per_image = [len(p) for p in proposals] probs = F.softmax(scores, dim=-1) return probs.split(num_inst_per_image, dim=0) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/roi_heads/custom_roi_heads.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved import numpy as np import json import math import torch from torch import nn from torch.autograd.function import Function from typing import Dict, List, Optional, Tuple, Union from detectron2.layers import ShapeSpec from detectron2.structures import Boxes, Instances, pairwise_iou from detectron2.utils.events import get_event_storage from detectron2.modeling.box_regression import Box2BoxTransform from detectron2.modeling.roi_heads.fast_rcnn import fast_rcnn_inference from detectron2.modeling.roi_heads.roi_heads import ROI_HEADS_REGISTRY, StandardROIHeads from detectron2.modeling.roi_heads.cascade_rcnn import CascadeROIHeads from detectron2.modeling.roi_heads.box_head import build_box_head from .custom_fast_rcnn import CustomFastRCNNOutputLayers @ROI_HEADS_REGISTRY.register() class CustomROIHeads(StandardROIHeads): @classmethod def _init_box_head(self, cfg, input_shape): ret = super()._init_box_head(cfg, input_shape) del ret['box_predictor'] ret['box_predictor'] = CustomFastRCNNOutputLayers( cfg, ret['box_head'].output_shape) self.debug = cfg.DEBUG if self.debug: self.debug_show_name = cfg.DEBUG_SHOW_NAME self.save_debug = cfg.SAVE_DEBUG self.vis_thresh = cfg.VIS_THRESH self.pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to( torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1) self.pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to( torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1) return ret def forward(self, images, features, proposals, targets=None): """ enable debug """ if not self.debug: del images if self.training: assert targets proposals = self.label_and_sample_proposals(proposals, targets) del targets if self.training: losses = self._forward_box(features, proposals) losses.update(self._forward_mask(features, proposals)) losses.update(self._forward_keypoint(features, proposals)) return proposals, losses else: pred_instances = self._forward_box(features, proposals) pred_instances = self.forward_with_given_boxes(features, pred_instances) if self.debug: from ..debug import debug_second_stage denormalizer = lambda x: x * self.pixel_std + self.pixel_mean debug_second_stage( [denormalizer(images[0].clone())], pred_instances, proposals=proposals, debug_show_name=self.debug_show_name) return pred_instances, {} @ROI_HEADS_REGISTRY.register() class CustomCascadeROIHeads(CascadeROIHeads): @classmethod def _init_box_head(self, cfg, input_shape): self.mult_proposal_score = cfg.MODEL.ROI_BOX_HEAD.MULT_PROPOSAL_SCORE ret = super()._init_box_head(cfg, input_shape) del ret['box_predictors'] cascade_bbox_reg_weights = cfg.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS box_predictors = [] for box_head, bbox_reg_weights in zip(ret['box_heads'], cascade_bbox_reg_weights): box_predictors.append( CustomFastRCNNOutputLayers( cfg, box_head.output_shape, box2box_transform=Box2BoxTransform(weights=bbox_reg_weights) )) ret['box_predictors'] = box_predictors self.debug = cfg.DEBUG if self.debug: self.debug_show_name = cfg.DEBUG_SHOW_NAME self.save_debug = cfg.SAVE_DEBUG self.vis_thresh = cfg.VIS_THRESH self.pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to( torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1) self.pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to( torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1) return ret def _forward_box(self, features, proposals, targets=None): """ Add mult proposal scores at testing """ if (not self.training) and self.mult_proposal_score: if len(proposals) > 0 and proposals[0].has('scores'): proposal_scores = [ p.get('scores') for p in proposals] else: proposal_scores = [ p.get('objectness_logits') for p in proposals] features = [features[f] for f in self.box_in_features] head_outputs = [] # (predictor, predictions, proposals) prev_pred_boxes = None image_sizes = [x.image_size for x in proposals] for k in range(self.num_cascade_stages): if k > 0: proposals = self._create_proposals_from_boxes(prev_pred_boxes, image_sizes) if self.training: proposals = self._match_and_label_boxes(proposals, k, targets) predictions = self._run_stage(features, proposals, k) prev_pred_boxes = self.box_predictor[k].predict_boxes(predictions, proposals) head_outputs.append((self.box_predictor[k], predictions, proposals)) if self.training: losses = {} storage = get_event_storage() for stage, (predictor, predictions, proposals) in enumerate(head_outputs): with storage.name_scope("stage{}".format(stage)): stage_losses = predictor.losses(predictions, proposals) losses.update({k + "_stage{}".format(stage): v for k, v in stage_losses.items()}) return losses else: # Each is a list[Tensor] of length #image. Each tensor is Ri x (K+1) scores_per_stage = [h[0].predict_probs(h[1], h[2]) for h in head_outputs] scores = [ sum(list(scores_per_image)) * (1.0 / self.num_cascade_stages) for scores_per_image in zip(*scores_per_stage) ] if self.mult_proposal_score: scores = [(s * ps[:, None]) ** 0.5 \ for s, ps in zip(scores, proposal_scores)] predictor, predictions, proposals = head_outputs[-1] boxes = predictor.predict_boxes(predictions, proposals) pred_instances, _ = fast_rcnn_inference( boxes, scores, image_sizes, predictor.test_score_thresh, predictor.test_nms_thresh, predictor.test_topk_per_image, ) return pred_instances def forward(self, images, features, proposals, targets=None): ''' enable debug ''' if not self.debug: del images if self.training: proposals = self.label_and_sample_proposals(proposals, targets) if self.training: losses = self._forward_box(features, proposals, targets) losses.update(self._forward_mask(features, proposals)) losses.update(self._forward_keypoint(features, proposals)) return proposals, losses else: # import pdb; pdb.set_trace() pred_instances = self._forward_box(features, proposals) pred_instances = self.forward_with_given_boxes(features, pred_instances) if self.debug: from ..debug import debug_second_stage denormalizer = lambda x: x * self.pixel_std + self.pixel_mean debug_second_stage( [denormalizer(x.clone()) for x in images], pred_instances, proposals=proposals, save_debug=self.save_debug, debug_show_name=self.debug_show_name, vis_thresh=self.vis_thresh) return pred_instances, {} ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/roi_heads/fed_loss.py ================================================ import torch import json import numpy as np from torch.nn import functional as F def load_class_freq( path='datasets/lvis/lvis_v1_train_cat_info.json', freq_weight=0.5): cat_info = json.load(open(path, 'r')) cat_info = torch.tensor( [c['image_count'] for c in sorted(cat_info, key=lambda x: x['id'])]) freq_weight = cat_info.float() ** freq_weight return freq_weight def get_fed_loss_inds( gt_classes, num_sample_cats=50, C=1203, \ weight=None, fed_cls_inds=-1): appeared = torch.unique(gt_classes) # C' prob = appeared.new_ones(C + 1).float() prob[-1] = 0 if len(appeared) < num_sample_cats: if weight is not None: prob[:C] = weight.float().clone() prob[appeared] = 0 if fed_cls_inds > 0: prob[fed_cls_inds:] = 0 more_appeared = torch.multinomial( prob, num_sample_cats - len(appeared), replacement=False) appeared = torch.cat([appeared, more_appeared]) return appeared ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/tag2Text/__init__.py ================================================ import sys sys.path.append('third_party/grit_src') ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/tag2Text/med.py ================================================ ''' * Copyright (c) 2022, salesforce.com, inc. * All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause * By Junnan Li * Based on huggingface code base * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert ''' import math import os import warnings from dataclasses import dataclass from typing import Optional, Tuple import torch from torch import Tensor, device, dtype, nn import torch.utils.checkpoint from torch import nn from torch.nn import CrossEntropyLoss import torch.nn.functional as F from transformers.activations import ACT2FN from transformers.file_utils import ( ModelOutput, ) from transformers.modeling_outputs import ( BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions, CausalLMOutputWithCrossAttentions, MaskedLMOutput, MultipleChoiceModelOutput, NextSentencePredictorOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput, ) from transformers.modeling_utils import ( PreTrainedModel, apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer, ) from transformers.utils import logging from transformers.models.bert.configuration_bert import BertConfig logger = logging.get_logger(__name__) class BertEmbeddings_nopos(nn.Module): """Construct the embeddings from word and position embeddings.""" def __init__(self, config): super().__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) # self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized # self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) # self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.config = config def forward( self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 ): if input_ids is not None: input_shape = input_ids.size() else: input_shape = inputs_embeds.size()[:-1] seq_length = input_shape[1] # if position_ids is None: # position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) embeddings = inputs_embeds # if self.position_embedding_type == "absolute": # position_embeddings = self.position_embeddings(position_ids) # # print('add position_embeddings!!!!') # embeddings += position_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings class BertEmbeddings(nn.Module): """Construct the embeddings from word and position embeddings.""" def __init__(self, config): super().__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.config = config def forward( self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 ): if input_ids is not None: input_shape = input_ids.size() else: input_shape = inputs_embeds.size()[:-1] seq_length = input_shape[1] if position_ids is None: position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) embeddings = inputs_embeds if self.position_embedding_type == "absolute": position_embeddings = self.position_embeddings(position_ids) # print('add position_embeddings!!!!') embeddings += position_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings class BertSelfAttention(nn.Module): def __init__(self, config, is_cross_attention): super().__init__() self.config = config if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads) ) self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = nn.Linear(config.hidden_size, self.all_head_size) if is_cross_attention: self.key = nn.Linear(config.encoder_width, self.all_head_size) self.value = nn.Linear(config.encoder_width, self.all_head_size) else: self.key = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": self.max_position_embeddings = config.max_position_embeddings self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.save_attention = False def save_attn_gradients(self, attn_gradients): self.attn_gradients = attn_gradients def get_attn_gradients(self): return self.attn_gradients def save_attention_map(self, attention_map): self.attention_map = attention_map def get_attention_map(self): return self.attention_map def transpose_for_scores(self, x): new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_value=None, output_attentions=False, ): mixed_query_layer = self.query(hidden_states) # If this is instantiated as a cross-attention module, the keys # and values come from an encoder; the attention mask needs to be # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None if is_cross_attention: # print(self.key.weight.shape) key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) attention_mask = encoder_attention_mask elif past_key_value is not None: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) key_layer = torch.cat([past_key_value[0], key_layer], dim=2) value_layer = torch.cat([past_key_value[1], value_layer], dim=2) else: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) query_layer = self.transpose_for_scores(mixed_query_layer) if key_layer.shape[0] > query_layer.shape[0]: key_layer = key_layer[:query_layer.shape[0], :, :, :] attention_mask = attention_mask[:query_layer.shape[0], :, :] value_layer = value_layer[:query_layer.shape[0], :, :, :] attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) past_key_value = (key_layer, value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": seq_length = hidden_states.size()[1] position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) distance = position_ids_l - position_ids_r positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility if self.position_embedding_type == "relative_key": relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) attention_scores = attention_scores + relative_position_scores elif self.position_embedding_type == "relative_key_query": relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in BertModel forward() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = nn.Softmax(dim=-1)(attention_scores) if is_cross_attention and self.save_attention: self.save_attention_map(attention_probs) attention_probs.register_hook(self.save_attn_gradients) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs_dropped = self.dropout(attention_probs) # Mask heads if we want to if head_mask is not None: attention_probs_dropped = attention_probs_dropped * head_mask context_layer = torch.matmul(attention_probs_dropped, value_layer) context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) context_layer = context_layer.view(*new_context_layer_shape) outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) outputs = outputs + (past_key_value,) return outputs class BertSelfOutput(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states class BertAttention(nn.Module): def __init__(self, config, is_cross_attention=False): super().__init__() self.self = BertSelfAttention(config, is_cross_attention) self.output = BertSelfOutput(config) self.pruned_heads = set() def prune_heads(self, heads): if len(heads) == 0: return heads, index = find_pruneable_heads_and_indices( heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads ) # Prune linear layers self.self.query = prune_linear_layer(self.self.query, index) self.self.key = prune_linear_layer(self.self.key, index) self.self.value = prune_linear_layer(self.self.value, index) self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) # Update hyper params and store pruned heads self.self.num_attention_heads = self.self.num_attention_heads - len(heads) self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_value=None, output_attentions=False, ): self_outputs = self.self( hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions, ) attention_output = self.output(self_outputs[0], hidden_states) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs class BertIntermediate(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.intermediate_size) if isinstance(config.hidden_act, str): self.intermediate_act_fn = ACT2FN[config.hidden_act] else: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states class BertOutput(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states class BertLayer(nn.Module): def __init__(self, config, layer_num): super().__init__() self.config = config self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 self.attention = BertAttention(config) self.layer_num = layer_num if self.config.add_cross_attention: self.crossattention = BertAttention(config, is_cross_attention=self.config.add_cross_attention) self.intermediate = BertIntermediate(config) self.output = BertOutput(config) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_value=None, output_attentions=False, mode=None, ): if mode == 'mlr': assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers" # print('attention_output.shape',attention_output.shape) # print('encoder_hidden_states.shape',encoder_hidden_states.shape) cross_attention_outputs = self.crossattention( hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions=output_attentions, ) attention_output = cross_attention_outputs[0] outputs = cross_attention_outputs[1:-1] # add cross attentions if we output attention weights present_key_value = cross_attention_outputs[-1] else: # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None self_attention_outputs = self.attention( hidden_states, attention_mask, head_mask, output_attentions=output_attentions, past_key_value=self_attn_past_key_value, ) attention_output = self_attention_outputs[0] outputs = self_attention_outputs[1:-1] present_key_value = self_attention_outputs[-1] if mode=='multimodal': assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers" cross_attention_outputs = self.crossattention( attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions=output_attentions, ) attention_output = cross_attention_outputs[0] outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights layer_output = apply_chunking_to_forward( self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output ) outputs = (layer_output,) + outputs outputs = outputs + (present_key_value,) return outputs def feed_forward_chunk(self, attention_output): intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) return layer_output class BertEncoder(nn.Module): def __init__(self, config): super().__init__() self.config = config self.layer = nn.ModuleList([BertLayer(config,i) for i in range(config.num_hidden_layers)]) self.gradient_checkpointing = False def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, use_cache=None, output_attentions=False, output_hidden_states=False, return_dict=True, mode='multimodal', ): all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None next_decoder_cache = () if use_cache else None for i in range(self.config.num_hidden_layers): layer_module = self.layer[i] if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_head_mask = head_mask[i] if head_mask is not None else None past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: if use_cache: logger.warn( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." ) use_cache = False def create_custom_forward(module): def custom_forward(*inputs): return module(*inputs, past_key_value, output_attentions) return custom_forward layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, layer_head_mask, encoder_hidden_states, encoder_attention_mask, mode=mode, ) else: layer_outputs = layer_module( hidden_states, attention_mask, layer_head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions, mode=mode, ) hidden_states = layer_outputs[0] if use_cache: next_decoder_cache += (layer_outputs[-1],) if output_attentions: all_self_attentions = all_self_attentions + (layer_outputs[1],) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if not return_dict: return tuple( v for v in [ hidden_states, next_decoder_cache, all_hidden_states, all_self_attentions, all_cross_attentions, ] if v is not None ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, hidden_states=all_hidden_states, attentions=all_self_attentions, cross_attentions=all_cross_attentions, ) class BertPooler(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.activation = nn.Tanh() def forward(self, hidden_states): # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] pooled_output = self.dense(first_token_tensor) pooled_output = self.activation(pooled_output) return pooled_output class BertPredictionHeadTransform(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) if isinstance(config.hidden_act, str): self.transform_act_fn = ACT2FN[config.hidden_act] else: self.transform_act_fn = config.hidden_act self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.transform_act_fn(hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states class BertLMPredictionHead(nn.Module): def __init__(self, config): super().__init__() self.transform = BertPredictionHeadTransform(config) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) return hidden_states class BertOnlyMLMHead(nn.Module): def __init__(self, config): super().__init__() self.predictions = BertLMPredictionHead(config) def forward(self, sequence_output): prediction_scores = self.predictions(sequence_output) return prediction_scores class BertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = BertConfig base_model_prefix = "bert" _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """ Initialize the weights """ if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() class BertModel(BertPreTrainedModel): """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of cross-attention is added between the self-attention layers, following the architecture described in `Attention is all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an input to the forward pass. """ def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) if add_pooling_layer else None self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int], device: device, is_decoder: bool) -> Tensor: """ Makes broadcastable attention and causal masks so that future and masked tokens are ignored. Arguments: attention_mask (:obj:`torch.Tensor`): Mask with ones indicating tokens to attend to, zeros for tokens to ignore. input_shape (:obj:`Tuple[int]`): The shape of the input to the model. device: (:obj:`torch.device`): The device of the input to the model. Returns: :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`. """ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. if attention_mask.dim() == 3: extended_attention_mask = attention_mask[:, None, :, :] elif attention_mask.dim() == 2: # Provided a padding mask of dimensions [batch_size, seq_length] # - if the model is a decoder, apply a causal mask in addition to the padding mask # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] if is_decoder: batch_size, seq_length = input_shape seq_ids = torch.arange(seq_length, device=device) causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] # in case past_key_values are used we need to add a prefix ones mask to the causal mask # causal and attention masks must have same type with pytorch version < 1.3 causal_mask = causal_mask.to(attention_mask.dtype) if causal_mask.shape[1] < attention_mask.shape[1]: prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1] causal_mask = torch.cat( [ torch.ones((batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype), causal_mask, ], axis=-1, ) extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] else: extended_attention_mask = attention_mask[:, None, None, :] else: raise ValueError( "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( input_shape, attention_mask.shape ) ) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask def forward( self, input_ids=None, attention_mask=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, is_decoder=False, mode='multimodal', ): r""" encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. use_cache (:obj:`bool`, `optional`): If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up decoding (see :obj:`past_key_values`). """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: use_cache = False if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() batch_size, seq_length = input_shape device = input_ids.device elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size, seq_length = input_shape device = inputs_embeds.device elif encoder_embeds is not None: input_shape = encoder_embeds.size()[:-1] batch_size, seq_length = input_shape device = encoder_embeds.device else: raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds") # past_key_values_length past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 if attention_mask is None: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device, is_decoder) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if encoder_hidden_states is not None: if type(encoder_hidden_states) == list: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size() else: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if type(encoder_attention_mask) == list: encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] elif encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) if encoder_embeds is None: embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) else: embedding_output = encoder_embeds encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, past_key_values=past_key_values, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, mode=mode, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) if self.pooler is not None else None if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, ) class BertLMHeadModel(BertPreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] def __init__(self, config): super().__init__(config) self.bert = BertModel(config, add_pooling_layer=False) self.cls = BertOnlyMLMHead(config) self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings def forward( self, input_ids=None, attention_mask=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, labels=None, past_key_values=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, return_logits=False, is_decoder=True, reduction='mean', mode='multimodal', ): r""" encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]`` past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. use_cache (:obj:`bool`, `optional`): If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up decoding (see :obj:`past_key_values`). Returns: Example:: >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig >>> import torch >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased') >>> config = BertConfig.from_pretrained("bert-base-cased") >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) >>> prediction_logits = outputs.logits """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False outputs = self.bert( input_ids, attention_mask=attention_mask, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, past_key_values=past_key_values, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, is_decoder=is_decoder, mode=mode, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) # sequence_output.shape torch.Size([85, 30, 768]) # prediction_scores.shape torch.Size([85, 30, 30524]) # labels.shape torch.Size([85, 30]) if return_logits: return prediction_scores[:, :-1, :].contiguous() lm_loss = None if labels is not None: # we are doing next-token prediction; shift prediction scores and input ids by one shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() labels = labels[:, 1:].contiguous() loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1) lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) if reduction=='none': lm_loss = lm_loss.view(prediction_scores.size(0),-1).sum(1) if not return_dict: output = (prediction_scores,) + outputs[2:] return ((lm_loss,) + output) if lm_loss is not None else output return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, cross_attentions=outputs.cross_attentions, ) def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs): input_shape = input_ids.shape # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly if attention_mask is None: attention_mask = input_ids.new_ones(input_shape) # cut decoder_input_ids if past is used if past is not None: input_ids = input_ids[:, -1:] return { "input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past, "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None), "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None), "is_decoder": True, } def _reorder_cache(self, past, beam_idx): reordered_past = () for layer_past in past: reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) return reordered_past ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/tag2Text/swin_transformer.py ================================================ # -------------------------------------------------------- # Swin Transformer # Copyright (c) 2021 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ze Liu # -------------------------------------------------------- import numpy as np from scipy import interpolate import torch import torch.nn as nn import torch.utils.checkpoint as checkpoint from timm.models.layers import DropPath, to_2tuple, trunc_normal_ class Mlp(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x def window_partition(x, window_size): """ Args: x: (B, H, W, C) window_size (int): window size Returns: windows: (num_windows*B, window_size, window_size, C) """ B, H, W, C = x.shape x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) return windows def window_reverse(windows, window_size, H, W): """ Args: windows: (num_windows*B, window_size, window_size, C) window_size (int): Window size H (int): Height of image W (int): Width of image Returns: x: (B, H, W, C) """ B = int(windows.shape[0] / (H * W / window_size / window_size)) x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) return x class WindowAttention(nn.Module): r""" Window based multi-head self attention (W-MSA) module with relative position bias. It supports both of shifted and non-shifted window. Args: dim (int): Number of input channels. window_size (tuple[int]): The height and width of the window. num_heads (int): Number of attention heads. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 proj_drop (float, optional): Dropout ratio of output. Default: 0.0 """ def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.): super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim ** -0.5 # define a parameter table of relative position bias self.relative_position_bias_table = nn.Parameter( torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH # get pair-wise relative position index for each token inside the window coords_h = torch.arange(self.window_size[0]) coords_w = torch.arange(self.window_size[1]) coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww self.register_buffer("relative_position_index", relative_position_index) self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) trunc_normal_(self.relative_position_bias_table, std=.02) self.softmax = nn.Softmax(dim=-1) def forward(self, x, mask=None): """ Args: x: input features with shape of (num_windows*B, N, C) mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None """ B_, N, C = x.shape qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) q = q * self.scale attn = (q @ k.transpose(-2, -1)) relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) if mask is not None: nW = mask.shape[0] attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) attn = attn.view(-1, self.num_heads, N, N) attn = self.softmax(attn) else: attn = self.softmax(attn) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B_, N, C) x = self.proj(x) x = self.proj_drop(x) return x def extra_repr(self) -> str: return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}' def flops(self, N): # calculate flops for 1 window with token length of N flops = 0 # qkv = self.qkv(x) flops += N * self.dim * 3 * self.dim # attn = (q @ k.transpose(-2, -1)) flops += self.num_heads * N * (self.dim // self.num_heads) * N # x = (attn @ v) flops += self.num_heads * N * N * (self.dim // self.num_heads) # x = self.proj(x) flops += N * self.dim * self.dim return flops class SwinTransformerBlock(nn.Module): r""" Swin Transformer Block. Args: dim (int): Number of input channels. input_resolution (tuple[int]): Input resulotion. num_heads (int): Number of attention heads. window_size (int): Window size. shift_size (int): Shift size for SW-MSA. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float, optional): Stochastic depth rate. Default: 0.0 act_layer (nn.Module, optional): Activation layer. Default: nn.GELU norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): super().__init__() self.dim = dim self.input_resolution = input_resolution self.num_heads = num_heads self.window_size = window_size self.shift_size = shift_size self.mlp_ratio = mlp_ratio if min(self.input_resolution) <= self.window_size: # if window size is larger than input resolution, we don't partition windows self.shift_size = 0 self.window_size = min(self.input_resolution) assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" self.norm1 = norm_layer(dim) self.attn = WindowAttention( dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) if self.shift_size > 0: # calculate attention mask for SW-MSA H, W = self.input_resolution img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1 h_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) w_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1 mask_windows = mask_windows.view(-1, self.window_size * self.window_size) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) else: attn_mask = None self.register_buffer("attn_mask", attn_mask) def forward(self, x): H, W = self.input_resolution B, L, C = x.shape assert L == H * W, "input feature has wrong size" shortcut = x x = self.norm1(x) x = x.view(B, H, W, C) # cyclic shift if self.shift_size > 0: shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) else: shifted_x = x # partition windows x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C # W-MSA/SW-MSA attn_windows = self.attn(x_windows, mask=self.attn_mask) # nW*B, window_size*window_size, C # merge windows attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C # reverse cyclic shift if self.shift_size > 0: x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) else: x = shifted_x x = x.view(B, H * W, C) # FFN x = shortcut + self.drop_path(x) x = x + self.drop_path(self.mlp(self.norm2(x))) return x def extra_repr(self) -> str: return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \ f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}" def flops(self): flops = 0 H, W = self.input_resolution # norm1 flops += self.dim * H * W # W-MSA/SW-MSA nW = H * W / self.window_size / self.window_size flops += nW * self.attn.flops(self.window_size * self.window_size) # mlp flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio # norm2 flops += self.dim * H * W return flops class PatchMerging(nn.Module): r""" Patch Merging Layer. Args: input_resolution (tuple[int]): Resolution of input feature. dim (int): Number of input channels. norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm): super().__init__() self.input_resolution = input_resolution self.dim = dim self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) self.norm = norm_layer(4 * dim) def forward(self, x): """ x: B, H*W, C """ H, W = self.input_resolution B, L, C = x.shape assert L == H * W, "input feature has wrong size" assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even." x = x.view(B, H, W, C) x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C x = self.norm(x) x = self.reduction(x) return x def extra_repr(self) -> str: return f"input_resolution={self.input_resolution}, dim={self.dim}" def flops(self): H, W = self.input_resolution flops = H * W * self.dim flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim return flops class BasicLayer(nn.Module): """ A basic Swin Transformer layer for one stage. Args: dim (int): Number of input channels. input_resolution (tuple[int]): Input resolution. depth (int): Number of blocks. num_heads (int): Number of attention heads. window_size (int): Local window size. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__(self, dim, input_resolution, depth, num_heads, window_size, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False): super().__init__() self.dim = dim self.input_resolution = input_resolution self.depth = depth self.use_checkpoint = use_checkpoint # build blocks self.blocks = nn.ModuleList([ SwinTransformerBlock(dim=dim, input_resolution=input_resolution, num_heads=num_heads, window_size=window_size, shift_size=0 if (i % 2 == 0) else window_size // 2, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop, drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, norm_layer=norm_layer) for i in range(depth)]) # patch merging layer if downsample is not None: self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer) else: self.downsample = None def forward(self, x): for blk in self.blocks: if self.use_checkpoint: x = checkpoint.checkpoint(blk, x) else: x = blk(x) if self.downsample is not None: x = self.downsample(x) return x def extra_repr(self) -> str: return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}" def flops(self): flops = 0 for blk in self.blocks: flops += blk.flops() if self.downsample is not None: flops += self.downsample.flops() return flops class PatchEmbed(nn.Module): r""" Image to Patch Embedding Args: img_size (int): Image size. Default: 224. patch_size (int): Patch token size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. norm_layer (nn.Module, optional): Normalization layer. Default: None """ def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): super().__init__() img_size = to_2tuple(img_size) patch_size = to_2tuple(patch_size) patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]] self.img_size = img_size self.patch_size = patch_size self.patches_resolution = patches_resolution self.num_patches = patches_resolution[0] * patches_resolution[1] self.in_chans = in_chans self.embed_dim = embed_dim self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: self.norm = None def forward(self, x): B, C, H, W = x.shape # FIXME look at relaxing size constraints assert H == self.img_size[0] and W == self.img_size[1], \ f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." x = self.proj(x).flatten(2).transpose(1, 2) # B Ph*Pw C if self.norm is not None: x = self.norm(x) return x def flops(self): Ho, Wo = self.patches_resolution flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) if self.norm is not None: flops += Ho * Wo * self.embed_dim return flops class SwinTransformer(nn.Module): r""" Swin Transformer A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - https://arxiv.org/pdf/2103.14030 Args: img_size (int | tuple(int)): Input image size. Default 224 patch_size (int | tuple(int)): Patch size. Default: 4 in_chans (int): Number of input image channels. Default: 3 num_classes (int): Number of classes for classification head. Default: 1000 embed_dim (int): Patch embedding dimension. Default: 96 depths (tuple(int)): Depth of each Swin Transformer layer. num_heads (tuple(int)): Number of attention heads in different layers. window_size (int): Window size. Default: 7 mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None drop_rate (float): Dropout rate. Default: 0 attn_drop_rate (float): Attention dropout rate. Default: 0 drop_path_rate (float): Stochastic depth rate. Default: 0.1 norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. ape (bool): If True, add absolute position embedding to the patch embedding. Default: False patch_norm (bool): If True, add normalization after patch embedding. Default: True use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False """ def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, norm_layer=nn.LayerNorm, ape=False, patch_norm=True, use_checkpoint=False, **kwargs): super().__init__() self.num_classes = num_classes self.num_layers = len(depths) self.embed_dim = embed_dim self.ape = ape self.patch_norm = patch_norm self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) self.mlp_ratio = mlp_ratio # split image into non-overlapping patches self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer if self.patch_norm else None) num_patches = self.patch_embed.num_patches patches_resolution = self.patch_embed.patches_resolution self.patches_resolution = patches_resolution # absolute position embedding if self.ape: self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) trunc_normal_(self.absolute_pos_embed, std=.02) self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule # build layers self.layers = nn.ModuleList() for i_layer in range(self.num_layers): layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer), input_resolution=(patches_resolution[0] // (2 ** i_layer), patches_resolution[1] // (2 ** i_layer)), depth=depths[i_layer], num_heads=num_heads[i_layer], window_size=window_size, mlp_ratio=self.mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], norm_layer=norm_layer, downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, use_checkpoint=use_checkpoint) self.layers.append(layer) self.norm = norm_layer(self.num_features) self.avgpool = nn.AdaptiveAvgPool1d(1) # self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) @torch.jit.ignore def no_weight_decay(self): return {'absolute_pos_embed'} @torch.jit.ignore def no_weight_decay_keywords(self): return {'relative_position_bias_table'} def forward(self, x, idx_to_group_img=None, image_atts=None, **kwargs): x = self.patch_embed(x) if self.ape: x = x + self.absolute_pos_embed x = self.pos_drop(x) for layer in self.layers: x = layer(x) x = self.norm(x) # B L C x_cls = self.avgpool(x.transpose(1, 2)) # B C 1 if idx_to_group_img is None: return torch.cat([x_cls.transpose(1, 2), x], dim=1) else: x_bs = torch.gather(x, dim=0, index=idx_to_group_img.view(-1, 1, 1).expand(-1, x.shape[1], x.shape[2])) weights = image_atts[:, 1:].unsqueeze(2) # B L 1 x_bs_cls = torch.sum((weights * x_bs).transpose(1, 2), dim=-1, keepdim=True) # B C 1 x_bs_cls = x_bs_cls / torch.sum(weights.transpose(1, 2), dim=-1, keepdim=True) # avgpool return torch.cat([x_bs_cls.transpose(1, 2), x_bs], dim=1), \ torch.cat([x_cls.transpose(1, 2), x], dim=1) def flops(self): flops = 0 flops += self.patch_embed.flops() for i, layer in enumerate(self.layers): flops += layer.flops() flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers) flops += self.num_features * self.num_classes return flops def interpolate_relative_pos_embed(rel_pos_bias, dst_num_pos, param_name=''): # from: https://github.com/microsoft/unilm/blob/8a0a1c1f4e7326938ea7580a00d56d7f17d65612/beit/run_class_finetuning.py#L348 # rel_pos_bias: relative_position_bias_table src_num_pos, num_attn_heads = rel_pos_bias.size() num_extra_tokens = 0 src_size = int((src_num_pos - num_extra_tokens) ** 0.5) dst_size = int((dst_num_pos - num_extra_tokens) ** 0.5) if src_size != dst_size: print("Position interpolate %s from %dx%d to %dx%d" % (param_name, src_size, src_size, dst_size, dst_size)) # extra_tokens = rel_pos_bias[-num_extra_tokens:, :] # rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :] def geometric_progression(a, r, n): return a * (1.0 - r ** n) / (1.0 - r) left, right = 1.01, 1.5 while right - left > 1e-6: q = (left + right) / 2.0 gp = geometric_progression(1, q, src_size // 2) if gp > dst_size // 2: right = q else: left = q # if q > 1.090307: # q = 1.090307 dis = [] cur = 1 for i in range(src_size // 2): dis.append(cur) cur += q ** (i + 1) r_ids = [-_ for _ in reversed(dis)] x = r_ids + [0] + dis y = r_ids + [0] + dis t = dst_size // 2.0 dx = np.arange(-t, t + 0.1, 1.0) dy = np.arange(-t, t + 0.1, 1.0) # print("Original positions = %s" % str(x)) # print("Target positions = %s" % str(dx)) all_rel_pos_bias = [] for i in range(num_attn_heads): z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy() f = interpolate.interp2d(x, y, z, kind='cubic') all_rel_pos_bias.append( torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(rel_pos_bias.device)) rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1) return rel_pos_bias ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/tag2Text/tag2text.py ================================================ ''' * Tag2Text * Written by Xinyu Huang ''' import warnings warnings.filterwarnings("ignore") from .vit import VisionTransformer, interpolate_pos_embed from .swin_transformer import SwinTransformer, interpolate_relative_pos_embed from .med import BertConfig, BertModel, BertLMHeadModel from transformers import BertTokenizer import torch from torch import nn import torch.nn.functional as F import os CUR_DIR = os.path.dirname(os.path.abspath(__file__)) from urllib.parse import urlparse from timm.models.hub import download_cached_file from .tag_class import tra_array import json import math import numpy as np def read_json(rpath): with open(rpath, 'r') as f: return json.load(f) delete_tag_index = [127, 3351, 3265, 3338, 3355, 3359] class Tag2Text_Caption(nn.Module): def __init__(self, med_config = f'{CUR_DIR}/med_config.json', image_size = 384, vit = 'base', vit_grad_ckpt = False, vit_ckpt_layer = 0, prompt = 'a picture of ', threshold = 0.7, ): """ Args: med_config (str): path for the mixture of encoder-decoder model's configuration file image_size (int): input image size vit (str): model size of vision transformer """ super().__init__() if vit=='swin_b': if image_size == 224: vision_config_path = 'configs/swin/config_swinB_224.json' elif image_size == 384: vision_config_path = f'{CUR_DIR}/config_swinB_384.json' vision_config = read_json(vision_config_path) assert image_size == vision_config['image_res'] vision_width = vision_config['vision_width'] self.visual_encoder = SwinTransformer(img_size=vision_config['image_res'], patch_size=4, in_chans=3, embed_dim=vision_config['embed_dim'], depths=vision_config['depths'], num_heads=vision_config['num_heads'], window_size=vision_config['window_size'], mlp_ratio=4., qkv_bias=True, drop_rate=0.0, drop_path_rate=0.1, ape=False, patch_norm=True, use_checkpoint=False) else: self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer) self.tokenizer = init_tokenizer() # create the decoder decoder_config = BertConfig.from_json_file(med_config) decoder_config.encoder_width = 768 self.text_decoder = BertLMHeadModel(config=decoder_config) # create encoder encoder_config = BertConfig.from_json_file(med_config) encoder_config.encoder_width = vision_width self.tag_encoder = BertModel(config=encoder_config, add_pooling_layer=False) self.prompt = prompt self.prompt_length = len(self.tokenizer(self.prompt).input_ids)-1 self.threshold = threshold num_features = 768 self.num_class = 3429 q2l_config = BertConfig.from_json_file(f'{CUR_DIR}/q2l_config.json') q2l_config.encoder_width = vision_width self.vision_multi = BertModel.from_pretrained('bert-base-uncased',config=q2l_config, add_pooling_layer=False) self.vision_multi.resize_token_embeddings(len(self.tokenizer)) self.label_embed = nn.Embedding(self.num_class, q2l_config.hidden_size) self.fc = GroupWiseLinear(self.num_class, num_features, bias=True) self.del_selfattention() tie_encoder_decoder_weights(self.tag_encoder,self.vision_multi,'',' ') self.tag_array = tra_array def del_selfattention(self): del self.vision_multi.embeddings for layer in self.vision_multi.encoder.layer: del layer.attention def generate(self, image, sample=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0, tag_input = None, return_tag_predict = False): image_embeds = self.visual_encoder(image) image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device) #==============generate tag==============# if tag_input == None: image_spatial_embeds = image_embeds[:,1:,:] image_cls_embeds = image_embeds[:,0,:] bs = image_spatial_embeds.shape[0] label_embed = self.label_embed.weight.unsqueeze(0).repeat(bs,1,1) mlr_tagembedding = self.vision_multi(encoder_embeds = label_embed, encoder_hidden_states = image_embeds, encoder_attention_mask = image_atts, return_dict = False, mode = 'mlr', ) logits = self.fc(mlr_tagembedding[0]) targets = torch.where(torch.sigmoid(logits) > self.threshold , torch.tensor(1.0).to(image.device), torch.zeros(self.num_class).to(image.device)) tag = targets.cpu().numpy() tag[:,delete_tag_index] = 0 bs = image.size(0) tag_input = [] for b in range(bs): index = np.argwhere(tag[b] == 1) token = self.tag_array[index].squeeze(axis = 1) tag_input.append(' | '.join(token)) #========================================# if not sample: image_embeds = image_embeds.repeat_interleave(num_beams,dim=0) image_atts = image_atts.repeat_interleave(num_beams,dim=0) tag_input_temp = [] for tag in tag_input: for i in range(num_beams): tag_input_temp.append(tag) tag_input = tag_input_temp tag_input_tokenzier = self.tokenizer(tag_input, padding='max_length', truncation=True, max_length=40, return_tensors="pt").to(image.device) encoder_input_ids = tag_input_tokenzier.input_ids encoder_input_ids[:,0] = self.tokenizer.enc_token_id # print(encoder_input_ids.size(), tag_input_tokenzier.attention_mask.size(),image_embeds.size(), image_atts.size()) # import pdb # pdb.set_trace() output_tagembedding = self.tag_encoder(encoder_input_ids, attention_mask = tag_input_tokenzier.attention_mask, encoder_hidden_states = image_embeds, encoder_attention_mask = image_atts, return_dict = True, ) prompt = [self.prompt] * image.size(0) input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(image.device) input_ids[:,0] = self.tokenizer.bos_token_id input_ids = input_ids[:, :-1] if sample: #nucleus sampling model_kwargs = {"encoder_hidden_states": output_tagembedding.last_hidden_state, "encoder_attention_mask":None} outputs = self.text_decoder.generate(input_ids=input_ids, max_length=max_length, min_length=min_length, do_sample=True, top_p=top_p, num_return_sequences=1, eos_token_id=self.tokenizer.sep_token_id, pad_token_id=self.tokenizer.pad_token_id, repetition_penalty=1.1, **model_kwargs) else: #beam search model_kwargs = {"encoder_hidden_states": output_tagembedding.last_hidden_state, "encoder_attention_mask":None} outputs = self.text_decoder.generate(input_ids=input_ids, max_length=max_length, min_length=min_length, num_beams=num_beams, eos_token_id=self.tokenizer.sep_token_id, pad_token_id=self.tokenizer.pad_token_id, repetition_penalty=repetition_penalty, **model_kwargs) captions = [] for output in outputs: caption = self.tokenizer.decode(output, skip_special_tokens=True) captions.append(caption[len(self.prompt):]) if return_tag_predict == True: if sample: return captions, tag_input else: return captions, tag_input[0:int(len(tag_input)/num_beams)] return captions def tag2text_caption(pretrained='',**kwargs): model = Tag2Text_Caption(**kwargs) if pretrained: if kwargs['vit'] == 'swin_b': model,msg = load_checkpoint_swinbase(model,pretrained,kwargs) else: model,msg = load_checkpoint(model,pretrained) # print('vit:',kwargs['vit']) # print('msg_v2',msg) return model from typing import List def tie_encoder_decoder_weights(encoder: nn.Module, decoder: nn.Module, base_model_prefix: str, skip_key:str): uninitialized_encoder_weights: List[str] = [] if decoder.__class__ != encoder.__class__: logger.info( f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized." ) def tie_encoder_to_decoder_recursively( decoder_pointer: nn.Module, encoder_pointer: nn.Module, module_name: str, uninitialized_encoder_weights: List[str], skip_key: str, depth=0, ): assert isinstance(decoder_pointer, nn.Module) and isinstance( encoder_pointer, nn.Module ), f"{decoder_pointer} and {encoder_pointer} have to be of type torch.nn.Module" if hasattr(decoder_pointer, "weight") and skip_key not in module_name: assert hasattr(encoder_pointer, "weight") encoder_pointer.weight = decoder_pointer.weight if hasattr(decoder_pointer, "bias"): assert hasattr(encoder_pointer, "bias") encoder_pointer.bias = decoder_pointer.bias # print(module_name+' is tied') return encoder_modules = encoder_pointer._modules decoder_modules = decoder_pointer._modules if len(decoder_modules) > 0: assert ( len(encoder_modules) > 0 ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}" all_encoder_weights = set([module_name + "/" + sub_name for sub_name in encoder_modules.keys()]) encoder_layer_pos = 0 for name, module in decoder_modules.items(): if name.isdigit(): encoder_name = str(int(name) + encoder_layer_pos) decoder_name = name if not isinstance(decoder_modules[decoder_name], type(encoder_modules[encoder_name])) and len( encoder_modules ) != len(decoder_modules): # this can happen if the name corresponds to the position in a list module list of layers # in this case the decoder has added a cross-attention that the encoder does not have # thus skip this step and subtract one layer pos from encoder encoder_layer_pos -= 1 continue elif name not in encoder_modules: continue elif depth > 500: raise ValueError( "Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your model." ) else: decoder_name = encoder_name = name tie_encoder_to_decoder_recursively( decoder_modules[decoder_name], encoder_modules[encoder_name], module_name + "/" + name, uninitialized_encoder_weights, skip_key, depth=depth + 1, ) all_encoder_weights.remove(module_name + "/" + encoder_name) uninitialized_encoder_weights += list(all_encoder_weights) # tie weights recursively tie_encoder_to_decoder_recursively(decoder, encoder, base_model_prefix, uninitialized_encoder_weights, skip_key) class GroupWiseLinear(nn.Module): # could be changed to: # output = torch.einsum('ijk,zjk->ij', x, self.W) # or output = torch.einsum('ijk,jk->ij', x, self.W[0]) def __init__(self, num_class, hidden_dim, bias=True): super().__init__() self.num_class = num_class self.hidden_dim = hidden_dim self.bias = bias self.W = nn.Parameter(torch.Tensor(1, num_class, hidden_dim)) if bias: self.b = nn.Parameter(torch.Tensor(1, num_class)) self.reset_parameters() def reset_parameters(self): stdv = 1. / math.sqrt(self.W.size(2)) for i in range(self.num_class): self.W[0][i].data.uniform_(-stdv, stdv) if self.bias: for i in range(self.num_class): self.b[0][i].data.uniform_(-stdv, stdv) def forward(self, x): # x: B,K,d x = (self.W * x).sum(-1) if self.bias: x = x + self.b return x def init_tokenizer(): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer.add_special_tokens({'bos_token':'[DEC]'}) tokenizer.add_special_tokens({'additional_special_tokens':['[ENC]']}) tokenizer.enc_token_id = tokenizer.additional_special_tokens_ids[0] return tokenizer def create_vit(vit, image_size, use_grad_checkpointing=False, ckpt_layer=0, drop_path_rate=0): assert vit in ['base', 'large'], "vit parameter must be base or large" if vit=='base': vision_width = 768 visual_encoder = VisionTransformer(img_size=image_size, patch_size=16, embed_dim=vision_width, depth=12, num_heads=12, use_grad_checkpointing=use_grad_checkpointing, ckpt_layer=ckpt_layer, drop_path_rate=0 or drop_path_rate ) elif vit=='large': vision_width = 1024 visual_encoder = VisionTransformer(img_size=image_size, patch_size=16, embed_dim=vision_width, depth=24, num_heads=16, use_grad_checkpointing=use_grad_checkpointing, ckpt_layer=ckpt_layer, drop_path_rate=0.1 or drop_path_rate ) return visual_encoder, vision_width def is_url(url_or_filename): parsed = urlparse(url_or_filename) return parsed.scheme in ("http", "https") def load_checkpoint(model,url_or_filename): if is_url(url_or_filename): cached_file = download_cached_file(url_or_filename, check_hash=False, progress=True) checkpoint = torch.load(cached_file, map_location='cpu') elif os.path.isfile(url_or_filename): checkpoint = torch.load(url_or_filename, map_location='cpu') else: raise RuntimeError('checkpoint url or path is invalid') state_dict = checkpoint['model'] state_dict['visual_encoder.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder.pos_embed'],model.visual_encoder) if 'visual_encoder_m.pos_embed' in model.state_dict().keys(): state_dict['visual_encoder_m.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder_m.pos_embed'], model.visual_encoder_m) for key in model.state_dict().keys(): if key in state_dict.keys(): if state_dict[key].shape!=model.state_dict()[key].shape: del state_dict[key] msg = model.load_state_dict(state_dict,strict=False) # print('load checkpoint from %s'%url_or_filename) return model,msg def load_checkpoint_swinbase(model,url_or_filename,kwargs): if kwargs['image_size'] == 224: vision_config_path = 'configs/swin/config_swinB_224.json' elif kwargs['image_size'] == 384: vision_config_path = f'{CUR_DIR}/config_swinB_384.json' elif kwargs['image_size'] == 480: vision_config_path = 'configs/swin/config_swinB_480.json' elif kwargs['image_size'] == 576: vision_config_path = 'configs/swin/config_swinB_576.json' elif kwargs['image_size'] == 608: vision_config_path = 'configs/swin/config_swinB_608.json' window_size = read_json(vision_config_path)['window_size'] # print('--------------') # print(url_or_filename) # print('--------------') if is_url(url_or_filename): cached_file = download_cached_file(url_or_filename, check_hash=False, progress=True) checkpoint = torch.load(cached_file, map_location='cpu') elif os.path.isfile(url_or_filename): checkpoint = torch.load(url_or_filename, map_location='cpu') else: raise RuntimeError('checkpoint url or path is invalid') state_dict = checkpoint['model'] for k in list(state_dict.keys()): if 'relative_position_bias_table' in k: dst_num_pos = (2 * window_size - 1) ** 2 state_dict[k] = interpolate_relative_pos_embed(state_dict[k], dst_num_pos, param_name=k) elif ('relative_position_index' in k) or ('attn_mask' in k): del state_dict[k] msg = model.load_state_dict(state_dict,strict=False) print('load checkpoint from %s'%url_or_filename) return model,msg if __name__=="__main__": model = Tag2Text_Caption() import pdb pdb.set_trace() ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/tag2Text/tag_class.py ================================================ import numpy as np tra_array = ['tennis', 'bear cub', 'observatory', 'bicycle', 'hillside', 'judge', 'watercolor illustration', 'granite', 'lobster', 'livery', 'stone', 'ceramic', 'ranch', 'cloth', 'smile', 'building', 'tattoo', 'cricketer', 'cheek', 'pear', 'source', 'winter', 'surface', 'spray', 'ceremony', 'magic', 'curve', 'container', 'fair', 'medicine', 'baby', 'tennis racquet', 'ornament', 'bamboo', 'duckling', 'song', 'safari', 'team presentation', 'daffodil', 'cross', 'toothpaste', 'shield', 'fashion model', 'capsule', 'map', 'creek', 'glass house', 'glass plate', 'siding', 'corner', 'water buffalo', 'bison', 'figure skater', 'diploma', 'tire', 'race', 'cable car', 'brain', 'gas stove', 'soap bubble', 'palette', 'snowboard', 'school child', 'trench coat', 'monk', 'fiber', 'kitchen window', 'sunglass', 'coffee', 'security', 'strawberry', 'penguin', 'tree root', 'loaf', 'engagement ring', 'lamb', 'vector cartoon illustration', 'sandwich', 'mountain village', 'shape', 'charm', 'fiction', 'knot', 'greenhouse', 'sushi', 'text', 'disaster', 'trophy', 'gang', 'strap', 'soccer game', 'cardinal', 'tee', 'turtle', 'water surface', 'grassland', 'dolphin', 'store', 'dirt', 'iceberg', 'pergola', 'farmer market', 'publicity portrait', 'tote bag', 'teenage girl', 'view mirror', 'session', 'commuter', 'dressing room', 'tricycle', 'christmas ball', 'headlight', 'police', 'armchair', 'chart', 'yacht', 'saw', 'printer', 'rock band', 'gingerbread house', 'tag', 'table lamp', 'hockey game', 'slope', 'font', 'wicker basket', 'jewelry', 'quarter', 'software', 'weapon', 'pin', 'worship', 'painter', 'goal', 'morning light', 'bike', 'baseball bat', 'elevator', 'cuisine', 'sausage', 'stunt', 'wrestler', 'statue', 'landing', 'pillar', 'willow tree', 'sea wave', 'chicken', 'peanut', 'muscle', 'bob', 'tv genre', 'bathroom window', 'radish', 'textile', 'pelican', 'marketplace', 'crest', 'elevation map', 'gift', 'parish', 'traffic light', 'campfire', 'fog', 'award winner', 'beach ball', 'mat', 'white house', 'plaster', 'moped', 'football team', 'solution', 'bicyclist', 'bit', 'playground', 'darkness', 'cake', 'maple leave', 'mold', 'cracker', 'blueberry', 'rubble', 'container ship', 'pedestrian bridge', 'snail', 'parrot', 'form', 'circuit', 'highlight', 'pickup truck', 'koala', 'rain', 'system', 'weather', 'raincoat', 'soccer team', 'windshield', 'thunderstorm', 'mike', 'bird house', 'bridge', 'grandfather', 'restroom', 'animation', 'wilderness', 'clown', 'banana', 'brown', 'braid', 'dining room', 'kindergarten', 'launch event', 'purple', 'school', 'stairwell', 'brooch', 'movie poster image', 'mountain river', 'shelf', 'wicket', 'headboard', 'buddha', 'flower field', 'dugout', 'cd', 'bald eagle', 'lagoon', 'seaweed', 'agriculture', 'emergency service', 'maple tree', 'parachute', 'continent', 'amusement park', 'remote', 'bun', 'tackle', 'hospital', 'garage door', 'birthday party', 'friendship', 'go', 'mausoleum', 'jeep', 'raccoon', 'step', 'ice hockey team', 'cigarette', 'lace dress', 'forest floor', 'mall', 'captain', 'milk', 'golf course', 'meal', 'picnic table', 'sail', 'volleyball', 'canal', 'terrace', 'computer desk', 'caravan', 'hotel', 'cheerleader', 'nurse', 'museum', 'marsh', 'fox', 'plateau', 'night', 'twin', 'letter logo', 'autumn tree', 'powder', 'convention', 'creature', 'lighthouse', 'shop window', 'jacket', 'stork', 'taxi', 'trade', 'blackboard', 'olive', 'road sign', 'resort', 'snowflake', 'cemetery', 'travel', 'evening dress', 'picnic', 'drink', 'winter morning', 'football player', 'snack', 'boxing glove', 'dinner party', 'airline', 'swing', 'port', 'wheelbarrow', 'bathroom sink', 'sweater', 'ambulance', 'gear', 'oil', 'wii controller', 'array', 'home office', 'car show', 'mixture', 'profession', 'tree frog', 'square', 'facility', 'coral reef', 'sea wall', 'pizza', 'exhibit', 'demolition', 'trout', 'ring', 'coffee shop', 'bracelet', 'bean', 'lip', 'fencing', 'landscape', 'sitting', 'package', 'metal', 'bust', 'king', 'hair', 'window seat', 'wildlife', 'trunk', 'greenery', 'stencil', 'fire hydrant', 'bridesmaid', 'plaza', 'alps', 'tower bridge', 'crop top', 'crossing', 'cinema', 'pedestrian crossing', 'family', 'shopping cart', 'stomach', 'church building', 'screen door', 'skater', 'soccer field', 'kettle', 'mussel', 'raindrop', 'candy cane', 'water lily', 'flower girl', 'desert', 'enclosure', 'christmas light', 'kitchen', 'caterpillar', 'plaid', 'bath', 'bush', 'mud', 'ballet', 'knee', 'adult', 'raft', 'sea view', 'cactus', 'office chair', 'overall', 'rim', 'scaffolding', 'pig', 'cover', 'poster page', 'sprinkle', 'chandelier', 'algae', 'traffic', 'surfboard', 'book', 'filming', 'flash', 'mansion', 'camouflage', 'trouser', 'ticket', 'weed', 'cab', 'trench', 'elephant', 'huddle', 'sphere', 'christmas decoration', 'city', 'launch', 'doll', 'christmas ornament', 'fabric', 'bikini', 'biplane', 'breakfast', 'neighbourhood', 'race track', 'foliage', 'avocado', 'school bus', 'footwear', 'highway', 'ocean view', 'art vector illustration', 'wall clock', 'curtain', 'teenager', 'kitchen area', 'robot', 'tusk', 'lounge chair', 'beam', 'paddle', 'camel', 'lid', 'world map', 'city view', 'newlywed', 'cargo ship', 'yellow', 'exhibition', 'bend', 'novel', 'wool', 'ontario', 'bread', 'campus', 'coastline', 'cutting board', 'booth', 'table top', 'carpet', 'beach chair', 'workout', 'street food', 'fun', 'costumer film designer', 'gadget', 'artist', 'fishing village', 'builder', 'violinist', 'iphone', 'spider web', 'traffic sign', 'ruin', 'rescue', 'clipboard', 'seal', 'film director', 'paw', 'nursery', 'intersection', 'tomato sauce', 'taste', 'paddy field', 'christmas tree', 'wave', 'stool', 'watering can', 'rug', 'daytime', 'subway station', 'craft', 'pine forest', 'black', 'planet', 'motif', 'christmas market', 'glass window', 'college', 'wheat', 'damage', 'rectangle', 'picture frame', 'chess', 'guest room', 'street corner', 'religion', 'seed', 'puzzle', 'freeway', 'beauty', 'ocean', 'watch', 'mother', 'garage', 'quote', 'dj', 'supporter', 'hip hop artist', 'muffin', 'eiffel tower', 'cash', 'firefighter', 'cauliflower', 'bunker', 'sled', 'manicure', 'shark', 'stall', 'jungle', 'family home', 'tour bus', 'chimney', 'touchdown', 'roundabout', 'coyote', 'street scene', 'tank', 'wedding dress', 'mantle', 'bedroom window', 'coconut', 'chapel', 'goat', 'living space', 'rock wall', 'polka dot', 'railway', 'mandala', 'mango', 'lesson', 'mountain landscape', 'team photo', 'bookshelf', 'meter', 'bulldog', 'evening sun', 'stick', 'card', 'pink', 'fish pond', 'paint', 'pill', 'cart', 'pea', 'van', 'album', 'football college game', 'mountain pass', 'doughnut', 'ski slope', 'match', 'official', 'shadow', 'organ', 'celebration', 'coin', 'log cabin', 'firework display', 'present', 'twig', 'chef', 'confetti', 'footpath', 'tour', 'ponytail', 'artwork', 'race car', 'club', 'season', 'hose', 'pencil', 'aircraft', 'rock formation', 'wardrobe', 'participant', 'politician', 'engineer', 'peace', 'filter', 'sailing boat', 'water bottle', 'service dog', 'poodle', 'loki', 'statesman', 'sleeping bag', 'outskirt', 'clock', 'factory', 'oak tree', 'physician', 'color', 'room', 'stairway', 'company', 'lady', 'graph', 'faucet', 'tablecloth', 'subway train', 'chocolate chip cookie', 'headquarters', 'screw', 'goggle', 'halloween', 'city street', 'swirl', 'cord', 'forward', 'bone', 'bedding', 'archway', 'wig', 'lobby', 'mask', 'attic', 'kitchen table', 'skylight', 'fire', 'exit', 'oil painting', 'passenger', 'meditation', 'salmon', 'fedora', 'rubber stamp', 'orange juice', 'arch', 'scientist', 'stroll', 'manhattan', 'float', 'baseball uniform', 'circle', 'church', 'decker bus', 'competitor', 'zoo', 'basketball team', 'tourist', 'daughter', 'silverware', 'ceiling fan', 'birth', 'vase', 'jack', 'mushroom', 'spiral', 'cage', 'limb', 'salad', 'ad', 'control', 'earth', 'party', 'bolt', 'tractor', 'barley', 'wedding photo', 'hawk', 'warehouse', 'vegetable garden', 'chocolate cake', 'cabbage', 'floor window', 'baby shower', 'magnifying glass', 'table', 'stethoscope', 'reading', 'mission', 'croissant', 'gift box', 'rocket', 'forest road', 'cooking', 'suite', 'hill country', 'motorcycle', 'baseball player', 'angle', 'drug', 'sport association', 'championship', 'family portrait', 'florist', 'softball', 'egret', 'office', 'plywood', 'jockey', 'mosque', 'brunch', 'beanie', 'office building', 'pattern', 'calendar', 'indoor', 'pepper', 'ledge', 'trail', 'fuel', 'laptop computer', 'tennis shoe', 'deck chair', 'guitarist', 'barn', 'surgery', 'cartoon illustration', 'nebula', 'railroad', 'mountain goat', 'goose', 'car door', 'cheer', 'liquid', 'hardwood floor', 'pathway', 'acorn', 'gull', 'airliner', 'couch', 'lake house', 'spaghetti', 'promenade', 'collection', 'garden', 'bank', 'robin', 'tennis ball', 'peony', 'gymnast', 'lavender', 'deck', 'test', 'riverside', 'rapper', 'domino', 'bride', 'mouse', 'basil', 'wedding couple', 'ocean wave', 'arm', 'kitchen floor', 'grove', 'family member', 'backyard', 'raspberry', 'forest fire', 'officer', 'hibiscus', 'canyon', 'composer', 'signature', 'olive oil', 'hibiscus flower', 'rose', 'vector icon', 'sunrise', 'horseback', 'motor scooter', 'office worker', 'tradition', 'ingredient', 'washing machine', 'lighting', 'bagel', 'sailboat', 'policeman', 'mare', 'graphic', 'halloween pumpkin', 'stock', 'pilot', 'education', 'team', 'body', 'horse', 'kimono', 'bazaar', 'bag', 'recording studio', 'parsley', 'entrance', 'denim', 'vet', 'horse farm', 'charcoal', 'architecture', 'glass vase', 'puppy', 'estuary', 'television show host', 'city bus', 'shoulder', 'beast', 'balance', 'golfer', 'roadside', 'denim jacket', 'stone wall', 'counter top', 'app icon', 'toast', 'head coach', 'ham', 'warrior', 'gem', 'refrigerator', 'snowman', 'construction worker', 'coal', 'website', 'morning fog', 'mustard', 'human', 'owl', 'puppy dog', 'piggy bank', 'vegetation', 'pirate', 'action film', 'marshmallow', 'thanksgiving', 'business', 'disease', 'signage', 'greeting', 'skate park', 'tile', 'mouth', 'spinach', 'vacation', 'leader', 'shrine', 'walker', 'science fiction film', 'bill', 'rabbit', 'motor boat', 'bar', 'radio', 'barge', 'tail', 'chainsaw', 'gallery', 'rainbow', 'pasta', 'padlock', 'web', 'pastry', 'ink', 'reef', 'school uniform', 'shawl', 'treasure', 'peach', 'dinner table', 'injury', 'harbor', 'witch', 'car dealership', 'litter', 'gesture', 'documentary', 'marriage', 'sea shell', 'priest', 'dome', 'kit', 'icon', 'seaside', 'bucket', 'entertainment', 'stable', 'hat', 'puddle', 'sock', 'shopper', 'technology', 'harbour', 'orbit', 'antler', 'tube', 'flag waving', 'cook', 'tight', 'commander', 'farmland', 'switch', 'hiker', 'wedding ceremony', 'award ceremony', 'champion', 'chopstick', 'farmhouse', 'performer', 'spike', 'accident', 'cruise ship', 'passenger train', 'attraction', 'entertainer', 'rear view', 'sidewalk', 'parade', 'racing', 'plane', 'ritual', 'peacock', 'pocket', 'plum', 'drop', 'carrot', 'floor', 'sunset', 'troop', 'architect', 'coffee table', 'dust', 'outline', 'leather', 'charity event', 'heat', 'whale', 'laundry', 'coconut tree', 'crosswalk', 'pony', 'ant', 'pipe', 'string', 'coat', 'angel', 'beef', 'church tower', 'dish', 'pitch', 'cupboard', 'thermometer', 'dirt field', 'fireworks', 'minute', 'cane', 'pajama', 'flower garden', 'autumn', 'trash can', 'dachshund', 'banana tree', 'tray', 'moose', 'roadway', 'carnival', 'antenna', 'pole', 'castle wall', 'ram', 'cattle', 'hay', 'cookie', 'swimmer', 'baseball team', 'strait', 'hedge', 'jet', 'fire pit', 'octopus', 'calf', 'cube', 'opera', 'cardboard box', 'tiara', 'kitchen sink', 'prairie', 'bowl', 'galaxy', 'straw hat', 'linen', 'ski resort', 'stitch', 'street lamp', 'motorist', 'icicle', 'stain', 'flora', 'drain', 'kitchen cabinet', 'decor', 'bouquet', 'pound', 'interior design', 'nail polish', 'figurine', 'tomb', 'disc', 'twist', 'blouse', 'ribbon', 'figure', 'burger', 'cork', 'soccer goalkeeper', 'train bridge', 'drinking water', 'dew', 'baker', 'storm cloud', 'tarmac', 'tv drama', 'sponge', 'magnet', 'sailor', 'entry', 'swan', 'exercise', 'sloth', 'jewel', 'scuba diver', 'bite', 'cat tree', 'tent', 'can', 'tennis match', 'ecosystem', 'picket fence', 'palm', 'train car', 'frying pan', 'rally', 'tablet pc', 'reindeer', 'image', 'wolf', 'chin', 'conservatory', 'flood water', 'cityscape', 'beach sand', 'car park', 'pavement', 'farm field', 'swimming', 'winter storm', 'stem', 'pillow', 'inning', 'gorilla', 'desk', 'avenue', 'fern', 'money', 'pearl', 'train station', 'skillet', 'nap', 'barber', 'library', 'freezer', 'label', 'rainforest', 'parking sign', 'mirror', 'wing', 'noodle', 'press room', 'sculpture', 'tablet', 'viewer', 'prayer', 'mini', 'mechanic', 'laugh', 'rice field', 'hand', 'mustache', 'mountain road', 'catwalk', 'conference', 'cape', 'installation', 'musician', 'stream', 'machine', 'speech', 'crocodile', 'soccer match', 'town square', 'passport', 'post box', 'point', 'stone building', 'motorway', 'mix', 'dentist', 'businessperson', 'happiness', 'boat', 'vineyard', 'treadmill', 'glass wall', 'water droplet', 'coffee mug', 'graduate', 'sunflower', 'parliament', 'shepherd', 'movie', 'wine', 'orchard', 'tulip', 'motherboard', 'cup', 'broom', 'spot', 'drawing', 'polo shirt', 'graduation', 'film producer', 'moonlight', 'glow', 'film format', 't shirt', 'rock face', 'sword', 'clinic', 'festival day', 'meadow', 'staple', 'pupil', 'training ground', 'rider', 'flower', 'foal', 'wharf', 'foot bridge', 'shooting', 'top', 'mast', 'police car', 'robe', 'wedding bouquet', 'stop sign', 'birthday cake', 'glitter', 'butter', 'scooter', 'tundra', 'superhero', 'pocket watch', 'inscription', 'youngster', 'fruit tree', 'movie poster', 'engine', 'foundation', 'motorcyclist', 'take', 'woman', 'antelope', 'country artist', 'road trip', 'typewriter', 'tuxedo', 'brand', 'pine', 'bathroom', 'paradise', 'texture', 'balloon', 'dining table', 'home', 'computer screen', 'actor', 'clip', 'tv tower', 'panorama', 'summit', 'cat', 'plot', 'eagle', 'dancer', 'pup', 'studio shot', 'tear', 'bird bath', 'classroom', 'bookstore', 'city wall', 'tv programme', 'blade', 'easel', 'buttercream', 'sweet', 'designer', 'diamond', 'handshake', 'herb', 'corn field', 'seafront', 'concrete', 'street artist', 'gas', 'stamp', 'window display', 'paper', 'note', 'pint', 'quarry', 'research', 'fixture', 'manager', 'soil', 'leopard', 'board game', 'ladder', 'stop light', 'island', 'ramp', 'football match', 'icing', 'drill', 'currency', 'summer evening', 'topping', 'pyramid', 'pomegranate', 'cell', 'ivy', 'squad', 'scenery', 'computer', 'locomotive', 'surf', 'mascot', 'dune', 'path', 'duck', 'twilight', 'wire', 'bow tie', 'strike', 'cormorant', 'car wash', 'crane', 'market', 'philosopher', 'alarm clock', 'camera', 'birch', 'greeting card', 'plain', 'clay', 'donut', 'lock', 'moth', 'laboratory', 'fan', 'violin', 'jazz fusion artist', 'mountain biker', 'terrain', 'magazine', 'pickup', 'comedy film', 'smartphone', 'film', 'bed', 'microwave oven', 'tournament', 'lawn', 'car window', 'alligator', 'screen', 'jetty', 'shopping bag', 'landscape view', 'cabinetry', 'friendly match', 'thing', 'petal', 'shopping center', 'transport', 'ballet dancer', 'shoreline', 'princess', 'car seat', 'parking meter', 'green', 'vodka', 'band', 'rock', 'costume', 'warning sign', 'strip', 'plaque', 'wheelchair', 'headband', 'ginger', 'dice', 'media', 'hairdresser', 'press', 'living room', 'stove', 'player', 'cherry', 'workshop', 'carving', 'embroidery', 'doodle', 'adventure', 'rugby player', 'monument', 'brush', 'marker', 'loft', 'postcard', 'collage', 'ball', 'professor', 'dresser', 'gig', 'festival', 'blackbird', 'makeup artist', 'video camera', 'sticker', 'peak', 'wildflower', 'santa hat', 'rodeo', 'wedding photographer', 'guy', 'staff', 'waterfall', 'operation', 'defender', 'falcon', 'haze', 'individual', 'gentleman', 'greyhound', 'rocking chair', 'rice', 'garbage', 'platter', 'chocolate', 'splash', 'business suit', 'cheetah', 'valley', 'maze', 'trampoline', 'garland', 'slalom', 'unicorn', 'tree stump', 'painting', 'romance', 'fight', 'alcohol', 'ghost', 'fondant', 'spa', 'shutter', 'death', 'demonstration', 'cotton', 'pier', 'flea market', 'history', 'savannah', 'fist', 'aisle', 'crew', 'jug', 'pose', 'anchor', 'teapot', 'boat house', 'business team', 'tripod', 'bee', 'pebble', 'mattress', 'canvas', 'hallway', 'campaign', 'pod', 'lake district', 'article', 'white', 'sofa', 'honey', 'marathon', 'pancake', 'tourist attraction', 'wedding gown', 'battle', 'shelving', 'sea', 'sheet music', 'pie', 'yarn', 'construction site', 'flyer', 'tie', 'star', 'lettuce', 'martial artist', 'dart', 'straw', 'reflection', 'conference room', 'temperature', 'rugby', 'mosquito', 'physicist', 'rock climber', 'crash', 'backdrop', 'toilet seat', 'sand castle', 'water park', 'toy car', 'waste', 'luxury', 'hangar', 'rv', 'tree trunk', 'board', 'gold', 'project picture', 'cap', 'cottage', 'relief', 'attire', 'microscope', 'battery', 'roll', 'line', 'parking garage', 'crystal', 'broadcasting', 'brick wall', 'lab', 'flooring', 'meeting', '3d cg rendering', 'desktop computer', 'cowboy', 'sailing ship', 'junction', 'hairstyle', 'homework', 'profile', 'model', 'flower pot', 'street light', 'salt lake', 'maple', 'space', 'blizzard', 'throw', 'zebras', 'brochure', 'constellation', 'beak', 'kilt', 'pond', 'blue sky', 'sneaker', 'sand dune', 'morning sun', 'almond', 'grill', 'curl', 'basketball girl game', 'chameleon', 'toilet bowl', 'prince', 'keyboard', 'queen', 'computer monitor', 'writing', 'crown', 'basilica', 'kiss', 'house', 'parking', 'football competition', 'shell', 'sport equipment', 'comedy', 'baboon', 'vendor', 'rise building', 'wrap', 'food truck', 'cat bed', 'rickshaw', 'flare', 'teal', 'nectar', 'eclipse', 'vehicle', 'steam locomotive', 'gorge', 'cow', 'christmas card', 'demonstrator', 'memorial', 'towel', 'jewellery', 'train', 'frisbee', 'baseball game', 'fur', 'afternoon sun', 'community', 'sparkler', 'bandage', 'firework', 'dollar', 'pasture', 'video', 'bus', 'tree house', 'seashore', 'field', 'hamburger', 'souvenir', 'hedgehog', 'worm', 'pine cone', 'osprey', 'dinosaur', 'vegetable', 'junk', 'poster', 'army', 'winger', 'bundle', 'stage', 'growth', 'wedding party', 'service', 'blanket', 'ruler', 'eye', 'credit card', 'castle', 'diner', 'hut', 'elk', 'hard rock artist', 'nun', 'dog breed', 'nest', 'drama film', 'number icon', 'water tank', 'giraffe', 'altar', 'pavilion', 'tv personality', 'suv', 'street vendor', 'street sign', 'ditch', 'debris', 'foam', 'takeoff', 'spice', 'mountain lake', 'tea', 'orchestra', 'spacecraft', 'counter', 'abbey', 'mountain', 'hydrangea', 'racer', 'orange tree', 'tide', 'cowboy hat', 'rapid', 'town', 'wild', 'herd', 'vein', 'driveway', 'jar', 'bark', 'illustration', 'horror film', 'corn', 'stroller', 'industry', 'mountain stream', 'gym', 'neckline', 'pan', 'client', 'spectator', 'eggplant', 'camper', 'fawn', 'hoodie', 'meat', 'lemonade', 'food market', 'slum', 'comic book character', 'flower market', 'love', 'palace', 'gun', 'heel', 'shopping street', 'shooting basketball guard', 'family photo', 'rooftop', 'laundry basket', 'airport runway', 'horn', 'face mask', 'flight', 'appetizer', 'violet', 'country lane', 'cement', 'instrument', 'tv actor', 'spark', 'celebrity', 'award', 'country house', 'standing', 'auction', 'date', 'engagement', 'puck', 'advertisement', 'chair', 'zebra', 'driftwood', 'bumblebee', 'maple leaf', 'bonnet', 'orange', 'water tower', 'door', 'singer', 'floor plan', 'discussion', 'theatre', 'pilgrim', 'mug', 'branch', 'window sill', 'baseball pitcher', 'bakery', 'lollipop', 'basketball player', 'toilet paper', 'chalkboard', 'cabin', 'sign', 'night sky', 'cannon', 'fishing net', 'submarine', 'suit', 'fur coat', 'wine bottle', 'folder', 'street art', 'suspension bridge', 'evening sky', 'billboard', 'postage stamp', 'newspaper', 'transportation', 'surgeon', 'light', 'park', 'horizon', 'road', 'sand bar', 'trumpet', 'lounge', 'cloud forest', 'birthday celebration', 'balcony', 'anime', 'beehive', 'umbrella', 'goldfish', 'baseball cap', 'waterhole', 'ceiling', 'carousel', 'backpack', 'plant pot', 'atmosphere', 'sunflower field', 'spire', 'vision', 'woodpecker', 'chip', 'pool table', 'lotus flower', 'cone', 'humpback whale', 'reservoir', 'hunt', 'piano', 'plate', 'dining area', 'luggage', 'skier', 'dance floor', 'crow', 'stair', 'overpass', 'opera house', 'bear', 'jazz artist', 'water', 'vessel', 'cast', 'yard', 'cathedral', 'basketball hoop', 'graveyard', 'sound', 'berry', 'onlooker', 'fauna', 'birch tree', 'retail', 'hill', 'skeleton', 'journalist', 'frost', 'basket', 'nail', 'dusk', 'trash', 'dawn', 'clover', 'hen', 'volcano', 'basketball coach', 'home decor', 'charge', 'haircut', 'sense', 'university', 'lizard', 'daisy', 'tablet computer', 'grass field', 'prison', 'metal artist', 'bathroom mirror', 'window frame', 'chest', 'flavor', 'pop country artist', 'market square', 'monkey', 'blog', 'deer', 'speech bubble', 'dog', 'independence day', 'girl', 'boy', 'tartan', 'furniture', 'appliance', 'office window', 'fish boat', 'sand box', 'tv sitcom', 'drama', 'sleigh', 'depression', 'paper towel', 'baseball', 'protestor', 'grape', 'wedding cake', 'invitation', 'accessory', 'pick', 'grandparent', 'racket', 'tea plantation', 'outdoors', 'egg', 'glass bowl', 'sun', 'organization', 'lion', 'panel', 'station', 'wallpaper', 'helicopter', 'salt', 'vanity', 'patio', 'lunch', 'street performer', 'mountain range', 'soup', 'bacon', 'power station', 'cantilever bridge', 'hummingbird', 'shirt', 'rope', 'hip', 'chalk', 'pendant', 'choir', 'tv', 'lichen', 'railway bridge', 'art gallery', 'bartender', 'wagon', 'baby elephant', 'accordion', 'horseshoe', 'building site', 'clutch', 'harvest', 'savanna', 'geranium', 'business woman', 'paddock', 'patch', 'beech tree', 'war', 'suburbs', 'hospital bed', 'motorcycle racer', 'moss', 'gravel', 'government agency', 'dollar bill', 'father', 'fjord', 'concert', 'nut', 'wedding photography', 'finish line', 'home plate', 'food', 'nose', 'thumb', 'village', 'dining room table', 'bumper', 'monster', 'blackberry', 'lime', 'conflict', 'gala', 'wallet', 'wrist', 'hug', 'mermaid', 'lava', 'lawyer', 'folk rock artist', 'arena', 'onion', 'toothbrush', 'fashion', 'perfume', 'flip', 'triangle', 'woodland', 'mail', 'grasshopper', 'studio', 'wood floor', 'den', 'racquet', 'cello', 'lemur', 'astronaut', 'glass table', 'blood', 'dvd', 'planter', 'silver', 'leash', 'master bedroom', 'forest', 'batter', 'shoe', 'engraving', 'opening', 'product', 'toe', 'cocktail', 'mallard duck', 'bike ride', 'oasis', 'wedding ring', 'cinematographer', 'holly', 'autograph', 'fence', 'ice cube', 'cove', 'pineapple', 'aurora', 'glass bead', 'produce', 'apartment building', 'cob', 'miniature', 'cockpit', 'flashlight', 'frog', 'sheep', 'groom', 'steel', 'watermelon', 'clip art', 'paper plate', 'ostrich', 'contour', 'mural', 'cub', 'paisley bandanna', 'winery', 'turn', 'handle', 'satellite', 'post', 'pork', 'child', 'asphalt', 'grocery store', 'vulture', 'trolley', 'nightclub', 'brick', 'trailer', 'compass', 'cereal', 'cafe', 'cartoon character', 'sugar', 'fiction book', 'glass floor', 'umpire', 'guitar', 'hamster', 'protester', 'airplane', 'garment', 'blazer', 'railway line', 'wedding', 'shoe box', 'parking lot', 'construction', 'graduation ceremony', 'tram', 'telescope', 'copper', 'pain', 'autumn forest', 'guest house', 'partner', 'crayon', 'dip', 'boot', 'corridor', 'computer keyboard', 'hockey player', 'chicken coop', 'bus station', 'gathering', 'ankle', 'bunk bed', 'wood table', 'football coach', 'monarch', 'pharmacy', 'legging', 'mannequin', 'female', 'train track', 'stack', 'canopy', 'design element', 'grandmother', 'symbol', 'beach hut', 'zucchini', 'bomb', 'businessman', 'skyscraper', 'tongue', 'case', 'sparkle', 'highland', 'ballroom', 'prom', 'estate', 'customer', 'archipelago', 'cheese', 'debate', 'carriage', 'bulldozer', 'pumpkin', 'sitting room', 'gas station', 'wedding reception', 'camp', 'dog bed', 'tower', 'property', 'river bed', 'pop latin artist', 'fridge', 'wine glass', 'coast', 'beer', 'tow truck', 'fire truck', 'mountain bike', 'thigh', 'heron', 'boat ride', 'gondola', 'turquoise', 'lake', 'llama', 'kitty', 'tin', 'waiting room', 'coffee cup', 'socialite', 'guard', 'tap', 'waterway', 'forehead', 'list', 'erosion', 'box', 'sea lion', 'pollen', 'dam', 'wasp', 'salon', 'tennis tournament', 'flower box', 'aquarium', 'rain cloud', 'clothing store', 'lead singer', 'cupcake', 'tortoise', 'lettering', 'sport facility', 'dance', 'dog house', 'nature', 'football', 'rooster', 'footballer', 'railway track', 'crowd', 'fishing rod', 'silhouette', 'wind turbine', 'sari', 'bus window', 'cloud', 'charity', 'medal', 'yoga', 'event', 'veil', 'fashion menswear milan week', 'news', 'knife', 'print', 'screen tv', 'walnut', 'fungus', 'ice cream', 'computer mouse', 'play', 'tribe', 'picture', 'video game', 'business card', 'music festival', 'rack', 'envelope', 'shower', 'dirt road', 'mine', 'oyster', 'monarch butterfly', 'dude', 'fruit salad', 'podium', 'fork', 'lace', 'test match', 'boulder', 'cricket player', 'staircase', 'peninsula', 'shopping', 'popcorn', 'oak', 'market stall', 'pine tree', 'mountaineer', 'student', 'closet', 'hood', 'handstand', 'centerpiece', 'insect', 'patient', 'makeover', 'tennis player', 'sheet', 'park bench', 'apple', 'organism', 'hook', 'turkey', 'tangerine', 'sibling', 'shopping mall', 'bird', 'scarf', 'smoothie', 'net', 'grass', 'napkin', 'ray', 'eyebrow', 'laptop keyboard', 'motorbike', 'woman hand', 'oven', 'book cover', 'easter egg', 'microwave', 'sand', 'snapshot', 'soccer ball', 'makeup', 'knight', 'bowling ball', 'shower curtain', 'flame', 'lightning', 'running', 'power plant', 'crib', 'cartoon', 'moat', 'fashion girl', 'wedding invitation', 'bottle', 'cliff', 'monastery', 'file photo', 'apartment', 'casino', 'cream', 'sweatshirt', 'storm', 'cruise', 'teddy bear', 'shovel', 'wind farm', 'writer', 'dock', 'professional', 'hotel room', 'job', 'monitor', 'donkey', 'pass', 'interview', 'duchess', 'mark', 'plank', 'beard', 'zombie', 'trio', 'channel', 'cricket team', 'windmill', 'vest', 'diagram', 'cable', 'winter scene', 'golden gate bridge', 'buffalo', 'studio portrait', 'pagoda', 'whiskey', 'freight train', 'kite', 'future', 'steam train', 'phone box', 'headset', 'wood', 'snowboarder', 'paper bag', 'slide', 'grapefruit', 'seating', 'morning', 'bronze sculpture', 'theatre actor', 'stump', 'jean', 'landmark', 'jam', 'waist', 'watercolor', 'hammock', 'light fixture', 'ice', 'basin', 'beverage', 'shelter', 'premiere', 'mound', 'ear', 'bronze', 'sunlight', 'street', 'energy', 'barn door', 'hike', 'fleet', 'claw', 'beach', 'pepperoni', 'bin', 'trainer', 'buffet', 'archive', 'toddler', 'referee', 'bay window', 'dove', 'production company', 'evening light', 'gate', 'farm', 'reed', 'fruit stand', 'explorer', 'snow storm', 'throw pillow', 'button', 'display case', 'bookcase', 'lead', 'lipstick', 'basketball court', 'cargo', 'ensemble', 'pope', 'clock tower', 'teen', 'speaker', 'rat', 'laptop', 'ski', 'mess', 'stadium', 'ferry boat', 'bunny', 'waterfront', 'downtown', 'sink', 'press conference', 'dinner', 'condiment', 'thread', 'audience', 'grid', 'car', 'plastic', 'people', 'barbecue', 'pigeon', 'urinal', 'seagull', 'volunteer', 'hockey', 'fir tree', 'pollution', 'trial', 'collar', 'area', 'meeting room', 'circus', 'yogurt', 'orangutan', 'viaduct', 'comedian', 'drone', 'scissor', 'pop rock artist', 'biscuit', 'panda', 'water feature', 'air balloon', 'remote control', 'watercolor painting', 'show', 'walk', 'post office', 'bike path', 'rap gangsta artist', 'microphone', 'crack', 'sunset sky', 'glass', 'tv show', 'cartoon style', 'stripe', 'foyer', 'signal', 'calligraphy', 'bulb', 'gardener', 'coffee bean', 'spider', 'tapestry', 'city skyline', 'necklace', 'kitten', 'traveler', 'veteran', 'frosting', 'fry', 'tennis court', 'tank top', 'butterfly house', 'mist', 'drummer', 'water level', 'scale', 'baseball glove', 'music video performer', 'champagne', 'camping', 'clothing', 'water drop', 'telephone box', 'pen', 'morning mist', 'fire engine', 'porch', 'opening ceremony', 'style', 'palm tree', 'fashion show', 'universe', 'scratch', 'axe', 'ottoman', 'explosion', 'rib', 'boutique', 'game', 'cucumber', 'fruit', 'stone bridge', 'nature reserve', 'track', 'train window', 'punch', 'telephone pole', 'velvet', 'sauce', 'moon', 'contrast', 'flamingo', 'bat', 'vending machine', 'ship', 'equestrian', 'shade', 'comforter', 'pallet', 'sparrow', 'wii', 'glaze', 'grocery', 'steeple', 'soccer player', 'contract', 'advertising', 'runner', 'chimpanzee', 'world', 'seat', 'project', 'chihuahua', 'bubble', 'willow', 'pedestal', 'soul hip hop artist', 'curb', 'drawer', 'leaf', 'banner', 'launch party', 'coach', 'government', 'snowball', 'toy', 'portrait', 'doctor', 'whiteboard', 'electronic', 'tiger', 'graffiti', 'column', 'nightstand', 'whistle', 'maxi dress', 'bench', 'wetsuit', 'bird feeder', 'football game', 'basketball', 'class', 'bathroom door', 'store window', 'text message', 'wreath', 'street view', 'binocular', 'pet', 'facade', 'drought', 'lemon', 'new year', 'night view', 'airplane window', 'specie', 'rule', 'jaw', 'wheat field', 'diet', 'pop artist', 'habitat', 'screenshot', 'scoreboard', 'shore', 'mane', 'quilt', 'ski lift', 'orchid', 'turban', 'christmas', 'airport', 'marina', 'glass door', 'glass bottle', 'restaurant', 'conductor', 'logo', 'sleep', 'tape', 'tomato', 'river bank', 'lilac', 'tooth', 'training', 'pottery', 'shop', 'steam engine', 'mason jar', 'base', 'procession', 'border', 'shoot', 'footprint', 'hotdog', 'bull', 'stocking', 'recreation', 'automobile model', 'design', 'country pop artist', 'river', 'retriever', 'department store', 'auditorium', 'sport car', 'supermarket', 'belt', 'cricket', 'window box', 'dress shirt', 'letter', 'residence', 'megaphone', 'pant', 'wildfire', 'bird nest', 'crab', 'swimsuit', 'candle', 'funeral', 'mill', 'national park', 'plant', 'cop', 'power line', 'perch', 'blue', 'finger', 'ferris wheel', 'globe', 'skateboard', 'helmet', 'movie theater', 'uniform', 'hammer', 'material', 'kid', 'well', 'butterfly', 'sideline', 'fashion fall show', 'planet earth', 'lift', 'male', 'sauna', 'gray', 'flour', 'sand sculpture', 'program', 'cabinet', 'infant', 'wheel', 'aircraft model', 'dough', 'garlic', 'skate', 'arrow', 'wrapping paper', 'ripple', 'lamp', 'iron', 'banknote', 'beaver', 'ferry', 'courtyard', 'bassist', 'countryside', 'steak', 'comfort', 'boxer', 'laundry room', 'campsite', 'brick building', 'golf', 'subway', 'headphone', 'fort', 'handbag', 'drum', 'flood', 'saddle', 'bass', 'labyrinth', 'needle', 'sun ray', 'app', 'menu', 'president', 'cardigan', 'dandelion', 'wetland', 'ice hockey player', 'number', 'city hall', 'fishing', 'portrait session', 'pug', 'key', 'art print', 'minister', 'hurdle', 'emergency', 'painting artist', 'flag pole', 'evening', 'purse', 'recipe', 'golf ball', 'coloring book', 'mountain peak', 'senior', 'holiday', 'bud', 'cousin', 'pantry', 'lap', 'skin', 'flag', 'tissue paper', 'ridge', 'wire fence', 'surfer', 'climber', 'photograph', 'sewing machine', 'cooler', 'actress', 'apple tree', 'cancer', 'starfish', 'automobile make', 'dumbbell', 'brace', 'tunnel', 'window', 'paint artist', 'composition', 'school student', 'condo', 'convertible', 'cushion', 'selfie', 'territory', 'guide', 'tree', 'court', 'shrimp', 'stone house', 'dress', 'eyelash', 'juice', 'broccoli', 'chain', 'tourism', 'mountain top', 'concept car', 'film premiere', 'light bulb', 'cafeteria', 'badge', 'flower bed', 'theater', 'root', 'racecar driver', 'basketball boy game', 'glove', 'skyline', 'wall', 'glacier', 'airport terminal', 'bug', 'trim', 'railway station', 'briefcase', 'flat', 'fountain', 'person', 'lane', 'asparagus', 'art', 'lantern', 'dishwasher', 'director', 'snake', 'lecture', 'game controller', 'tree branch', 'pub', 'bathing suit', 'queue', 'belly', 'poppy', 'bow', 'pitcher', 'ice cream cone', 'cave', 'candy', 'road bridge', 'host', 'traffic jam', 'earring', 'file', 'foot', 'watermark overlay stamp', 'mailbox', 'supercar', 'railing', 'bedroom', 'seafood', 'waffle', 'bronze statue', 'plan', 'flow', 'marble', 'basketball game', 'automobile', 'scene', 'cypress tree', 'soldier', 'skateboarder', 'glass building', 'cherry tree', 'pump', 'grain', 'wildebeest', 'loop', 'frame', 'bathtub', 'saxophone', 'diver', 'stalk', 'lily', 'bead', 'alley', 'flock', 'family room', 'manufacturing', 'pointer', 'worker', 'navy', 'potato', 'teacher', 'photography', 'dolly', 'boardwalk', 'water fountain', 'athlete', 'side dish', 'bay', 'ice hockey', 'phone', 'hero', 'face', 'gold medal', 'blind', 'swamp', 'researcher', 'swim', 'meatball', 'iguana', 'leather jacket', 'jellyfish', 'site', 'smoke', 'traffic signal', 'melon', 'beetle', 'calculator', 'skirt', 'plantation', 'sculptor', 'barrier', 'catcher', 'security guard', 'sketch', 'awning', 'steering wheel', 'mountain view', 'bus stop', 'pool', 'leg', 'spotlight', 'apron', 'mineral', 'inlet', 'sleeve', 'torch', 'emotion', 'march', 'police officer', 'performance', 'lamp post', 'fishing boat', 'summer', 'presentation', 'saucer', 'suitcase', 'supermodel', 'goalkeeper', 'shrub', 'rock artist', 'document', 'beach house', 'man', 'blue artist', 'cigar', 'railroad track', 'gown', 'mosaic', 'bungalow', 'alphabet', 'baseball field', 'shed', 'pedestrian', 'rail', 'soap', 'kitchen counter', 'dessert', 'dunk', 'blossom', 'conversation', 'fruit market', 'glass jar', 'military', 'beer bottle', 'photographer', 'tennis racket', 'competition', 'escalator', 'bell tower', 'stilt', 'ballerina', 'television', 'feather', 'fence post', 'rear', 'dahlia', 'red carpet', 'tub', 'hole', 'fortress', 'pack', 'telephone', 'cardboard', 'city park', 'platform', 'college student', 'arch bridge', 'wind', 'blender', 'bloom', 'ice rink', 'birthday', 'raven', 'fairy', 'embankment', 'hall', 'flower shop', 'suburb', 'barrel', 'biker', 'steam', 'dragonfly', 'formation', 'electricity', 'business people', 'symmetry', 'walkway', 'fisherman', 'gas mask', 'loch', 'youth', 'hanger', 'dot', 'fish', 'street market', 'animation film', 'crime fiction film', 'boar', 'emblem', 'halloween costume', 'kangaroo', 'couple', 'spoon', 'squirrel', 'neon sign', 'sky', 'office desk', 'beauty salon', 'breakwater', 'fashion look', 'toaster', 'author', 'news conference', 'outdoor', 'canoe', 'dragon', 'tool', 'shopping centre', 'ladybug', 'swimming pool', 'landscaping', 'ski pole', 'red', 'truck', 'fly', 'temple', 'level', 'sunday', 'railroad bridge', 'car mirror', 'lawn mower', 'flute', 'aircraft carrier', 'fashion menswear london week', 'sunshine', 'tile floor', 'skull', 'fossil', 'flower arrangement', 'diaper', 'sea turtle', 'cherry blossom', 'fireman', 'shack', 'lens', 'waiter', 'animal', 'basement', 'snow', 'autumn park', 'glass box', 'kick', 'head', 'anniversary', 'vine', 'back', 'paper lantern', 'fish tank', 'cellphone', 'silk', 'coral', 'notebook', 'photo', 'gazebo', 'ketchup', 'driver', 'farmer', 'bonfire', 'chestnut', 'photoshoot', 'football field', 'olive tree', 'pheasant', 'sandal', 'toilet', 'fireplace', 'music', 'deity', 'fish market', 'fig', 'bell', 'neck', 'grave', 'villa', 'cyclist', 'crate', 'grey', 'asphalt road', 'soccer', 'hostel', 'municipality', 'courthouse', 'roof', 'end table', 'pot', 'sedan', 'structure', 'folk artist', 'sport', 'sport team', 'protest', 'syringe', 'fashion designer', 'jersey', 'heart shape', 'kayak', 'stare', 'sit with', 'direct', 'read', 'photograph', 'spin', 'teach', 'laugh', 'carve', 'grow on', 'warm', 'watch', 'stretch', 'smell', 'decorate', 'shine', 'light', 'dance', 'send', 'park', 'chase', 'collect', 'lead', 'kiss', 'lead to', 'lick', 'smile', 'cheer', 'sit', 'point', 'block', 'rock', 'drop', 'cut', 'ski', 'wrap', 'lose', 'serve', 'provide', 'sleep', 'dress', 'embrace', 'burn', 'pack', 'stir', 'create', 'touch', 'wash', 'stick', 'reveal', 'shop', 'train', 'paint', 'groom', 'hunt', 'bloom', 'play', 'pay', 'brush', 'shoot', 'hold', 'picture', 'carry', 'sip', 'contain', 'turn', 'pour', 'pitch', 'give', 'add', 'blow', 'look in', 'show', 'walk', 'illuminate', 'kneel', 'cover', 'drag', 'post', 'present', 'fit', 'operate', 'fish', 'race', 'write', 'deliver', 'peel', 'push', 'run', 'sit around', 'buy', 'jump', 'walk on', 'attend', 'clean', 'sell', 'ride on', 'mount', 'host', 'dry', 'plant', 'sing', 'row', 'shake', 'perch', 'ride', 'fight', 'skateboard', 'live', 'call', 'surround', 'practice', 'play on', 'work on', 'step', 'relax', 'hit', 'fall in', 'flow', 'greet', 'launch', 'wear', 'hang on', 'drive', 'sit in', 'break', 'learn', 'fly', 'connect', 'display', 'locate', 'compete', 'go for', 'sail', 'lift', 'toast', 'help', 'run on', 'reflect', 'pose', 'scratch', 'frame', 'dribble', 'herd', 'enter', 'exit', 'place', 'inspect', 'build', 'pick', 'fill', 'grind', 'skate', 'offer', 'float', 'sit by', 'stand', 'release', 'rest', 'singe', 'climb', 'tie', 'mark', 'lay', 'stand around', 'capture', 'set', 'land', 'swinge', 'run in', 'kick', 'lean', 'head', 'sign', 'approach', 'swim', 'close', 'crash', 'control', 'fall', 'remove', 'repair', 'open', 'appear', 'travel', 'load', 'miss', 'check', 'surf', 'moor', 'smoke', 'drink', 'board', 'seat', 'feed', 'rise', 'sit on', 'swing', 'grow', 'strike', 'date', 'slide', 'share', 'graze', 'jump in', 'lie', 'extrude', 'roll', 'move', 'gather', 'eat', 'pull', 'run through', 'squeeze', 'lay on', 'draw', 'play with', 'wave', 'assemble', 'perform', 'march', 'score', 'attach', 'adjust', 'hang', 'hug', 'sleep on', 'throw', 'live in', 'talk', 'pet', 'work', 'run with', 'see', 'flip', 'catch', 'cook', 'receive', 'celebrate', 'look', 'classic', 'bridal', 'indoor', 'industrial', 'teenage', 'mini', 'grassy', 'aged', 'long', 'warm', 'light', 'handsome', 'happy', 'three', 'pregnant', 'circular', 'urban', 'silver', 'ceramic', '3d', 'green', 'blonde', 'golden', 'dark', 'tropical', 'ripe', 'deep', 'fat', 'musical', 'giant', 'medical', 'medieval', 'bare', 'stunning', 'bold', 'geographical', 'huge', 'plastic', 'foggy', 'stormy', 'gothic', 'biological', 'empty', 'clear', 'antique', 'pink', 'steep', 'brown', 'striped', 'aerial', 'rainy', 'cool', 'flying', 'commercial', 'purple', 'trendy', 'blank', 'haired', 'dead', 'wooden', 'flat', 'high', 'beige', 'panoramic', 'angry', 'dozen', 'rural', 'solar', 'big', 'small', 'stained', 'thick', 'many', 'fresh', 'clean', 'strong', 'abstract', 'crowded', 'retro', 'dry', 'gorgeous', 'martial', 'modern', 'blue', 'cloudy', 'low', 'four', 'outdoor', 'single', 'much', 'beautiful', 'snowy', 'pretty', 'new', 'short', 'sunny', 'closed', 'rocky', 'red', 'two', 'double', 'male', 'gray', 'five', 'colorful', 'automotive', 'various', 'one', 'old', 'rusty', 'tall', 'wild', 'narrow', 'natural', 'several', 'frozen', 'textured', 'lush', 'young', 'hot', 'mixed', 'white', 'float', 'quiet', 'round', 'bright', 'religious', 'female', 'historical', 'shiny', 'traditional', 'tourist', 'yellow', 'bald', 'coastal', 'lovely', 'little', 'broken', 'romantic', 'wide', 'royal', 'rich', 'open', 'cute', 'ancient', 'cold', 'political', 'elderly', 'gold', 'full', 'rustic', 'metallic', 'floral', 'sad', 'wet', 'fancy', 'senior', 'tiny', 'stylish', 'large', 'frosty', 'orange', 'transparent', 'electronic', 'shallow', 'scared', 'armed', 'dirty', 'historic', 'black', 'few', 'windy', 'some', 'square', 'ornamental', 'sandy', 'thin'] tra_array = np.array(tra_array) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/tag2Text/vit.py ================================================ ''' * Copyright (c) 2022, salesforce.com, inc. * All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause * By Junnan Li * Based on timm code base * https://github.com/rwightman/pytorch-image-models/tree/master/timm ''' import torch import torch.nn as nn import torch.nn.functional as F from functools import partial from timm.models.vision_transformer import _cfg, PatchEmbed from timm.models.registry import register_model from timm.models.layers import trunc_normal_, DropPath from timm.models.helpers import named_apply, adapt_input_conv from fairscale.nn.checkpoint.checkpoint_activations import checkpoint_wrapper class Mlp(nn.Module): """ MLP as used in Vision Transformer, MLP-Mixer and related networks """ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x class Attention(nn.Module): def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights self.scale = qk_scale or head_dim ** -0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.attn_gradients = None self.attention_map = None def save_attn_gradients(self, attn_gradients): self.attn_gradients = attn_gradients def get_attn_gradients(self): return self.attn_gradients def save_attention_map(self, attention_map): self.attention_map = attention_map def get_attention_map(self): return self.attention_map def forward(self, x, register_hook=False): B, N, C = x.shape qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) attn = (q @ k.transpose(-2, -1)) * self.scale attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) if register_hook: self.save_attention_map(attn) attn.register_hook(self.save_attn_gradients) x = (attn @ v).transpose(1, 2).reshape(B, N, C) x = self.proj(x) x = self.proj_drop(x) return x class Block(nn.Module): def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, use_grad_checkpointing=False): super().__init__() self.norm1 = norm_layer(dim) self.attn = Attention( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) if use_grad_checkpointing: self.attn = checkpoint_wrapper(self.attn) self.mlp = checkpoint_wrapper(self.mlp) def forward(self, x, register_hook=False): x = x + self.drop_path(self.attn(self.norm1(x), register_hook=register_hook)) x = x + self.drop_path(self.mlp(self.norm2(x))) return x class VisionTransformer(nn.Module): """ Vision Transformer A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` - https://arxiv.org/abs/2010.11929 """ def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=None, use_grad_checkpointing=False, ckpt_layer=0): """ Args: img_size (int, tuple): input image size patch_size (int, tuple): patch size in_chans (int): number of input channels num_classes (int): number of classes for classification head embed_dim (int): embedding dimension depth (int): depth of transformer num_heads (int): number of attention heads mlp_ratio (int): ratio of mlp hidden dim to embedding dim qkv_bias (bool): enable bias for qkv if True qk_scale (float): override default qk scale of head_dim ** -0.5 if set representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set drop_rate (float): dropout rate attn_drop_rate (float): attention dropout rate drop_path_rate (float): stochastic depth rate norm_layer: (nn.Module): normalization layer """ super().__init__() self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) num_patches = self.patch_embed.num_patches self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) self.pos_drop = nn.Dropout(p=drop_rate) dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule self.blocks = nn.ModuleList([ Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, use_grad_checkpointing=(use_grad_checkpointing and i>=depth-ckpt_layer) ) for i in range(depth)]) self.norm = norm_layer(embed_dim) trunc_normal_(self.pos_embed, std=.02) trunc_normal_(self.cls_token, std=.02) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) @torch.jit.ignore def no_weight_decay(self): return {'pos_embed', 'cls_token'} def forward(self, x, register_blk=-1): B = x.shape[0] x = self.patch_embed(x) cls_tokens = self.cls_token.expand(B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks x = torch.cat((cls_tokens, x), dim=1) x = x + self.pos_embed[:,:x.size(1),:] x = self.pos_drop(x) for i,blk in enumerate(self.blocks): x = blk(x, register_blk==i) x = self.norm(x) return x @torch.jit.ignore() def load_pretrained(self, checkpoint_path, prefix=''): _load_weights(self, checkpoint_path, prefix) @torch.no_grad() def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix: str = ''): """ Load weights from .npz checkpoints for official Google Brain Flax implementation """ import numpy as np def _n2p(w, t=True): if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1: w = w.flatten() if t: if w.ndim == 4: w = w.transpose([3, 2, 0, 1]) elif w.ndim == 3: w = w.transpose([2, 0, 1]) elif w.ndim == 2: w = w.transpose([1, 0]) return torch.from_numpy(w) w = np.load(checkpoint_path) if not prefix and 'opt/target/embedding/kernel' in w: prefix = 'opt/target/' if hasattr(model.patch_embed, 'backbone'): # hybrid backbone = model.patch_embed.backbone stem_only = not hasattr(backbone, 'stem') stem = backbone if stem_only else backbone.stem stem.conv.weight.copy_(adapt_input_conv(stem.conv.weight.shape[1], _n2p(w[f'{prefix}conv_root/kernel']))) stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale'])) stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias'])) if not stem_only: for i, stage in enumerate(backbone.stages): for j, block in enumerate(stage.blocks): bp = f'{prefix}block{i + 1}/unit{j + 1}/' for r in range(3): getattr(block, f'conv{r + 1}').weight.copy_(_n2p(w[f'{bp}conv{r + 1}/kernel'])) getattr(block, f'norm{r + 1}').weight.copy_(_n2p(w[f'{bp}gn{r + 1}/scale'])) getattr(block, f'norm{r + 1}').bias.copy_(_n2p(w[f'{bp}gn{r + 1}/bias'])) if block.downsample is not None: block.downsample.conv.weight.copy_(_n2p(w[f'{bp}conv_proj/kernel'])) block.downsample.norm.weight.copy_(_n2p(w[f'{bp}gn_proj/scale'])) block.downsample.norm.bias.copy_(_n2p(w[f'{bp}gn_proj/bias'])) embed_conv_w = _n2p(w[f'{prefix}embedding/kernel']) else: embed_conv_w = adapt_input_conv( model.patch_embed.proj.weight.shape[1], _n2p(w[f'{prefix}embedding/kernel'])) model.patch_embed.proj.weight.copy_(embed_conv_w) model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias'])) model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False)) pos_embed_w = _n2p(w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False) if pos_embed_w.shape != model.pos_embed.shape: pos_embed_w = resize_pos_embed( # resize pos embedding when different size from pretrained weights pos_embed_w, model.pos_embed, getattr(model, 'num_tokens', 1), model.patch_embed.grid_size) model.pos_embed.copy_(pos_embed_w) model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale'])) model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias'])) # if isinstance(model.head, nn.Linear) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]: # model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel'])) # model.head.bias.copy_(_n2p(w[f'{prefix}head/bias'])) # if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w: # model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel'])) # model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias'])) for i, block in enumerate(model.blocks.children()): block_prefix = f'{prefix}Transformer/encoderblock_{i}/' mha_prefix = block_prefix + 'MultiHeadDotProductAttention_1/' block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale'])) block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias'])) block.attn.qkv.weight.copy_(torch.cat([ _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('query', 'key', 'value')])) block.attn.qkv.bias.copy_(torch.cat([ _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('query', 'key', 'value')])) block.attn.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1)) block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias'])) for r in range(2): getattr(block.mlp, f'fc{r + 1}').weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/kernel'])) getattr(block.mlp, f'fc{r + 1}').bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/bias'])) block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/scale'])) block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/bias'])) def interpolate_pos_embed(pos_embed_checkpoint, visual_encoder): # interpolate position embedding embedding_size = pos_embed_checkpoint.shape[-1] num_patches = visual_encoder.patch_embed.num_patches num_extra_tokens = visual_encoder.pos_embed.shape[-2] - num_patches # height (== width) for the checkpoint position embedding orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) # height (== width) for the new position embedding new_size = int(num_patches ** 0.5) if orig_size!=new_size: # class_token and dist_token are kept unchanged extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] # only the position tokens are interpolated pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) pos_tokens = torch.nn.functional.interpolate( pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) print('reshape position embedding from %d to %d'%(orig_size ** 2,new_size ** 2)) return new_pos_embed else: return pos_embed_checkpoint ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/__init__.py ================================================ ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/__init__.py ================================================ from .build import build_dataset, build_pretraining_dataset ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/build.py ================================================ import os from torchvision import transforms from .transforms import * from .masking_generator import TubeMaskingGenerator, RandomMaskingGenerator from .mae import VideoMAE from .kinetics import VideoClsDataset from .kinetics_sparse import VideoClsDataset_sparse from .ssv2 import SSVideoClsDataset, SSRawFrameClsDataset class DataAugmentationForVideoMAE(object): def __init__(self, args): self.input_mean = [0.485, 0.456, 0.406] # IMAGENET_DEFAULT_MEAN self.input_std = [0.229, 0.224, 0.225] # IMAGENET_DEFAULT_STD normalize = GroupNormalize(self.input_mean, self.input_std) self.train_augmentation = GroupMultiScaleCrop(args.input_size, [1, .875, .75, .66]) if args.color_jitter > 0: self.transform = transforms.Compose([ self.train_augmentation, GroupColorJitter(args.color_jitter), GroupRandomHorizontalFlip(flip=args.flip), Stack(roll=False), ToTorchFormatTensor(div=True), normalize, ]) else: self.transform = transforms.Compose([ self.train_augmentation, GroupRandomHorizontalFlip(flip=args.flip), Stack(roll=False), ToTorchFormatTensor(div=True), normalize, ]) if args.mask_type == 'tube': self.masked_position_generator = TubeMaskingGenerator( args.window_size, args.mask_ratio ) elif args.mask_type == 'random': self.masked_position_generator = RandomMaskingGenerator( args.window_size, args.mask_ratio ) elif args.mask_type in 'attention': self.masked_position_generator = None def __call__(self, images): process_data, _ = self.transform(images) if self.masked_position_generator is None: return process_data, -1 else: return process_data, self.masked_position_generator() def __repr__(self): repr = "(DataAugmentationForVideoMAE,\n" repr += " transform = %s,\n" % str(self.transform) repr += " Masked position generator = %s,\n" % str(self.masked_position_generator) repr += ")" return repr def build_pretraining_dataset(args): transform = DataAugmentationForVideoMAE(args) dataset = VideoMAE( root=None, setting=args.data_path, prefix=args.prefix, split=args.split, video_ext='mp4', is_color=True, modality='rgb', num_segments=args.num_segments, new_length=args.num_frames, new_step=args.sampling_rate, transform=transform, temporal_jitter=False, video_loader=True, use_decord=args.use_decord, lazy_init=False, num_sample=args.num_sample) print("Data Aug = %s" % str(transform)) return dataset def build_dataset(is_train, test_mode, args): print(f'Use Dataset: {args.data_set}') if args.data_set in [ 'Kinetics', 'Kinetics_sparse', 'mitv1_sparse' ]: mode = None anno_path = None if is_train is True: mode = 'train' anno_path = os.path.join(args.data_path, 'train.csv') elif test_mode is True: mode = 'test' anno_path = os.path.join(args.data_path, 'test.csv') else: mode = 'validation' anno_path = os.path.join(args.data_path, 'val.csv') if 'sparse' in args.data_set: func = VideoClsDataset_sparse else: func = VideoClsDataset dataset = func( anno_path=anno_path, prefix=args.prefix, split=args.split, mode=mode, clip_len=args.num_frames, frame_sample_rate=args.sampling_rate, num_segment=1, test_num_segment=args.test_num_segment, test_num_crop=args.test_num_crop, num_crop=1 if not test_mode else 3, keep_aspect_ratio=True, crop_size=args.input_size, short_side_size=args.short_side_size, new_height=256, new_width=320, args=args) nb_classes = args.nb_classes elif args.data_set == 'SSV2': mode = None anno_path = None if is_train is True: mode = 'train' anno_path = os.path.join(args.data_path, 'train.csv') elif test_mode is True: mode = 'test' anno_path = os.path.join(args.data_path, 'test.csv') else: mode = 'validation' anno_path = os.path.join(args.data_path, 'val.csv') if args.use_decord: func = SSVideoClsDataset else: func = SSRawFrameClsDataset dataset = func( anno_path=anno_path, prefix=args.prefix, split=args.split, mode=mode, clip_len=1, num_segment=args.num_frames, test_num_segment=args.test_num_segment, test_num_crop=args.test_num_crop, num_crop=1 if not test_mode else 3, keep_aspect_ratio=True, crop_size=args.input_size, short_side_size=args.short_side_size, new_height=256, new_width=320, args=args) nb_classes = 174 elif args.data_set == 'UCF101': mode = None anno_path = None if is_train is True: mode = 'train' anno_path = os.path.join(args.data_path, 'train.csv') elif test_mode is True: mode = 'test' anno_path = os.path.join(args.data_path, 'test.csv') else: mode = 'validation' anno_path = os.path.join(args.data_path, 'val.csv') dataset = VideoClsDataset( anno_path=anno_path, prefix=args.prefix, split=args.split, mode=mode, clip_len=args.num_frames, frame_sample_rate=args.sampling_rate, num_segment=1, test_num_segment=args.test_num_segment, test_num_crop=args.test_num_crop, num_crop=1 if not test_mode else 3, keep_aspect_ratio=True, crop_size=args.input_size, short_side_size=args.short_side_size, new_height=256, new_width=320, args=args) nb_classes = 101 elif args.data_set == 'HMDB51': mode = None anno_path = None if is_train is True: mode = 'train' anno_path = os.path.join(args.data_path, 'train.csv') elif test_mode is True: mode = 'test' anno_path = os.path.join(args.data_path, 'test.csv') else: mode = 'validation' anno_path = os.path.join(args.data_path, 'val.csv') dataset = VideoClsDataset( anno_path=anno_path, prefix=args.prefix, split=args.split, mode=mode, clip_len=args.num_frames, frame_sample_rate=args.sampling_rate, num_segment=1, test_num_segment=args.test_num_segment, test_num_crop=args.test_num_crop, num_crop=1 if not test_mode else 3, keep_aspect_ratio=True, crop_size=args.input_size, short_side_size=args.short_side_size, new_height=256, new_width=320, args=args) nb_classes = 51 else: print(f'Wrong: {args.data_set}') raise NotImplementedError() assert nb_classes == args.nb_classes print("Number of the class = %d" % args.nb_classes) return dataset, nb_classes ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/kinetics.py ================================================ import os import os import io import numpy as np from numpy.lib.function_base import disp import torch from torchvision import transforms import warnings from decord import VideoReader, cpu from torch.utils.data import Dataset from .random_erasing import RandomErasing from .video_transforms import ( Compose, Resize, CenterCrop, Normalize, create_random_augment, random_short_side_scale_jitter, random_crop, random_resized_crop_with_shift, random_resized_crop, horizontal_flip, random_short_side_scale_jitter, uniform_crop, ) from .volume_transforms import ClipToTensor try: from petrel_client.client import Client has_client = True except ImportError: has_client = False class VideoClsDataset(Dataset): """Load your own video classification dataset.""" def __init__(self, anno_path, prefix='', split=' ', mode='train', clip_len=8, frame_sample_rate=2, crop_size=224, short_side_size=256, new_height=256, new_width=340, keep_aspect_ratio=True, num_segment=1, num_crop=1, test_num_segment=10, test_num_crop=3, args=None): self.anno_path = anno_path self.prefix = prefix self.split = split self.mode = mode self.clip_len = clip_len self.frame_sample_rate = frame_sample_rate self.crop_size = crop_size self.short_side_size = short_side_size self.new_height = new_height self.new_width = new_width self.keep_aspect_ratio = keep_aspect_ratio self.num_segment = num_segment self.test_num_segment = test_num_segment self.num_crop = num_crop self.test_num_crop = test_num_crop self.args = args self.aug = False self.rand_erase = False assert num_segment == 1 if self.mode in ['train']: self.aug = True if self.args.reprob > 0: self.rand_erase = True if VideoReader is None: raise ImportError("Unable to import `decord` which is required to read videos.") import pandas as pd cleaned = pd.read_csv(self.anno_path, header=None, delimiter=self.split) self.dataset_samples = list(cleaned.values[:, 0]) self.label_array = list(cleaned.values[:, 1]) self.client = None if has_client: self.client = Client('~/petreloss.conf') if (mode == 'train'): pass elif (mode == 'validation'): self.data_transform = Compose([ Resize(self.short_side_size, interpolation='bilinear'), CenterCrop(size=(self.crop_size, self.crop_size)), ClipToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) elif mode == 'test': self.data_resize = Compose([ Resize(size=(short_side_size), interpolation='bilinear') ]) self.data_transform = Compose([ ClipToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) self.test_seg = [] self.test_dataset = [] self.test_label_array = [] for ck in range(self.test_num_segment): for cp in range(self.test_num_crop): for idx in range(len(self.label_array)): sample_label = self.label_array[idx] self.test_label_array.append(sample_label) self.test_dataset.append(self.dataset_samples[idx]) self.test_seg.append((ck, cp)) def __getitem__(self, index): if self.mode == 'train': args = self.args scale_t = 1 sample = self.dataset_samples[index] buffer = self.loadvideo_decord(sample, sample_rate_scale=scale_t) # T H W C if len(buffer) == 0: while len(buffer) == 0: warnings.warn("video {} not correctly loaded during training".format(sample)) index = np.random.randint(self.__len__()) sample = self.dataset_samples[index] buffer = self.loadvideo_decord(sample, sample_rate_scale=scale_t) if args.num_sample > 1: frame_list = [] label_list = [] index_list = [] for _ in range(args.num_sample): new_frames = self._aug_frame(buffer, args) label = self.label_array[index] frame_list.append(new_frames) label_list.append(label) index_list.append(index) return frame_list, label_list, index_list, {} else: buffer = self._aug_frame(buffer, args) return buffer, self.label_array[index], index, {} elif self.mode == 'validation': sample = self.dataset_samples[index] buffer = self.loadvideo_decord(sample) if len(buffer) == 0: while len(buffer) == 0: warnings.warn("video {} not correctly loaded during validation".format(sample)) index = np.random.randint(self.__len__()) sample = self.dataset_samples[index] buffer = self.loadvideo_decord(sample) buffer = self.data_transform(buffer) return buffer, self.label_array[index], sample.split("/")[-1].split(".")[0] elif self.mode == 'test': sample = self.test_dataset[index] chunk_nb, split_nb = self.test_seg[index] buffer = self.loadvideo_decord(sample, chunk_nb=chunk_nb) while len(buffer) == 0: warnings.warn("video {}, temporal {}, spatial {} not found during testing".format(\ str(self.test_dataset[index]), chunk_nb, split_nb)) index = np.random.randint(self.__len__()) sample = self.test_dataset[index] chunk_nb, split_nb = self.test_seg[index] buffer = self.loadvideo_decord(sample, chunk_nb=chunk_nb) buffer = self.data_resize(buffer) if isinstance(buffer, list): buffer = np.stack(buffer, 0) if self.test_num_crop == 1: spatial_step = 1.0 * (max(buffer.shape[1], buffer.shape[2]) - self.short_side_size) / 2 spatial_start = int(spatial_step) else: spatial_step = 1.0 * (max(buffer.shape[1], buffer.shape[2]) - self.short_side_size) \ / (self.test_num_crop - 1) spatial_start = int(split_nb * spatial_step) if buffer.shape[1] >= buffer.shape[2]: buffer = buffer[:, spatial_start:spatial_start + self.short_side_size, :, :] else: buffer = buffer[:, :, spatial_start:spatial_start + self.short_side_size, :] buffer = self.data_transform(buffer) return buffer, self.test_label_array[index], sample.split("/")[-1].split(".")[0], \ chunk_nb, split_nb else: raise NameError('mode {} unkown'.format(self.mode)) def _aug_frame( self, buffer, args, ): aug_transform = create_random_augment( input_size=(self.crop_size, self.crop_size), auto_augment=args.aa, interpolation=args.train_interpolation, ) buffer = [ transforms.ToPILImage()(frame) for frame in buffer ] buffer = aug_transform(buffer) buffer = [transforms.ToTensor()(img) for img in buffer] buffer = torch.stack(buffer) # T C H W buffer = buffer.permute(0, 2, 3, 1) # T H W C # T H W C buffer = tensor_normalize( buffer, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] ) # T H W C -> C T H W. buffer = buffer.permute(3, 0, 1, 2) # Perform data augmentation. scl, asp = ( [0.08, 1.0], [0.75, 1.3333], ) buffer = spatial_sampling( buffer, spatial_idx=-1, min_scale=256, max_scale=320, crop_size=self.crop_size, random_horizontal_flip=False if args.data_set == 'SSV2' else True , inverse_uniform_sampling=False, aspect_ratio=asp, scale=scl, motion_shift=False ) if self.rand_erase: erase_transform = RandomErasing( args.reprob, mode=args.remode, max_count=args.recount, num_splits=args.recount, device="cpu", ) buffer = buffer.permute(1, 0, 2, 3) buffer = erase_transform(buffer) buffer = buffer.permute(1, 0, 2, 3) return buffer def loadvideo_decord(self, sample, sample_rate_scale=1, chunk_nb=0): """Load video content using Decord""" fname = sample fname = os.path.join(self.prefix, fname) try: if self.keep_aspect_ratio: if fname.startswith('s3'): video_bytes = self.client.get(fname) vr = VideoReader(io.BytesIO(video_bytes), num_threads=1, ctx=cpu(0)) else: vr = VideoReader(fname, num_threads=1, ctx=cpu(0)) else: if fname.startswith('s3:'): video_bytes = self.client.get(fname) vr = VideoReader(io.BytesIO(video_bytes), width=self.new_width, height=self.new_height, num_threads=1, ctx=cpu(0)) else: vr = VideoReader(fname, width=self.new_width, height=self.new_height, num_threads=1, ctx=cpu(0)) # handle temporal segments converted_len = int(self.clip_len * self.frame_sample_rate) seg_len = len(vr) // self.num_segment if self.mode == 'test': temporal_step = max(1.0 * (len(vr) - converted_len) / (self.test_num_segment - 1), 0) temporal_start = int(chunk_nb * temporal_step) bound = min(temporal_start + converted_len, len(vr)) all_index = [x for x in range(temporal_start, bound, self.frame_sample_rate)] while len(all_index) < self.clip_len: all_index.append(all_index[-1]) vr.seek(0) buffer = vr.get_batch(all_index).asnumpy() return buffer all_index = [] for i in range(self.num_segment): if seg_len <= converted_len: index = np.linspace(0, seg_len, num=seg_len // self.frame_sample_rate) index = np.concatenate((index, np.ones(self.clip_len - seg_len // self.frame_sample_rate) * seg_len)) index = np.clip(index, 0, seg_len - 1).astype(np.int64) else: if self.mode == 'validation': end_idx = (seg_len - converted_len) // 2 else: end_idx = np.random.randint(converted_len, seg_len) str_idx = end_idx - converted_len index = np.linspace(str_idx, end_idx, num=self.clip_len) index = np.clip(index, str_idx, end_idx - 1).astype(np.int64) index = index + i*seg_len all_index.extend(list(index)) all_index = all_index[::int(sample_rate_scale)] vr.seek(0) buffer = vr.get_batch(all_index).asnumpy() return buffer except: print("video cannot be loaded by decord: ", fname) return [] def __len__(self): if self.mode != 'test': return len(self.dataset_samples) else: return len(self.test_dataset) def spatial_sampling( frames, spatial_idx=-1, min_scale=256, max_scale=320, crop_size=224, random_horizontal_flip=True, inverse_uniform_sampling=False, aspect_ratio=None, scale=None, motion_shift=False, ): """ Perform spatial sampling on the given video frames. If spatial_idx is -1, perform random scale, random crop, and random flip on the given frames. If spatial_idx is 0, 1, or 2, perform spatial uniform sampling with the given spatial_idx. Args: frames (tensor): frames of images sampled from the video. The dimension is `num frames` x `height` x `width` x `channel`. spatial_idx (int): if -1, perform random spatial sampling. If 0, 1, or 2, perform left, center, right crop if width is larger than height, and perform top, center, buttom crop if height is larger than width. min_scale (int): the minimal size of scaling. max_scale (int): the maximal size of scaling. crop_size (int): the size of height and width used to crop the frames. inverse_uniform_sampling (bool): if True, sample uniformly in [1 / max_scale, 1 / min_scale] and take a reciprocal to get the scale. If False, take a uniform sample from [min_scale, max_scale]. aspect_ratio (list): Aspect ratio range for resizing. scale (list): Scale range for resizing. motion_shift (bool): Whether to apply motion shift for resizing. Returns: frames (tensor): spatially sampled frames. """ assert spatial_idx in [-1, 0, 1, 2] if spatial_idx == -1: if aspect_ratio is None and scale is None: frames, _ = random_short_side_scale_jitter( images=frames, min_size=min_scale, max_size=max_scale, inverse_uniform_sampling=inverse_uniform_sampling, ) frames, _ = random_crop(frames, crop_size) else: transform_func = ( random_resized_crop_with_shift if motion_shift else random_resized_crop ) frames = transform_func( images=frames, target_height=crop_size, target_width=crop_size, scale=scale, ratio=aspect_ratio, ) if random_horizontal_flip: frames, _ = horizontal_flip(0.5, frames) else: # The testing is deterministic and no jitter should be performed. # min_scale, max_scale, and crop_size are expect to be the same. assert len({min_scale, max_scale, crop_size}) == 1 frames, _ = random_short_side_scale_jitter( frames, min_scale, max_scale ) frames, _ = uniform_crop(frames, crop_size, spatial_idx) return frames def tensor_normalize(tensor, mean, std): """ Normalize a given tensor by subtracting the mean and dividing the std. Args: tensor (tensor): tensor to normalize. mean (tensor or list): mean value to subtract. std (tensor or list): std to divide. """ if tensor.dtype == torch.uint8: tensor = tensor.float() tensor = tensor / 255.0 if type(mean) == list: mean = torch.tensor(mean) if type(std) == list: std = torch.tensor(std) tensor = tensor - mean tensor = tensor / std return tensor ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/kinetics_sparse.py ================================================ import os import os import io import random import numpy as np from numpy.lib.function_base import disp import torch from torchvision import transforms import warnings from decord import VideoReader, cpu from torch.utils.data import Dataset from .random_erasing import RandomErasing from .video_transforms import ( Compose, Resize, CenterCrop, Normalize, create_random_augment, random_short_side_scale_jitter, random_crop, random_resized_crop_with_shift, random_resized_crop, horizontal_flip, random_short_side_scale_jitter, uniform_crop, ) from .volume_transforms import ClipToTensor try: from petrel_client.client import Client has_client = True except ImportError: has_client = False class VideoClsDataset_sparse(Dataset): """Load your own video classification dataset.""" def __init__(self, anno_path, prefix='', split=' ', mode='train', clip_len=8, frame_sample_rate=2, crop_size=224, short_side_size=256, new_height=256, new_width=340, keep_aspect_ratio=True, num_segment=1, num_crop=1, test_num_segment=10, test_num_crop=3, args=None): self.anno_path = anno_path self.prefix = prefix self.split = split self.mode = mode self.clip_len = clip_len self.frame_sample_rate = frame_sample_rate self.crop_size = crop_size self.short_side_size = short_side_size self.new_height = new_height self.new_width = new_width self.keep_aspect_ratio = keep_aspect_ratio self.num_segment = num_segment self.test_num_segment = test_num_segment self.num_crop = num_crop self.test_num_crop = test_num_crop self.args = args self.aug = False self.rand_erase = False assert num_segment == 1 if self.mode in ['train']: self.aug = True if self.args.reprob > 0: self.rand_erase = True if VideoReader is None: raise ImportError("Unable to import `decord` which is required to read videos.") import pandas as pd cleaned = pd.read_csv(self.anno_path, header=None, delimiter=self.split) self.dataset_samples = list(cleaned.values[:, 0]) self.label_array = list(cleaned.values[:, 1]) self.client = None if has_client: self.client = Client('~/petreloss.conf') if (mode == 'train'): pass elif (mode == 'validation'): self.data_transform = Compose([ Resize(self.short_side_size, interpolation='bilinear'), CenterCrop(size=(self.crop_size, self.crop_size)), ClipToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) elif mode == 'test': self.data_resize = Compose([ Resize(size=(short_side_size), interpolation='bilinear') ]) self.data_transform = Compose([ ClipToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) self.test_seg = [] self.test_dataset = [] self.test_label_array = [] for ck in range(self.test_num_segment): for cp in range(self.test_num_crop): for idx in range(len(self.label_array)): sample_label = self.label_array[idx] self.test_label_array.append(sample_label) self.test_dataset.append(self.dataset_samples[idx]) self.test_seg.append((ck, cp)) def __getitem__(self, index): if self.mode == 'train': args = self.args sample = self.dataset_samples[index] buffer = self.loadvideo_decord(sample, chunk_nb=-1) # T H W C if len(buffer) == 0: while len(buffer) == 0: warnings.warn("video {} not correctly loaded during training".format(sample)) index = np.random.randint(self.__len__()) sample = self.dataset_samples[index] buffer = self.loadvideo_decord(sample, chunk_nb=-1) if args.num_sample > 1: frame_list = [] label_list = [] index_list = [] for _ in range(args.num_sample): new_frames = self._aug_frame(buffer, args) label = self.label_array[index] frame_list.append(new_frames) label_list.append(label) index_list.append(index) return frame_list, label_list, index_list, {} else: buffer = self._aug_frame(buffer, args) return buffer, self.label_array[index], index, {} elif self.mode == 'validation': sample = self.dataset_samples[index] buffer = self.loadvideo_decord(sample, chunk_nb=0) if len(buffer) == 0: while len(buffer) == 0: warnings.warn("video {} not correctly loaded during validation".format(sample)) index = np.random.randint(self.__len__()) sample = self.dataset_samples[index] buffer = self.loadvideo_decord(sample, chunk_nb=0) buffer = self.data_transform(buffer) return buffer, self.label_array[index], sample.split("/")[-1].split(".")[0] elif self.mode == 'test': sample = self.test_dataset[index] chunk_nb, split_nb = self.test_seg[index] buffer = self.loadvideo_decord(sample, chunk_nb=chunk_nb) while len(buffer) == 0: warnings.warn("video {}, temporal {}, spatial {} not found during testing".format(\ str(self.test_dataset[index]), chunk_nb, split_nb)) index = np.random.randint(self.__len__()) sample = self.test_dataset[index] chunk_nb, split_nb = self.test_seg[index] buffer = self.loadvideo_decord(sample, chunk_nb=chunk_nb) buffer = self.data_resize(buffer) if isinstance(buffer, list): buffer = np.stack(buffer, 0) if self.test_num_crop == 1: spatial_step = 1.0 * (max(buffer.shape[1], buffer.shape[2]) - self.short_side_size) / 2 spatial_start = int(spatial_step) else: spatial_step = 1.0 * (max(buffer.shape[1], buffer.shape[2]) - self.short_side_size) \ / (self.test_num_crop - 1) spatial_start = int(split_nb * spatial_step) if buffer.shape[1] >= buffer.shape[2]: buffer = buffer[:, spatial_start:spatial_start + self.short_side_size, :, :] else: buffer = buffer[:, :, spatial_start:spatial_start + self.short_side_size, :] buffer = self.data_transform(buffer) return buffer, self.test_label_array[index], sample.split("/")[-1].split(".")[0], \ chunk_nb, split_nb else: raise NameError('mode {} unkown'.format(self.mode)) def _aug_frame( self, buffer, args, ): aug_transform = create_random_augment( input_size=(self.crop_size, self.crop_size), auto_augment=args.aa, interpolation=args.train_interpolation, ) buffer = [ transforms.ToPILImage()(frame) for frame in buffer ] buffer = aug_transform(buffer) buffer = [transforms.ToTensor()(img) for img in buffer] buffer = torch.stack(buffer) # T C H W buffer = buffer.permute(0, 2, 3, 1) # T H W C # T H W C buffer = tensor_normalize( buffer, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] ) # T H W C -> C T H W. buffer = buffer.permute(3, 0, 1, 2) # Perform data augmentation. scl, asp = ( [0.08, 1.0], [0.75, 1.3333], ) buffer = spatial_sampling( buffer, spatial_idx=-1, min_scale=256, max_scale=320, crop_size=self.crop_size, random_horizontal_flip=False if args.data_set == 'SSV2' else True , inverse_uniform_sampling=False, aspect_ratio=asp, scale=scl, motion_shift=False ) if self.rand_erase: erase_transform = RandomErasing( args.reprob, mode=args.remode, max_count=args.recount, num_splits=args.recount, device="cpu", ) buffer = buffer.permute(1, 0, 2, 3) buffer = erase_transform(buffer) buffer = buffer.permute(1, 0, 2, 3) return buffer def _get_seq_frames(self, video_size, num_frames, clip_idx=-1): seg_size = max(0., float(video_size - 1) / num_frames) max_frame = int(video_size) - 1 seq = [] # index from 1, must add 1 if clip_idx == -1: for i in range(num_frames): start = int(np.round(seg_size * i)) end = int(np.round(seg_size * (i + 1))) idx = min(random.randint(start, end), max_frame) seq.append(idx) else: num_segment = 1 if self.mode == 'test': num_segment = self.test_num_segment duration = seg_size / (num_segment + 1) for i in range(num_frames): start = int(np.round(seg_size * i)) frame_index = start + int(duration * (clip_idx + 1)) idx = min(frame_index, max_frame) seq.append(idx) return seq def loadvideo_decord(self, sample, chunk_nb=0): """Load video content using Decord""" fname = sample fname = os.path.join(self.prefix, fname) try: if self.keep_aspect_ratio: if fname.startswith('s3'): video_bytes = self.client.get(fname) vr = VideoReader(io.BytesIO(video_bytes), num_threads=1, ctx=cpu(0)) else: vr = VideoReader(fname, num_threads=1, ctx=cpu(0)) else: if fname.startswith('s3:'): video_bytes = self.client.get(fname) vr = VideoReader(io.BytesIO(video_bytes), width=self.new_width, height=self.new_height, num_threads=1, ctx=cpu(0)) else: vr = VideoReader(fname, width=self.new_width, height=self.new_height, num_threads=1, ctx=cpu(0)) all_index = self._get_seq_frames(len(vr), self.clip_len, clip_idx=chunk_nb) vr.seek(0) buffer = vr.get_batch(all_index).asnumpy() return buffer except: print("video cannot be loaded by decord: ", fname) return [] def __len__(self): if self.mode != 'test': return len(self.dataset_samples) else: return len(self.test_dataset) def spatial_sampling( frames, spatial_idx=-1, min_scale=256, max_scale=320, crop_size=224, random_horizontal_flip=True, inverse_uniform_sampling=False, aspect_ratio=None, scale=None, motion_shift=False, ): """ Perform spatial sampling on the given video frames. If spatial_idx is -1, perform random scale, random crop, and random flip on the given frames. If spatial_idx is 0, 1, or 2, perform spatial uniform sampling with the given spatial_idx. Args: frames (tensor): frames of images sampled from the video. The dimension is `num frames` x `height` x `width` x `channel`. spatial_idx (int): if -1, perform random spatial sampling. If 0, 1, or 2, perform left, center, right crop if width is larger than height, and perform top, center, buttom crop if height is larger than width. min_scale (int): the minimal size of scaling. max_scale (int): the maximal size of scaling. crop_size (int): the size of height and width used to crop the frames. inverse_uniform_sampling (bool): if True, sample uniformly in [1 / max_scale, 1 / min_scale] and take a reciprocal to get the scale. If False, take a uniform sample from [min_scale, max_scale]. aspect_ratio (list): Aspect ratio range for resizing. scale (list): Scale range for resizing. motion_shift (bool): Whether to apply motion shift for resizing. Returns: frames (tensor): spatially sampled frames. """ assert spatial_idx in [-1, 0, 1, 2] if spatial_idx == -1: if aspect_ratio is None and scale is None: frames, _ = random_short_side_scale_jitter( images=frames, min_size=min_scale, max_size=max_scale, inverse_uniform_sampling=inverse_uniform_sampling, ) frames, _ = random_crop(frames, crop_size) else: transform_func = ( random_resized_crop_with_shift if motion_shift else random_resized_crop ) frames = transform_func( images=frames, target_height=crop_size, target_width=crop_size, scale=scale, ratio=aspect_ratio, ) if random_horizontal_flip: frames, _ = horizontal_flip(0.5, frames) else: # The testing is deterministic and no jitter should be performed. # min_scale, max_scale, and crop_size are expect to be the same. assert len({min_scale, max_scale, crop_size}) == 1 frames, _ = random_short_side_scale_jitter( frames, min_scale, max_scale ) frames, _ = uniform_crop(frames, crop_size, spatial_idx) return frames def tensor_normalize(tensor, mean, std): """ Normalize a given tensor by subtracting the mean and dividing the std. Args: tensor (tensor): tensor to normalize. mean (tensor or list): mean value to subtract. std (tensor or list): std to divide. """ if tensor.dtype == torch.uint8: tensor = tensor.float() tensor = tensor / 255.0 if type(mean) == list: mean = torch.tensor(mean) if type(std) == list: std = torch.tensor(std) tensor = tensor - mean tensor = tensor / std return tensor ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/mae.py ================================================ import os import cv2 import io import numpy as np import torch import decord from PIL import Image from decord import VideoReader, cpu import random try: from petrel_client.client import Client has_client = True except ImportError: has_client = False class VideoMAE(torch.utils.data.Dataset): """Load your own video classification dataset. Parameters ---------- root : str, required. Path to the root folder storing the dataset. setting : str, required. A text file describing the dataset, each line per video sample. There are three items in each line: (1) video path; (2) video length and (3) video label. prefix : str, required. The prefix for loading data. split : str, required. The split character for metadata. train : bool, default True. Whether to load the training or validation set. test_mode : bool, default False. Whether to perform evaluation on the test set. Usually there is three-crop or ten-crop evaluation strategy involved. name_pattern : str, default None. The naming pattern of the decoded video frames. For example, img_00012.jpg. video_ext : str, default 'mp4'. If video_loader is set to True, please specify the video format accordinly. is_color : bool, default True. Whether the loaded image is color or grayscale. modality : str, default 'rgb'. Input modalities, we support only rgb video frames for now. Will add support for rgb difference image and optical flow image later. num_segments : int, default 1. Number of segments to evenly divide the video into clips. A useful technique to obtain global video-level information. Limin Wang, etal, Temporal Segment Networks: Towards Good Practices for Deep Action Recognition, ECCV 2016. num_crop : int, default 1. Number of crops for each image. default is 1. Common choices are three crops and ten crops during evaluation. new_length : int, default 1. The length of input video clip. Default is a single image, but it can be multiple video frames. For example, new_length=16 means we will extract a video clip of consecutive 16 frames. new_step : int, default 1. Temporal sampling rate. For example, new_step=1 means we will extract a video clip of consecutive frames. new_step=2 means we will extract a video clip of every other frame. temporal_jitter : bool, default False. Whether to temporally jitter if new_step > 1. video_loader : bool, default False. Whether to use video loader to load data. use_decord : bool, default True. Whether to use Decord video loader to load data. Otherwise load image. transform : function, default None. A function that takes data and label and transforms them. data_aug : str, default 'v1'. Different types of data augmentation auto. Supports v1, v2, v3 and v4. lazy_init : bool, default False. If set to True, build a dataset instance without loading any dataset. """ def __init__(self, root, setting, prefix='', split=' ', train=True, test_mode=False, name_pattern='img_%05d.jpg', video_ext='mp4', is_color=True, modality='rgb', num_segments=1, num_crop=1, new_length=1, new_step=1, transform=None, temporal_jitter=False, video_loader=False, use_decord=True, lazy_init=False, num_sample=1, ): super(VideoMAE, self).__init__() self.root = root self.setting = setting self.prefix = prefix self.split = split self.train = train self.test_mode = test_mode self.is_color = is_color self.modality = modality self.num_segments = num_segments self.num_crop = num_crop self.new_length = new_length self.new_step = new_step self.skip_length = self.new_length * self.new_step self.temporal_jitter = temporal_jitter self.name_pattern = name_pattern self.video_loader = video_loader self.video_ext = video_ext self.use_decord = use_decord self.transform = transform self.lazy_init = lazy_init self.num_sample = num_sample # sparse sampling, num_segments != 1 if self.num_segments != 1: print('Use sparse sampling, change frame and stride') self.new_length = self.num_segments self.skip_length = 1 self.client = None if has_client: self.client = Client('~/petreloss.conf') if not self.lazy_init: self.clips = self._make_dataset(root, setting) if len(self.clips) == 0: raise(RuntimeError("Found 0 video clips in subfolders of: " + root + "\n" "Check your data directory (opt.data-dir).")) def __getitem__(self, index): while True: try: images = None if self.use_decord: directory, target = self.clips[index] if self.video_loader: if '.' in directory.split('/')[-1]: # data in the "setting" file already have extension, e.g., demo.mp4 video_name = directory else: # data in the "setting" file do not have extension, e.g., demo # So we need to provide extension (i.e., .mp4) to complete the file name. video_name = '{}.{}'.format(directory, self.video_ext) video_name = os.path.join(self.prefix, video_name) if video_name.startswith('s3'): video_bytes = self.client.get(video_name) decord_vr = VideoReader(io.BytesIO(video_bytes), num_threads=1, ctx=cpu(0)) else: decord_vr = decord.VideoReader(video_name, num_threads=1, ctx=cpu(0)) duration = len(decord_vr) segment_indices, skip_offsets = self._sample_train_indices(duration) images = self._video_TSN_decord_batch_loader(directory, decord_vr, duration, segment_indices, skip_offsets) else: video_name, total_frame, target = self.clips[index] video_name = os.path.join(self.prefix, video_name) segment_indices, skip_offsets = self._sample_train_indices(total_frame) frame_id_list = self._get_frame_id_list(total_frame, segment_indices, skip_offsets) images = [] for idx in frame_id_list: frame_fname = os.path.join(video_name, self.name_pattern.format(idx)) img_bytes = self.client.get(frame_fname) img_np = np.frombuffer(img_bytes, np.uint8) img = cv2.imdecode(img_np, cv2.IMREAD_COLOR) cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) images.append(Image.fromarray(img)) if images is not None: break except Exception as e: print("Failed to load video from {} with error {}".format( video_name, e)) index = random.randint(0, len(self.clips) - 1) if self.num_sample > 1: process_data_list = [] mask_list = [] for _ in range(self.num_sample): process_data, mask = self.transform((images, None)) process_data = process_data.view((self.new_length, 3) + process_data.size()[-2:]).transpose(0, 1) process_data_list.append(process_data) mask_list.append(mask) return process_data_list, mask_list else: process_data, mask = self.transform((images, None)) # T*C,H,W process_data = process_data.view((self.new_length, 3) + process_data.size()[-2:]).transpose(0, 1) # T*C,H,W -> T,C,H,W -> C,T,H,W return (process_data, mask) def __len__(self): return len(self.clips) def _make_dataset(self, directory, setting): if not os.path.exists(setting): raise(RuntimeError("Setting file %s doesn't exist. Check opt.train-list and opt.val-list. " % (setting))) clips = [] print(f'Load dataset using decord: {self.use_decord}') with open(setting) as split_f: data = split_f.readlines() for line in data: line_info = line.split(self.split) if len(line_info) < 2: raise(RuntimeError('Video input format is not correct, missing one or more element. %s' % line)) if self.use_decord: # line format: video_path, video_label clip_path = os.path.join(line_info[0]) target = int(line_info[1]) item = (clip_path, target) else: # line format: video_path, video_duration, video_label clip_path = os.path.join(line_info[0]) total_frame = int(line_info[1]) target = int(line_info[2]) item = (clip_path, total_frame, target) clips.append(item) return clips def _sample_train_indices(self, num_frames): average_duration = (num_frames - self.skip_length + 1) // self.num_segments if average_duration > 0: offsets = np.multiply(list(range(self.num_segments)), average_duration) offsets = offsets + np.random.randint(average_duration, size=self.num_segments) elif num_frames > max(self.num_segments, self.skip_length): offsets = np.sort(np.random.randint( num_frames - self.skip_length + 1, size=self.num_segments)) else: offsets = np.zeros((self.num_segments,)) if self.temporal_jitter: skip_offsets = np.random.randint( self.new_step, size=self.skip_length // self.new_step) else: skip_offsets = np.zeros( self.skip_length // self.new_step, dtype=int) return offsets + 1, skip_offsets def _get_frame_id_list(self, duration, indices, skip_offsets): frame_id_list = [] for seg_ind in indices: offset = int(seg_ind) for i, _ in enumerate(range(0, self.skip_length, self.new_step)): if offset + skip_offsets[i] <= duration: frame_id = offset + skip_offsets[i] - 1 else: frame_id = offset - 1 frame_id_list.append(frame_id) if offset + self.new_step < duration: offset += self.new_step return frame_id_list def _video_TSN_decord_batch_loader(self, directory, video_reader, duration, indices, skip_offsets): sampled_list = [] frame_id_list = [] for seg_ind in indices: offset = int(seg_ind) for i, _ in enumerate(range(0, self.skip_length, self.new_step)): if offset + skip_offsets[i] <= duration: frame_id = offset + skip_offsets[i] - 1 else: frame_id = offset - 1 frame_id_list.append(frame_id) if offset + self.new_step < duration: offset += self.new_step try: video_data = video_reader.get_batch(frame_id_list).asnumpy() sampled_list = [Image.fromarray(video_data[vid, :, :, :]).convert('RGB') for vid, _ in enumerate(frame_id_list)] except: raise RuntimeError('Error occured in reading frames {} from video {} of duration {}.'.format(frame_id_list, directory, duration)) return sampled_list ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/masking_generator.py ================================================ import numpy as np class TubeMaskingGenerator: def __init__(self, input_size, mask_ratio): self.frames, self.height, self.width = input_size self.num_patches_per_frame = self.height * self.width self.total_patches = self.frames * self.num_patches_per_frame self.num_masks_per_frame = int(mask_ratio * self.num_patches_per_frame) self.total_masks = self.frames * self.num_masks_per_frame def __repr__(self): repr_str = "Maks: total patches {}, mask patches {}".format( self.total_patches, self.total_masks ) return repr_str def __call__(self): mask_per_frame = np.hstack([ np.zeros(self.num_patches_per_frame - self.num_masks_per_frame), np.ones(self.num_masks_per_frame), ]) np.random.shuffle(mask_per_frame) mask = np.tile(mask_per_frame, (self.frames, 1)).flatten() return mask class RandomMaskingGenerator: def __init__(self, input_size, mask_ratio): if not isinstance(input_size, tuple): input_size = (input_size, ) * 3 self.frames, self.height, self.width = input_size self.num_patches = self.frames * self.height * self.width # 8x14x14 self.num_mask = int(mask_ratio * self.num_patches) def __repr__(self): repr_str = "Maks: total patches {}, mask patches {}".format( self.num_patches, self.num_mask) return repr_str def __call__(self): mask = np.hstack([ np.zeros(self.num_patches - self.num_mask), np.ones(self.num_mask), ]) np.random.shuffle(mask) return mask # [196*8] ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/mixup.py ================================================ """ Mixup and Cutmix Papers: mixup: Beyond Empirical Risk Minimization (https://arxiv.org/abs/1710.09412) CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features (https://arxiv.org/abs/1905.04899) Code Reference: CutMix: https://github.com/clovaai/CutMix-PyTorch Hacked together by / Copyright 2019, Ross Wightman """ import numpy as np import torch def one_hot(x, num_classes, on_value=1., off_value=0., device='cuda'): x = x.long().view(-1, 1) return torch.full((x.size()[0], num_classes), off_value, device=device).scatter_(1, x, on_value) def mixup_target(target, num_classes, lam=1., smoothing=0.0, device='cuda'): off_value = smoothing / num_classes on_value = 1. - smoothing + off_value y1 = one_hot(target, num_classes, on_value=on_value, off_value=off_value, device=device) y2 = one_hot(target.flip(0), num_classes, on_value=on_value, off_value=off_value, device=device) return y1 * lam + y2 * (1. - lam) def rand_bbox(img_shape, lam, margin=0., count=None): """ Standard CutMix bounding-box Generates a random square bbox based on lambda value. This impl includes support for enforcing a border margin as percent of bbox dimensions. Args: img_shape (tuple): Image shape as tuple lam (float): Cutmix lambda value margin (float): Percentage of bbox dimension to enforce as margin (reduce amount of box outside image) count (int): Number of bbox to generate """ ratio = np.sqrt(1 - lam) img_h, img_w = img_shape[-2:] cut_h, cut_w = int(img_h * ratio), int(img_w * ratio) margin_y, margin_x = int(margin * cut_h), int(margin * cut_w) cy = np.random.randint(0 + margin_y, img_h - margin_y, size=count) cx = np.random.randint(0 + margin_x, img_w - margin_x, size=count) yl = np.clip(cy - cut_h // 2, 0, img_h) yh = np.clip(cy + cut_h // 2, 0, img_h) xl = np.clip(cx - cut_w // 2, 0, img_w) xh = np.clip(cx + cut_w // 2, 0, img_w) return yl, yh, xl, xh def rand_bbox_minmax(img_shape, minmax, count=None): """ Min-Max CutMix bounding-box Inspired by Darknet cutmix impl, generates a random rectangular bbox based on min/max percent values applied to each dimension of the input image. Typical defaults for minmax are usually in the .2-.3 for min and .8-.9 range for max. Args: img_shape (tuple): Image shape as tuple minmax (tuple or list): Min and max bbox ratios (as percent of image size) count (int): Number of bbox to generate """ assert len(minmax) == 2 img_h, img_w = img_shape[-2:] cut_h = np.random.randint(int(img_h * minmax[0]), int(img_h * minmax[1]), size=count) cut_w = np.random.randint(int(img_w * minmax[0]), int(img_w * minmax[1]), size=count) yl = np.random.randint(0, img_h - cut_h, size=count) xl = np.random.randint(0, img_w - cut_w, size=count) yu = yl + cut_h xu = xl + cut_w return yl, yu, xl, xu def cutmix_bbox_and_lam(img_shape, lam, ratio_minmax=None, correct_lam=True, count=None): """ Generate bbox and apply lambda correction. """ if ratio_minmax is not None: yl, yu, xl, xu = rand_bbox_minmax(img_shape, ratio_minmax, count=count) else: yl, yu, xl, xu = rand_bbox(img_shape, lam, count=count) if correct_lam or ratio_minmax is not None: bbox_area = (yu - yl) * (xu - xl) lam = 1. - bbox_area / float(img_shape[-2] * img_shape[-1]) return (yl, yu, xl, xu), lam class Mixup: """ Mixup/Cutmix that applies different params to each element or whole batch Args: mixup_alpha (float): mixup alpha value, mixup is active if > 0. cutmix_alpha (float): cutmix alpha value, cutmix is active if > 0. cutmix_minmax (List[float]): cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None. prob (float): probability of applying mixup or cutmix per batch or element switch_prob (float): probability of switching to cutmix instead of mixup when both are active mode (str): how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element) correct_lam (bool): apply lambda correction when cutmix bbox clipped by image borders label_smoothing (float): apply label smoothing to the mixed target tensor num_classes (int): number of classes for target """ def __init__(self, mixup_alpha=1., cutmix_alpha=0., cutmix_minmax=None, prob=1.0, switch_prob=0.5, mode='batch', correct_lam=True, label_smoothing=0.1, num_classes=1000): self.mixup_alpha = mixup_alpha self.cutmix_alpha = cutmix_alpha self.cutmix_minmax = cutmix_minmax if self.cutmix_minmax is not None: assert len(self.cutmix_minmax) == 2 # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe self.cutmix_alpha = 1.0 self.mix_prob = prob self.switch_prob = switch_prob self.label_smoothing = label_smoothing self.num_classes = num_classes self.mode = mode self.correct_lam = correct_lam # correct lambda based on clipped area for cutmix self.mixup_enabled = True # set to false to disable mixing (intended tp be set by train loop) def _params_per_elem(self, batch_size): lam = np.ones(batch_size, dtype=np.float32) use_cutmix = np.zeros(batch_size, dtype=np.bool) if self.mixup_enabled: if self.mixup_alpha > 0. and self.cutmix_alpha > 0.: use_cutmix = np.random.rand(batch_size) < self.switch_prob lam_mix = np.where( use_cutmix, np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size), np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size)) elif self.mixup_alpha > 0.: lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size) elif self.cutmix_alpha > 0.: use_cutmix = np.ones(batch_size, dtype=np.bool) lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size) else: assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true." lam = np.where(np.random.rand(batch_size) < self.mix_prob, lam_mix.astype(np.float32), lam) return lam, use_cutmix def _params_per_batch(self): lam = 1. use_cutmix = False if self.mixup_enabled and np.random.rand() < self.mix_prob: if self.mixup_alpha > 0. and self.cutmix_alpha > 0.: use_cutmix = np.random.rand() < self.switch_prob lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) if use_cutmix else \ np.random.beta(self.mixup_alpha, self.mixup_alpha) elif self.mixup_alpha > 0.: lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha) elif self.cutmix_alpha > 0.: use_cutmix = True lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) else: assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true." lam = float(lam_mix) return lam, use_cutmix def _mix_elem(self, x): batch_size = len(x) lam_batch, use_cutmix = self._params_per_elem(batch_size) x_orig = x.clone() # need to keep an unmodified original for mixing source for i in range(batch_size): j = batch_size - i - 1 lam = lam_batch[i] if lam != 1.: if use_cutmix[i]: (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) x[i][..., yl:yh, xl:xh] = x_orig[j][..., yl:yh, xl:xh] lam_batch[i] = lam else: x[i] = x[i] * lam + x_orig[j] * (1 - lam) return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1) def _mix_pair(self, x): batch_size = len(x) lam_batch, use_cutmix = self._params_per_elem(batch_size // 2) x_orig = x.clone() # need to keep an unmodified original for mixing source for i in range(batch_size // 2): j = batch_size - i - 1 lam = lam_batch[i] if lam != 1.: if use_cutmix[i]: (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh] x[j][:, yl:yh, xl:xh] = x_orig[i][:, yl:yh, xl:xh] lam_batch[i] = lam else: x[i] = x[i] * lam + x_orig[j] * (1 - lam) x[j] = x[j] * lam + x_orig[i] * (1 - lam) lam_batch = np.concatenate((lam_batch, lam_batch[::-1])) return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1) def _mix_batch(self, x): lam, use_cutmix = self._params_per_batch() if lam == 1.: return 1. if use_cutmix: (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( x.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) x[..., yl:yh, xl:xh] = x.flip(0)[..., yl:yh, xl:xh] else: x_flipped = x.flip(0).mul_(1. - lam) x.mul_(lam).add_(x_flipped) return lam def __call__(self, x, target): assert len(x) % 2 == 0, 'Batch size should be even when using this' if self.mode == 'elem': lam = self._mix_elem(x) elif self.mode == 'pair': lam = self._mix_pair(x) else: lam = self._mix_batch(x) target = mixup_target(target, self.num_classes, lam, self.label_smoothing, x.device) return x, target class FastCollateMixup(Mixup): """ Fast Collate w/ Mixup/Cutmix that applies different params to each element or whole batch A Mixup impl that's performed while collating the batches. """ def _mix_elem_collate(self, output, batch, half=False): batch_size = len(batch) num_elem = batch_size // 2 if half else batch_size assert len(output) == num_elem lam_batch, use_cutmix = self._params_per_elem(num_elem) for i in range(num_elem): j = batch_size - i - 1 lam = lam_batch[i] mixed = batch[i][0] if lam != 1.: if use_cutmix[i]: if not half: mixed = mixed.copy() (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh] lam_batch[i] = lam else: mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam) np.rint(mixed, out=mixed) output[i] += torch.from_numpy(mixed.astype(np.uint8)) if half: lam_batch = np.concatenate((lam_batch, np.ones(num_elem))) return torch.tensor(lam_batch).unsqueeze(1) def _mix_pair_collate(self, output, batch): batch_size = len(batch) lam_batch, use_cutmix = self._params_per_elem(batch_size // 2) for i in range(batch_size // 2): j = batch_size - i - 1 lam = lam_batch[i] mixed_i = batch[i][0] mixed_j = batch[j][0] assert 0 <= lam <= 1.0 if lam < 1.: if use_cutmix[i]: (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) patch_i = mixed_i[:, yl:yh, xl:xh].copy() mixed_i[:, yl:yh, xl:xh] = mixed_j[:, yl:yh, xl:xh] mixed_j[:, yl:yh, xl:xh] = patch_i lam_batch[i] = lam else: mixed_temp = mixed_i.astype(np.float32) * lam + mixed_j.astype(np.float32) * (1 - lam) mixed_j = mixed_j.astype(np.float32) * lam + mixed_i.astype(np.float32) * (1 - lam) mixed_i = mixed_temp np.rint(mixed_j, out=mixed_j) np.rint(mixed_i, out=mixed_i) output[i] += torch.from_numpy(mixed_i.astype(np.uint8)) output[j] += torch.from_numpy(mixed_j.astype(np.uint8)) lam_batch = np.concatenate((lam_batch, lam_batch[::-1])) return torch.tensor(lam_batch).unsqueeze(1) def _mix_batch_collate(self, output, batch): batch_size = len(batch) lam, use_cutmix = self._params_per_batch() if use_cutmix: (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) for i in range(batch_size): j = batch_size - i - 1 mixed = batch[i][0] if lam != 1.: if use_cutmix: mixed = mixed.copy() # don't want to modify the original while iterating mixed[..., yl:yh, xl:xh] = batch[j][0][..., yl:yh, xl:xh] else: mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam) np.rint(mixed, out=mixed) output[i] += torch.from_numpy(mixed.astype(np.uint8)) return lam def __call__(self, batch, _=None): batch_size = len(batch) assert batch_size % 2 == 0, 'Batch size should be even when using this' half = 'half' in self.mode if half: batch_size //= 2 output = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8) if self.mode == 'elem' or self.mode == 'half': lam = self._mix_elem_collate(output, batch, half=half) elif self.mode == 'pair': lam = self._mix_pair_collate(output, batch) else: lam = self._mix_batch_collate(output, batch) target = torch.tensor([b[1] for b in batch], dtype=torch.int64) target = mixup_target(target, self.num_classes, lam, self.label_smoothing, device='cpu') target = target[:batch_size] return output, target ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/rand_augment.py ================================================ """ This implementation is based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/auto_augment.py pulished under an Apache License 2.0. COMMENT FROM ORIGINAL: AutoAugment, RandAugment, and AugMix for PyTorch This code implements the searched ImageNet policies with various tweaks and improvements and does not include any of the search code. AA and RA Implementation adapted from: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/autoaugment.py AugMix adapted from: https://github.com/google-research/augmix Papers: AutoAugment: Learning Augmentation Policies from Data https://arxiv.org/abs/1805.09501 Learning Data Augmentation Strategies for Object Detection https://arxiv.org/abs/1906.11172 RandAugment: Practical automated data augmentation... https://arxiv.org/abs/1909.13719 AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty https://arxiv.org/abs/1912.02781 Hacked together by / Copyright 2020 Ross Wightman """ import math import numpy as np import random import re import PIL from PIL import Image, ImageEnhance, ImageOps _PIL_VER = tuple([int(x) for x in PIL.__version__.split(".")[:2]]) _FILL = (128, 128, 128) # This signifies the max integer that the controller RNN could predict for the # augmentation scheme. _MAX_LEVEL = 10.0 _HPARAMS_DEFAULT = { "translate_const": 250, "img_mean": _FILL, } _RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC) def _interpolation(kwargs): interpolation = kwargs.pop("resample", Image.BILINEAR) if isinstance(interpolation, (list, tuple)): return random.choice(interpolation) else: return interpolation def _check_args_tf(kwargs): if "fillcolor" in kwargs and _PIL_VER < (5, 0): kwargs.pop("fillcolor") kwargs["resample"] = _interpolation(kwargs) def shear_x(img, factor, **kwargs): _check_args_tf(kwargs) return img.transform( img.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), **kwargs ) def shear_y(img, factor, **kwargs): _check_args_tf(kwargs) return img.transform( img.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), **kwargs ) def translate_x_rel(img, pct, **kwargs): pixels = pct * img.size[0] _check_args_tf(kwargs) return img.transform( img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs ) def translate_y_rel(img, pct, **kwargs): pixels = pct * img.size[1] _check_args_tf(kwargs) return img.transform( img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs ) def translate_x_abs(img, pixels, **kwargs): _check_args_tf(kwargs) return img.transform( img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs ) def translate_y_abs(img, pixels, **kwargs): _check_args_tf(kwargs) return img.transform( img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs ) def rotate(img, degrees, **kwargs): _check_args_tf(kwargs) if _PIL_VER >= (5, 2): return img.rotate(degrees, **kwargs) elif _PIL_VER >= (5, 0): w, h = img.size post_trans = (0, 0) rotn_center = (w / 2.0, h / 2.0) angle = -math.radians(degrees) matrix = [ round(math.cos(angle), 15), round(math.sin(angle), 15), 0.0, round(-math.sin(angle), 15), round(math.cos(angle), 15), 0.0, ] def transform(x, y, matrix): (a, b, c, d, e, f) = matrix return a * x + b * y + c, d * x + e * y + f matrix[2], matrix[5] = transform( -rotn_center[0] - post_trans[0], -rotn_center[1] - post_trans[1], matrix, ) matrix[2] += rotn_center[0] matrix[5] += rotn_center[1] return img.transform(img.size, Image.AFFINE, matrix, **kwargs) else: return img.rotate(degrees, resample=kwargs["resample"]) def auto_contrast(img, **__): return ImageOps.autocontrast(img) def invert(img, **__): return ImageOps.invert(img) def equalize(img, **__): return ImageOps.equalize(img) def solarize(img, thresh, **__): return ImageOps.solarize(img, thresh) def solarize_add(img, add, thresh=128, **__): lut = [] for i in range(256): if i < thresh: lut.append(min(255, i + add)) else: lut.append(i) if img.mode in ("L", "RGB"): if img.mode == "RGB" and len(lut) == 256: lut = lut + lut + lut return img.point(lut) else: return img def posterize(img, bits_to_keep, **__): if bits_to_keep >= 8: return img return ImageOps.posterize(img, bits_to_keep) def contrast(img, factor, **__): return ImageEnhance.Contrast(img).enhance(factor) def color(img, factor, **__): return ImageEnhance.Color(img).enhance(factor) def brightness(img, factor, **__): return ImageEnhance.Brightness(img).enhance(factor) def sharpness(img, factor, **__): return ImageEnhance.Sharpness(img).enhance(factor) def _randomly_negate(v): """With 50% prob, negate the value""" return -v if random.random() > 0.5 else v def _rotate_level_to_arg(level, _hparams): # range [-30, 30] level = (level / _MAX_LEVEL) * 30.0 level = _randomly_negate(level) return (level,) def _enhance_level_to_arg(level, _hparams): # range [0.1, 1.9] return ((level / _MAX_LEVEL) * 1.8 + 0.1,) def _enhance_increasing_level_to_arg(level, _hparams): # the 'no change' level is 1.0, moving away from that towards 0. or 2.0 increases the enhancement blend # range [0.1, 1.9] level = (level / _MAX_LEVEL) * 0.9 level = 1.0 + _randomly_negate(level) return (level,) def _shear_level_to_arg(level, _hparams): # range [-0.3, 0.3] level = (level / _MAX_LEVEL) * 0.3 level = _randomly_negate(level) return (level,) def _translate_abs_level_to_arg(level, hparams): translate_const = hparams["translate_const"] level = (level / _MAX_LEVEL) * float(translate_const) level = _randomly_negate(level) return (level,) def _translate_rel_level_to_arg(level, hparams): # default range [-0.45, 0.45] translate_pct = hparams.get("translate_pct", 0.45) level = (level / _MAX_LEVEL) * translate_pct level = _randomly_negate(level) return (level,) def _posterize_level_to_arg(level, _hparams): # As per Tensorflow TPU EfficientNet impl # range [0, 4], 'keep 0 up to 4 MSB of original image' # intensity/severity of augmentation decreases with level return (int((level / _MAX_LEVEL) * 4),) def _posterize_increasing_level_to_arg(level, hparams): # As per Tensorflow models research and UDA impl # range [4, 0], 'keep 4 down to 0 MSB of original image', # intensity/severity of augmentation increases with level return (4 - _posterize_level_to_arg(level, hparams)[0],) def _posterize_original_level_to_arg(level, _hparams): # As per original AutoAugment paper description # range [4, 8], 'keep 4 up to 8 MSB of image' # intensity/severity of augmentation decreases with level return (int((level / _MAX_LEVEL) * 4) + 4,) def _solarize_level_to_arg(level, _hparams): # range [0, 256] # intensity/severity of augmentation decreases with level return (int((level / _MAX_LEVEL) * 256),) def _solarize_increasing_level_to_arg(level, _hparams): # range [0, 256] # intensity/severity of augmentation increases with level return (256 - _solarize_level_to_arg(level, _hparams)[0],) def _solarize_add_level_to_arg(level, _hparams): # range [0, 110] return (int((level / _MAX_LEVEL) * 110),) LEVEL_TO_ARG = { "AutoContrast": None, "Equalize": None, "Invert": None, "Rotate": _rotate_level_to_arg, # There are several variations of the posterize level scaling in various Tensorflow/Google repositories/papers "Posterize": _posterize_level_to_arg, "PosterizeIncreasing": _posterize_increasing_level_to_arg, "PosterizeOriginal": _posterize_original_level_to_arg, "Solarize": _solarize_level_to_arg, "SolarizeIncreasing": _solarize_increasing_level_to_arg, "SolarizeAdd": _solarize_add_level_to_arg, "Color": _enhance_level_to_arg, "ColorIncreasing": _enhance_increasing_level_to_arg, "Contrast": _enhance_level_to_arg, "ContrastIncreasing": _enhance_increasing_level_to_arg, "Brightness": _enhance_level_to_arg, "BrightnessIncreasing": _enhance_increasing_level_to_arg, "Sharpness": _enhance_level_to_arg, "SharpnessIncreasing": _enhance_increasing_level_to_arg, "ShearX": _shear_level_to_arg, "ShearY": _shear_level_to_arg, "TranslateX": _translate_abs_level_to_arg, "TranslateY": _translate_abs_level_to_arg, "TranslateXRel": _translate_rel_level_to_arg, "TranslateYRel": _translate_rel_level_to_arg, } NAME_TO_OP = { "AutoContrast": auto_contrast, "Equalize": equalize, "Invert": invert, "Rotate": rotate, "Posterize": posterize, "PosterizeIncreasing": posterize, "PosterizeOriginal": posterize, "Solarize": solarize, "SolarizeIncreasing": solarize, "SolarizeAdd": solarize_add, "Color": color, "ColorIncreasing": color, "Contrast": contrast, "ContrastIncreasing": contrast, "Brightness": brightness, "BrightnessIncreasing": brightness, "Sharpness": sharpness, "SharpnessIncreasing": sharpness, "ShearX": shear_x, "ShearY": shear_y, "TranslateX": translate_x_abs, "TranslateY": translate_y_abs, "TranslateXRel": translate_x_rel, "TranslateYRel": translate_y_rel, } class AugmentOp: """ Apply for video. """ def __init__(self, name, prob=0.5, magnitude=10, hparams=None): hparams = hparams or _HPARAMS_DEFAULT self.aug_fn = NAME_TO_OP[name] self.level_fn = LEVEL_TO_ARG[name] self.prob = prob self.magnitude = magnitude self.hparams = hparams.copy() self.kwargs = { "fillcolor": hparams["img_mean"] if "img_mean" in hparams else _FILL, "resample": hparams["interpolation"] if "interpolation" in hparams else _RANDOM_INTERPOLATION, } # If magnitude_std is > 0, we introduce some randomness # in the usually fixed policy and sample magnitude from a normal distribution # with mean `magnitude` and std-dev of `magnitude_std`. # NOTE This is my own hack, being tested, not in papers or reference impls. self.magnitude_std = self.hparams.get("magnitude_std", 0) def __call__(self, img_list): if self.prob < 1.0 and random.random() > self.prob: return img_list magnitude = self.magnitude if self.magnitude_std and self.magnitude_std > 0: magnitude = random.gauss(magnitude, self.magnitude_std) magnitude = min(_MAX_LEVEL, max(0, magnitude)) # clip to valid range level_args = ( self.level_fn(magnitude, self.hparams) if self.level_fn is not None else () ) if isinstance(img_list, list): return [ self.aug_fn(img, *level_args, **self.kwargs) for img in img_list ] else: return self.aug_fn(img_list, *level_args, **self.kwargs) _RAND_TRANSFORMS = [ "AutoContrast", "Equalize", "Invert", "Rotate", "Posterize", "Solarize", "SolarizeAdd", "Color", "Contrast", "Brightness", "Sharpness", "ShearX", "ShearY", "TranslateXRel", "TranslateYRel", ] _RAND_INCREASING_TRANSFORMS = [ "AutoContrast", "Equalize", "Invert", "Rotate", "PosterizeIncreasing", "SolarizeIncreasing", "SolarizeAdd", "ColorIncreasing", "ContrastIncreasing", "BrightnessIncreasing", "SharpnessIncreasing", "ShearX", "ShearY", "TranslateXRel", "TranslateYRel", ] # These experimental weights are based loosely on the relative improvements mentioned in paper. # They may not result in increased performance, but could likely be tuned to so. _RAND_CHOICE_WEIGHTS_0 = { "Rotate": 0.3, "ShearX": 0.2, "ShearY": 0.2, "TranslateXRel": 0.1, "TranslateYRel": 0.1, "Color": 0.025, "Sharpness": 0.025, "AutoContrast": 0.025, "Solarize": 0.005, "SolarizeAdd": 0.005, "Contrast": 0.005, "Brightness": 0.005, "Equalize": 0.005, "Posterize": 0, "Invert": 0, } def _select_rand_weights(weight_idx=0, transforms=None): transforms = transforms or _RAND_TRANSFORMS assert weight_idx == 0 # only one set of weights currently rand_weights = _RAND_CHOICE_WEIGHTS_0 probs = [rand_weights[k] for k in transforms] probs /= np.sum(probs) return probs def rand_augment_ops(magnitude=10, hparams=None, transforms=None): hparams = hparams or _HPARAMS_DEFAULT transforms = transforms or _RAND_TRANSFORMS return [ AugmentOp(name, prob=0.5, magnitude=magnitude, hparams=hparams) for name in transforms ] class RandAugment: def __init__(self, ops, num_layers=2, choice_weights=None): self.ops = ops self.num_layers = num_layers self.choice_weights = choice_weights def __call__(self, img): # no replacement when using weighted choice ops = np.random.choice( self.ops, self.num_layers, replace=self.choice_weights is None, p=self.choice_weights, ) for op in ops: img = op(img) return img def rand_augment_transform(config_str, hparams): """ RandAugment: Practical automated data augmentation... - https://arxiv.org/abs/1909.13719 Create a RandAugment transform :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining sections, not order sepecific determine 'm' - integer magnitude of rand augment 'n' - integer num layers (number of transform ops selected per image) 'w' - integer probabiliy weight index (index of a set of weights to influence choice of op) 'mstd' - float std deviation of magnitude noise applied 'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0) Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5 'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2 :param hparams: Other hparams (kwargs) for the RandAugmentation scheme :return: A PyTorch compatible Transform """ magnitude = _MAX_LEVEL # default to _MAX_LEVEL for magnitude (currently 10) num_layers = 2 # default to 2 ops per image weight_idx = None # default to no probability weights for op choice transforms = _RAND_TRANSFORMS config = config_str.split("-") assert config[0] == "rand" config = config[1:] for c in config: cs = re.split(r"(\d.*)", c) if len(cs) < 2: continue key, val = cs[:2] if key == "mstd": # noise param injected via hparams for now hparams.setdefault("magnitude_std", float(val)) elif key == "inc": if bool(val): transforms = _RAND_INCREASING_TRANSFORMS elif key == "m": magnitude = int(val) elif key == "n": num_layers = int(val) elif key == "w": weight_idx = int(val) else: assert NotImplementedError ra_ops = rand_augment_ops( magnitude=magnitude, hparams=hparams, transforms=transforms ) choice_weights = ( None if weight_idx is None else _select_rand_weights(weight_idx) ) return RandAugment(ra_ops, num_layers, choice_weights=choice_weights) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/random_erasing.py ================================================ """ This implementation is based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/random_erasing.py pulished under an Apache License 2.0. """ import math import random import torch def _get_pixels( per_pixel, rand_color, patch_size, dtype=torch.float32, device="cuda" ): # NOTE I've seen CUDA illegal memory access errors being caused by the normal_() # paths, flip the order so normal is run on CPU if this becomes a problem # Issue has been fixed in master https://github.com/pytorch/pytorch/issues/19508 if per_pixel: return torch.empty(patch_size, dtype=dtype, device=device).normal_() elif rand_color: return torch.empty( (patch_size[0], 1, 1), dtype=dtype, device=device ).normal_() else: return torch.zeros((patch_size[0], 1, 1), dtype=dtype, device=device) class RandomErasing: """Randomly selects a rectangle region in an image and erases its pixels. 'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/pdf/1708.04896.pdf This variant of RandomErasing is intended to be applied to either a batch or single image tensor after it has been normalized by dataset mean and std. Args: probability: Probability that the Random Erasing operation will be performed. min_area: Minimum percentage of erased area wrt input image area. max_area: Maximum percentage of erased area wrt input image area. min_aspect: Minimum aspect ratio of erased area. mode: pixel color mode, one of 'const', 'rand', or 'pixel' 'const' - erase block is constant color of 0 for all channels 'rand' - erase block is same per-channel random (normal) color 'pixel' - erase block is per-pixel random (normal) color max_count: maximum number of erasing blocks per image, area per box is scaled by count. per-image count is randomly chosen between 1 and this value. """ def __init__( self, probability=0.5, min_area=0.02, max_area=1 / 3, min_aspect=0.3, max_aspect=None, mode="const", min_count=1, max_count=None, num_splits=0, device="cuda", cube=True, ): self.probability = probability self.min_area = min_area self.max_area = max_area max_aspect = max_aspect or 1 / min_aspect self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) self.min_count = min_count self.max_count = max_count or min_count self.num_splits = num_splits mode = mode.lower() self.rand_color = False self.per_pixel = False self.cube = cube if mode == "rand": self.rand_color = True # per block random normal elif mode == "pixel": self.per_pixel = True # per pixel random normal else: assert not mode or mode == "const" self.device = device def _erase(self, img, chan, img_h, img_w, dtype): if random.random() > self.probability: return area = img_h * img_w count = ( self.min_count if self.min_count == self.max_count else random.randint(self.min_count, self.max_count) ) for _ in range(count): for _ in range(10): target_area = ( random.uniform(self.min_area, self.max_area) * area / count ) aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) h = int(round(math.sqrt(target_area * aspect_ratio))) w = int(round(math.sqrt(target_area / aspect_ratio))) if w < img_w and h < img_h: top = random.randint(0, img_h - h) left = random.randint(0, img_w - w) img[:, top : top + h, left : left + w] = _get_pixels( self.per_pixel, self.rand_color, (chan, h, w), dtype=dtype, device=self.device, ) break def _erase_cube( self, img, batch_start, batch_size, chan, img_h, img_w, dtype, ): if random.random() > self.probability: return area = img_h * img_w count = ( self.min_count if self.min_count == self.max_count else random.randint(self.min_count, self.max_count) ) for _ in range(count): for _ in range(100): target_area = ( random.uniform(self.min_area, self.max_area) * area / count ) aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) h = int(round(math.sqrt(target_area * aspect_ratio))) w = int(round(math.sqrt(target_area / aspect_ratio))) if w < img_w and h < img_h: top = random.randint(0, img_h - h) left = random.randint(0, img_w - w) for i in range(batch_start, batch_size): img_instance = img[i] img_instance[ :, top : top + h, left : left + w ] = _get_pixels( self.per_pixel, self.rand_color, (chan, h, w), dtype=dtype, device=self.device, ) break def __call__(self, input): if len(input.size()) == 3: self._erase(input, *input.size(), input.dtype) else: batch_size, chan, img_h, img_w = input.size() # skip first slice of batch if num_splits is set (for clean portion of samples) batch_start = ( batch_size // self.num_splits if self.num_splits > 1 else 0 ) if self.cube: self._erase_cube( input, batch_start, batch_size, chan, img_h, img_w, input.dtype, ) else: for i in range(batch_start, batch_size): self._erase(input[i], chan, img_h, img_w, input.dtype) return input ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/ssv2.py ================================================ import os import io import cv2 import numpy as np import torch from torchvision import transforms import warnings from decord import VideoReader, cpu from torch.utils.data import Dataset from .random_erasing import RandomErasing from .video_transforms import ( Compose, Resize, CenterCrop, Normalize, create_random_augment, random_short_side_scale_jitter, random_crop, random_resized_crop_with_shift, random_resized_crop, horizontal_flip, random_short_side_scale_jitter, uniform_crop, ) from .volume_transforms import ClipToTensor try: from petrel_client.client import Client has_client = True except ImportError: has_client = False class SSRawFrameClsDataset(Dataset): """Load your own raw frame classification dataset.""" def __init__(self, anno_path, prefix='', split=' ', mode='train', clip_len=8, crop_size=224, short_side_size=256, new_height=256, new_width=340, keep_aspect_ratio=True, num_segment=1, num_crop=1, test_num_segment=10, test_num_crop=3, filename_tmpl='img_{:05}.jpg', args=None): self.anno_path = anno_path self.prefix = prefix self.split = split self.mode = mode self.clip_len = clip_len self.crop_size = crop_size self.short_side_size = short_side_size self.new_height = new_height self.new_width = new_width self.keep_aspect_ratio = keep_aspect_ratio self.num_segment = num_segment self.test_num_segment = test_num_segment self.num_crop = num_crop self.test_num_crop = test_num_crop self.filename_tmpl = filename_tmpl self.args = args self.aug = False self.rand_erase = False self.client = None if has_client: self.client = Client('~/petreloss.conf') if self.mode in ['train']: self.aug = True if self.args.reprob > 0: self.rand_erase = True if VideoReader is None: raise ImportError( "Unable to import `decord` which is required to read videos.") import pandas as pd cleaned = pd.read_csv(self.anno_path, header=None, delimiter=self.split) self.dataset_samples = list(cleaned.values[:, 0]) self.total_frames = list(cleaned.values[:, 1]) self.label_array = list(cleaned.values[:, -1]) if (mode == 'train'): pass elif (mode == 'validation'): self.data_transform = Compose([ Resize(self.short_side_size, interpolation='bilinear'), CenterCrop(size=(self.crop_size, self.crop_size)), ClipToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) elif mode == 'test': self.data_resize = Compose([ Resize(size=(short_side_size), interpolation='bilinear') ]) self.data_transform = Compose([ ClipToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) self.test_seg = [] self.test_dataset = [] self.test_total_frames = [] self.test_label_array = [] for ck in range(self.test_num_segment): for cp in range(self.test_num_crop): for idx in range(len(self.label_array)): self.test_seg.append((ck, cp)) self.test_dataset.append(self.dataset_samples[idx]) self.test_total_frames.append(self.total_frames[idx]) self.test_label_array.append(self.label_array[idx]) def __getitem__(self, index): if self.mode == 'train': args = self.args scale_t = 1 sample = self.dataset_samples[index] total_frame = self.total_frames[index] buffer = self.load_frame(sample, total_frame, sample_rate_scale=scale_t) # T H W C if len(buffer) == 0: while len(buffer) == 0: warnings.warn( "video {} not correctly loaded during training".format( sample)) index = np.random.randint(self.__len__()) sample = self.dataset_samples[index] total_frame = self.total_frames[index] buffer = self.load_frame(sample, total_frame, sample_rate_scale=scale_t) if args.num_sample > 1: frame_list = [] label_list = [] index_list = [] for _ in range(args.num_sample): new_frames = self._aug_frame(buffer, args) label = self.label_array[index] frame_list.append(new_frames) label_list.append(label) index_list.append(index) return frame_list, label_list, index_list, {} else: buffer = self._aug_frame(buffer, args) return buffer, self.label_array[index], index, {} elif self.mode == 'validation': sample = self.dataset_samples[index] total_frame = self.total_frames[index] buffer = self.load_frame(sample, total_frame) if len(buffer) == 0: while len(buffer) == 0: warnings.warn( "video {} not correctly loaded during validation". format(sample)) index = np.random.randint(self.__len__()) sample = self.dataset_samples[index] buffer = self.load_frame(sample, total_frame) buffer = self.data_transform(buffer) return buffer, self.label_array[index], sample.split( "/")[-1].split(".")[0] elif self.mode == 'test': sample = self.test_dataset[index] total_frame = self.test_total_frames[index] chunk_nb, split_nb = self.test_seg[index] buffer = self.load_frame(sample, total_frame) while len(buffer) == 0: warnings.warn("video {}, temporal {}, spatial {} not found during testing".format(\ str(self.test_dataset[index]), chunk_nb, split_nb)) index = np.random.randint(self.__len__()) sample = self.test_dataset[index] total_frame = self.test_total_frames[index] chunk_nb, split_nb = self.test_seg[index] buffer = self.load_frame(sample, total_frame) buffer = self.data_resize(buffer) if isinstance(buffer, list): buffer = np.stack(buffer, 0) spatial_step = 1.0 * (max(buffer.shape[1], buffer.shape[2]) - self.short_side_size) \ / (self.test_num_crop - 1) temporal_start = chunk_nb spatial_start = int(split_nb * spatial_step) if buffer.shape[1] >= buffer.shape[2]: buffer = buffer[temporal_start::self.test_num_segment, \ spatial_start:spatial_start + self.short_side_size, :, :] else: buffer = buffer[temporal_start::self.test_num_segment, \ :, spatial_start:spatial_start + self.short_side_size, :] buffer = self.data_transform(buffer) return buffer, self.test_label_array[index], sample.split("/")[-1].split(".")[0], \ chunk_nb, split_nb else: raise NameError('mode {} unkown'.format(self.mode)) def _aug_frame( self, buffer, args, ): aug_transform = create_random_augment( input_size=(self.crop_size, self.crop_size), auto_augment=args.aa, interpolation=args.train_interpolation, ) buffer = [transforms.ToPILImage()(frame) for frame in buffer] buffer = aug_transform(buffer) buffer = [transforms.ToTensor()(img) for img in buffer] buffer = torch.stack(buffer) # T C H W buffer = buffer.permute(0, 2, 3, 1) # T H W C # T H W C buffer = tensor_normalize(buffer, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) # T H W C -> C T H W. buffer = buffer.permute(3, 0, 1, 2) # Perform data augmentation. scl, asp = ( [0.08, 1.0], [0.75, 1.3333], ) buffer = spatial_sampling( buffer, spatial_idx=-1, min_scale=256, max_scale=320, crop_size=self.crop_size, random_horizontal_flip=False if args.data_set == 'SSV2' else True, inverse_uniform_sampling=False, aspect_ratio=asp, scale=scl, motion_shift=False) if self.rand_erase: erase_transform = RandomErasing( args.reprob, mode=args.remode, max_count=args.recount, num_splits=args.recount, device="cpu", ) buffer = buffer.permute(1, 0, 2, 3) buffer = erase_transform(buffer) buffer = buffer.permute(1, 0, 2, 3) return buffer def load_frame(self, sample, num_frames, sample_rate_scale=1): """Load video content using Decord""" fname = sample fname = os.path.join(self.prefix, fname) if self.mode == 'test': tick = num_frames / float(self.num_segment) all_index = [] for t_seg in range(self.test_num_segment): tmp_index = [ int(t_seg * tick / self.test_num_segment + tick * x) for x in range(self.num_segment) ] all_index.extend(tmp_index) all_index = list(np.sort(np.array(all_index))) imgs = [] for idx in all_index: frame_fname = os.path.join(fname, self.filename_tmpl.format(idx + 1)) img_bytes = self.client.get(frame_fname) img_np = np.frombuffer(img_bytes, np.uint8) img = cv2.imdecode(img_np, cv2.IMREAD_COLOR) cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) imgs.append(img) buffer = np.array(imgs) return buffer # handle temporal segments average_duration = num_frames // self.num_segment all_index = [] if average_duration > 0: if self.mode == 'validation': all_index = list( np.multiply(list(range(self.num_segment)), average_duration) + np.ones(self.num_segment, dtype=int) * (average_duration // 2)) else: all_index = list( np.multiply(list(range(self.num_segment)), average_duration) + np.random.randint(average_duration, size=self.num_segment)) elif num_frames > self.num_segment: if self.mode == 'validation': all_index = list(range(self.num_segment)) else: all_index = list( np.sort( np.random.randint(num_frames, size=self.num_segment))) else: all_index = [0] * (self.num_segment - num_frames) + list( range(num_frames)) all_index = list(np.array(all_index)) imgs = [] for idx in all_index: frame_fname = os.path.join(fname, self.filename_tmpl.format(idx + 1)) img_bytes = self.client.get(frame_fname) img_np = np.frombuffer(img_bytes, np.uint8) img = cv2.imdecode(img_np, cv2.IMREAD_COLOR) cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) imgs.append(img) buffer = np.array(imgs) return buffer def __len__(self): if self.mode != 'test': return len(self.dataset_samples) else: return len(self.test_dataset) class SSVideoClsDataset(Dataset): """Load your own video classification dataset.""" def __init__(self, anno_path, prefix='', split=' ', mode='train', clip_len=8, crop_size=224, short_side_size=256, new_height=256, new_width=340, keep_aspect_ratio=True, num_segment=1, num_crop=1, test_num_segment=10, test_num_crop=3, args=None): self.anno_path = anno_path self.prefix = prefix self.split = split self.mode = mode self.clip_len = clip_len self.crop_size = crop_size self.short_side_size = short_side_size self.new_height = new_height self.new_width = new_width self.keep_aspect_ratio = keep_aspect_ratio self.num_segment = num_segment self.test_num_segment = test_num_segment self.num_crop = num_crop self.test_num_crop = test_num_crop self.args = args self.aug = False self.rand_erase = False self.client = None if has_client: self.client = Client('~/petreloss.conf') if self.mode in ['train']: self.aug = True if self.args.reprob > 0: self.rand_erase = True if VideoReader is None: raise ImportError("Unable to import `decord` which is required to read videos.") import pandas as pd cleaned = pd.read_csv(self.anno_path, header=None, delimiter=self.split) self.dataset_samples = list(cleaned.values[:, 0]) self.label_array = list(cleaned.values[:, 1]) if (mode == 'train'): pass elif (mode == 'validation'): self.data_transform = Compose([ Resize(self.short_side_size, interpolation='bilinear'), CenterCrop(size=(self.crop_size, self.crop_size)), ClipToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) elif mode == 'test': self.data_resize = Compose([ Resize(size=(short_side_size), interpolation='bilinear') ]) self.data_transform = Compose([ ClipToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) self.test_seg = [] self.test_dataset = [] self.test_label_array = [] for ck in range(self.test_num_segment): for cp in range(self.test_num_crop): for idx in range(len(self.label_array)): sample_label = self.label_array[idx] self.test_label_array.append(sample_label) self.test_dataset.append(self.dataset_samples[idx]) self.test_seg.append((ck, cp)) def __getitem__(self, index): if self.mode == 'train': args = self.args scale_t = 1 sample = self.dataset_samples[index] buffer = self.loadvideo_decord(sample, sample_rate_scale=scale_t) # T H W C if len(buffer) == 0: while len(buffer) == 0: warnings.warn("video {} not correctly loaded during training".format(sample)) index = np.random.randint(self.__len__()) sample = self.dataset_samples[index] buffer = self.loadvideo_decord(sample, sample_rate_scale=scale_t) if args.num_sample > 1: frame_list = [] label_list = [] index_list = [] for _ in range(args.num_sample): new_frames = self._aug_frame(buffer, args) label = self.label_array[index] frame_list.append(new_frames) label_list.append(label) index_list.append(index) return frame_list, label_list, index_list, {} else: buffer = self._aug_frame(buffer, args) return buffer, self.label_array[index], index, {} elif self.mode == 'validation': sample = self.dataset_samples[index] buffer = self.loadvideo_decord(sample) if len(buffer) == 0: while len(buffer) == 0: warnings.warn("video {} not correctly loaded during validation".format(sample)) index = np.random.randint(self.__len__()) sample = self.dataset_samples[index] buffer = self.loadvideo_decord(sample) buffer = self.data_transform(buffer) return buffer, self.label_array[index], sample.split("/")[-1].split(".")[0] elif self.mode == 'test': sample = self.test_dataset[index] chunk_nb, split_nb = self.test_seg[index] buffer = self.loadvideo_decord(sample) while len(buffer) == 0: warnings.warn("video {}, temporal {}, spatial {} not found during testing".format(\ str(self.test_dataset[index]), chunk_nb, split_nb)) index = np.random.randint(self.__len__()) sample = self.test_dataset[index] chunk_nb, split_nb = self.test_seg[index] buffer = self.loadvideo_decord(sample) buffer = self.data_resize(buffer) if isinstance(buffer, list): buffer = np.stack(buffer, 0) spatial_step = 1.0 * (max(buffer.shape[1], buffer.shape[2]) - self.short_side_size) \ / (self.test_num_crop - 1) temporal_start = chunk_nb # 0/1 spatial_start = int(split_nb * spatial_step) if buffer.shape[1] >= buffer.shape[2]: buffer = buffer[temporal_start::2, \ spatial_start:spatial_start + self.short_side_size, :, :] else: buffer = buffer[temporal_start::2, \ :, spatial_start:spatial_start + self.short_side_size, :] buffer = self.data_transform(buffer) return buffer, self.test_label_array[index], sample.split("/")[-1].split(".")[0], \ chunk_nb, split_nb else: raise NameError('mode {} unkown'.format(self.mode)) def _aug_frame( self, buffer, args, ): aug_transform = create_random_augment( input_size=(self.crop_size, self.crop_size), auto_augment=args.aa, interpolation=args.train_interpolation, ) buffer = [ transforms.ToPILImage()(frame) for frame in buffer ] buffer = aug_transform(buffer) buffer = [transforms.ToTensor()(img) for img in buffer] buffer = torch.stack(buffer) # T C H W buffer = buffer.permute(0, 2, 3, 1) # T H W C # T H W C buffer = tensor_normalize( buffer, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] ) # T H W C -> C T H W. buffer = buffer.permute(3, 0, 1, 2) # Perform data augmentation. scl, asp = ( [0.08, 1.0], [0.75, 1.3333], ) buffer = spatial_sampling( buffer, spatial_idx=-1, min_scale=256, max_scale=320, crop_size=self.crop_size, random_horizontal_flip=False if args.data_set == 'SSV2' else True, inverse_uniform_sampling=False, aspect_ratio=asp, scale=scl, motion_shift=False ) if self.rand_erase: erase_transform = RandomErasing( args.reprob, mode=args.remode, max_count=args.recount, num_splits=args.recount, device="cpu", ) buffer = buffer.permute(1, 0, 2, 3) buffer = erase_transform(buffer) buffer = buffer.permute(1, 0, 2, 3) return buffer def loadvideo_decord(self, sample, sample_rate_scale=1): """Load video content using Decord""" fname = sample fname = os.path.join(self.prefix, fname) try: if self.keep_aspect_ratio: if fname.startswith('s3'): video_bytes = self.client.get(fname) vr = VideoReader(io.BytesIO(video_bytes), num_threads=1, ctx=cpu(0)) else: vr = VideoReader(fname, num_threads=1, ctx=cpu(0)) else: if fname.startswith('s3:'): video_bytes = self.client.get(fname) vr = VideoReader(io.BytesIO(video_bytes), width=self.new_width, height=self.new_height, num_threads=1, ctx=cpu(0)) else: vr = VideoReader(fname, width=self.new_width, height=self.new_height, num_threads=1, ctx=cpu(0)) except: print("video cannot be loaded by decord: ", fname) return [] if self.mode == 'test': tick = len(vr) / float(self.num_segment) all_index = list(np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segment)] + [int(tick * x) for x in range(self.num_segment)])) while len(all_index) < (self.num_segment * self.test_num_segment): all_index.append(all_index[-1]) all_index = np.sort(np.array(all_index)) vr.seek(0) buffer = vr.get_batch(all_index).asnumpy() return buffer elif self.mode == 'validation': tick = len(vr) / float(self.num_segment) all_index = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segment)]) vr.seek(0) buffer = vr.get_batch(all_index).asnumpy() return buffer # handle temporal segments average_duration = len(vr) // self.num_segment if average_duration > 0: all_index = list(np.multiply(list(range(self.num_segment)), average_duration) + np.random.randint(average_duration, size=self.num_segment)) elif len(vr) > self.num_segment: all_index = list(np.sort(np.random.randint(len(vr), size=self.num_segment))) else: all_index = list(np.zeros((self.num_segment,))) vr.seek(0) buffer = vr.get_batch(all_index).asnumpy() return buffer def __len__(self): if self.mode != 'test': return len(self.dataset_samples) else: return len(self.test_dataset) def spatial_sampling( frames, spatial_idx=-1, min_scale=256, max_scale=320, crop_size=224, random_horizontal_flip=True, inverse_uniform_sampling=False, aspect_ratio=None, scale=None, motion_shift=False, ): """ Perform spatial sampling on the given video frames. If spatial_idx is -1, perform random scale, random crop, and random flip on the given frames. If spatial_idx is 0, 1, or 2, perform spatial uniform sampling with the given spatial_idx. Args: frames (tensor): frames of images sampled from the video. The dimension is `num frames` x `height` x `width` x `channel`. spatial_idx (int): if -1, perform random spatial sampling. If 0, 1, or 2, perform left, center, right crop if width is larger than height, and perform top, center, buttom crop if height is larger than width. min_scale (int): the minimal size of scaling. max_scale (int): the maximal size of scaling. crop_size (int): the size of height and width used to crop the frames. inverse_uniform_sampling (bool): if True, sample uniformly in [1 / max_scale, 1 / min_scale] and take a reciprocal to get the scale. If False, take a uniform sample from [min_scale, max_scale]. aspect_ratio (list): Aspect ratio range for resizing. scale (list): Scale range for resizing. motion_shift (bool): Whether to apply motion shift for resizing. Returns: frames (tensor): spatially sampled frames. """ assert spatial_idx in [-1, 0, 1, 2] if spatial_idx == -1: if aspect_ratio is None and scale is None: frames, _ = random_short_side_scale_jitter( images=frames, min_size=min_scale, max_size=max_scale, inverse_uniform_sampling=inverse_uniform_sampling, ) frames, _ = random_crop(frames, crop_size) else: transform_func = ( random_resized_crop_with_shift if motion_shift else random_resized_crop ) frames = transform_func( images=frames, target_height=crop_size, target_width=crop_size, scale=scale, ratio=aspect_ratio, ) if random_horizontal_flip: frames, _ = horizontal_flip(0.5, frames) else: # The testing is deterministic and no jitter should be performed. # min_scale, max_scale, and crop_size are expect to be the same. assert len({min_scale, max_scale, crop_size}) == 1 frames, _ = random_short_side_scale_jitter( frames, min_scale, max_scale ) frames, _ = uniform_crop(frames, crop_size, spatial_idx) return frames def tensor_normalize(tensor, mean, std): """ Normalize a given tensor by subtracting the mean and dividing the std. Args: tensor (tensor): tensor to normalize. mean (tensor or list): mean value to subtract. std (tensor or list): std to divide. """ if tensor.dtype == torch.uint8: tensor = tensor.float() tensor = tensor / 255.0 if type(mean) == list: mean = torch.tensor(mean) if type(std) == list: std = torch.tensor(std) tensor = tensor - mean tensor = tensor / std return tensor ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/transforms.py ================================================ import torch import torchvision.transforms.functional as F import warnings import random import numpy as np import torchvision from PIL import Image, ImageOps import numbers class GroupRandomCrop(object): def __init__(self, size): if isinstance(size, numbers.Number): self.size = (int(size), int(size)) else: self.size = size def __call__(self, img_tuple): img_group, label = img_tuple w, h = img_group[0].size th, tw = self.size out_images = list() x1 = random.randint(0, w - tw) y1 = random.randint(0, h - th) for img in img_group: assert(img.size[0] == w and img.size[1] == h) if w == tw and h == th: out_images.append(img) else: out_images.append(img.crop((x1, y1, x1 + tw, y1 + th))) return (out_images, label) class GroupCenterCrop(object): def __init__(self, size): self.worker = torchvision.transforms.CenterCrop(size) def __call__(self, img_tuple): img_group, label = img_tuple return ([self.worker(img) for img in img_group], label) class GroupRandomHorizontalFlip(object): def __init__(self, flip=False): self.flip = flip def __call__(self, img_tuple): v = random.random() if self.flip and v < 0.5: img_group, label = img_tuple ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group] return (ret, label) else: return img_tuple class GroupNormalize(object): def __init__(self, mean, std): self.mean = mean self.std = std def __call__(self, tensor_tuple): tensor, label = tensor_tuple rep_mean = self.mean * (tensor.size()[0]//len(self.mean)) rep_std = self.std * (tensor.size()[0]//len(self.std)) # TODO: make efficient for t, m, s in zip(tensor, rep_mean, rep_std): t.sub_(m).div_(s) return (tensor,label) class GroupGrayScale(object): def __init__(self, size): self.worker = torchvision.transforms.Grayscale(size) def __call__(self, img_tuple): img_group, label = img_tuple return ([self.worker(img) for img in img_group], label) class GroupColorJitter(object): def __init__(self, size): self.worker = torchvision.transforms.ColorJitter( brightness=size, contrast=size, saturation=size ) def __call__(self, img_tuple): img_group, label = img_tuple return ([self.worker(img) for img in img_group], label) class GroupScale(object): """ Rescales the input PIL.Image to the given 'size'. 'size' will be the size of the smaller edge. For example, if height > width, then image will be rescaled to (size * height / width, size) size: size of the smaller edge interpolation: Default: PIL.Image.BILINEAR """ def __init__(self, size, interpolation=Image.BILINEAR): self.worker = torchvision.transforms.Resize(size, interpolation) def __call__(self, img_tuple): img_group, label = img_tuple return ([self.worker(img) for img in img_group], label) class GroupMultiScaleCrop(object): def __init__(self, input_size, scales=None, max_distort=1, fix_crop=True, more_fix_crop=True): self.scales = scales if scales is not None else [1, 875, .75, .66] self.max_distort = max_distort self.fix_crop = fix_crop self.more_fix_crop = more_fix_crop self.input_size = input_size if not isinstance(input_size, int) else [input_size, input_size] self.interpolation = Image.BILINEAR def __call__(self, img_tuple): img_group, label = img_tuple im_size = img_group[0].size crop_w, crop_h, offset_w, offset_h = self._sample_crop_size(im_size) crop_img_group = [img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h)) for img in img_group] ret_img_group = [img.resize((self.input_size[0], self.input_size[1]), self.interpolation) for img in crop_img_group] return (ret_img_group, label) def _sample_crop_size(self, im_size): image_w, image_h = im_size[0], im_size[1] # find a crop size base_size = min(image_w, image_h) crop_sizes = [int(base_size * x) for x in self.scales] crop_h = [self.input_size[1] if abs(x - self.input_size[1]) < 3 else x for x in crop_sizes] crop_w = [self.input_size[0] if abs(x - self.input_size[0]) < 3 else x for x in crop_sizes] pairs = [] for i, h in enumerate(crop_h): for j, w in enumerate(crop_w): if abs(i - j) <= self.max_distort: pairs.append((w, h)) crop_pair = random.choice(pairs) if not self.fix_crop: w_offset = random.randint(0, image_w - crop_pair[0]) h_offset = random.randint(0, image_h - crop_pair[1]) else: w_offset, h_offset = self._sample_fix_offset(image_w, image_h, crop_pair[0], crop_pair[1]) return crop_pair[0], crop_pair[1], w_offset, h_offset def _sample_fix_offset(self, image_w, image_h, crop_w, crop_h): offsets = self.fill_fix_offset(self.more_fix_crop, image_w, image_h, crop_w, crop_h) return random.choice(offsets) @staticmethod def fill_fix_offset(more_fix_crop, image_w, image_h, crop_w, crop_h): w_step = (image_w - crop_w) // 4 h_step = (image_h - crop_h) // 4 ret = list() ret.append((0, 0)) # upper left ret.append((4 * w_step, 0)) # upper right ret.append((0, 4 * h_step)) # lower left ret.append((4 * w_step, 4 * h_step)) # lower right ret.append((2 * w_step, 2 * h_step)) # center if more_fix_crop: ret.append((0, 2 * h_step)) # center left ret.append((4 * w_step, 2 * h_step)) # center right ret.append((2 * w_step, 4 * h_step)) # lower center ret.append((2 * w_step, 0 * h_step)) # upper center ret.append((1 * w_step, 1 * h_step)) # upper left quarter ret.append((3 * w_step, 1 * h_step)) # upper right quarter ret.append((1 * w_step, 3 * h_step)) # lower left quarter ret.append((3 * w_step, 3 * h_step)) # lower righ quarter return ret class Stack(object): def __init__(self, roll=False): self.roll = roll def __call__(self, img_tuple): img_group, label = img_tuple if img_group[0].mode == 'L': return (np.concatenate([np.expand_dims(x, 2) for x in img_group], axis=2), label) elif img_group[0].mode == 'RGB': if self.roll: return (np.concatenate([np.array(x)[:, :, ::-1] for x in img_group], axis=2), label) else: return (np.concatenate(img_group, axis=2), label) class ToTorchFormatTensor(object): """ Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """ def __init__(self, div=True): self.div = div def __call__(self, pic_tuple): pic, label = pic_tuple if isinstance(pic, np.ndarray): # handle numpy array img = torch.from_numpy(pic).permute(2, 0, 1).contiguous() else: # handle PIL Image img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes())) img = img.view(pic.size[1], pic.size[0], len(pic.mode)) # put it from HWC to CHW format # yikes, this transpose takes 80% of the loading time/CPU img = img.transpose(0, 1).transpose(0, 2).contiguous() return (img.float().div(255.) if self.div else img.float(), label) class IdentityTransform(object): def __call__(self, data): return data ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/video_transforms.py ================================================ #!/usr/bin/env python3 import math import numpy as np import random import torch import torchvision.transforms.functional as F from PIL import Image from torchvision import transforms from .rand_augment import rand_augment_transform from .random_erasing import RandomErasing import numbers import PIL import torchvision import vbench.third_party.umt.functional as FF _pil_interpolation_to_str = { Image.NEAREST: "PIL.Image.NEAREST", Image.BILINEAR: "PIL.Image.BILINEAR", Image.BICUBIC: "PIL.Image.BICUBIC", Image.LANCZOS: "PIL.Image.LANCZOS", Image.HAMMING: "PIL.Image.HAMMING", Image.BOX: "PIL.Image.BOX", } _RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC) def _pil_interp(method): if method == "bicubic": return Image.BICUBIC elif method == "lanczos": return Image.LANCZOS elif method == "hamming": return Image.HAMMING else: return Image.BILINEAR def random_short_side_scale_jitter( images, min_size, max_size, boxes=None, inverse_uniform_sampling=False ): """ Perform a spatial short scale jittering on the given images and corresponding boxes. Args: images (tensor): images to perform scale jitter. Dimension is `num frames` x `channel` x `height` x `width`. min_size (int): the minimal size to scale the frames. max_size (int): the maximal size to scale the frames. boxes (ndarray): optional. Corresponding boxes to images. Dimension is `num boxes` x 4. inverse_uniform_sampling (bool): if True, sample uniformly in [1 / max_scale, 1 / min_scale] and take a reciprocal to get the scale. If False, take a uniform sample from [min_scale, max_scale]. Returns: (tensor): the scaled images with dimension of `num frames` x `channel` x `new height` x `new width`. (ndarray or None): the scaled boxes with dimension of `num boxes` x 4. """ if inverse_uniform_sampling: size = int( round(1.0 / np.random.uniform(1.0 / max_size, 1.0 / min_size)) ) else: size = int(round(np.random.uniform(min_size, max_size))) height = images.shape[2] width = images.shape[3] if (width <= height and width == size) or ( height <= width and height == size ): return images, boxes new_width = size new_height = size if width < height: new_height = int(math.floor((float(height) / width) * size)) if boxes is not None: boxes = boxes * float(new_height) / height else: new_width = int(math.floor((float(width) / height) * size)) if boxes is not None: boxes = boxes * float(new_width) / width return ( torch.nn.functional.interpolate( images, size=(new_height, new_width), mode="bilinear", align_corners=False, ), boxes, ) def crop_boxes(boxes, x_offset, y_offset): """ Peform crop on the bounding boxes given the offsets. Args: boxes (ndarray or None): bounding boxes to peform crop. The dimension is `num boxes` x 4. x_offset (int): cropping offset in the x axis. y_offset (int): cropping offset in the y axis. Returns: cropped_boxes (ndarray or None): the cropped boxes with dimension of `num boxes` x 4. """ cropped_boxes = boxes.copy() cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset return cropped_boxes def random_crop(images, size, boxes=None): """ Perform random spatial crop on the given images and corresponding boxes. Args: images (tensor): images to perform random crop. The dimension is `num frames` x `channel` x `height` x `width`. size (int): the size of height and width to crop on the image. boxes (ndarray or None): optional. Corresponding boxes to images. Dimension is `num boxes` x 4. Returns: cropped (tensor): cropped images with dimension of `num frames` x `channel` x `size` x `size`. cropped_boxes (ndarray or None): the cropped boxes with dimension of `num boxes` x 4. """ if images.shape[2] == size and images.shape[3] == size: return images height = images.shape[2] width = images.shape[3] y_offset = 0 if height > size: y_offset = int(np.random.randint(0, height - size)) x_offset = 0 if width > size: x_offset = int(np.random.randint(0, width - size)) cropped = images[ :, :, y_offset : y_offset + size, x_offset : x_offset + size ] cropped_boxes = ( crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None ) return cropped, cropped_boxes def horizontal_flip(prob, images, boxes=None): """ Perform horizontal flip on the given images and corresponding boxes. Args: prob (float): probility to flip the images. images (tensor): images to perform horizontal flip, the dimension is `num frames` x `channel` x `height` x `width`. boxes (ndarray or None): optional. Corresponding boxes to images. Dimension is `num boxes` x 4. Returns: images (tensor): images with dimension of `num frames` x `channel` x `height` x `width`. flipped_boxes (ndarray or None): the flipped boxes with dimension of `num boxes` x 4. """ if boxes is None: flipped_boxes = None else: flipped_boxes = boxes.copy() if np.random.uniform() < prob: images = images.flip((-1)) if len(images.shape) == 3: width = images.shape[2] elif len(images.shape) == 4: width = images.shape[3] else: raise NotImplementedError("Dimension does not supported") if boxes is not None: flipped_boxes[:, [0, 2]] = width - boxes[:, [2, 0]] - 1 return images, flipped_boxes def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None): """ Perform uniform spatial sampling on the images and corresponding boxes. Args: images (tensor): images to perform uniform crop. The dimension is `num frames` x `channel` x `height` x `width`. size (int): size of height and weight to crop the images. spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width is larger than height. Or 0, 1, or 2 for top, center, and bottom crop if height is larger than width. boxes (ndarray or None): optional. Corresponding boxes to images. Dimension is `num boxes` x 4. scale_size (int): optinal. If not None, resize the images to scale_size before performing any crop. Returns: cropped (tensor): images with dimension of `num frames` x `channel` x `size` x `size`. cropped_boxes (ndarray or None): the cropped boxes with dimension of `num boxes` x 4. """ assert spatial_idx in [0, 1, 2] ndim = len(images.shape) if ndim == 3: images = images.unsqueeze(0) height = images.shape[2] width = images.shape[3] if scale_size is not None: if width <= height: width, height = scale_size, int(height / width * scale_size) else: width, height = int(width / height * scale_size), scale_size images = torch.nn.functional.interpolate( images, size=(height, width), mode="bilinear", align_corners=False, ) y_offset = int(math.ceil((height - size) / 2)) x_offset = int(math.ceil((width - size) / 2)) if height > width: if spatial_idx == 0: y_offset = 0 elif spatial_idx == 2: y_offset = height - size else: if spatial_idx == 0: x_offset = 0 elif spatial_idx == 2: x_offset = width - size cropped = images[ :, :, y_offset : y_offset + size, x_offset : x_offset + size ] cropped_boxes = ( crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None ) if ndim == 3: cropped = cropped.squeeze(0) return cropped, cropped_boxes def clip_boxes_to_image(boxes, height, width): """ Clip an array of boxes to an image with the given height and width. Args: boxes (ndarray): bounding boxes to perform clipping. Dimension is `num boxes` x 4. height (int): given image height. width (int): given image width. Returns: clipped_boxes (ndarray): the clipped boxes with dimension of `num boxes` x 4. """ clipped_boxes = boxes.copy() clipped_boxes[:, [0, 2]] = np.minimum( width - 1.0, np.maximum(0.0, boxes[:, [0, 2]]) ) clipped_boxes[:, [1, 3]] = np.minimum( height - 1.0, np.maximum(0.0, boxes[:, [1, 3]]) ) return clipped_boxes def blend(images1, images2, alpha): """ Blend two images with a given weight alpha. Args: images1 (tensor): the first images to be blended, the dimension is `num frames` x `channel` x `height` x `width`. images2 (tensor): the second images to be blended, the dimension is `num frames` x `channel` x `height` x `width`. alpha (float): the blending weight. Returns: (tensor): blended images, the dimension is `num frames` x `channel` x `height` x `width`. """ return images1 * alpha + images2 * (1 - alpha) def grayscale(images): """ Get the grayscale for the input images. The channels of images should be in order BGR. Args: images (tensor): the input images for getting grayscale. Dimension is `num frames` x `channel` x `height` x `width`. Returns: img_gray (tensor): blended images, the dimension is `num frames` x `channel` x `height` x `width`. """ # R -> 0.299, G -> 0.587, B -> 0.114. img_gray = torch.tensor(images) gray_channel = ( 0.299 * images[:, 2] + 0.587 * images[:, 1] + 0.114 * images[:, 0] ) img_gray[:, 0] = gray_channel img_gray[:, 1] = gray_channel img_gray[:, 2] = gray_channel return img_gray def color_jitter(images, img_brightness=0, img_contrast=0, img_saturation=0): """ Perfrom a color jittering on the input images. The channels of images should be in order BGR. Args: images (tensor): images to perform color jitter. Dimension is `num frames` x `channel` x `height` x `width`. img_brightness (float): jitter ratio for brightness. img_contrast (float): jitter ratio for contrast. img_saturation (float): jitter ratio for saturation. Returns: images (tensor): the jittered images, the dimension is `num frames` x `channel` x `height` x `width`. """ jitter = [] if img_brightness != 0: jitter.append("brightness") if img_contrast != 0: jitter.append("contrast") if img_saturation != 0: jitter.append("saturation") if len(jitter) > 0: order = np.random.permutation(np.arange(len(jitter))) for idx in range(0, len(jitter)): if jitter[order[idx]] == "brightness": images = brightness_jitter(img_brightness, images) elif jitter[order[idx]] == "contrast": images = contrast_jitter(img_contrast, images) elif jitter[order[idx]] == "saturation": images = saturation_jitter(img_saturation, images) return images def brightness_jitter(var, images): """ Perfrom brightness jittering on the input images. The channels of images should be in order BGR. Args: var (float): jitter ratio for brightness. images (tensor): images to perform color jitter. Dimension is `num frames` x `channel` x `height` x `width`. Returns: images (tensor): the jittered images, the dimension is `num frames` x `channel` x `height` x `width`. """ alpha = 1.0 + np.random.uniform(-var, var) img_bright = torch.zeros(images.shape) images = blend(images, img_bright, alpha) return images def contrast_jitter(var, images): """ Perfrom contrast jittering on the input images. The channels of images should be in order BGR. Args: var (float): jitter ratio for contrast. images (tensor): images to perform color jitter. Dimension is `num frames` x `channel` x `height` x `width`. Returns: images (tensor): the jittered images, the dimension is `num frames` x `channel` x `height` x `width`. """ alpha = 1.0 + np.random.uniform(-var, var) img_gray = grayscale(images) img_gray[:] = torch.mean(img_gray, dim=(1, 2, 3), keepdim=True) images = blend(images, img_gray, alpha) return images def saturation_jitter(var, images): """ Perfrom saturation jittering on the input images. The channels of images should be in order BGR. Args: var (float): jitter ratio for saturation. images (tensor): images to perform color jitter. Dimension is `num frames` x `channel` x `height` x `width`. Returns: images (tensor): the jittered images, the dimension is `num frames` x `channel` x `height` x `width`. """ alpha = 1.0 + np.random.uniform(-var, var) img_gray = grayscale(images) images = blend(images, img_gray, alpha) return images def lighting_jitter(images, alphastd, eigval, eigvec): """ Perform AlexNet-style PCA jitter on the given images. Args: images (tensor): images to perform lighting jitter. Dimension is `num frames` x `channel` x `height` x `width`. alphastd (float): jitter ratio for PCA jitter. eigval (list): eigenvalues for PCA jitter. eigvec (list[list]): eigenvectors for PCA jitter. Returns: out_images (tensor): the jittered images, the dimension is `num frames` x `channel` x `height` x `width`. """ if alphastd == 0: return images # generate alpha1, alpha2, alpha3. alpha = np.random.normal(0, alphastd, size=(1, 3)) eig_vec = np.array(eigvec) eig_val = np.reshape(eigval, (1, 3)) rgb = np.sum( eig_vec * np.repeat(alpha, 3, axis=0) * np.repeat(eig_val, 3, axis=0), axis=1, ) out_images = torch.zeros_like(images) if len(images.shape) == 3: # C H W channel_dim = 0 elif len(images.shape) == 4: # T C H W channel_dim = 1 else: raise NotImplementedError(f"Unsupported dimension {len(images.shape)}") for idx in range(images.shape[channel_dim]): # C H W if len(images.shape) == 3: out_images[idx] = images[idx] + rgb[2 - idx] # T C H W elif len(images.shape) == 4: out_images[:, idx] = images[:, idx] + rgb[2 - idx] else: raise NotImplementedError( f"Unsupported dimension {len(images.shape)}" ) return out_images def color_normalization(images, mean, stddev): """ Perform color nomration on the given images. Args: images (tensor): images to perform color normalization. Dimension is `num frames` x `channel` x `height` x `width`. mean (list): mean values for normalization. stddev (list): standard deviations for normalization. Returns: out_images (tensor): the noramlized images, the dimension is `num frames` x `channel` x `height` x `width`. """ if len(images.shape) == 3: assert ( len(mean) == images.shape[0] ), "channel mean not computed properly" assert ( len(stddev) == images.shape[0] ), "channel stddev not computed properly" elif len(images.shape) == 4: assert ( len(mean) == images.shape[1] ), "channel mean not computed properly" assert ( len(stddev) == images.shape[1] ), "channel stddev not computed properly" else: raise NotImplementedError(f"Unsupported dimension {len(images.shape)}") out_images = torch.zeros_like(images) for idx in range(len(mean)): # C H W if len(images.shape) == 3: out_images[idx] = (images[idx] - mean[idx]) / stddev[idx] elif len(images.shape) == 4: out_images[:, idx] = (images[:, idx] - mean[idx]) / stddev[idx] else: raise NotImplementedError( f"Unsupported dimension {len(images.shape)}" ) return out_images def _get_param_spatial_crop( scale, ratio, height, width, num_repeat=10, log_scale=True, switch_hw=False ): """ Given scale, ratio, height and width, return sampled coordinates of the videos. """ for _ in range(num_repeat): area = height * width target_area = random.uniform(*scale) * area if log_scale: log_ratio = (math.log(ratio[0]), math.log(ratio[1])) aspect_ratio = math.exp(random.uniform(*log_ratio)) else: aspect_ratio = random.uniform(*ratio) w = int(round(math.sqrt(target_area * aspect_ratio))) h = int(round(math.sqrt(target_area / aspect_ratio))) if np.random.uniform() < 0.5 and switch_hw: w, h = h, w if 0 < w <= width and 0 < h <= height: i = random.randint(0, height - h) j = random.randint(0, width - w) return i, j, h, w # Fallback to central crop in_ratio = float(width) / float(height) if in_ratio < min(ratio): w = width h = int(round(w / min(ratio))) elif in_ratio > max(ratio): h = height w = int(round(h * max(ratio))) else: # whole image w = width h = height i = (height - h) // 2 j = (width - w) // 2 return i, j, h, w def random_resized_crop( images, target_height, target_width, scale=(0.8, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0), ): """ Crop the given images to random size and aspect ratio. A crop of random size (default: of 0.08 to 1.0) of the original size and a random aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop is finally resized to given size. This is popularly used to train the Inception networks. Args: images: Images to perform resizing and cropping. target_height: Desired height after cropping. target_width: Desired width after cropping. scale: Scale range of Inception-style area based random resizing. ratio: Aspect ratio range of Inception-style area based random resizing. """ height = images.shape[2] width = images.shape[3] i, j, h, w = _get_param_spatial_crop(scale, ratio, height, width) cropped = images[:, :, i : i + h, j : j + w] return torch.nn.functional.interpolate( cropped, size=(target_height, target_width), mode="bilinear", align_corners=False, ) def random_resized_crop_with_shift( images, target_height, target_width, scale=(0.8, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0), ): """ This is similar to random_resized_crop. However, it samples two different boxes (for cropping) for the first and last frame. It then linearly interpolates the two boxes for other frames. Args: images: Images to perform resizing and cropping. target_height: Desired height after cropping. target_width: Desired width after cropping. scale: Scale range of Inception-style area based random resizing. ratio: Aspect ratio range of Inception-style area based random resizing. """ t = images.shape[1] height = images.shape[2] width = images.shape[3] i, j, h, w = _get_param_spatial_crop(scale, ratio, height, width) i_, j_, h_, w_ = _get_param_spatial_crop(scale, ratio, height, width) i_s = [int(i) for i in torch.linspace(i, i_, steps=t).tolist()] j_s = [int(i) for i in torch.linspace(j, j_, steps=t).tolist()] h_s = [int(i) for i in torch.linspace(h, h_, steps=t).tolist()] w_s = [int(i) for i in torch.linspace(w, w_, steps=t).tolist()] out = torch.zeros((3, t, target_height, target_width)) for ind in range(t): out[:, ind : ind + 1, :, :] = torch.nn.functional.interpolate( images[ :, ind : ind + 1, i_s[ind] : i_s[ind] + h_s[ind], j_s[ind] : j_s[ind] + w_s[ind], ], size=(target_height, target_width), mode="bilinear", align_corners=False, ) return out def create_random_augment( input_size, auto_augment=None, interpolation="bilinear", ): """ Get video randaug transform. Args: input_size: The size of the input video in tuple. auto_augment: Parameters for randaug. An example: "rand-m7-n4-mstd0.5-inc1" (m is the magnitude and n is the number of operations to apply). interpolation: Interpolation method. """ if isinstance(input_size, tuple): img_size = input_size[-2:] else: img_size = input_size if auto_augment: assert isinstance(auto_augment, str) if isinstance(img_size, tuple): img_size_min = min(img_size) else: img_size_min = img_size aa_params = {"translate_const": int(img_size_min * 0.45)} if interpolation and interpolation != "random": aa_params["interpolation"] = _pil_interp(interpolation) if auto_augment.startswith("rand"): return transforms.Compose( [rand_augment_transform(auto_augment, aa_params)] ) raise NotImplementedError def random_sized_crop_img( im, size, jitter_scale=(0.08, 1.0), jitter_aspect=(3.0 / 4.0, 4.0 / 3.0), max_iter=10, ): """ Performs Inception-style cropping (used for training). """ assert ( len(im.shape) == 3 ), "Currently only support image for random_sized_crop" h, w = im.shape[1:3] i, j, h, w = _get_param_spatial_crop( scale=jitter_scale, ratio=jitter_aspect, height=h, width=w, num_repeat=max_iter, log_scale=False, switch_hw=True, ) cropped = im[:, i : i + h, j : j + w] return torch.nn.functional.interpolate( cropped.unsqueeze(0), size=(size, size), mode="bilinear", align_corners=False, ).squeeze(0) # The following code are modified based on timm lib, we will replace the following # contents with dependency from PyTorchVideo. # https://github.com/facebookresearch/pytorchvideo class RandomResizedCropAndInterpolation: """Crop the given PIL Image to random size and aspect ratio with random interpolation. A crop of random size (default: of 0.08 to 1.0) of the original size and a random aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop is finally resized to given size. This is popularly used to train the Inception networks. Args: size: expected output size of each edge scale: range of size of the origin size cropped ratio: range of aspect ratio of the origin aspect ratio cropped interpolation: Default: PIL.Image.BILINEAR """ def __init__( self, size, scale=(0.08, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0), interpolation="bilinear", ): if isinstance(size, tuple): self.size = size else: self.size = (size, size) if (scale[0] > scale[1]) or (ratio[0] > ratio[1]): print("range should be of kind (min, max)") if interpolation == "random": self.interpolation = _RANDOM_INTERPOLATION else: self.interpolation = _pil_interp(interpolation) self.scale = scale self.ratio = ratio @staticmethod def get_params(img, scale, ratio): """Get parameters for ``crop`` for a random sized crop. Args: img (PIL Image): Image to be cropped. scale (tuple): range of size of the origin size cropped ratio (tuple): range of aspect ratio of the origin aspect ratio cropped Returns: tuple: params (i, j, h, w) to be passed to ``crop`` for a random sized crop. """ area = img.size[0] * img.size[1] for _ in range(10): target_area = random.uniform(*scale) * area log_ratio = (math.log(ratio[0]), math.log(ratio[1])) aspect_ratio = math.exp(random.uniform(*log_ratio)) w = int(round(math.sqrt(target_area * aspect_ratio))) h = int(round(math.sqrt(target_area / aspect_ratio))) if w <= img.size[0] and h <= img.size[1]: i = random.randint(0, img.size[1] - h) j = random.randint(0, img.size[0] - w) return i, j, h, w # Fallback to central crop in_ratio = img.size[0] / img.size[1] if in_ratio < min(ratio): w = img.size[0] h = int(round(w / min(ratio))) elif in_ratio > max(ratio): h = img.size[1] w = int(round(h * max(ratio))) else: # whole image w = img.size[0] h = img.size[1] i = (img.size[1] - h) // 2 j = (img.size[0] - w) // 2 return i, j, h, w def __call__(self, img): """ Args: img (PIL Image): Image to be cropped and resized. Returns: PIL Image: Randomly cropped and resized image. """ i, j, h, w = self.get_params(img, self.scale, self.ratio) if isinstance(self.interpolation, (tuple, list)): interpolation = random.choice(self.interpolation) else: interpolation = self.interpolation return F.resized_crop(img, i, j, h, w, self.size, interpolation) def __repr__(self): if isinstance(self.interpolation, (tuple, list)): interpolate_str = " ".join( [_pil_interpolation_to_str[x] for x in self.interpolation] ) else: interpolate_str = _pil_interpolation_to_str[self.interpolation] format_string = self.__class__.__name__ + "(size={0}".format(self.size) format_string += ", scale={0}".format( tuple(round(s, 4) for s in self.scale) ) format_string += ", ratio={0}".format( tuple(round(r, 4) for r in self.ratio) ) format_string += ", interpolation={0})".format(interpolate_str) return format_string def transforms_imagenet_train( img_size=224, scale=None, ratio=None, hflip=0.5, vflip=0.0, color_jitter=0.4, auto_augment=None, interpolation="random", use_prefetcher=False, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), re_prob=0.0, re_mode="const", re_count=1, re_num_splits=0, separate=False, ): """ If separate==True, the transforms are returned as a tuple of 3 separate transforms for use in a mixing dataset that passes * all data through the first (primary) transform, called the 'clean' data * a portion of the data through the secondary transform * normalizes and converts the branches above with the third, final transform """ if isinstance(img_size, tuple): img_size = img_size[-2:] else: img_size = img_size scale = tuple(scale or (0.08, 1.0)) # default imagenet scale range ratio = tuple( ratio or (3.0 / 4.0, 4.0 / 3.0) ) # default imagenet ratio range primary_tfl = [ RandomResizedCropAndInterpolation( img_size, scale=scale, ratio=ratio, interpolation=interpolation ) ] if hflip > 0.0: primary_tfl += [transforms.RandomHorizontalFlip(p=hflip)] if vflip > 0.0: primary_tfl += [transforms.RandomVerticalFlip(p=vflip)] secondary_tfl = [] if auto_augment: assert isinstance(auto_augment, str) if isinstance(img_size, tuple): img_size_min = min(img_size) else: img_size_min = img_size aa_params = dict( translate_const=int(img_size_min * 0.45), img_mean=tuple([min(255, round(255 * x)) for x in mean]), ) if interpolation and interpolation != "random": aa_params["interpolation"] = _pil_interp(interpolation) if auto_augment.startswith("rand"): secondary_tfl += [rand_augment_transform(auto_augment, aa_params)] elif auto_augment.startswith("augmix"): raise NotImplementedError("Augmix not implemented") else: raise NotImplementedError("Auto aug not implemented") elif color_jitter is not None: # color jitter is enabled when not using AA if isinstance(color_jitter, (list, tuple)): # color jitter should be a 3-tuple/list if spec brightness/contrast/saturation # or 4 if also augmenting hue assert len(color_jitter) in (3, 4) else: # if it's a scalar, duplicate for brightness, contrast, and saturation, no hue color_jitter = (float(color_jitter),) * 3 secondary_tfl += [transforms.ColorJitter(*color_jitter)] final_tfl = [] final_tfl += [ transforms.ToTensor(), transforms.Normalize(mean=torch.tensor(mean), std=torch.tensor(std)), ] if re_prob > 0.0: final_tfl.append( RandomErasing( re_prob, mode=re_mode, max_count=re_count, num_splits=re_num_splits, device="cpu", cube=False, ) ) if separate: return ( transforms.Compose(primary_tfl), transforms.Compose(secondary_tfl), transforms.Compose(final_tfl), ) else: return transforms.Compose(primary_tfl + secondary_tfl + final_tfl) ############################################################################################################ ############################################################################################################ class Compose(object): """Composes several transforms Args: transforms (list of ``Transform`` objects): list of transforms to compose """ def __init__(self, transforms): self.transforms = transforms def __call__(self, clip): for t in self.transforms: clip = t(clip) return clip class RandomHorizontalFlip(object): """Horizontally flip the list of given images randomly with a probability 0.5 """ def __call__(self, clip): """ Args: img (PIL.Image or numpy.ndarray): List of images to be cropped in format (h, w, c) in numpy.ndarray Returns: PIL.Image or numpy.ndarray: Randomly flipped clip """ if random.random() < 0.5: if isinstance(clip[0], np.ndarray): return [np.fliplr(img) for img in clip] elif isinstance(clip[0], PIL.Image.Image): return [ img.transpose(PIL.Image.FLIP_LEFT_RIGHT) for img in clip ] else: raise TypeError('Expected numpy.ndarray or PIL.Image' + ' but got list of {0}'.format(type(clip[0]))) return clip class RandomResize(object): """Resizes a list of (H x W x C) numpy.ndarray to the final size The larger the original image is, the more times it takes to interpolate Args: interpolation (str): Can be one of 'nearest', 'bilinear' defaults to nearest size (tuple): (widht, height) """ def __init__(self, ratio=(3. / 4., 4. / 3.), interpolation='nearest'): self.ratio = ratio self.interpolation = interpolation def __call__(self, clip): scaling_factor = random.uniform(self.ratio[0], self.ratio[1]) if isinstance(clip[0], np.ndarray): im_h, im_w, im_c = clip[0].shape elif isinstance(clip[0], PIL.Image.Image): im_w, im_h = clip[0].size new_w = int(im_w * scaling_factor) new_h = int(im_h * scaling_factor) new_size = (new_w, new_h) resized = FF.resize_clip( clip, new_size, interpolation=self.interpolation) return resized class Resize(object): """Resizes a list of (H x W x C) numpy.ndarray to the final size The larger the original image is, the more times it takes to interpolate Args: interpolation (str): Can be one of 'nearest', 'bilinear' defaults to nearest size (tuple): (widht, height) """ def __init__(self, size, interpolation='nearest'): self.size = size self.interpolation = interpolation def __call__(self, clip): resized = FF.resize_clip( clip, self.size, interpolation=self.interpolation) return resized class RandomCrop(object): """Extract random crop at the same location for a list of images Args: size (sequence or int): Desired output size for the crop in format (h, w) """ def __init__(self, size): if isinstance(size, numbers.Number): size = (size, size) self.size = size def __call__(self, clip): """ Args: img (PIL.Image or numpy.ndarray): List of images to be cropped in format (h, w, c) in numpy.ndarray Returns: PIL.Image or numpy.ndarray: Cropped list of images """ h, w = self.size if isinstance(clip[0], np.ndarray): im_h, im_w, im_c = clip[0].shape elif isinstance(clip[0], PIL.Image.Image): im_w, im_h = clip[0].size else: raise TypeError('Expected numpy.ndarray or PIL.Image' + 'but got list of {0}'.format(type(clip[0]))) if w > im_w or h > im_h: error_msg = ( 'Initial image size should be larger then ' 'cropped size but got cropped sizes : ({w}, {h}) while ' 'initial image is ({im_w}, {im_h})'.format( im_w=im_w, im_h=im_h, w=w, h=h)) raise ValueError(error_msg) x1 = random.randint(0, im_w - w) y1 = random.randint(0, im_h - h) cropped = FF.crop_clip(clip, y1, x1, h, w) return cropped class ThreeCrop(object): """Extract random crop at the same location for a list of images Args: size (sequence or int): Desired output size for the crop in format (h, w) """ def __init__(self, size): if isinstance(size, numbers.Number): size = (size, size) self.size = size def __call__(self, clip): """ Args: img (PIL.Image or numpy.ndarray): List of images to be cropped in format (h, w, c) in numpy.ndarray Returns: PIL.Image or numpy.ndarray: Cropped list of images """ h, w = self.size if isinstance(clip[0], np.ndarray): im_h, im_w, im_c = clip[0].shape elif isinstance(clip[0], PIL.Image.Image): im_w, im_h = clip[0].size else: raise TypeError('Expected numpy.ndarray or PIL.Image' + 'but got list of {0}'.format(type(clip[0]))) if w != im_w and h != im_h: clip = FF.resize_clip(clip, self.size, interpolation="bilinear") im_h, im_w, im_c = clip[0].shape step = np.max((np.max((im_w, im_h)) - self.size[0]) // 2, 0) cropped = [] for i in range(3): if (im_h > self.size[0]): x1 = 0 y1 = i * step cropped.extend(FF.crop_clip(clip, y1, x1, h, w)) else: x1 = i * step y1 = 0 cropped.extend(FF.crop_clip(clip, y1, x1, h, w)) return cropped class RandomRotation(object): """Rotate entire clip randomly by a random angle within given bounds Args: degrees (sequence or int): Range of degrees to select from If degrees is a number instead of sequence like (min, max), the range of degrees, will be (-degrees, +degrees). """ def __init__(self, degrees): if isinstance(degrees, numbers.Number): if degrees < 0: raise ValueError('If degrees is a single number,' 'must be positive') degrees = (-degrees, degrees) else: if len(degrees) != 2: raise ValueError('If degrees is a sequence,' 'it must be of len 2.') self.degrees = degrees def __call__(self, clip): """ Args: img (PIL.Image or numpy.ndarray): List of images to be cropped in format (h, w, c) in numpy.ndarray Returns: PIL.Image or numpy.ndarray: Cropped list of images """ import skimage angle = random.uniform(self.degrees[0], self.degrees[1]) if isinstance(clip[0], np.ndarray): rotated = [skimage.transform.rotate(img, angle) for img in clip] elif isinstance(clip[0], PIL.Image.Image): rotated = [img.rotate(angle) for img in clip] else: raise TypeError('Expected numpy.ndarray or PIL.Image' + 'but got list of {0}'.format(type(clip[0]))) return rotated class CenterCrop(object): """Extract center crop at the same location for a list of images Args: size (sequence or int): Desired output size for the crop in format (h, w) """ def __init__(self, size): if isinstance(size, numbers.Number): size = (size, size) self.size = size def __call__(self, clip): """ Args: img (PIL.Image or numpy.ndarray): List of images to be cropped in format (h, w, c) in numpy.ndarray Returns: PIL.Image or numpy.ndarray: Cropped list of images """ h, w = self.size if isinstance(clip[0], np.ndarray): im_h, im_w, im_c = clip[0].shape elif isinstance(clip[0], PIL.Image.Image): im_w, im_h = clip[0].size else: raise TypeError('Expected numpy.ndarray or PIL.Image' + 'but got list of {0}'.format(type(clip[0]))) if w > im_w or h > im_h: error_msg = ( 'Initial image size should be larger then ' 'cropped size but got cropped sizes : ({w}, {h}) while ' 'initial image is ({im_w}, {im_h})'.format( im_w=im_w, im_h=im_h, w=w, h=h)) raise ValueError(error_msg) x1 = int(round((im_w - w) / 2.)) y1 = int(round((im_h - h) / 2.)) cropped = FF.crop_clip(clip, y1, x1, h, w) return cropped class ColorJitter(object): """Randomly change the brightness, contrast and saturation and hue of the clip Args: brightness (float): How much to jitter brightness. brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]. contrast (float): How much to jitter contrast. contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]. saturation (float): How much to jitter saturation. saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]. hue(float): How much to jitter hue. hue_factor is chosen uniformly from [-hue, hue]. Should be >=0 and <= 0.5. """ def __init__(self, brightness=0, contrast=0, saturation=0, hue=0): self.brightness = brightness self.contrast = contrast self.saturation = saturation self.hue = hue def get_params(self, brightness, contrast, saturation, hue): if brightness > 0: brightness_factor = random.uniform( max(0, 1 - brightness), 1 + brightness) else: brightness_factor = None if contrast > 0: contrast_factor = random.uniform( max(0, 1 - contrast), 1 + contrast) else: contrast_factor = None if saturation > 0: saturation_factor = random.uniform( max(0, 1 - saturation), 1 + saturation) else: saturation_factor = None if hue > 0: hue_factor = random.uniform(-hue, hue) else: hue_factor = None return brightness_factor, contrast_factor, saturation_factor, hue_factor def __call__(self, clip): """ Args: clip (list): list of PIL.Image Returns: list PIL.Image : list of transformed PIL.Image """ if isinstance(clip[0], np.ndarray): raise TypeError( 'Color jitter not yet implemented for numpy arrays') elif isinstance(clip[0], PIL.Image.Image): brightness, contrast, saturation, hue = self.get_params( self.brightness, self.contrast, self.saturation, self.hue) # Create img transform function sequence img_transforms = [] if brightness is not None: img_transforms.append(lambda img: torchvision.transforms.functional.adjust_brightness(img, brightness)) if saturation is not None: img_transforms.append(lambda img: torchvision.transforms.functional.adjust_saturation(img, saturation)) if hue is not None: img_transforms.append(lambda img: torchvision.transforms.functional.adjust_hue(img, hue)) if contrast is not None: img_transforms.append(lambda img: torchvision.transforms.functional.adjust_contrast(img, contrast)) random.shuffle(img_transforms) # Apply to all images jittered_clip = [] for img in clip: for func in img_transforms: jittered_img = func(img) jittered_clip.append(jittered_img) else: raise TypeError('Expected numpy.ndarray or PIL.Image' + 'but got list of {0}'.format(type(clip[0]))) return jittered_clip class Normalize(object): """Normalize a clip with mean and standard deviation. Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels, this transform will normalize each channel of the input ``torch.*Tensor`` i.e. ``input[channel] = (input[channel] - mean[channel]) / std[channel]`` .. note:: This transform acts out of place, i.e., it does not mutates the input tensor. Args: mean (sequence): Sequence of means for each channel. std (sequence): Sequence of standard deviations for each channel. """ def __init__(self, mean, std): self.mean = mean self.std = std def __call__(self, clip): """ Args: clip (Tensor): Tensor clip of size (T, C, H, W) to be normalized. Returns: Tensor: Normalized Tensor clip. """ return FF.normalize(clip, self.mean, self.std) def __repr__(self): return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/volume_transforms.py ================================================ import numpy as np from PIL import Image import torch def convert_img(img): """Converts (H, W, C) numpy.ndarray to (C, W, H) format """ if len(img.shape) == 3: img = img.transpose(2, 0, 1) if len(img.shape) == 2: img = np.expand_dims(img, 0) return img class ClipToTensor(object): """Convert a list of m (H x W x C) numpy.ndarrays in the range [0, 255] to a torch.FloatTensor of shape (C x m x H x W) in the range [0, 1.0] """ def __init__(self, channel_nb=3, div_255=True, numpy=False): self.channel_nb = channel_nb self.div_255 = div_255 self.numpy = numpy def __call__(self, clip): """ Args: clip (list of numpy.ndarray): clip (list of images) to be converted to tensor. """ # Retrieve shape if isinstance(clip[0], np.ndarray): h, w, ch = clip[0].shape assert ch == self.channel_nb, 'Got {0} instead of 3 channels'.format( ch) elif isinstance(clip[0], Image.Image): w, h = clip[0].size else: raise TypeError('Expected numpy.ndarray or PIL.Image\ but got list of {0}'.format(type(clip[0]))) np_clip = np.zeros([self.channel_nb, len(clip), int(h), int(w)]) # Convert for img_idx, img in enumerate(clip): if isinstance(img, np.ndarray): pass elif isinstance(img, Image.Image): img = np.array(img, copy=False) else: raise TypeError('Expected numpy.ndarray or PIL.Image\ but got list of {0}'.format(type(clip[0]))) img = convert_img(img) np_clip[:, img_idx, :, :] = img if self.numpy: if self.div_255: np_clip = np_clip / 255.0 return np_clip else: tensor_clip = torch.from_numpy(np_clip) if not isinstance(tensor_clip, torch.FloatTensor): tensor_clip = tensor_clip.float() if self.div_255: tensor_clip = torch.div(tensor_clip, 255) return tensor_clip # Note this norms data to -1/1 class ClipToTensor_K(object): """Convert a list of m (H x W x C) numpy.ndarrays in the range [0, 255] to a torch.FloatTensor of shape (C x m x H x W) in the range [0, 1.0] """ def __init__(self, channel_nb=3, div_255=True, numpy=False): self.channel_nb = channel_nb self.div_255 = div_255 self.numpy = numpy def __call__(self, clip): """ Args: clip (list of numpy.ndarray): clip (list of images) to be converted to tensor. """ # Retrieve shape if isinstance(clip[0], np.ndarray): h, w, ch = clip[0].shape assert ch == self.channel_nb, 'Got {0} instead of 3 channels'.format( ch) elif isinstance(clip[0], Image.Image): w, h = clip[0].size else: raise TypeError('Expected numpy.ndarray or PIL.Image\ but got list of {0}'.format(type(clip[0]))) np_clip = np.zeros([self.channel_nb, len(clip), int(h), int(w)]) # Convert for img_idx, img in enumerate(clip): if isinstance(img, np.ndarray): pass elif isinstance(img, Image.Image): img = np.array(img, copy=False) else: raise TypeError('Expected numpy.ndarray or PIL.Image\ but got list of {0}'.format(type(clip[0]))) img = convert_img(img) np_clip[:, img_idx, :, :] = img if self.numpy: if self.div_255: np_clip = (np_clip - 127.5) / 127.5 return np_clip else: tensor_clip = torch.from_numpy(np_clip) if not isinstance(tensor_clip, torch.FloatTensor): tensor_clip = tensor_clip.float() if self.div_255: tensor_clip = torch.div(torch.sub(tensor_clip, 127.5), 127.5) return tensor_clip class ToTensor(object): """Converts numpy array to tensor """ def __call__(self, array): tensor = torch.from_numpy(array) return tensor ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/functional.py ================================================ import numbers import cv2 import numpy as np import PIL import torch def _is_tensor_clip(clip): return torch.is_tensor(clip) and clip.ndimension() == 4 def crop_clip(clip, min_h, min_w, h, w): if isinstance(clip[0], np.ndarray): cropped = [img[min_h:min_h + h, min_w:min_w + w, :] for img in clip] elif isinstance(clip[0], PIL.Image.Image): cropped = [ img.crop((min_w, min_h, min_w + w, min_h + h)) for img in clip ] else: raise TypeError('Expected numpy.ndarray or PIL.Image' + 'but got list of {0}'.format(type(clip[0]))) return cropped def resize_clip(clip, size, interpolation='bilinear'): if isinstance(clip[0], np.ndarray): if isinstance(size, numbers.Number): im_h, im_w, im_c = clip[0].shape # Min spatial dim already matches minimal size if (im_w <= im_h and im_w == size) or (im_h <= im_w and im_h == size): return clip new_h, new_w = get_resize_sizes(im_h, im_w, size) size = (new_w, new_h) else: size = size[0], size[1] if interpolation == 'bilinear': np_inter = cv2.INTER_LINEAR else: np_inter = cv2.INTER_NEAREST scaled = [ cv2.resize(img, size, interpolation=np_inter) for img in clip ] elif isinstance(clip[0], PIL.Image.Image): if isinstance(size, numbers.Number): im_w, im_h = clip[0].size # Min spatial dim already matches minimal size if (im_w <= im_h and im_w == size) or (im_h <= im_w and im_h == size): return clip new_h, new_w = get_resize_sizes(im_h, im_w, size) size = (new_w, new_h) else: size = size[1], size[0] if interpolation == 'bilinear': pil_inter = PIL.Image.BILINEAR else: pil_inter = PIL.Image.NEAREST scaled = [img.resize(size, pil_inter) for img in clip] else: raise TypeError('Expected numpy.ndarray or PIL.Image' + 'but got list of {0}'.format(type(clip[0]))) return scaled def get_resize_sizes(im_h, im_w, size): if im_w < im_h: ow = size oh = int(size * im_h / im_w) else: oh = size ow = int(size * im_w / im_h) return oh, ow def normalize(clip, mean, std, inplace=False): if not _is_tensor_clip(clip): raise TypeError('tensor is not a torch clip.') if not inplace: clip = clip.clone() dtype = clip.dtype mean = torch.as_tensor(mean, dtype=dtype, device=clip.device) std = torch.as_tensor(std, dtype=dtype, device=clip.device) clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None]) return clip ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/models/__init__.py ================================================ from .clip import clip_b16, clip_l14, clip_l14_336 # from .modeling_finetune import vit_base_patch16_224, vit_base_patch16_384, vit_large_patch16_224, vit_large_patch16_384 from .modeling_finetune import vit_large_patch16_224 from .modeling_pretrain_umt import pretrain_umt_base_patch16_224, pretrain_umt_large_patch16_224 from .modeling_pretrain import pretrain_videomae_base_patch16_224, pretrain_videomae_large_patch16_224, pretrain_videomae_huge_patch16_224 ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/models/clip.py ================================================ #!/usr/bin/env python import os from collections import OrderedDict import torch from torch import nn MODEL_PATH = 'your_model_path/clip_visual_encoder' _MODELS = { # extracted from OpenAI, see extract_clip "ViT-B/16": os.path.join(MODEL_PATH, "vit_b16.pth"), "ViT-L/14": os.path.join(MODEL_PATH, "vit_l14.pth"), "ViT-L/14_336": os.path.join(MODEL_PATH, "vit_l14_336.pth"), } class LayerNorm(nn.LayerNorm): """Subclass torch's LayerNorm to handle fp16.""" def forward(self, x): orig_type = x.dtype ret = super().forward(x.type(torch.float32)) return ret.type(orig_type) class QuickGELU(nn.Module): def forward(self, x): return x * torch.sigmoid(1.702 * x) class ResidualAttentionBlock(nn.Module): def __init__(self, d_model, n_head, attn_mask=None): super().__init__() self.attn = nn.MultiheadAttention(d_model, n_head) self.ln_1 = LayerNorm(d_model) self.mlp = nn.Sequential(OrderedDict([ ("c_fc", nn.Linear(d_model, d_model * 4)), ("gelu", QuickGELU()), ("c_proj", nn.Linear(d_model * 4, d_model)) ])) self.ln_2 = LayerNorm(d_model) self.attn_mask = attn_mask def attention(self, x, return_attn=False): self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None if return_attn: return self.attn(x, x, x, need_weights=True, attn_mask=self.attn_mask) else: return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] def forward(self, x, return_attn=False): if return_attn: x_, attn = self.attention(self.ln_1(x), return_attn=True) x = x + x_ x = x + self.mlp(self.ln_2(x)) return x, attn else: x = x + self.attention(self.ln_1(x)) x = x + self.mlp(self.ln_2(x)) return x class Transformer(nn.Module): def __init__( self, width, layers, heads, return_attn=False, clip_return_layer=1, clip_return_interval=1, ): super().__init__() self.layers = layers self.return_attn = return_attn self.resblocks = nn.ModuleList() for _ in range(layers): self.resblocks.append( ResidualAttentionBlock( width, heads, ) ) self.return_index = [] for i in range(clip_return_layer): self.return_index.append(layers - int(i * clip_return_interval) - 1) print(f'Teacher return index: {self.return_index}') def forward(self, x): attn = None z = [] for idx, blk in enumerate(self.resblocks): if idx == self.layers - 1 and self.return_attn: x, attn = blk(x, return_attn=True) else: x = blk(x) if idx in self.return_index: z.append(x) x = torch.stack(z) return x, attn class VisionTransformer(nn.Module): def __init__( self, input_resolution, patch_size, width, layers, heads, output_dim, clip_norm_type='l2', kernel_size=1, return_attn=False, clip_return_layer=1, clip_return_interval=1, ): super().__init__() self.clip_norm_type = clip_norm_type self.return_attn = return_attn print(f'Normalization Type: {clip_norm_type}') print(f'Return Attention: {return_attn}') print(f'Return Layer: {clip_return_layer}') print(f'Return Interval: {clip_return_interval}') self.output_dim = output_dim self.conv1 = nn.Conv3d( 3, width, (kernel_size, patch_size, patch_size), (kernel_size, patch_size, patch_size), (0, 0, 0), bias=False ) scale = width ** -0.5 self.class_embedding = nn.Parameter(scale * torch.randn(width)) self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width)) self.ln_pre = LayerNorm(width) self.transformer = Transformer( width, layers, heads, return_attn=return_attn, clip_return_layer=clip_return_layer, clip_return_interval=clip_return_interval, ) self.ln_post = LayerNorm(width) self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) def forward(self, x, mask=None): x = self.conv1(x) # shape = [*, width, grid, grid] N, C, T, H, W = x.shape x = x.permute(0, 2, 3, 4, 1).reshape(N * T, H * W, C) x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width] x = x + self.positional_embedding.to(x.dtype) x = self.ln_pre(x) if mask is not None: cls_tokens = x[:, :1, :] x = x[:, 1:] x = x.reshape(N, T * H * W, C) x = x[~mask].view(N * T, -1, C) HW = x.shape[1] x = torch.cat([cls_tokens, x], dim=1) else: HW = H * W x = x.permute(1, 0, 2) # NLD -> LND x, attn = self.transformer(x) K = x.shape[0] x = self.ln_post(x[:, 1:, :, :]) # [HW, NT, C] x = x.view(K, HW, N, T, C).permute(0, 2, 3, 1, 4).reshape(K, N, T * HW, C) # [K, N, THW, C] x = x @ self.proj if self.clip_norm_type == 'l2': x = x / x.norm(dim=-1, keepdim=True) elif self.clip_norm_type == 'none': pass else: raise NotImplementedError if self.return_attn: return x, attn[:, 0, 1:] else: return x def inflate_weight(weight_2d, time_dim, center=True): print(f'Init center: {center}') if center: weight_3d = torch.zeros(*weight_2d.shape) weight_3d = weight_3d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1) middle_idx = time_dim // 2 weight_3d[:, :, middle_idx, :, :] = weight_2d else: weight_3d = weight_2d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1) weight_3d = weight_3d / time_dim return weight_3d def load_state_dict(model, state_dict, input_resolution=224, patch_size=16, center=True): state_dict_3d = model.state_dict() for k in state_dict.keys(): if k in state_dict_3d.keys() and state_dict[k].shape != state_dict_3d[k].shape: if len(state_dict_3d[k].shape) <= 2: print(f'Ignore: {k}') continue print(f'Inflate: {k}, {state_dict[k].shape} => {state_dict_3d[k].shape}') time_dim = state_dict_3d[k].shape[2] state_dict[k] = inflate_weight(state_dict[k], time_dim, center=center) pos_embed_checkpoint = state_dict['positional_embedding'] embedding_size = pos_embed_checkpoint.shape[-1] num_patches = (input_resolution // patch_size) ** 2 orig_size = int((pos_embed_checkpoint.shape[-2] - 1) ** 0.5) new_size = int(num_patches ** 0.5) if orig_size != new_size: print(f'Pos_emb from {orig_size} to {new_size}') extra_tokens = pos_embed_checkpoint[:1] pos_tokens = pos_embed_checkpoint[1:] pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) pos_tokens = torch.nn.functional.interpolate( pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(0, 2) new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=0) state_dict['positional_embedding'] = new_pos_embed model.load_state_dict(state_dict, strict=True) def clip_b16( pretrained=True, clip_norm_type='l2', input_resolution=224, kernel_size=1, return_attn=False, center=True, clip_return_layer=1, clip_return_interval=1 ): model = VisionTransformer( input_resolution=input_resolution, patch_size=16, width=768, layers=12, heads=12, output_dim=512, clip_norm_type=clip_norm_type, kernel_size=kernel_size, return_attn=return_attn, clip_return_layer=clip_return_layer, clip_return_interval=clip_return_interval ) if pretrained: print('load pretrained weights') state_dict = torch.load(_MODELS["ViT-B/16"], map_location='cpu') load_state_dict(model, state_dict, input_resolution=input_resolution, patch_size=16, center=center) return model.eval() def clip_l14( pretrained=True, clip_norm_type='l2', input_resolution=224, kernel_size=1, return_attn=False, center=True, clip_return_layer=1, clip_return_interval=1 ): model = VisionTransformer( input_resolution=input_resolution, patch_size=14, width=1024, layers=24, heads=16, output_dim=768, clip_norm_type=clip_norm_type, kernel_size=kernel_size, return_attn=return_attn, clip_return_layer=clip_return_layer, clip_return_interval=clip_return_interval ) if pretrained: print('load pretrained weights') state_dict = torch.load(_MODELS["ViT-L/14"], map_location='cpu') load_state_dict(model, state_dict, input_resolution=input_resolution, patch_size=14, center=center) return model.eval() def clip_l14_336( pretrained=True, clip_norm_type='l2', input_resolution=336, kernel_size=1, return_attn=False, center=True, clip_return_layer=1, clip_return_interval=1 ): model = VisionTransformer( input_resolution=input_resolution, patch_size=14, width=1024, layers=24, heads=16, output_dim=768, clip_norm_type=clip_norm_type, kernel_size=kernel_size, return_attn=return_attn, clip_return_layer=clip_return_layer, clip_return_interval=clip_return_interval, ) if pretrained: print('load pretrained weights') state_dict = torch.load(_MODELS["ViT-L/14_336"], map_location='cpu') load_state_dict(model, state_dict, input_resolution=input_resolution, patch_size=14, center=center) return model.eval() if __name__ == '__main__': import time from fvcore.nn import FlopCountAnalysis from fvcore.nn import flop_count_table import numpy as np seed = 4217 np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) num_frames = 8 model = clip_ml_b16(pretrained=True, kernel_size=1, return_attn=False, clip_return_layer=1) # print(model) # flops = FlopCountAnalysis(model, torch.rand(1, 3, num_frames, 224, 224)) # s = time.time() # print(flop_count_table(flops, max_depth=1)) # print(time.time()-s) print(model(torch.rand(1, 3, num_frames, 224, 224)).shape) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/models/modeling_finetune.py ================================================ from functools import partial import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from timm.models.layers import drop_path, to_2tuple, trunc_normal_ from timm.models.registry import register_model import torch.utils.checkpoint as checkpoint def _cfg(url='', **kwargs): return { 'url': url, 'num_classes': 400, 'input_size': (3, 224, 224), 'pool_size': None, 'crop_pct': .9, 'interpolation': 'bicubic', 'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5), **kwargs } class DropPath(nn.Module): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """ def __init__(self, drop_prob=None): super(DropPath, self).__init__() self.drop_prob = drop_prob def forward(self, x): return drop_path(x, self.drop_prob, self.training) def extra_repr(self) -> str: return 'p={}'.format(self.drop_prob) class Mlp(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) # x = self.drop(x) # commit this for the orignal BERT implement x = self.fc2(x) x = self.drop(x) return x class Attention(nn.Module): def __init__( self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., attn_head_dim=None): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads if attn_head_dim is not None: head_dim = attn_head_dim all_head_dim = head_dim * self.num_heads self.scale = qk_scale or head_dim ** -0.5 self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) if qkv_bias: self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) else: self.q_bias = None self.v_bias = None self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(all_head_dim, dim) self.proj_drop = nn.Dropout(proj_drop) def forward(self, x): B, N, C = x.shape qkv_bias = None if self.q_bias is not None: qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias)) # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) q = q * self.scale attn = (q @ k.transpose(-2, -1)) attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, -1) x = self.proj(x) x = self.proj_drop(x) return x class Block(nn.Module): def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm, attn_head_dim=None): super().__init__() self.norm1 = norm_layer(dim) self.attn = Attention( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) if init_values > 0: self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True) self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True) else: self.gamma_1, self.gamma_2 = None, None def forward(self, x): if self.gamma_1 is None: x = x + self.drop_path(self.attn(self.norm1(x))) x = x + self.drop_path(self.mlp(self.norm2(x))) else: x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x))) x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) return x class PatchEmbed(nn.Module): """ Image to Patch Embedding """ def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, num_frames=16, tubelet_size=2): super().__init__() img_size = to_2tuple(img_size) patch_size = to_2tuple(patch_size) self.tubelet_size = int(tubelet_size) num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (num_frames // self.tubelet_size) self.img_size = img_size self.patch_size = patch_size self.num_patches = num_patches self.proj = nn.Conv3d(in_channels=in_chans, out_channels=embed_dim, kernel_size=(self.tubelet_size, patch_size[0], patch_size[1]), stride=(self.tubelet_size, patch_size[0], patch_size[1])) def forward(self, x, **kwargs): B, C, T, H, W = x.shape # FIXME look at relaxing size constraints assert H == self.img_size[0] and W == self.img_size[1], \ f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." x = self.proj(x).flatten(2).transpose(1, 2) return x # sin-cos position encoding # https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31 def get_sinusoid_encoding_table(n_position, d_hid, cur_frame=-1, pre_n_position=1568): ''' Sinusoid position encoding table ''' # TODO: make it with torch instead of numpy def get_position_angle_vec(position): return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)] # generate checkpoint position embedding sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(pre_n_position)]) sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 sinusoid_table = torch.tensor(sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0) print(f"n_position: {n_position}") print(f"pre_n_position: {pre_n_position}") if n_position // cur_frame * 8 != pre_n_position and cur_frame != -1: T = 8 # checkpoint frame P = 14 # checkpoint size C = d_hid new_P = int((n_position // cur_frame) ** 0.5) # testing size print(f'Pretraining uses 14x14, but current version is {new_P}x{new_P}') print(f'Interpolate the position embedding') sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C) sinusoid_table = sinusoid_table.reshape(-1, P, P, C).permute(0, 3, 1, 2) sinusoid_table = torch.nn.functional.interpolate( sinusoid_table, size=(new_P, new_P), mode='bicubic', align_corners=False) # BT, C, H, W -> BT, H, W, C -> B, T, H, W, C sinusoid_table = sinusoid_table.permute(0, 2, 3, 1).reshape(-1, T, new_P, new_P, C) sinusoid_table = sinusoid_table.flatten(1, 3) # B, THW, C if cur_frame != -1 and cur_frame != 8: print(f'Pretraining uses 8 frames, but current frame is {cur_frame}') print(f'Interpolate the position embedding') T = 8 # checkpoint frame new_T = cur_frame # testing frame # interpolate P = int((n_position // cur_frame) ** 0.5) # testing size C = d_hid sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C) sinusoid_table = sinusoid_table.permute(0, 2, 3, 4, 1).reshape(-1, C, T) # BHW, C, T sinusoid_table = torch.nn.functional.interpolate(sinusoid_table, size=new_T, mode='linear') sinusoid_table = sinusoid_table.reshape(1, P, P, C, new_T).permute(0, 4, 1, 2, 3) # B, T, H, W, C sinusoid_table = sinusoid_table.flatten(1, 3) # B, THW, C if n_position == pre_n_position: return sinusoid_table else: print("Use learnable position embedding") return nn.Parameter(sinusoid_table, requires_grad=True) class VisionTransformer(nn.Module): """ Vision Transformer with support for patch or hybrid CNN input stage """ def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, fc_drop_rate=0., drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=0., use_learnable_pos_emb=False, init_scale=0., all_frames=16, tubelet_size=2, use_checkpoint=False, checkpoint_num=0, use_mean_pooling=True): super().__init__() self.num_classes = num_classes self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models self.tubelet_size = tubelet_size self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, num_frames=all_frames, tubelet_size=self.tubelet_size) num_patches = self.patch_embed.num_patches self.use_checkpoint = use_checkpoint self.checkpoint_num = checkpoint_num print(f'Use checkpoint: {use_checkpoint}') print(f'Checkpoint number: {checkpoint_num}') if use_learnable_pos_emb: self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) else: # sine-cosine positional embeddings is on the way if patch_size == 14: pre_n_position = 2048 else: pre_n_position = 1568 self.pos_embed = get_sinusoid_encoding_table( num_patches, embed_dim, all_frames // tubelet_size, pre_n_position=pre_n_position ) self.pos_drop = nn.Dropout(p=drop_rate) dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule self.blocks = nn.ModuleList([ Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, init_values=init_values) for i in range(depth)]) self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim) self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None self.fc_dropout = nn.Dropout(p=fc_drop_rate) if fc_drop_rate > 0 else nn.Identity() self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() if use_learnable_pos_emb: trunc_normal_(self.pos_embed, std=.02) trunc_normal_(self.head.weight, std=.02) self.apply(self._init_weights) self.head.weight.data.mul_(init_scale) self.head.bias.data.mul_(init_scale) def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def get_num_layers(self): return len(self.blocks) @torch.jit.ignore def no_weight_decay(self): return {'pos_embed', 'cls_token'} def get_classifier(self): return self.head def reset_classifier(self, num_classes, global_pool=''): self.num_classes = num_classes self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() def forward_features(self, x): x = self.patch_embed(x) B, _, _ = x.size() if self.pos_embed is not None: x = x + self.pos_embed.expand(B, -1, -1).type_as(x).to(x.device).clone().detach() x = self.pos_drop(x) for idx, blk in enumerate(self.blocks): if self.use_checkpoint and idx < self.checkpoint_num: x = checkpoint.checkpoint(blk, x) else: x = blk(x) x = self.norm(x) if self.fc_norm is not None: return self.fc_norm(x.mean(1)) else: return x[:, 0] def forward(self, x): x = self.forward_features(x) x = self.head(self.fc_dropout(x)) return x # @register_model # def vit_base_patch16_224(pretrained=False, **kwargs): # model = VisionTransformer( # patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, # norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) # model.default_cfg = _cfg() # return model # # # # @register_model # def vit_base_patch16_384(pretrained=False, **kwargs): # model = VisionTransformer( # img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, # norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) # model.default_cfg = _cfg() # return model @register_model def vit_large_patch16_224(pretrained=False, **kwargs): kwargs.pop('pretrained_cfg', None) # added by Ziqi to accommodate timm=0.9.12 kwargs.pop('pretrained_cfg_overlay', None) # added by Ziqi to accommodate timm=0.9.12 model = VisionTransformer( patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) model.default_cfg = _cfg() return model # @register_model # def vit_large_patch16_384(pretrained=False, **kwargs): # model = VisionTransformer( # img_size=384, patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, # norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) # model.default_cfg = _cfg() # return model if __name__ == '__main__': import time from fvcore.nn import FlopCountAnalysis from fvcore.nn import flop_count_table import numpy as np seed = 4217 np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) num_frames = 8 # model = vit_base_patch16_384(all_frames=num_frames, tubelet_size=1) # model = vit_large_patch16_384(all_frames=num_frames, tubelet_size=1) # print(model) flops = FlopCountAnalysis(model, torch.rand(1, 3, num_frames, 384, 384)) s = time.time() print(flop_count_table(flops, max_depth=1)) print(time.time()-s) # print(model(torch.rand(1, 3, num_frames, 224, 224)).shape) ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/models/modeling_pretrain.py ================================================ import math import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as checkpoint from functools import partial from .modeling_finetune import Block, _cfg, PatchEmbed, get_sinusoid_encoding_table from timm.models.registry import register_model from timm.models.layers import trunc_normal_ as __call_trunc_normal_ def trunc_normal_(tensor, mean=0., std=1.): __call_trunc_normal_(tensor, mean=mean, std=std, a=-std, b=std) class PretrainVisionTransformerEncoder(nn.Module): """ Vision Transformer with support for patch or hybrid CNN input stage """ def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None, num_frames=16, tubelet_size=2, use_checkpoint=False, use_learnable_pos_emb=False): super().__init__() self.num_classes = num_classes self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, num_frames=num_frames, tubelet_size=tubelet_size ) num_patches = self.patch_embed.num_patches self.use_checkpoint = use_checkpoint # TODO: Add the cls token if use_learnable_pos_emb: self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) else: # sine-cosine positional embeddings self.pos_embed = get_sinusoid_encoding_table(num_patches, embed_dim) dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule self.blocks = nn.ModuleList([ Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, init_values=init_values) for i in range(depth)]) self.norm = norm_layer(embed_dim) self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() if use_learnable_pos_emb: trunc_normal_(self.pos_embed, std=.02) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): nn.init.xavier_uniform_(m.weight) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def get_num_layers(self): return len(self.blocks) @torch.jit.ignore def no_weight_decay(self): return {'pos_embed', 'cls_token'} def get_classifier(self): return self.head def reset_classifier(self, num_classes, global_pool=''): self.num_classes = num_classes self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() def forward_features(self, x, mask): _, _, T, _, _ = x.shape x = self.patch_embed(x) x = x + self.pos_embed.type_as(x).to(x.device).clone().detach() B, _, C = x.shape x_vis = x[~mask].reshape(B, -1, C) # ~mask means visible if self.use_checkpoint: for blk in self.blocks: x_vis = checkpoint.checkpoint(blk, x_vis) else: for blk in self.blocks: x_vis = blk(x_vis) x_vis = self.norm(x_vis) return x_vis def forward(self, x, mask): x = self.forward_features(x, mask) x = self.head(x) return x class PretrainVisionTransformerDecoder(nn.Module): """ Vision Transformer with support for patch or hybrid CNN input stage """ def __init__(self, patch_size=16, num_classes=768, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None, num_patches=196, tubelet_size=2, use_checkpoint=False ): super().__init__() self.num_classes = num_classes assert num_classes == 3 * tubelet_size * patch_size ** 2 self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models self.patch_size = patch_size self.use_checkpoint = use_checkpoint dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule self.blocks = nn.ModuleList([ Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, init_values=init_values) for i in range(depth)]) self.norm = norm_layer(embed_dim) self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): nn.init.xavier_uniform_(m.weight) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def get_num_layers(self): return len(self.blocks) @torch.jit.ignore def no_weight_decay(self): return {'pos_embed', 'cls_token'} def get_classifier(self): return self.head def reset_classifier(self, num_classes, global_pool=''): self.num_classes = num_classes self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() def forward(self, x, return_token_num): if self.use_checkpoint: for blk in self.blocks: x = checkpoint.checkpoint(blk, x) else: for blk in self.blocks: x = blk(x) if return_token_num > 0: x = self.head(self.norm(x[:, -return_token_num:])) # only return the mask tokens predict pixels else: x = self.head(self.norm(x)) return x class PretrainVisionTransformer(nn.Module): """ Vision Transformer with support for patch or hybrid CNN input stage """ def __init__(self, img_size=224, patch_size=16, encoder_in_chans=3, encoder_num_classes=0, encoder_embed_dim=768, encoder_depth=12, encoder_num_heads=12, decoder_num_classes=1536, # decoder_num_classes=768, decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=8, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=0., use_learnable_pos_emb=False, use_checkpoint=False, num_frames=16, tubelet_size=2, num_classes=0, # avoid the error from create_fn in timm in_chans=0, # avoid the error from create_fn in timm ): super().__init__() self.encoder = PretrainVisionTransformerEncoder( img_size=img_size, patch_size=patch_size, in_chans=encoder_in_chans, num_classes=encoder_num_classes, embed_dim=encoder_embed_dim, depth=encoder_depth, num_heads=encoder_num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=drop_path_rate, norm_layer=norm_layer, init_values=init_values, num_frames=num_frames, tubelet_size=tubelet_size, use_checkpoint=use_checkpoint, use_learnable_pos_emb=use_learnable_pos_emb) self.decoder = PretrainVisionTransformerDecoder( patch_size=patch_size, num_patches=self.encoder.patch_embed.num_patches, num_classes=decoder_num_classes, embed_dim=decoder_embed_dim, depth=decoder_depth, num_heads=decoder_num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=drop_path_rate, norm_layer=norm_layer, init_values=init_values, tubelet_size=tubelet_size, use_checkpoint=use_checkpoint) self.encoder_to_decoder = nn.Linear(encoder_embed_dim, decoder_embed_dim, bias=False) self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim)) self.pos_embed = get_sinusoid_encoding_table(self.encoder.patch_embed.num_patches, decoder_embed_dim) trunc_normal_(self.mask_token, std=.02) def _init_weights(self, m): if isinstance(m, nn.Linear): nn.init.xavier_uniform_(m.weight) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def get_num_layers(self): return len(self.blocks) @torch.jit.ignore def no_weight_decay(self): return {'pos_embed', 'cls_token', 'mask_token'} def forward(self, x, mask): _, _, T, _, _ = x.shape x_vis = self.encoder(x, mask) # [B, N_vis, C_e] x_vis = self.encoder_to_decoder(x_vis) # [B, N_vis, C_d] B, N, C = x_vis.shape # we don't unshuffle the correct visible token order, # but shuffle the pos embedding accorddingly. expand_pos_embed = self.pos_embed.expand(B, -1, -1).type_as(x).to(x.device).clone().detach() pos_emd_vis = expand_pos_embed[~mask].reshape(B, -1, C) pos_emd_mask = expand_pos_embed[mask].reshape(B, -1, C) x_full = torch.cat([x_vis + pos_emd_vis, self.mask_token + pos_emd_mask], dim=1) # [B, N, C_d] x = self.decoder(x_full, pos_emd_mask.shape[1]) # [B, N_mask, 3 * 16 * 16] return x @register_model def pretrain_videomae_base_patch16_224(pretrained=False, **kwargs): model = PretrainVisionTransformer( img_size=224, patch_size=16, encoder_embed_dim=768, encoder_depth=12, encoder_num_heads=12, encoder_num_classes=0, decoder_num_classes=1536, decoder_embed_dim=384, decoder_num_heads=6, mlp_ratio=4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) model.default_cfg = _cfg() if pretrained: checkpoint = torch.load( kwargs["init_ckpt"], map_location="cpu" ) model.load_state_dict(checkpoint["model"]) return model @register_model def pretrain_videomae_large_patch16_224(pretrained=False, **kwargs): model = PretrainVisionTransformer( img_size=224, patch_size=16, encoder_embed_dim=1024, encoder_depth=24, encoder_num_heads=16, encoder_num_classes=0, decoder_num_classes=1536, decoder_embed_dim=512, decoder_num_heads=8, mlp_ratio=4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) model.default_cfg = _cfg() if pretrained: checkpoint = torch.load( kwargs["init_ckpt"], map_location="cpu" ) model.load_state_dict(checkpoint["model"]) return model @register_model def pretrain_videomae_huge_patch16_224(pretrained=False, **kwargs): model = PretrainVisionTransformer( img_size=224, patch_size=16, encoder_embed_dim=1280, encoder_depth=32, encoder_num_heads=16, encoder_num_classes=0, decoder_num_classes=1536, decoder_embed_dim=640, decoder_num_heads=8, mlp_ratio=4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) model.default_cfg = _cfg() if pretrained: checkpoint = torch.load( kwargs["init_ckpt"], map_location="cpu" ) model.load_state_dict(checkpoint["model"]) return model ================================================ FILE: Open-Sora/build/lib/vbench/third_pary/umt/models/modeling_pretrain_umt.py ================================================ import math import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as checkpoint from functools import partial from .modeling_finetune import Block, DropPath, Mlp, _cfg, PatchEmbed from timm.models.registry import register_model from timm.models.layers import trunc_normal_ as __call_trunc_normal_ def trunc_normal_(tensor, mean=0., std=1.): __call_trunc_normal_(tensor, mean=mean, std=std, a=-std, b=std) # sin-cos position encoding # https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31 def get_sinusoid_encoding_table(n_position, d_hid): ''' Sinusoid position encoding table ''' # TODO: make it with torch instead of numpy def get_position_angle_vec(position): return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)] sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)]) sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 return torch.tensor(sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0) class PretrainVisionTransformerEncoder(nn.Module): """ Vision Transformer with support for patch or hybrid CNN input stage """ def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None, num_frames=16, tubelet_size=2, use_checkpoint=False, checkpoint_num=0, use_learnable_pos_emb=False, clip_return_layer=1, clip_student_return_interval=1): super().__init__() self.num_classes = num_classes self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, num_frames=num_frames, tubelet_size=tubelet_size ) num_patches = self.patch_embed.num_patches self.use_checkpoint = use_checkpoint self.checkpoint_num = checkpoint_num print(f'Use checkpoint: {use_checkpoint}') print(f'Checkpoint number: {checkpoint_num}') self.return_index = [] for i in range(clip_return_layer): self.return_index.append(depth - int(i * clip_student_return_interval) - 1) print(f'Student return index: {self.return_index}') self.use_learnable_pos_emb = use_learnable_pos_emb if use_learnable_pos_emb: print('Use learnable position embedding') self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) else: # sine-cosine positional embeddings self.pos_embed = get_sinusoid_encoding_table(num_patches, embed_dim) dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule self.blocks = nn.ModuleList([ Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, init_values=init_values) for i in range(depth)]) self.norm = norm_layer(embed_dim) self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() if use_learnable_pos_emb: trunc_normal_(self.pos_embed, std=.02) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): nn.init.xavier_uniform_(m.weight) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def get_num_layers(self): return len(self.blocks) @torch.jit.ignore def no_weight_decay(self): return {'pos_embed', 'cls_token'} def get_classifier(self): return self.head def reset_classifier(self, num_classes, global_pool=''): self.num_classes = num_classes self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() def forward_features(self, x, mask): x = self.patch_embed(x) if self.use_learnable_pos_emb: x = x + self.pos_embed.type_as(x).to(x.device) else: x = x + self.pos_embed.type_as(x).to(x.device).clone().detach() B, _, C = x.shape x_vis = x[~mask].reshape(B, -1, C) # ~mask means visible x_clip_vis = [] for idx, blk in enumerate(self.blocks): if self.use_checkpoint and idx < self.checkpoint_num: x_vis = checkpoint.checkpoint(blk, x_vis) else: x_vis = blk(x_vis) if idx in self.return_index: x_clip_vis.append(x_vis) x_vis = self.norm(x_vis) x_clip_vis = self.norm(torch.stack(x_clip_vis)) return x_vis, x_clip_vis def forward(self, x, mask): x, x_clip_vis = self.forward_features(x, mask) x = self.head(x) x_clip_vis = self.head(x_clip_vis) return x_clip_vis class Linear_Decoder(nn.Module): def __init__(self, num_classes=768, embed_dim=768, norm_layer=nn.LayerNorm, clip_norm_type='l2'): super().__init__() self.clip_norm_type = clip_norm_type print(f'Normalization Type: {clip_norm_type}') self.head = nn.Linear(embed_dim, num_classes) self.norm = norm_layer(num_classes) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): nn.init.xavier_uniform_(m.weight) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def forward(self, x): x = self.norm(self.head(x)) if self.clip_norm_type == 'l2': x = x / x.norm(dim=-1, keepdim=True) elif self.clip_norm_type == 'none': pass else: raise NotImplementedError return x class PretrainVisionTransformer(nn.Module): """ Vision Transformer with support for patch or hybrid CNN input stage """ def __init__(self, img_size=224, patch_size=16, encoder_in_chans=3, encoder_num_classes=0, encoder_embed_dim=768, encoder_depth=12, encoder_num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=0., use_learnable_pos_emb=False, use_checkpoint=False, checkpoint_num=0, num_frames=16, tubelet_size=2, # clip, clip_decoder_embed_dim=768, clip_output_dim=512, clip_norm_type='l2', clip_return_layer=1, clip_student_return_interval=1, ): super().__init__() self.encoder = PretrainVisionTransformerEncoder( img_size=img_size, patch_size=patch_size, in_chans=encoder_in_chans, num_classes=encoder_num_classes, embed_dim=encoder_embed_dim, depth=encoder_depth, num_heads=encoder_num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=drop_path_rate, norm_layer=norm_layer, init_values=init_values, num_frames=num_frames, tubelet_size=tubelet_size, use_checkpoint=use_checkpoint, checkpoint_num=checkpoint_num, use_learnable_pos_emb=use_learnable_pos_emb, clip_return_layer=clip_return_layer, clip_student_return_interval=clip_student_return_interval ) # CLIP decoder self.clip_decoder = nn.ModuleList([ Linear_Decoder( num_classes=clip_output_dim, embed_dim=clip_decoder_embed_dim, norm_layer=norm_layer, clip_norm_type=clip_norm_type ) for _ in range(clip_return_layer) ]) self.clip_pos_embed = get_sinusoid_encoding_table(self.encoder.patch_embed.num_patches, clip_decoder_embed_dim) def _init_weights(self, m): if isinstance(m, nn.Linear): nn.init.xavier_uniform_(m.weight) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def get_num_layers(self): return len(self.blocks) @torch.jit.ignore def no_weight_decay(self): return {'pos_embed', 'cls_token', 'mask_token', 'clip_mask_token', 'clip_pos_embed'} def forward(self, x, mask): x_clip_vis = self.encoder(x, mask) # [B, N_vis, C_e] # align CLIP K, B, _, C_CLIP = x_clip_vis.shape expand_clip_pos_embed = self.clip_pos_embed.repeat(B, 1, 1).type_as(x).to(x.device).clone().detach() clip_pos_emd_vis = expand_clip_pos_embed[~mask].view(B, -1, C_CLIP).unsqueeze(0).repeat(K, 1, 1, 1) x_clip_full = x_clip_vis + clip_pos_emd_vis # [K, B, N, C_d_clip] x_clip = [] for idx, clip_decoder in enumerate(self.clip_decoder): x_clip.append(clip_decoder(x_clip_full[idx])) x_clip = torch.stack(x_clip) # align and normalize return x_clip @register_model def pretrain_umt_base_patch16_224(pretrained=False, **kwargs): model = PretrainVisionTransformer( img_size=224, patch_size=16, encoder_embed_dim=768, encoder_depth=12, encoder_num_heads=12, encoder_num_classes=0, mlp_ratio=4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) model.default_cfg = _cfg() if pretrained: checkpoint = torch.load( kwargs["init_ckpt"], map_location="cpu" ) model.load_state_dict(checkpoint["model"]) return model @register_model def pretrain_umt_large_patch16_224(pretrained=False, **kwargs): model = PretrainVisionTransformer( img_size=224, patch_size=16, encoder_embed_dim=1024, encoder_depth=24, encoder_num_heads=16, encoder_num_classes=0, mlp_ratio=4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) model.default_cfg = _cfg() if pretrained: checkpoint = torch.load( kwargs["init_ckpt"], map_location="cpu" ) model.load_state_dict(checkpoint["model"]) return model if __name__ == '__main__': import time from fvcore.nn import FlopCountAnalysis from fvcore.nn import flop_count_table import numpy as np seed = 4217 np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) model = pretrain_umt_base_patch16_224() # flops = FlopCountAnalysis(model, torch.rand(1, 3, 16, 224, 224)) # s = time.time() # print(flop_count_table(flops, max_depth=1)) # print(time.time()-s) mask = torch.cat([ torch.ones(1, 8 * int(14 * 14 * 0.75)), torch.zeros(1, 8 * int(14 * 14 * 0.25)), ], dim=-1).to(torch.bool) print(model(torch.rand(1, 3, 16, 224, 224), mask)[1].shape) ================================================ FILE: Open-Sora/build/lib/vbench/utils.py ================================================ import os import json import numpy as np import logging import subprocess import torch import re from pathlib import Path from PIL import Image, ImageSequence from decord import VideoReader, cpu from torchvision import transforms from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, ToPILImage try: from torchvision.transforms import InterpolationMode BICUBIC = InterpolationMode.BICUBIC BILINEAR = InterpolationMode.BILINEAR except ImportError: BICUBIC = Image.BICUBIC BILINEAR = Image.BILINEAR CACHE_DIR = os.environ.get('VBENCH_CACHE_DIR') if CACHE_DIR is None: CACHE_DIR = os.path.join(os.path.expanduser('~'), '.cache', 'vbench') logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def clip_transform(n_px): return Compose([ Resize(n_px, interpolation=BICUBIC), CenterCrop(n_px), transforms.Lambda(lambda x: x.float().div(255.0)), Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), ]) def clip_transform_Image(n_px): return Compose([ Resize(n_px, interpolation=BICUBIC), CenterCrop(n_px), ToTensor(), Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), ]) def dino_transform(n_px): return Compose([ Resize(size=n_px), transforms.Lambda(lambda x: x.float().div(255.0)), Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) def dino_transform_Image(n_px): return Compose([ Resize(size=n_px), ToTensor(), Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) def tag2text_transform(n_px): normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) return Compose([ToPILImage(),Resize((n_px, n_px)),ToTensor(),normalize]) def get_frame_indices(num_frames, vlen, sample='rand', fix_start=None, input_fps=1, max_num_frames=-1): if sample in ["rand", "middle"]: # uniform sampling acc_samples = min(num_frames, vlen) # split the video into `acc_samples` intervals, and sample from each interval. intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int) ranges = [] for idx, interv in enumerate(intervals[:-1]): ranges.append((interv, intervals[idx + 1] - 1)) if sample == 'rand': try: frame_indices = [random.choice(range(x[0], x[1])) for x in ranges] except: frame_indices = np.random.permutation(vlen)[:acc_samples] frame_indices.sort() frame_indices = list(frame_indices) elif fix_start is not None: frame_indices = [x[0] + fix_start for x in ranges] elif sample == 'middle': frame_indices = [(x[0] + x[1]) // 2 for x in ranges] else: raise NotImplementedError if len(frame_indices) < num_frames: # padded with last frame padded_frame_indices = [frame_indices[-1]] * num_frames padded_frame_indices[:len(frame_indices)] = frame_indices frame_indices = padded_frame_indices elif "fps" in sample: # fps0.5, sequentially sample frames at 0.5 fps output_fps = float(sample[3:]) duration = float(vlen) / input_fps delta = 1 / output_fps # gap between frames, this is also the clip length each frame represents frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta) frame_indices = np.around(frame_seconds * input_fps).astype(int) frame_indices = [e for e in frame_indices if e < vlen] if max_num_frames > 0 and len(frame_indices) > max_num_frames: frame_indices = frame_indices[:max_num_frames] # frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames) else: raise ValueError return frame_indices def load_video(video_path, data_transform=None, num_frames=None, return_tensor=True, width=None, height=None): """ Load a video from a given path and apply optional data transformations. The function supports loading video in GIF (.gif), PNG (.png), and MP4 (.mp4) formats. Depending on the format, it processes and extracts frames accordingly. Parameters: - video_path (str): The file path to the video or image to be loaded. - data_transform (callable, optional): A function that applies transformations to the video data. Returns: - frames (torch.Tensor): A tensor containing the video frames with shape (T, C, H, W), where T is the number of frames, C is the number of channels, H is the height, and W is the width. Raises: - NotImplementedError: If the video format is not supported. The function first determines the format of the video file by its extension. For GIFs, it iterates over each frame and converts them to RGB. For PNGs, it reads the single frame, converts it to RGB. For MP4s, it reads the frames using the VideoReader class and converts them to NumPy arrays. If a data_transform is provided, it is applied to the buffer before converting it to a tensor. Finally, the tensor is permuted to match the expected (T, C, H, W) format. """ if video_path.endswith('.gif'): frame_ls = [] img = Image.open(video_path) for frame in ImageSequence.Iterator(img): frame = frame.convert('RGB') frame = np.array(frame).astype(np.uint8) frame_ls.append(frame) buffer = np.array(frame_ls).astype(np.uint8) elif video_path.endswith('.png'): frame = Image.open(video_path) frame = frame.convert('RGB') frame = np.array(frame).astype(np.uint8) frame_ls = [frame] buffer = np.array(frame_ls) elif video_path.endswith('.mp4'): import decord decord.bridge.set_bridge('native') if width: video_reader = VideoReader(video_path, width=width, height=height, num_threads=1) else: video_reader = VideoReader(video_path, num_threads=1) frames = video_reader.get_batch(range(len(video_reader))) # (T, H, W, C), torch.uint8 buffer = frames.asnumpy().astype(np.uint8) else: raise NotImplementedError frames = buffer if num_frames: frame_indices = get_frame_indices( num_frames, len(frames), sample="middle" ) frames = frames[frame_indices] if data_transform: frames = data_transform(frames) elif return_tensor: frames = torch.Tensor(frames) frames = frames.permute(0, 3, 1, 2) # (T, C, H, W), torch.uint8 return frames def read_frames_decord_by_fps( video_path, sample_fps=2, sample='rand', fix_start=None, max_num_frames=-1, trimmed30=False, num_frames=8 ): import decord decord.bridge.set_bridge("torch") video_reader = VideoReader(video_path, num_threads=1) vlen = len(video_reader) fps = video_reader.get_avg_fps() duration = vlen / float(fps) if trimmed30 and duration > 30: duration = 30 vlen = int(30 * float(fps)) frame_indices = get_frame_indices( num_frames, vlen, sample=sample, fix_start=fix_start, input_fps=fps, max_num_frames=max_num_frames ) frames = video_reader.get_batch(frame_indices) # (T, H, W, C), torch.uint8 frames = frames.permute(0, 3, 1, 2) # (T, C, H, W), torch.uint8 return frames def load_dimension_info(json_dir, dimension, lang): """ Load video list and prompt information based on a specified dimension and language from a JSON file. Parameters: - json_dir (str): The directory path where the JSON file is located. - dimension (str): The dimension for evaluation to filter the video prompts. - lang (str): The language key used to retrieve the appropriate prompt text. Returns: - video_list (list): A list of video file paths that match the specified dimension. - prompt_dict_ls (list): A list of dictionaries, each containing a prompt and its corresponding video list. The function reads the JSON file to extract video information. It filters the prompts based on the specified dimension and compiles a list of video paths and associated prompts in the specified language. Notes: - The JSON file is expected to contain a list of dictionaries with keys 'dimension', 'video_list', and language-based prompts. - The function assumes that the 'video_list' key in the JSON can either be a list or a single string value. """ video_list = [] prompt_dict_ls = [] full_prompt_list = load_json(json_dir) for prompt_dict in full_prompt_list: if dimension in prompt_dict['dimension'] and 'video_list' in prompt_dict: prompt = prompt_dict[f'prompt_{lang}'] cur_video_list = prompt_dict['video_list'] if isinstance(prompt_dict['video_list'], list) else [prompt_dict['video_list']] video_list += cur_video_list if 'auxiliary_info' in prompt_dict and dimension in prompt_dict['auxiliary_info']: prompt_dict_ls += [{'prompt': prompt, 'video_list': cur_video_list, 'auxiliary_info': prompt_dict['auxiliary_info'][dimension]}] else: prompt_dict_ls += [{'prompt': prompt, 'video_list': cur_video_list}] return video_list, prompt_dict_ls def init_submodules(dimension_list, local=False, read_frame=False): submodules_dict = {} if local: logger.info("\x1b[32m[Local Mode]\x1b[0m Working in local mode, please make sure that the pre-trained model has been fully downloaded.") for dimension in dimension_list: os.makedirs(CACHE_DIR, exist_ok=True) if dimension == 'background_consistency': # read_frame = False if local: vit_b_path = f'{CACHE_DIR}/clip_model/ViT-B-32.pt' if not os.path.isfile(vit_b_path): wget_command = ['wget', 'https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt', '-P', os.path.dirname(vit_b_path)] subprocess.run(wget_command, check=True) else: vit_b_path = 'ViT-B/32' submodules_dict[dimension] = [vit_b_path, read_frame] elif dimension == 'human_action': umt_path = f'{CACHE_DIR}/umt_model/l16_ptk710_ftk710_ftk400_f16_res224.pth' if not os.path.isfile(umt_path): wget_command = ['wget', 'https://pjlab-gvm-data.oss-cn-shanghai.aliyuncs.com/umt/single_modality/l16_ptk710_ftk710_ftk400_f16_res224.pth', '-P', os.path.dirname(umt_path)] subprocess.run(wget_command, check=True) submodules_dict[dimension] = [umt_path,] elif dimension == 'temporal_flickering': submodules_dict[dimension] = [] elif dimension == 'motion_smoothness': CUR_DIR = os.path.dirname(os.path.abspath(__file__)) submodules_dict[dimension] = { 'config': f'{CUR_DIR}/third_party/amt/cfgs/AMT-S.yaml', 'ckpt': f'{CACHE_DIR}/amt_model/amt-s.pth' } details = submodules_dict[dimension] # Check if the file exists, if not, download it with wget if not os.path.isfile(details['ckpt']): print(f"File {details['ckpt']} does not exist. Downloading...") wget_command = ['wget', '-P', os.path.dirname(details['ckpt']), 'https://huggingface.co/lalala125/AMT/resolve/main/amt-s.pth'] subprocess.run(wget_command, check=True) elif dimension == 'dynamic_degree': submodules_dict[dimension] = { 'model': f'{CACHE_DIR}/raft_model/models/raft-things.pth' } details = submodules_dict[dimension] if not os.path.isfile(details['model']): # raise NotImplementedError print(f"File {details['model']} does not exist. Downloading...") wget_command = ['wget', '-P', f'{CACHE_DIR}/raft_model/', 'https://dl.dropboxusercontent.com/s/4j4z58wuv8o0mfz/models.zip'] unzip_command = ['unzip', '-d', f'{CACHE_DIR}/raft_model/', f'{CACHE_DIR}/raft_model/models.zip'] remove_command = ['rm', '-r', f'{CACHE_DIR}/raft_model/models.zip'] try: subprocess.run(wget_command, check=True) subprocess.run(unzip_command, check=True) subprocess.run(remove_command, check=True) except subprocess.CalledProcessError as err: print(f"Error during downloading RAFT model: {err}") # Assign the DINO model path for subject consistency dimension elif dimension == 'subject_consistency': if local: submodules_dict[dimension] = { 'repo_or_dir': f'{CACHE_DIR}/dino_model/facebookresearch_dino_main/', 'path': f'{CACHE_DIR}/dino_model/dino_vitbase16_pretrain.pth', 'model': 'dino_vitb16', 'source': 'local', 'read_frame': read_frame } details = submodules_dict[dimension] # Check if the file exists, if not, download it with wget if not os.path.isdir(details['repo_or_dir']): print(f"Directory {details['repo_or_dir']} does not exist. Cloning repository...") subprocess.run(['git', 'clone', 'https://github.com/facebookresearch/dino', details['repo_or_dir']], check=True) if not os.path.isfile(details['path']): print(f"File {details['path']} does not exist. Downloading...") wget_command = ['wget', '-P', os.path.dirname(details['path']), 'https://dl.fbaipublicfiles.com/dino/dino_vitbase16_pretrain/dino_vitbase16_pretrain.pth'] subprocess.run(wget_command, check=True) else: submodules_dict[dimension] = { 'repo_or_dir':'facebookresearch/dino:main', 'source':'github', 'model': 'dino_vitb16', 'read_frame': read_frame } elif dimension == 'aesthetic_quality': aes_path = f'{CACHE_DIR}/aesthetic_model/emb_reader' if local: vit_l_path = f'{CACHE_DIR}/clip_model/ViT-L-14.pt' if not os.path.isfile(vit_l_path): wget_command = ['wget' ,'https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt', '-P', os.path.dirname(vit_l_path)] subprocess.run(wget_command, check=True) else: vit_l_path = 'ViT-L/14' submodules_dict[dimension] = [vit_l_path, aes_path] elif dimension == 'imaging_quality': musiq_spaq_path = f'{CACHE_DIR}/pyiqa_model/musiq_spaq_ckpt-358bb6af.pth' if not os.path.isfile(musiq_spaq_path): wget_command = ['wget', 'https://github.com/chaofengc/IQA-PyTorch/releases/download/v0.1-weights/musiq_spaq_ckpt-358bb6af.pth', '-P', os.path.dirname(musiq_spaq_path)] subprocess.run(wget_command, check=True) submodules_dict[dimension] = {'model_path': musiq_spaq_path} elif dimension in ["object_class", "multiple_objects", "color", "spatial_relationship" ]: submodules_dict[dimension] = { "model_weight": f'{CACHE_DIR}/grit_model/grit_b_densecap_objectdet.pth' } if not os.path.exists(submodules_dict[dimension]['model_weight']): wget_command = ['wget', 'https://datarelease.blob.core.windows.net/grit/models/grit_b_densecap_objectdet.pth', '-P', os.path.dirname(submodules_dict[dimension]["model_weight"])] subprocess.run(wget_command, check=True) elif dimension == 'scene': submodules_dict[dimension] = { "pretrained": f'{CACHE_DIR}/caption_model/tag2text_swin_14m.pth', "image_size":384, "vit":"swin_b" } if not os.path.exists(submodules_dict[dimension]['pretrained']): wget_command = ['wget', 'https://huggingface.co/spaces/xinyu1205/recognize-anything/resolve/main/tag2text_swin_14m.pth', '-P', os.path.dirname(submodules_dict[dimension]["pretrained"])] subprocess.run(wget_command, check=True) elif dimension == 'appearance_style': if local: submodules_dict[dimension] = {"name": f'{CACHE_DIR}/clip_model/ViT-B-32.pt'} if not os.path.isfile(submodules_dict[dimension]["name"]): wget_command = ['wget', 'https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt', '-P', os.path.dirname(submodules_dict[dimension]["name"])] subprocess.run(wget_command, check=True) else: submodules_dict[dimension] = {"name": 'ViT-B/32'} elif dimension in ["temporal_style", "overall_consistency"]: submodules_dict[dimension] = { "pretrain": f'{CACHE_DIR}/ViCLIP/ViClip-InternVid-10M-FLT.pth', } if not os.path.exists(submodules_dict[dimension]['pretrain']): wget_command = ['wget', 'https://pjlab-gvm-data.oss-cn-shanghai.aliyuncs.com/internvideo/viclip/ViClip-InternVid-10M-FLT.pth', '-P', os.path.dirname(submodules_dict[dimension]["pretrain"])] subprocess.run(wget_command, check=True) return submodules_dict def get_prompt_from_filename(path: str): """ 1. prompt-0.suffix -> prompt 2. prompt.suffix -> prompt """ prompt = Path(path).stem number_ending = r'-\d+$' # checks ending with - if re.search(number_ending, prompt): return re.sub(number_ending, '', prompt) return prompt def save_json(data, path, indent=4): with open(path, 'w', encoding='utf-8') as f: json.dump(data, f, indent=indent) def load_json(path): """ Load a JSON file from the given file path. Parameters: - file_path (str): The path to the JSON file. Returns: - data (dict or list): The data loaded from the JSON file, which could be a dictionary or a list. """ with open(path, 'r', encoding='utf-8') as f: return json.load(f) ================================================ FILE: Open-Sora/build/lib/vbench2_beta_i2v/__init__.py ================================================ import os from vbench2_beta_i2v.utils import init_submodules, save_json, load_json from vbench import VBench import importlib class VBenchI2V(VBench): def build_full_dimension_list(self, ): return ["subject_consistency", "background_consistency", "aesthetic_quality", "imaging_quality", "object_class", "multiple_objects", "color", "spatial_relationship", "scene", "temporal_style", 'overall_consistency', "human_action", "temporal_flickering", "motion_smoothness", "dynamic_degree", "appearance_style", "i2v_subject", "i2v_background", "camera_motion"] def evaluate(self, videos_path, name, dimension_list=None, local=False, read_frame=False, custom_prompt=False, resolution="1-1"): results_dict = {} if dimension_list is None: dimension_list = self.build_full_dimension_list() submodules_dict = init_submodules(dimension_list, local=local, read_frame=read_frame, resolution=resolution) # print('BEFORE BUILDING') cur_full_info_path = self.build_full_info_json(videos_path, name, dimension_list, custom_prompt=custom_prompt) # print('AFTER BUILDING') for dimension in dimension_list: try: dimension_module = importlib.import_module(f'vbench2_beta_i2v.{dimension}') evaluate_func = getattr(dimension_module, f'compute_{dimension}') except Exception as e: raise NotImplementedError(f'UnImplemented dimension {dimension}!, {e}') submodules_list = submodules_dict[dimension] print(f'cur_full_info_path: {cur_full_info_path}') # TODO: to delete results = evaluate_func(cur_full_info_path, self.device, submodules_list) results_dict[dimension] = results output_name = os.path.join(self.output_path, name+'_eval_results.json') save_json(results_dict, output_name) print(f'Evaluation results saved to {output_name}') ================================================ FILE: Open-Sora/build/lib/vbench2_beta_i2v/camera_motion.py ================================================ import torch import os import numpy as np from tqdm import tqdm from vbench2_beta_i2v.third_party.cotracker.utils.visualizer import Visualizer from vbench2_beta_i2v.utils import load_video, load_dimension_info def transform(vector): x = np.mean([item[0] for item in vector]) y = np.mean([item[1] for item in vector]) return [x, y] def transform_class(vector, min_reso, factor=0.005): # 768*0.05 scale = min_reso * factor x, y = vector direction = [] if x > scale: direction.append("right") elif x < -scale: direction.append("left") if y > scale: direction.append("down") elif y < -scale: direction.append("up") return direction if direction else ["static"] class CameraPredict: def __init__(self, device, submodules_list): self.device = device self.grid_size = 10 try: self.model = torch.hub.load(submodules_list["repo"], submodules_list["model"]).to(self.device) except: # workaround for CERTIFICATE_VERIFY_FAILED (see: https://github.com/pytorch/pytorch/issues/33288#issuecomment-954160699) import ssl ssl._create_default_https_context = ssl._create_unverified_context self.model = torch.hub.load(submodules_list["repo"], submodules_list["model"]).to(self.device) def infer(self, video_path, save_video=False, save_dir="./saved_videos"): # load video video = load_video(video_path, return_tensor=False) # set scale height, width = video.shape[1], video.shape[2] self.scale = min(height, width) video = torch.from_numpy(video).permute(0, 3, 1, 2)[None].float().to(self.device) # B T C H W pred_tracks, pred_visibility = self.model(video, grid_size=self.grid_size) # B T N 2, B T N 1 if save_video: video_name = os.path.basename(video_path)[:-4] vis = Visualizer(save_dir=save_dir, pad_value=120, linewidth=3) vis.visualize(video, pred_tracks, pred_visibility, filename=video_name) return pred_tracks[0].long().detach().cpu().numpy() def get_edge_point(self, track): middle = self.grid_size // 2 top = [list(track[0, i, :]) for i in range(middle-2, middle+2)] down = [list(track[self.grid_size-1, i, :]) for i in range(middle-2, middle+2)] left = [list(track[i, 0, :]) for i in range(middle-2, middle+2)] right = [list(track[i, self.grid_size-1, :]) for i in range(middle-2, middle+2)] return top, down, left, right def get_edge_direction(self, track1, track2): edge_points1 = self.get_edge_point(track1) edge_points2 = self.get_edge_point(track2) vector_results = [] for points1, points2 in zip(edge_points1, edge_points2): vectors = [[end[0]-start[0], end[1]-start[1]] for start, end in zip(points1, points2)] vector_results.append(vectors) vector_results = list(map(transform, vector_results)) class_results = [transform_class(vector, min_reso=self.scale) for vector in vector_results] return class_results def classify_top_down(self, top, down): results = [] classes = [f"{item_t}_{item_d}" for item_t in top for item_d in down] results_mapping = { "left_left": "pan_right", "right_right": "pan_left", "down_down": "tilt_up", "up_up": "tilt_down", "up_down": "zoom_in", "down_up": "zoom_out", "static_static": "static" } results = [results_mapping.get(cls) for cls in classes if cls in results_mapping] return results if results else ["None"] def classify_left_right(self, left, right): results = [] classes = [f"{item_l}_{item_r}" for item_l in left for item_r in right] results_mapping = { "left_left": "pan_right", "right_right": "pan_left", "down_down": "tilt_up", "up_up": "tilt_down", "left_right": "zoom_in", "right_left": "zoom_out", "static_static": "static" } results = [results_mapping.get(cls) for cls in classes if cls in results_mapping] return results if results else ["None"] def camera_classify(self, track1, track2): top, down, left, right = self.get_edge_direction(track1, track2) top_results = self.classify_top_down(top, down) left_results = self.classify_left_right(left, right) results = list(set(top_results+left_results)) if "static" in results and len(results)>1: results.remove("static") if "None" in results and len(results)>1: results.remove("None") return results def predict(self, video_path): pred_track = self.infer(video_path) track1 = pred_track[0].reshape((self.grid_size, self.grid_size, 2)) track2 = pred_track[-1].reshape((self.grid_size, self.grid_size, 2)) results = self.camera_classify(track1, track2) return results def get_type(video_name): camera_mapping = { "camera pans left": "pan_left", "camera pans right": "pan_right", "camera tilts up": "tilt_up", "camera tilts down": "tilt_down", "camera zooms in": "zoom_in", "camera zooms out": "zoom_out", "camera static": "static" } for item, value in camera_mapping.items(): if item in video_name: return value raise ValueError("Not a recognized video name") def camera_motion(camera, video_list): sim = [] video_results = [] diff_type_results = { "pan_left":[], "pan_right":[], "tilt_up":[], "tilt_down":[], "zoom_in":[], "zoom_out":[], "static":[], } for video_path in tqdm(video_list): target_type = get_type(os.path.basename(video_path)) predict_results = camera.predict(video_path) video_score = 1.0 if target_type in predict_results else 0.0 diff_type_results[target_type].append(video_score) video_results.append({'video_path': video_path, 'video_results': video_score, 'prompt_type':target_type, 'predict_type': predict_results}) sim.append(video_score) avg_score = np.mean(sim) for key, value in diff_type_results.items(): diff_type_results[key] = np.mean(value) return avg_score, diff_type_results, video_results def compute_camera_motion(json_dir, device, submodules_list): camera = CameraPredict(device, submodules_list) video_list, _ = load_dimension_info(json_dir, dimension='camera_motion', lang='en') all_results, diff_type_results, video_results = camera_motion(camera, video_list) return all_results, diff_type_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench2_beta_i2v/crop_to_diff_ratio.py ================================================ import os from PIL import Image import json import os.path as osp import random import argparse from tqdm import tqdm import logging logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def save_json(data, save_file): json.dump(data, open(save_file, "w")) def crop(img_path, bbox, save_root): os.makedirs(save_root, exist_ok=True) img = Image.open(img_path) x, y, width, height = map(int, bbox) crop_img = img.crop((x, y, x+width, y+height)) crop_img.save(osp.join(save_root, osp.basename(img_path))) def get_other_ratio_crop(second_crop_info, ratio="8-5"): random.seed(123) ratio_w, ratio_h = map(int, ratio.split('-')) assert 1.0 <= ratio_w/ratio_h < 1.7778, "The ratio does not meet the requirements, it needs to be between 1:1 and 16:9." width, height = second_crop_info['width'], second_crop_info['height'] x, y, crop_w, crop_h = second_crop_info['second_bbox'] if width == height: target_w = int(width/ratio_w) * ratio_w target_h = int(width/ratio_w) * ratio_h assert target_h >= crop_h target_x = 0 y_min = max(y - (target_h - crop_h), 0) y_max = min(y + target_h, height) - target_h assert y_max >= y_min target_y = random.randint(y_min, y_max) else: target_w = int(height/ratio_h) * ratio_w target_h = int(height/ratio_h) * ratio_h assert target_w >= crop_w target_y = 0 x_min = max(x - (target_w - crop_w), 0) x_max = min(x + target_w, width) - target_w assert x_max >= x_min target_x = random.randint(x_min, x_max) return [target_x, target_y, target_w, target_h] def transfer_bbox_to_origin_img(first_crop_info, old_bbox): x, y, _, _ = first_crop_info["first_bbox"] old_x, old_y, width, height = old_bbox return [x + old_x, y + old_y, width, height] def get_target_crop(args): data = json.load(open(args.crop_info_path, "r")) target_results = [] os.makedirs(args.result_path, exist_ok=True) ####### get target crop info ######## for item in tqdm(data): second_crop_info = item["second_crop"] first_crop_info = item["first_crop"] target_crop = transfer_bbox_to_origin_img(first_crop_info, get_other_ratio_crop(second_crop_info, args.target_ratio)) item["target_crop"] = { "target_ratio":args.target_ratio, "target_bbox":target_crop } target_results.append(item) target_file = os.path.join(args.result_path, f"target_crop_info_{args.target_ratio}.json") save_json(target_results, target_file) logger.info(f"Target crop info are saved in the '{target_file}' file") ####### crop images ######### ori_path = args.ori_image_path target_path = f"{args.result_path}/{args.target_ratio}" for sample in tqdm(target_results): img_path = osp.join(ori_path, sample["file_name"]) target_bbox = sample["target_crop"]["target_bbox"] crop(img_path, target_bbox, target_path) logger.info(f"Cropped images are saved in the '{target_path}' path") if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--crop_info_path', type=str, default="vbench2_beta_i2v/data/i2v-bench-info.json", help="image suite meta info") parser.add_argument('--target_ratio', default="5-4", required=True, help="the required crop ratio") parser.add_argument('--ori_image_path', type=str, default="vbench2_beta_i2v/data/origin", help='the file path of the original image data') parser.add_argument('--result_path', type=str, default="vbench2_beta_i2v/data/target_crop", help='result save path') args = parser.parse_args() get_target_crop(args) ================================================ FILE: Open-Sora/build/lib/vbench2_beta_i2v/i2v_background.py ================================================ import io import os import cv2 import json import numpy as np from PIL import Image from tqdm import tqdm import torch import torch.nn as nn import torch.nn.functional as F import torchvision.transforms as transforms from vbench2_beta_i2v.utils import load_video, load_i2v_dimension_info, dino_transform, dino_transform_Image import logging logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def i2v_background(model, video_pair_list, device): video_results = [] sim_list = [] max_weight = 0.5 mean_weight = 0.5 min_weight = 0.0 image_transform = dino_transform_Image(224) frames_transform = dino_transform(224) for image_path, video_path in tqdm(video_pair_list): # input image preprocess & extract feature input_image = image_transform(Image.open(image_path)) input_image = input_image.unsqueeze(0) input_image = input_image.to(device) input_image_features = model(input_image) input_image_features = F.normalize(input_image_features, dim=-1, p=2) # get frames from video images = load_video(video_path) images = frames_transform(images) # calculate sim between input image and frames in generated video conformity_scores = [] consec_scores = [] for i in range(len(images)): with torch.no_grad(): image = images[i].unsqueeze(0) image = image.to(device) image_features = model(image) image_features = F.normalize(image_features, dim=-1, p=2) if i != 0: sim_consec = max(0.0, F.cosine_similarity(former_image_features, image_features).item()) consec_scores.append(sim_consec) sim_to_input = max(0.0, F.cosine_similarity(input_image_features, image_features).item()) conformity_scores.append(sim_to_input) former_image_features = image_features video_score = max_weight * np.max(conformity_scores) + \ mean_weight * np.mean(consec_scores) + \ min_weight * np.min(consec_scores) sim_list.append(video_score) video_results.append({'image_path': image_path, 'video_path': video_path, 'video_results': video_score}) return np.mean(sim_list), video_results def compute_i2v_background(json_dir, device, submodules_list): dino_model = torch.hub.load(**submodules_list).to(device) resolution = submodules_list['resolution'] logger.info("Initialize DINO success") video_pair_list, _ = load_i2v_dimension_info(json_dir, dimension='i2v_background', lang='en', resolution=resolution) all_results, video_results = i2v_background(dino_model, video_pair_list, device) return all_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench2_beta_i2v/i2v_subject.py ================================================ import io import os import cv2 import json import numpy as np from PIL import Image from tqdm import tqdm import torch import torch.nn as nn import torch.nn.functional as F import torchvision.transforms as transforms from vbench2_beta_i2v.utils import load_video, load_i2v_dimension_info, dino_transform, dino_transform_Image import logging logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def i2v_subject(model, video_pair_list, device): video_results = [] sim_list = [] max_weight = 0.5 mean_weight = 0.5 min_weight = 0.0 image_transform = dino_transform_Image(224) frames_transform = dino_transform(224) for image_path, video_path in tqdm(video_pair_list): # input image preprocess & extract feature input_image = image_transform(Image.open(image_path)) input_image = input_image.unsqueeze(0) input_image = input_image.to(device) input_image_features = model(input_image) input_image_features = F.normalize(input_image_features, dim=-1, p=2) # get frames from video images = load_video(video_path) images = frames_transform(images) # calculate sim between input image and frames in generated video conformity_scores = [] consec_scores = [] for i in range(len(images)): with torch.no_grad(): image = images[i].unsqueeze(0) image = image.to(device) image_features = model(image) image_features = F.normalize(image_features, dim=-1, p=2) if i != 0: sim_consec = max(0.0, F.cosine_similarity(former_image_features, image_features).item()) consec_scores.append(sim_consec) sim_to_input = max(0.0, F.cosine_similarity(input_image_features, image_features).item()) conformity_scores.append(sim_to_input) former_image_features = image_features video_score = max_weight * np.max(conformity_scores) + \ mean_weight * np.mean(consec_scores) + \ min_weight * np.min(consec_scores) sim_list.append(video_score) video_results.append({'image_path': image_path, 'video_path': video_path, 'video_results': video_score}) return np.mean(sim_list), video_results def compute_i2v_subject(json_dir, device, submodules_list): dino_model = torch.hub.load(**submodules_list).to(device) resolution = submodules_list['resolution'] logger.info("Initialize DINO success") video_pair_list, _ = load_i2v_dimension_info(json_dir, dimension='i2v_subject', lang='en', resolution=resolution) all_results, video_results = i2v_subject(dino_model, video_pair_list, device) return all_results, video_results ================================================ FILE: Open-Sora/build/lib/vbench2_beta_i2v/utils.py ================================================ import os import json import numpy as np import logging import subprocess import torch from PIL import Image, ImageSequence from decord import VideoReader, cpu from torchvision import transforms from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, ToPILImage try: from torchvision.transforms import InterpolationMode BICUBIC = InterpolationMode.BICUBIC BILINEAR = InterpolationMode.BILINEAR except ImportError: BICUBIC = Image.BICUBIC BILINEAR = Image.BILINEAR CACHE_DIR = os.environ.get('VBENCH_CACHE_DIR') if CACHE_DIR is None: CACHE_DIR = os.path.join(os.path.expanduser('~'), '.cache', 'vbench') logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def clip_transform(n_px): return Compose([ Resize(n_px, interpolation=BICUBIC), CenterCrop(n_px), transforms.Lambda(lambda x: x.float().div(255.0)), Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), ]) def clip_transform_Image(n_px): return Compose([ Resize(n_px, interpolation=BICUBIC), CenterCrop(n_px), ToTensor(), Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), ]) def dino_transform(n_px): return Compose([ Resize(size=n_px), transforms.Lambda(lambda x: x.float().div(255.0)), Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) def dino_transform_Image(n_px): return Compose([ Resize(size=n_px), ToTensor(), Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) def tag2text_transform(n_px): normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) return Compose([ToPILImage(),Resize((n_px, n_px)),ToTensor(),normalize]) def get_frame_indices(num_frames, vlen, sample='rand', fix_start=None, input_fps=1, max_num_frames=-1): if sample in ["rand", "middle"]: # uniform sampling acc_samples = min(num_frames, vlen) # split the video into `acc_samples` intervals, and sample from each interval. intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int) ranges = [] for idx, interv in enumerate(intervals[:-1]): ranges.append((interv, intervals[idx + 1] - 1)) if sample == 'rand': try: frame_indices = [random.choice(range(x[0], x[1])) for x in ranges] except: frame_indices = np.random.permutation(vlen)[:acc_samples] frame_indices.sort() frame_indices = list(frame_indices) elif fix_start is not None: frame_indices = [x[0] + fix_start for x in ranges] elif sample == 'middle': frame_indices = [(x[0] + x[1]) // 2 for x in ranges] else: raise NotImplementedError if len(frame_indices) < num_frames: # padded with last frame padded_frame_indices = [frame_indices[-1]] * num_frames padded_frame_indices[:len(frame_indices)] = frame_indices frame_indices = padded_frame_indices elif "fps" in sample: # fps0.5, sequentially sample frames at 0.5 fps output_fps = float(sample[3:]) duration = float(vlen) / input_fps delta = 1 / output_fps # gap between frames, this is also the clip length each frame represents frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta) frame_indices = np.around(frame_seconds * input_fps).astype(int) frame_indices = [e for e in frame_indices if e < vlen] if max_num_frames > 0 and len(frame_indices) > max_num_frames: frame_indices = frame_indices[:max_num_frames] # frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames) else: raise ValueError return frame_indices def load_video(video_path, data_transform=None, num_frames=None, return_tensor=True, width=None, height=None): """ Load a video from a given path and apply optional data transformations. The function supports loading video in GIF (.gif), PNG (.png), and MP4 (.mp4) formats. Depending on the format, it processes and extracts frames accordingly. Parameters: - video_path (str): The file path to the video or image to be loaded. - data_transform (callable, optional): A function that applies transformations to the video data. Returns: - frames (torch.Tensor): A tensor containing the video frames with shape (T, C, H, W), where T is the number of frames, C is the number of channels, H is the height, and W is the width. Raises: - NotImplementedError: If the video format is not supported. The function first determines the format of the video file by its extension. For GIFs, it iterates over each frame and converts them to RGB. For PNGs, it reads the single frame, converts it to RGB. For MP4s, it reads the frames using the VideoReader class and converts them to NumPy arrays. If a data_transform is provided, it is applied to the buffer before converting it to a tensor. Finally, the tensor is permuted to match the expected (T, C, H, W) format. """ if video_path.endswith('.gif'): frame_ls = [] img = Image.open(video_path) for frame in ImageSequence.Iterator(img): frame = frame.convert('RGB') frame = np.array(frame).astype(np.uint8) frame_ls.append(frame) buffer = np.array(frame_ls).astype(np.uint8) elif video_path.endswith('.png'): frame = Image.open(video_path) frame = frame.convert('RGB') frame = np.array(frame).astype(np.uint8) frame_ls = [frame] buffer = np.array(frame_ls) elif video_path.endswith('.mp4'): import decord decord.bridge.set_bridge('native') if width: video_reader = VideoReader(video_path, width=width, height=height, num_threads=1) else: video_reader = VideoReader(video_path, num_threads=1) frames = video_reader.get_batch(range(len(video_reader))) # (T, H, W, C), torch.uint8 buffer = frames.asnumpy().astype(np.uint8) else: raise NotImplementedError frames = buffer if num_frames: frame_indices = get_frame_indices( num_frames, len(frames), sample="middle" ) frames = frames[frame_indices] if data_transform: frames = data_transform(frames) elif return_tensor: frames = torch.Tensor(frames) frames = frames.permute(0, 3, 1, 2) # (T, C, H, W), torch.uint8 return frames def read_frames_decord_by_fps( video_path, sample_fps=2, sample='rand', fix_start=None, max_num_frames=-1, trimmed30=False, num_frames=8 ): import decord decord.bridge.set_bridge("torch") video_reader = VideoReader(video_path, num_threads=1) vlen = len(video_reader) fps = video_reader.get_avg_fps() duration = vlen / float(fps) if trimmed30 and duration > 30: duration = 30 vlen = int(30 * float(fps)) frame_indices = get_frame_indices( num_frames, vlen, sample=sample, fix_start=fix_start, input_fps=fps, max_num_frames=max_num_frames ) frames = video_reader.get_batch(frame_indices) # (T, H, W, C), torch.uint8 frames = frames.permute(0, 3, 1, 2) # (T, C, H, W), torch.uint8 return frames def load_dimension_info(json_dir, dimension, lang): """ Load video list and prompt information based on a specified dimension and language from a JSON file. Parameters: - json_dir (str): The directory path where the JSON file is located. - dimension (str): The dimension for evaluation to filter the video prompts. - lang (str): The language key used to retrieve the appropriate prompt text. Returns: - video_list (list): A list of video file paths that match the specified dimension. - prompt_dict_ls (list): A list of dictionaries, each containing a prompt and its corresponding video list. The function reads the JSON file to extract video information. It filters the prompts based on the specified dimension and compiles a list of video paths and associated prompts in the specified language. Notes: - The JSON file is expected to contain a list of dictionaries with keys 'dimension', 'video_list', and language-based prompts. - The function assumes that the 'video_list' key in the JSON can either be a list or a single string value. """ video_list = [] prompt_dict_ls = [] full_prompt_list = load_json(json_dir) for prompt_dict in full_prompt_list: if dimension in prompt_dict['dimension'] and 'video_list' in prompt_dict: prompt = prompt_dict[f'prompt_{lang}'] cur_video_list = prompt_dict['video_list'] if isinstance(prompt_dict['video_list'], list) else [prompt_dict['video_list']] video_list += cur_video_list if 'auxiliary_info' in prompt_dict and dimension in prompt_dict['auxiliary_info']: prompt_dict_ls += [{'prompt': prompt, 'video_list': cur_video_list, 'auxiliary_info': prompt_dict['auxiliary_info'][dimension]}] else: prompt_dict_ls += [{'prompt': prompt, 'video_list': cur_video_list}] return video_list, prompt_dict_ls def load_i2v_dimension_info(json_dir, dimension, lang, resolution): """ Load video list and prompt information based on a specified dimension and language from a JSON file. Parameters: - json_dir (str): The directory path where the JSON file is located. - dimension (str): The dimension for evaluation to filter the video prompts. - lang (str): The language key used to retrieve the appropriate prompt text. - resulution (str): The resolution of the image will be used Returns: - video_list (list): A list of video file paths that match the specified dimension. - prompt_dict_ls (list): A list of dictionaries, each containing a prompt and its corresponding video list. The function reads the JSON file to extract video information. It filters the prompts based on the specified dimension and compiles a list of video paths and associated prompts in the specified language. Notes: - The JSON file is expected to contain a list of dictionaries with keys 'dimension', 'video_list', and language-based prompts. - The function assumes that the 'video_list' key in the JSON can either be a list or a single string value. """ video_pair_list = [] prompt_dict_ls = [] full_prompt_list = load_json(json_dir) image_root = f'vbench2_beta_i2v/data/crop/{resolution}' image_root = '/root/autodl-tmp/video_samples/samples_sora-original_model.safetensors_vbench' for prompt_dict in full_prompt_list: if dimension in prompt_dict['dimension'] and 'video_list' in prompt_dict: prompt = prompt_dict[f'prompt_{lang}'] cur_video_list = prompt_dict['video_list'] if isinstance(prompt_dict['video_list'], list) else [prompt_dict['video_list']] # create image-video pair image_path = os.path.join(image_root, prompt_dict["image_name"]) cur_video_pair = [(image_path, video) for video in cur_video_list] video_pair_list += cur_video_pair if 'auxiliary_info' in prompt_dict and dimension in prompt_dict['auxiliary_info']: prompt_dict_ls += [{'prompt': prompt, 'video_list': cur_video_list, 'auxiliary_info': prompt_dict['auxiliary_info'][dimension]}] else: prompt_dict_ls += [{'prompt': prompt, 'video_list': cur_video_list}] return video_pair_list, prompt_dict_ls def init_submodules(dimension_list, local=False, read_frame=False, resolution="1-1"): submodules_dict = {} if local: logger.info("\x1b[32m[Local Mode]\x1b[0m Working in local mode, please make sure that the pre-trained model has been fully downloaded.") for dimension in dimension_list: os.makedirs(CACHE_DIR, exist_ok=True) if dimension == 'i2v_subject' or dimension == 'i2v_background': if local: submodules_dict[dimension] = { 'repo_or_dir': f'{CACHE_DIR}/dino_model/facebookresearch_dino_main/', 'path': f'{CACHE_DIR}/dino_model/dino_vitbase16_pretrain.pth', 'model': 'dino_vitb16', 'source': 'local', 'resolution': resolution } details = submodules_dict[dimension] # Check if the file exists, if not, download it with wget if not os.path.isdir(details['repo_or_dir']): print(f"Directory {details['repo_or_dir']} does not exist. Cloning repository...") subprocess.run(['git', 'clone', 'https://github.com/facebookresearch/dino', details['repo_or_dir']], check=True) if not os.path.isfile(details['path']): print(f"File {details['path']} does not exist. Downloading...") wget_command = ['wget', '-P', os.path.dirname(details['path']), 'https://dl.fbaipublicfiles.com/dino/dino_vitbase16_pretrain/dino_vitbase16_pretrain.pth'] subprocess.run(wget_command, check=True) else: submodules_dict[dimension] = { 'repo_or_dir':'facebookresearch/dino:main', 'source':'github', 'model': 'dino_vitb16', 'resolution': resolution } elif dimension == 'camera_motion': submodules_dict[dimension] = { "repo":"facebookresearch/co-tracker", "model":"cotracker2" } return submodules_dict def save_json(data, path, indent=4): with open(path, 'w', encoding='utf-8') as f: json.dump(data, f, indent=indent) def load_json(path): """ Load a JSON file from the given file path. Parameters: - file_path (str): The path to the JSON file. Returns: - data (dict or list): The data loaded from the JSON file, which could be a dictionary or a list. """ with open(path, 'r', encoding='utf-8') as f: return json.load(f) ================================================ FILE: Open-Sora/configs/dit/inference/16x256x256.py ================================================ num_frames = 16 fps = 8 image_size = (256, 256) # Define model model = dict( type="DiT-XL/2", condition="text", from_pretrained="PRETRAINED_MODEL", ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", ) text_encoder = dict( type="clip", from_pretrained="openai/clip-vit-base-patch32", model_max_length=77, ) scheduler = dict( type="dpm-solver", num_sampling_steps=20, cfg_scale=4.0, ) dtype = "bf16" # Others batch_size = 2 seed = 42 prompt_path = "./assets/texts/ucf101_labels.txt" save_dir = "./samples/samples/" ================================================ FILE: Open-Sora/configs/dit/inference/1x256x256-class.py ================================================ num_frames = 1 fps = 1 image_size = (256, 256) # Define model model = dict( type="DiT-XL/2", no_temporal_pos_emb=True, condition="label_1000", from_pretrained="DiT-XL-2-256x256.pt", ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", ) text_encoder = dict( type="classes", num_classes=1000, ) scheduler = dict( type="dpm-solver", num_sampling_steps=20, cfg_scale=4.0, ) dtype = "bf16" # Others batch_size = 2 seed = 42 prompt_path = "./assets/texts/imagenet_id.txt" save_dir = "./samples/samples/" ================================================ FILE: Open-Sora/configs/dit/inference/1x256x256.py ================================================ num_frames = 1 fps = 1 image_size = (256, 256) # Define model model = dict( type="DiT-XL/2", no_temporal_pos_emb=True, condition="text", from_pretrained="PRETRAINED_MODEL", ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", ) text_encoder = dict( type="clip", from_pretrained="openai/clip-vit-base-patch32", model_max_length=77, ) scheduler = dict( type="dpm-solver", num_sampling_steps=20, cfg_scale=4.0, ) dtype = "bf16" # Others batch_size = 2 seed = 42 prompt_path = "./assets/texts/imagenet_labels.txt" save_dir = "./samples/samples/" ================================================ FILE: Open-Sora/configs/dit/train/16x256x256.py ================================================ # Define dataset dataset = dict( type="VideoTextDataset", data_path=None, num_frames=16, frame_interval=3, image_size=(256, 256), ) # Define acceleration num_workers = 4 dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 1 # Define model model = dict( type="DiT-XL/2", from_pretrained="DiT-XL-2-256x256.pt", enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", ) text_encoder = dict( type="clip", from_pretrained="openai/clip-vit-base-patch32", model_max_length=77, ) scheduler = dict( type="iddpm", timestep_respacing="", ) # Others seed = 42 outputs = "outputs" wandb = False epochs = 1000 log_every = 10 ckpt_every = 1000 load = None batch_size = 8 lr = 2e-5 grad_clip = 1.0 ================================================ FILE: Open-Sora/configs/dit/train/1x256x256.py ================================================ # Define dataset dataset = dict( type="VideoTextDataset", data_path=None, num_frames=1, frame_interval=1, image_size=(256, 256), transform_name="center", ) # Define acceleration num_workers = 4 dtype = "bf16" grad_checkpoint = False plugin = "zero2" sp_size = 1 # Define model model = dict( type="DiT-XL/2", no_temporal_pos_emb=True, enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", ) text_encoder = dict( type="clip", from_pretrained="openai/clip-vit-base-patch32", model_max_length=77, ) scheduler = dict( type="iddpm", timestep_respacing="", ) # Others seed = 42 outputs = "outputs" wandb = False epochs = 1000 log_every = 10 ckpt_every = 1000 load = None batch_size = 128 lr = 1e-4 # according to DiT repo grad_clip = 1.0 ================================================ FILE: Open-Sora/configs/latte/inference/16x256x256-class.py ================================================ num_frames = 16 fps = 8 image_size = (256, 256) # Define model model = dict( type="Latte-XL/2", condition="label_101", from_pretrained="Latte-XL-2-256x256-ucf101.pt", ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", ) text_encoder = dict( type="classes", num_classes=101, ) scheduler = dict( type="dpm-solver", num_sampling_steps=20, cfg_scale=4.0, ) dtype = "bf16" # Others batch_size = 2 seed = 42 prompt_path = "./assets/texts/ucf101_id.txt" save_dir = "./samples/samples/" ================================================ FILE: Open-Sora/configs/latte/inference/16x256x256.py ================================================ num_frames = 16 fps = 8 image_size = (256, 256) # Define model model = dict( type="Latte-XL/2", condition="text", from_pretrained="PRETRAINED_MODEL", ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", ) text_encoder = dict( type="clip", from_pretrained="openai/clip-vit-base-patch32", model_max_length=77, ) scheduler = dict( type="dpm-solver", num_sampling_steps=20, cfg_scale=4.0, ) dtype = "bf16" # Others batch_size = 2 seed = 42 prompt_path = "./assets/texts/ucf101_labels.txt" save_dir = "./samples/samples/" ================================================ FILE: Open-Sora/configs/latte/train/16x256x256.py ================================================ # Define dataset dataset = dict( type="VideoTextDataset", data_path=None, num_frames=16, frame_interval=3, image_size=(256, 256), ) # Define acceleration num_workers = 4 dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 1 # Define model model = dict( type="Latte-XL/2", enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", ) text_encoder = dict( type="clip", from_pretrained="openai/clip-vit-base-patch32", model_max_length=77, ) scheduler = dict( type="iddpm", timestep_respacing="", ) # Others seed = 42 outputs = "outputs" wandb = False epochs = 1000 log_every = 10 ckpt_every = 1000 load = None batch_size = 8 lr = 2e-5 grad_clip = 1.0 ================================================ FILE: Open-Sora/configs/opensora/inference/16x256x256.py ================================================ num_frames = 16 fps = 24 // 3 image_size = (256, 256) # Define model model = dict( type="STDiT-XL/2", space_scale=0.5, time_scale=1.0, enable_flash_attn=True, enable_layernorm_kernel=True, from_pretrained="PRETRAINED_MODEL", ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", micro_batch_size=4, ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=120, ) scheduler = dict( type="iddpm", num_sampling_steps=100, cfg_scale=7.0, cfg_channel=3, # or None ) dtype = "bf16" # Condition prompt_path = "./assets/texts/t2v_samples.txt" prompt = None # prompt has higher priority than prompt_path # Others batch_size = 1 seed = 42 save_dir = "./samples/samples/" ================================================ FILE: Open-Sora/configs/opensora/inference/16x512x512-rflow.py ================================================ num_frames = 16 fps = 24 // 3 image_size = (512, 512) # Define model model = dict( type="STDiT-XL/2", space_scale=1.0, time_scale=1.0, enable_flash_attn=True, enable_layernorm_kernel=True, from_pretrained="PRETRAINED_MODEL", ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", micro_batch_size=2, ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=120, ) scheduler = dict( type="rflow", num_sampling_steps=10, cfg_scale=7.0, ) dtype = "bf16" # Others batch_size = 2 seed = 42 prompt_path = "./assets/texts/t2v_samples.txt" save_dir = "./outputs/samples/" ================================================ FILE: Open-Sora/configs/opensora/inference/16x512x512.py ================================================ num_frames = 16 fps = 24 // 3 image_size = (512, 512) # Define model model = dict( type="STDiT-XL/2", space_scale=1.0, time_scale=1.0, enable_flash_attn=True, enable_layernorm_kernel=True, from_pretrained="PRETRAINED_MODEL", ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", micro_batch_size=2, ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=120, ) scheduler = dict( type="iddpm", num_sampling_steps=100, cfg_scale=7.0, ) dtype = "bf16" # Others batch_size = 2 seed = 42 prompt_path = "./assets/texts/t2v_samples.txt" save_dir = "./samples/samples/" ================================================ FILE: Open-Sora/configs/opensora/inference/64x512x512.py ================================================ num_frames = 64 fps = 24 // 2 image_size = (512, 512) # Define model model = dict( type="STDiT-XL/2", space_scale=1.0, time_scale=2 / 3, enable_flash_attn=True, enable_layernorm_kernel=True, from_pretrained="PRETRAINED_MODEL", ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", micro_batch_size=128, ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=120, ) scheduler = dict( type="iddpm", num_sampling_steps=100, cfg_scale=7.0, ) dtype = "bf16" # Others batch_size = 1 seed = 42 prompt_path = "./assets/texts/t2v_samples.txt" save_dir = "./samples/samples/" ================================================ FILE: Open-Sora/configs/opensora/train/16x256x256-mask.py ================================================ # Define dataset dataset = dict( type="VideoTextDataset", data_path=None, num_frames=16, frame_interval=3, image_size=(256, 256), ) # Define acceleration num_workers = 4 dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 1 # Define model model = dict( type="STDiT-XL/2", space_scale=0.5, time_scale=1.0, from_pretrained="PixArt-XL-2-512x512.pth", enable_flash_attn=True, enable_layernorm_kernel=True, ) mask_ratios = { "identity": 0.7, "random": 0.15, "mask_head": 0.05, "mask_tail": 0.05, "mask_head_tail": 0.05, } vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=120, shardformer=True, ) scheduler = dict( type="iddpm", timestep_respacing="", ) # Others seed = 42 outputs = "outputs" wandb = False epochs = 1000 log_every = 10 ckpt_every = 1000 load = None batch_size = 8 lr = 2e-5 grad_clip = 1.0 ================================================ FILE: Open-Sora/configs/opensora/train/16x256x256-spee-rflow.py ================================================ # Define dataset dataset = dict( type="VideoTextDataset", data_path=None, num_frames=16, frame_interval=3, image_size=(256, 256), ) # Define acceleration num_workers = 4 dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 1 # Define model model = dict( type="STDiT-XL/2", space_scale=0.5, time_scale=1.0, # from_pretrained="PixArt-XL-2-512x512.pth", # from_pretrained = "/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/OpenSora-v1-HQ-16x512x512.pth", # from_pretrained = "OpenSora-v1-HQ-16x512x512.pth", from_pretrained="PRETRAINED_MODEL", enable_flash_attn=True, enable_layernorm_kernel=True, ) # mask_ratios = [0.5, 0.29, 0.07, 0.07, 0.07] # mask_ratios = { # "identity": 0.9, # "random": 0.06, # "mask_head": 0.01, # "mask_tail": 0.01, # "mask_head_tail": 0.02, # } vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=120, shardformer=True, ) scheduler = dict( type="rflow", # timestep_respacing="", ) # Others seed = 42 outputs = "outputs" wandb = True epochs = 1 log_every = 10 ckpt_every = 1000 load = None batch_size = 16 lr = 2e-5 grad_clip = 1.0 ================================================ FILE: Open-Sora/configs/opensora/train/16x256x256-spee.py ================================================ # Define dataset dataset = dict( type="VideoTextDataset", data_path=None, num_frames=16, frame_interval=3, image_size=(256, 256), ) # Define acceleration num_workers = 4 dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 1 # Define model model = dict( type="STDiT-XL/2", space_scale=0.5, time_scale=1.0, from_pretrained="PixArt-XL-2-512x512.pth", enable_flash_attn=True, enable_layernorm_kernel=True, ) mask_ratios = { "identity": 0.5, "random": 0.29, "mask_head": 0.07, "mask_tail": 0.07, "mask_head_tail": 0.07, } vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=120, shardformer=True, ) scheduler = dict( type="iddpm-speed", timestep_respacing="", ) # Others seed = 42 outputs = "outputs" wandb = False epochs = 1000 log_every = 10 ckpt_every = 1000 load = None batch_size = 8 lr = 2e-5 grad_clip = 1.0 ================================================ FILE: Open-Sora/configs/opensora/train/16x256x256.py ================================================ # Define dataset dataset = dict( type="VideoTextDataset", data_path=None, num_frames=16, frame_interval=3, image_size=(256, 256), ) # Define acceleration num_workers = 0 dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 1 # Define model model = dict( type="STDiT-XL/2", space_scale=0.5, time_scale=1.0, from_pretrained="PixArt-XL-2-512x512.pth", enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=120, shardformer=True, ) scheduler = dict( type="iddpm", timestep_respacing="", ) # Others seed = 42 outputs = "outputs" wandb = False epochs = 1000 log_every = 10 ckpt_every = 1000 load = None batch_size = 8 lr = 2e-5 grad_clip = 1.0 ================================================ FILE: Open-Sora/configs/opensora/train/16x512x512.py ================================================ # Define dataset dataset = dict( type="VideoTextDataset", data_path=None, num_frames=16, frame_interval=3, image_size=(512, 512), ) # Define acceleration num_workers = 4 dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 1 # Define model model = dict( type="STDiT-XL/2", space_scale=1.0, time_scale=1.0, from_pretrained=None, enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", micro_batch_size=128, ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=120, shardformer=True, ) scheduler = dict( type="iddpm", timestep_respacing="", ) # Others seed = 42 outputs = "outputs" wandb = False epochs = 1000 log_every = 10 ckpt_every = 500 load = None batch_size = 8 lr = 2e-5 grad_clip = 1.0 ================================================ FILE: Open-Sora/configs/opensora/train/360x512x512.py ================================================ # Define dataset dataset = dict( type="VideoTextDataset", data_path=None, num_frames=360, frame_interval=3, image_size=(512, 512), ) # Define acceleration num_workers = 4 dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 1 # Define acceleration dtype = "bf16" grad_checkpoint = True plugin = "zero2-seq" sp_size = 2 # Define model model = dict( type="STDiT-XL/2", space_scale=1.0, time_scale=2 / 3, from_pretrained=None, enable_flash_attn=True, enable_layernorm_kernel=True, enable_sequence_parallelism=True, # enable sq here ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", micro_batch_size=128, ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=120, shardformer=True, ) scheduler = dict( type="iddpm", timestep_respacing="", ) # Others seed = 42 outputs = "outputs" wandb = False epochs = 1000 log_every = 10 ckpt_every = 250 load = None batch_size = 1 lr = 2e-5 grad_clip = 1.0 ================================================ FILE: Open-Sora/configs/opensora/train/64x512x512-sp.py ================================================ # Define dataset dataset = dict( type="VideoTextDataset", data_path=None, num_frames=16, frame_interval=3, image_size=(512, 512), ) # Define acceleration num_workers = 4 dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 2 # Define model model = dict( type="STDiT-XL/2", space_scale=1.0, time_scale=2 / 3, from_pretrained=None, enable_flash_attn=True, enable_layernorm_kernel=True, enable_sequence_parallelism=True, # enable sq here ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=120, shardformer=True, ) scheduler = dict( type="iddpm", timestep_respacing="", ) # Others seed = 42 outputs = "outputs" wandb = False epochs = 1000 log_every = 10 ckpt_every = 1000 load = None batch_size = 1 lr = 2e-5 grad_clip = 1.0 ================================================ FILE: Open-Sora/configs/opensora/train/64x512x512.py ================================================ # Define dataset dataset = dict( type="VideoTextDataset", data_path=None, num_frames=64, frame_interval=3, image_size=(512, 512), ) # Define acceleration num_workers = 4 dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 1 # Define model model = dict( type="STDiT-XL/2", space_scale=1.0, time_scale=2 / 3, from_pretrained=None, enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", micro_batch_size=64, ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=120, shardformer=True, ) scheduler = dict( type="iddpm", timestep_respacing="", ) # Others seed = 42 outputs = "outputs" wandb = False epochs = 1000 log_every = 10 ckpt_every = 250 load = None batch_size = 4 lr = 2e-5 grad_clip = 1.0 ================================================ FILE: Open-Sora/configs/opensora-v1-1/inference/sample-ref.py ================================================ num_frames = 16 frame_interval = 3 fps = 24 image_size = (240, 426) multi_resolution = "STDiT2" # Condition prompt_path = None prompt = [ 'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. {"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0"}', 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png","mask_strategy": "0"}', 'A car driving on the ocean.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4","mask_strategy": "0,0,-8,0,8"}', 'A snowy forest.{"reference_path": "https://cdn.pixabay.com/video/2021/04/25/72171-542991404_large.mp4","mask_strategy": "0,0,0,0,15,0.8"}', 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png;assets/images/condition/sunset2.png","mask_strategy": "0;0,1,0,-1,1"}', '|0|a white jeep equipped with a roof rack driving on a dirt road in a coniferous forest.|2|a white jeep equipped with a roof rack driving on a dirt road in the desert.|4|a white jeep equipped with a roof rack driving on a dirt road in a mountain.|6|A white jeep equipped with a roof rack driving on a dirt road in a city.|8|a white jeep equipped with a roof rack driving on a dirt road on the surface of a river.|10|a white jeep equipped with a roof rack driving on a dirt road under the lake.|12|a white jeep equipped with a roof rack flying into the sky.|14|a white jeep equipped with a roof rack driving in the universe. Earth is the background.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,15"}', ] loop = 2 condition_frame_length = 4 # ( # loop id, [the loop index of the condition image or video] # reference id, [the index of the condition image or video in the reference_path] # reference start, [the start frame of the condition image or video] # target start, [the location to insert] # length, [the number of frames to insert] # edit_ratio [the edit rate of the condition image or video] # ) # See https://github.com/hpcaitech/Open-Sora/blob/main/docs/config.md#advanced-inference-config for more details # See https://github.com/hpcaitech/Open-Sora/blob/main/docs/commands.md#inference-with-open-sora-11 for more examples # Define model model = dict( type="STDiT2-XL/2", from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3", input_sq_size=512, qk_norm=True, qk_norm_legacy=True, enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", cache_dir=None, # "/mnt/hdd/cached_models", micro_batch_size=4, ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", cache_dir=None, # "/mnt/hdd/cached_models", model_max_length=200, ) scheduler = dict( type="iddpm", num_sampling_steps=100, cfg_scale=7.0, cfg_channel=3, # or None ) dtype = "bf16" # Others batch_size = 1 seed = 42 save_dir = "./samples/samples/" ================================================ FILE: Open-Sora/configs/opensora-v1-1/inference/sample.py ================================================ num_frames = 16 frame_interval = 3 fps = 24 image_size = (240, 426) multi_resolution = "STDiT2" # Define model model = dict( type="STDiT2-XL/2", from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3", input_sq_size=512, qk_norm=True, qk_norm_legacy=True, enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", cache_dir=None, # "/mnt/hdd/cached_models", micro_batch_size=4, ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", cache_dir=None, # "/mnt/hdd/cached_models", model_max_length=200, ) scheduler = dict( type="iddpm", num_sampling_steps=100, cfg_scale=7.0, cfg_channel=3, # or None ) dtype = "bf16" # Condition prompt_path = "./assets/texts/t2v_samples.txt" prompt = None # prompt has higher priority than prompt_path # Others batch_size = 1 seed = 42 save_dir = "./samples/samples/" ================================================ FILE: Open-Sora/configs/opensora-v1-1/train/benchmark.py ================================================ # this file is only for batch size search and is not used for training # Define dataset dataset = dict( type="VariableVideoTextDataset", data_path=None, num_frames=None, frame_interval=3, image_size=(None, None), transform_name="resize_crop", ) # bucket config format: # 1. { resolution: {num_frames: (prob, batch_size)} }, in this case batch_size is ignored when searching # 2. { resolution: {num_frames: (prob, (max_batch_size, ))} }, batch_size is searched in the range [batch_size_start, max_batch_size), batch_size_start is configured via CLI # 3. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size) # 4. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size, step_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size) with step_size (grid search) # 5. { resolution: {num_frames: (0.0, None)} }, this bucket will not be used bucket_config = { # == manual search == # "240p": {128: (1.0, 2)}, # 4.28s/it # "240p": {64: (1.0, 4)}, # "240p": {32: (1.0, 8)}, # 4.6s/it # "240p": {16: (1.0, 16)}, # 4.6s/it # "480p": {16: (1.0, 4)}, # 4.6s/it # "720p": {16: (1.0, 2)}, # 5.89s/it # "256": {1: (1.0, 256)}, # 4.5s/it # "512": {1: (1.0, 96)}, # 4.7s/it # "512": {1: (1.0, 128)}, # 6.3s/it # "480p": {1: (1.0, 50)}, # 4.0s/it # "1024": {1: (1.0, 32)}, # 6.8s/it # "1024": {1: (1.0, 20)}, # 4.3s/it # "1080p": {1: (1.0, 16)}, # 8.6s/it # "1080p": {1: (1.0, 8)}, # 4.4s/it # == stage 2 == # "240p": { # 16: (1.0, (2, 32)), # 32: (1.0, (2, 16)), # 64: (1.0, (2, 8)), # 128: (1.0, (2, 6)), # }, # "256": {1: (1.0, (128, 300))}, # "512": {1: (0.5, (64, 128))}, # "480p": {1: (0.4, (32, 128)), 16: (0.4, (2, 32)), 32: (0.0, None)}, # "720p": {16: (0.1, (2, 16)), 32: (0.0, None)}, # No examples now # "1024": {1: (0.3, (8, 64))}, # "1080p": {1: (0.3, (2, 32))}, # == stage 3 == "720p": {1: (20, 40), 32: (0.5, (2, 4)), 64: (0.5, (1, 1))}, } # Define acceleration num_workers = 4 num_bucket_build_workers = 16 dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 1 # Define model model = dict( type="STDiT2-XL/2", from_pretrained=None, input_sq_size=512, # pretrained model is trained on 512x512 qk_norm=True, qk_norm_legacy=True, enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", micro_batch_size=4, local_files_only=True, ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=200, shardformer=True, local_files_only=True, ) scheduler = dict( type="iddpm", timestep_respacing="", ) # Others seed = 42 outputs = "outputs" wandb = False epochs = 1000 log_every = 10 ckpt_every = 1000 load = None batch_size = None lr = 2e-5 grad_clip = 1.0 ================================================ FILE: Open-Sora/configs/opensora-v1-1/train/image.py ================================================ # Define dataset dataset = dict( type="VariableVideoTextDataset", data_path=None, num_frames=None, frame_interval=3, image_size=(None, None), transform_name="resize_crop", ) bucket_config = { # 6s/it "256": {1: (1.0, 256)}, "512": {1: (1.0, 80)}, "480p": {1: (1.0, 52)}, "1024": {1: (1.0, 20)}, "1080p": {1: (1.0, 8)}, } # Define acceleration num_workers = 4 num_bucket_build_workers = 16 dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 1 # Define model model = dict( type="STDiT2-XL/2", from_pretrained=None, input_sq_size=512, # pretrained model is trained on 512x512 qk_norm=True, qk_norm_legacy=True, enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", micro_batch_size=4, local_files_only=True, ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=200, shardformer=True, local_files_only=True, ) scheduler = dict( type="iddpm", timestep_respacing="", ) # Others seed = 42 outputs = "outputs" wandb = False epochs = 1000 log_every = 10 ckpt_every = 500 load = None batch_size = 10 # only for logging lr = 2e-5 grad_clip = 1.0 ================================================ FILE: Open-Sora/configs/opensora-v1-1/train/image_rflow.py ================================================ # Define dataset # dataset = dict( # type="VariableVideoTextDataset", # data_path=None, # num_frames=None, # frame_interval=3, # image_size=(None, None), # transform_name="resize_crop", # ) dataset = dict( type="VideoTextDataset", data_path=None, num_frames=1, frame_interval=1, image_size=(256, 256), transform_name="center", ) bucket_config = { # 6s/it "256": {1: (1.0, 256)}, "512": {1: (1.0, 80)}, "480p": {1: (1.0, 52)}, "1024": {1: (1.0, 20)}, "1080p": {1: (1.0, 8)}, } # Define acceleration num_workers = 16 dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 1 # Define model # model = dict( # type="DiT-XL/2", # from_pretrained="/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/PixArt-XL-2-512x512.pth", # # input_sq_size=512, # pretrained model is trained on 512x512 # enable_flash_attn=True, # enable_layernorm_kernel=True, # ) model = dict( type="PixArt-XL/2", space_scale=1.0, time_scale=1.0, no_temporal_pos_emb=True, from_pretrained="PixArt-XL-2-512x512.pth", enable_flash_attn=True, enable_layernorm_kernel=True, ) # model = dict( # type="DiT-XL/2", # # space_scale=1.0, # # time_scale=1.0, # no_temporal_pos_emb=True, # # from_pretrained="PixArt-XL-2-512x512.pth", # from_pretrained="/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/PixArt-XL-2-512x512.pth", # enable_flash_attn=True, # enable_layernorm_kernel=True, # ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", micro_batch_size=4, ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=200, shardformer=True, ) scheduler = dict( type="rflow", # timestep_respacing="", ) # Others seed = 42 outputs = "outputs" wandb = False epochs = 10 log_every = 10 ckpt_every = 500 load = None batch_size = 100 # only for logging lr = 2e-5 grad_clip = 1.0 ================================================ FILE: Open-Sora/configs/opensora-v1-1/train/stage1.py ================================================ # Define dataset dataset = dict( type="VariableVideoTextDataset", data_path=None, num_frames=None, frame_interval=3, image_size=(None, None), transform_name="resize_crop", ) # IMG: 1024 (20%) 512 (30%) 256 (50%) drop (50%) bucket_config = { # 1s/it "144p": {1: (0.5, 48), 16: (1.0, 6), 32: (1.0, 3), 96: (1.0, 1)}, "256": {1: (0.5, 24), 16: (0.5, 3), 48: (0.5, 1), 64: (0.0, None)}, "240p": {16: (0.3, 2), 32: (0.3, 1), 64: (0.0, None)}, "512": {1: (0.4, 12)}, "1024": {1: (0.3, 3)}, } mask_ratios = { "identity": 0.75, "quarter_random": 0.025, "quarter_head": 0.025, "quarter_tail": 0.025, "quarter_head_tail": 0.05, "image_random": 0.025, "image_head": 0.025, "image_tail": 0.025, "image_head_tail": 0.05, } # Define acceleration num_workers = 8 num_bucket_build_workers = 16 dtype = "bf16" grad_checkpoint = False plugin = "zero2" sp_size = 1 # Define model model = dict( type="STDiT2-XL/2", from_pretrained=None, input_sq_size=512, # pretrained model is trained on 512x512 qk_norm=True, qk_norm_legacy=True, enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", micro_batch_size=4, local_files_only=True, ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=200, shardformer=True, local_files_only=True, ) scheduler = dict( type="iddpm", timestep_respacing="", ) # Others seed = 42 outputs = "outputs" wandb = False epochs = 1000 log_every = 10 ckpt_every = 500 load = None batch_size = None lr = 2e-5 grad_clip = 1.0 ================================================ FILE: Open-Sora/configs/opensora-v1-1/train/stage2.py ================================================ # Define dataset dataset = dict( type="VariableVideoTextDataset", data_path=None, num_frames=None, frame_interval=3, image_size=(None, None), transform_name="resize_crop", ) bucket_config = { # 7s/it "144p": {1: (1.0, 48), 16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (1.0, 1)}, "256": {1: (0.8, 254), 16: (0.5, 17), 32: (0.5, 9), 64: (0.5, 4), 128: (0.5, 1)}, "240p": {1: (0.1, 20), 16: (0.9, 17), 32: (0.8, 9), 64: (0.8, 4), 128: (0.8, 2)}, "512": {1: (0.5, 86), 16: (0.2, 4), 32: (0.2, 2), 64: (0.2, 1), 128: (0.0, None)}, "480p": {1: (0.4, 54), 16: (0.4, 4), 32: (0.0, None)}, "720p": {1: (0.1, 20), 16: (0.1, 2), 32: (0.0, None)}, "1024": {1: (0.3, 20)}, "1080p": {1: (0.4, 8)}, } mask_ratios = { "identity": 0.75, "quarter_random": 0.025, "quarter_head": 0.025, "quarter_tail": 0.025, "quarter_head_tail": 0.05, "image_random": 0.025, "image_head": 0.025, "image_tail": 0.025, "image_head_tail": 0.05, } # Define acceleration num_workers = 8 num_bucket_build_workers = 16 dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 1 # Define model model = dict( type="STDiT2-XL/2", from_pretrained=None, input_sq_size=512, # pretrained model is trained on 512x512 qk_norm=True, qk_norm_legacy=True, enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", micro_batch_size=4, local_files_only=True, ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=200, shardformer=True, local_files_only=True, ) scheduler = dict( type="iddpm", timestep_respacing="", ) # Others seed = 42 outputs = "outputs" wandb = False epochs = 1000 log_every = 10 ckpt_every = 500 load = None batch_size = None lr = 2e-5 grad_clip = 1.0 ================================================ FILE: Open-Sora/configs/opensora-v1-1/train/stage3.py ================================================ # Define dataset dataset = dict( type="VariableVideoTextDataset", data_path=None, num_frames=None, frame_interval=3, image_size=(None, None), transform_name="resize_crop", ) bucket_config = { # 13s/it "144p": {1: (1.0, 200), 16: (1.0, 36), 32: (1.0, 18), 64: (1.0, 9), 128: (1.0, 4)}, "256": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 11), 64: (0.5, 6), 128: (0.8, 4)}, "240p": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 10), 64: (0.5, 6), 128: (0.5, 3)}, "360p": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.5, 1)}, "512": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.8, 1)}, "480p": {1: (0.4, 80), 16: (0.6, 6), 32: (0.6, 3), 64: (0.6, 1), 128: (0.0, None)}, "720p": {1: (0.4, 40), 16: (0.6, 3), 32: (0.6, 1), 96: (0.0, None)}, "1024": {1: (0.3, 40)}, } mask_ratios = { "identity": 0.75, "quarter_random": 0.025, "quarter_head": 0.025, "quarter_tail": 0.025, "quarter_head_tail": 0.05, "image_random": 0.025, "image_head": 0.025, "image_tail": 0.025, "image_head_tail": 0.05, } # Define acceleration num_workers = 8 num_bucket_build_workers = 16 dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 1 # Define model model = dict( type="STDiT2-XL/2", from_pretrained=None, input_sq_size=512, # pretrained model is trained on 512x512 qk_norm=True, qk_norm_legacy=True, enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", micro_batch_size=4, local_files_only=True, ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=200, shardformer=True, local_files_only=True, ) scheduler = dict( type="iddpm", timestep_respacing="", ) # Others seed = 42 outputs = "outputs" wandb = False epochs = 1000 log_every = 10 ckpt_every = 500 load = None batch_size = None lr = 2e-5 grad_clip = 1.0 ================================================ FILE: Open-Sora/configs/opensora-v1-1/train/video.py ================================================ # Define dataset dataset = dict( type="VariableVideoTextDataset", data_path=None, num_frames=None, frame_interval=3, image_size=(None, None), transform_name="resize_crop", ) bucket_config = { # 6s/it "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)}, "256": {1: (1.0, 256)}, "512": {1: (0.5, 80)}, "480p": {1: (0.4, 52), 16: (0.4, 4), 32: (0.0, None)}, "720p": {16: (0.1, 2), 32: (0.0, None)}, # No examples now "1024": {1: (0.3, 20)}, "1080p": {1: (0.3, 8)}, } # Define acceleration num_workers = 4 num_bucket_build_workers = 16 dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 1 # Define model model = dict( type="STDiT2-XL/2", from_pretrained=None, input_sq_size=512, # pretrained model is trained on 512x512 qk_norm=True, qk_norm_legacy=True, enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", micro_batch_size=4, local_files_only=True, ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=200, shardformer=True, local_files_only=True, ) scheduler = dict( type="iddpm", timestep_respacing="", ) # Others seed = 42 outputs = "outputs" wandb = False epochs = 1000 log_every = 10 ckpt_every = 500 load = None batch_size = 10 # only for logging lr = 2e-5 grad_clip = 1.0 ================================================ FILE: Open-Sora/configs/opensora-v1-2/inference/sample.py ================================================ resolution = "240p" aspect_ratio = "9:16" num_frames = 51 fps = 24 frame_interval = 1 save_fps = 24 #save_dir = "./samples/samples/" save_dir = "/root/autodl-tmp/video_samples/" seed = 42 batch_size = 1 multi_resolution = "STDiT2" dtype = "bf16" condition_frame_length = 5 align = 5 model = dict( type="STDiT3-XL/2", from_pretrained="/root/autodl-tmp/pretrained_models/hpcai-tech/OpenSora-STDiT-v3", qk_norm=True, enable_flash_attn=True,#True enable_layernorm_kernel=True,#True ) vae = dict( type="OpenSoraVAE_V1_2", from_pretrained="/root/autodl-tmp/pretrained_models/hpcai-tech/OpenSora-VAE-v1.2", micro_frame_size=17, micro_batch_size=4, ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=300, ) scheduler = dict( type="rflow", use_timestep_transform=True, num_sampling_steps=30, cfg_scale=7.0, ) aes = 6.5 flow = None #num_sample = 1 ================================================ FILE: Open-Sora/docs/acceleration.md ================================================ # Acceleration >This document corresponds to our v1.1 release Open-Sora aims to provide a high-speed training framework for diffusion models. We can achieve **55%** training speed acceleration when training on **64 frames 512x512 videos**. Our framework support training **1min 1080p videos**. ## Accelerated Transformer Open-Sora boosts the training speed by: - Kernel optimization including [flash attention](https://github.com/Dao-AILab/flash-attention), fused layernorm kernel, and the ones compiled by colossalAI. - Hybrid parallelism including ZeRO. - Gradient checkpointing for larger batch size. Our training speed on images is comparable to [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT), a project to accelerate DiT training. The training speed is measured on 8 H800 GPUs with batch size 128, image size 256x256. | Model | Throughput (img/s/GPU) | Throughput (tokens/s/GPU) | | -------- | ---------------------- | ------------------------- | | DiT | 100 | 26k | | OpenDiT | 175 | 45k | | OpenSora | 175 | 45k | ## Efficient STDiT Our STDiT adopts spatial-temporal attention to model the video data. Compared with directly applying full attention on DiT, our STDiT is more efficient as the number of frames increases. Our current framework only supports sequence parallelism for very long sequence. The training speed is measured on 8 H800 GPUs with acceleration techniques applied, GC means gradient checkpointing. Both with T5 conditioning like PixArt. | Model | Setting | Throughput (sample/s/GPU) | Throughput (tokens/s/GPU) | | ---------------- | -------------- | ------------------------- | ------------------------- | | DiT | 16x256 (4k) | 7.20 | 29k | | STDiT | 16x256 (4k) | 7.00 | 28k | | DiT | 16x512 (16k) | 0.85 | 14k | | STDiT | 16x512 (16k) | 1.45 | 23k | | DiT (GC) | 64x512 (65k) | 0.08 | 5k | | STDiT (GC) | 64x512 (65k) | 0.40 | 25k | | STDiT (GC, sp=2) | 360x512 (370k) | 0.10 | 18k | With a 4x downsampling in the temporal dimension with Video-VAE, an 24fps video has 450 frames. The gap between the speed of STDiT (28k tokens/s) and DiT on images (up to 45k tokens/s) mainly comes from the T5 and VAE encoding, and temporal attention. ## Accelerated Encoder (T5, VAE) During training, texts are encoded by T5, and videos are encoded by VAE. Typically there are two ways to accelerate the training: 1. Preprocess text and video data in advance and save them to disk. 2. Encode text and video data during training, and accelerate the encoding process. For option 1, 120 tokens for one sample require 1M disk space, and a 64x64x64 latent requires 4M. Considering a training dataset with 10M video clips, the total disk space required is 50TB. Our storage system is not ready at this time for this scale of data. For option 2, we boost T5 speed and memory requirement. According to [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT), we find VAE consumes a large number of GPU memory. Thus we split batch size into smaller ones for VAE encoding. With both techniques, we can greatly accelerate the training speed. The training speed is measured on 8 H800 GPUs with STDiT. | Acceleration | Setting | Throughput (img/s/GPU) | Throughput (tokens/s/GPU) | | ------------ | ------------- | ---------------------- | ------------------------- | | Baseline | 16x256 (4k) | 6.16 | 25k | | w. faster T5 | 16x256 (4k) | 7.00 | 29k | | Baseline | 64x512 (65k) | 0.94 | 15k | | w. both | 64x512 (65k) | 1.45 | 23k | ================================================ FILE: Open-Sora/docs/commands.md ================================================ # Commands - [Config](#Config) - [Inference](#inference) - [Inference with Open-Sora 1.2](#inference-with-open-sora-12) - [Inference with Open-Sora 1.1](#inference-with-open-sora-11) - [Inference with DiT pretrained on ImageNet](#inference-with-dit-pretrained-on-imagenet) - [Inference with Latte pretrained on UCF101](#inference-with-latte-pretrained-on-ucf101) - [Inference with PixArt-α pretrained weights](#inference-with-pixart-α-pretrained-weights) - [Inference with checkpoints saved during training](#inference-with-checkpoints-saved-during-training) - [Inference Hyperparameters](#inference-hyperparameters) - [Training](#training) - [Training Hyperparameters](#training-hyperparameters) - [Search batch size for buckets](#search-batch-size-for-buckets) ## Config Note that currently our model loading for vae and diffusion model supports two types: * load from local file path * load from huggingface Our config supports loading from huggingface online image by default. If you wish to load from a local path downloaded from huggingface image, you need to set `force_huggingface=True`, for instance: ```python # for vae vae = dict( type="OpenSoraVAE_V1_2", from_pretrained="/root/commonData/OpenSora-VAE-v1.2", micro_frame_size=17, micro_batch_size=4, force_huggingface=True, # NOTE: set here ) # for diffusion model model = dict( type="STDiT3-XL/2", from_pretrained="/root/commonData/OpenSora-STDiT-v3", qk_norm=True, enable_flash_attn=True, enable_layernorm_kernel=True, force_huggingface=True, # NOTE: set here ) ``` However, if you want to load a self-trained model, do not set `force_huggingface=True` since your image won't be in huggingface format. ## Inference You can modify corresponding config files to change the inference settings. See more details [here](/docs/structure.md#inference-config-demos). ### Inference with Open-Sora 1.2 The inference API is compatible with Open-Sora 1.1. To ease users' experience, we add support to `--resolution` and `--aspect-ratio` options, which is a more user-friendly way to specify the image size. ```bash python scripts/inference.py configs/opensora-v1-2/inference/sample.py \ --resolution 480p --aspect-ratio 9:16 # equivalent to python scripts/inference.py configs/opensora-v1-2/inference/sample.py \ --image-size 480 853 ``` In this version, we have merged all functions in previous `inference-long.py` into `inference.py`. The command line arguments are the same as before (only note that the frame index and length is calculated with 4x compressed). ### Inference with Open-Sora 1.1 Since Open-Sora 1.1 supports inference with dynamic input size, you can pass the input size as an argument. ```bash # image sampling with prompt path python scripts/inference.py configs/opensora-v1-1/inference/sample.py \ --ckpt-path CKPT_PATH --prompt-path assets/texts/t2i_samples.txt --num-frames 1 --image-size 1024 1024 # image sampling with prompt python scripts/inference.py configs/opensora-v1-1/inference/sample.py \ --ckpt-path CKPT_PATH --prompt "A beautiful sunset over the city" --num-frames 1 --image-size 1024 1024 # video sampling python scripts/inference.py configs/opensora-v1-1/inference/sample.py \ --ckpt-path CKPT_PATH --prompt "A beautiful sunset over the city" --num-frames 16 --image-size 480 854 ``` You can adjust the `--num-frames` and `--image-size` to generate different results. We recommend you to use the same image size as the training resolution, which is defined in [aspect.py](/opensora/datasets/aspect.py). Some examples are shown below. - 240p - 16:9 240x426 - 3:4 276x368 - 1:1 320x320 - 480p - 16:9 480x854 - 3:4 554x738 - 1:1 640x640 - 720p - 16:9 720x1280 - 3:4 832x1110 - 1:1 960x960 `inference-long.py` is compatible with `inference.py` and supports advanced features. ```bash # image condition python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \ --num-frames 32 --image-size 240 426 --sample-name image-cond \ --prompt 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/wave.png","mask_strategy": "0"}' # video extending python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \ --num-frames 32 --image-size 240 426 --sample-name image-cond \ --prompt 'A car driving on the ocean.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4","mask_strategy": "0,0,0,-8,8"}' # long video generation python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \ --num-frames 32 --image-size 240 426 --loop 16 --condition-frame-length 8 --sample-name long \ --prompt '|0|a white jeep equipped with a roof rack driving on a dirt road in a coniferous forest.|2|a white jeep equipped with a roof rack driving on a dirt road in the desert.|4|a white jeep equipped with a roof rack driving on a dirt road in a mountain.|6|A white jeep equipped with a roof rack driving on a dirt road in a city.|8|a white jeep equipped with a roof rack driving on a dirt road on the surface of a river.|10|a white jeep equipped with a roof rack driving on a dirt road under the lake.|12|a white jeep equipped with a roof rack flying into the sky.|14|a white jeep equipped with a roof rack driving in the universe. Earth is the background.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,16"}' # video connecting python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \ --num-frames 32 --image-size 240 426 --sample-name connect \ --prompt 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png;assets/images/condition/sunset2.png","mask_strategy": "0;0,1,0,-1,1"}' # video editing python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \ --num-frames 32 --image-size 480 853 --sample-name edit \ --prompt 'A cyberpunk-style city at night.{"reference_path": "https://cdn.pixabay.com/video/2021/10/12/91744-636709154_large.mp4","mask_strategy": "0,0,0,0,32,0.4"}' ``` ### Inference with DiT pretrained on ImageNet The following command automatically downloads the pretrained weights on ImageNet and runs inference. ```bash python scripts/inference.py configs/dit/inference/1x256x256-class.py --ckpt-path DiT-XL-2-256x256.pt ``` ### Inference with Latte pretrained on UCF101 The following command automatically downloads the pretrained weights on UCF101 and runs inference. ```bash python scripts/inference.py configs/latte/inference/16x256x256-class.py --ckpt-path Latte-XL-2-256x256-ucf101.pt ``` ### Inference with PixArt-α pretrained weights Download T5 into `./pretrained_models` and run the following command. ```bash # 256x256 torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x256x256.py --ckpt-path PixArt-XL-2-256x256.pth # 512x512 torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x512x512.py --ckpt-path PixArt-XL-2-512x512.pth # 1024 multi-scale torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x1024MS.py --ckpt-path PixArt-XL-2-1024MS.pth ``` ### Inference with checkpoints saved during training During training, an experiment logging folder is created in `outputs` directory. Under each checkpoint folder, e.g. `epoch12-global_step2000`, there is a `ema.pt` and the shared `model` folder. Run the following command to perform inference. ```bash # inference with ema model torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000/ema.pt # inference with model torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000 # inference with sequence parallelism # sequence parallelism is enabled automatically when nproc_per_node is larger than 1 torchrun --standalone --nproc_per_node 2 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000 ``` The second command will automatically generate a `model_ckpt.pt` file in the checkpoint folder. ### Inference Hyperparameters 1. DPM-solver is good at fast inference for images. However, the video result is not satisfactory. You can use it for fast demo purpose. ```python type="dmp-solver" num_sampling_steps=20 ``` 2. You can use [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt)'s finetuned VAE decoder on videos for inference (consumes more memory). However, we do not see significant improvement in the video result. To use it, download [the pretrained weights](https://huggingface.co/maxin-cn/Latte/tree/main/t2v_required_models/vae_temporal_decoder) into `./pretrained_models/vae_temporal_decoder` and modify the config file as follows. ```python vae = dict( type="VideoAutoencoderKLTemporalDecoder", from_pretrained="pretrained_models/vae_temporal_decoder", ) ``` ## Training To resume training, run the following command. ``--load`` different from ``--ckpt-path`` as it loads the optimizer and dataloader states. ```bash torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --load YOUR_PRETRAINED_CKPT ``` To enable wandb logging, add `--wandb` to the command. ```bash WANDB_API_KEY=YOUR_WANDB_API_KEY torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --wandb True ``` You can modify corresponding config files to change the training settings. See more details [here](/docs/structure.md#training-config-demos). ### Training Hyperparameters 1. `dtype` is the data type for training. Only `fp16` and `bf16` are supported. ColossalAI automatically enables the mixed precision training for `fp16` and `bf16`. During training, we find `bf16` more stable. ## Search batch size for buckets To search the batch size for buckets, run the following command. ```bash torchrun --standalone --nproc_per_node 1 scripts/misc/search_bs.py configs/opensora-v1-2/misc/bs.py --data-path /mnt/nfs-207/sora_data/meta/searchbs.csv ``` Here, your data should be a small one for searching purposes. To control the batch size search range, you should specify `bucket_config` in the config file, where the value tuple is `(guess_value, range)` and the search will be performed in `guess_value±range`. Here is an example of the bucket config: ```python bucket_config = { "240p": { 1: (100, 100), 51: (24, 10), 102: (12, 10), 204: (4, 8), 408: (2, 8), }, "480p": { 1: (50, 50), 51: (6, 6), 102: (3, 3), 204: (1, 2), }, } ``` You can also specify a resolution to search for parallelism. ```bash torchrun --standalone --nproc_per_node 1 scripts/misc/search_bs.py configs/opensora-v1-2/misc/bs.py --data-path /mnt/nfs-207/sora_data/meta/searchbs.csv --resolution 240p ``` The searching goal should be specified in the config file as well. There are two ways: 1. Specify a `base_step_time` in the config file. The searching goal is to find the batch size that can achieve the `base_step_time` for each bucket. 2. If `base_step_time` is not specified, it will be determined by `base` which is a tuple of `(batch_size, step_time)`. The step time is the maximum batch size allowed for the bucket. The script will print the best batch size (and corresponding step time) for each bucket and save the output config file. Note that we assume a larger batch size is better, so the script use binary search to find the best batch size. ================================================ FILE: Open-Sora/docs/config.md ================================================ # Config Guide - [Inference Config](#inference-config) - [Advanced Inference config](#advanced-inference-config) - [Inference Args](#inference-args) - [Training Config](#training-config) - [Training Args](#training-args) - [Training Bucket Configs](#training-bucket-configs) Our config files follows [MMEgine](https://github.com/open-mmlab/mmengine). MMEngine will reads the config file (a `.py` file) and parse it into a dictionary-like object. We expose some fields in the config file to the command line arguments (defined in [opensora/utils/config_util.py](/opensora/utils/config_utils.py)). To change the inference settings, you can directly modify the corresponding config file. Or you can pass arguments to overwrite the config file. ## Inference Config The explanation of each field is provided below. ```python # Define sampling size num_frames = 64 # number of frames, 1 means image fps = 24 # frames per second (condition for generation) frame_interval = 3 # output video will have fps/frame_interval frames per second image_size = (240, 426) # image size (height, width) # Define model model = dict( type="STDiT2-XL/2", # Select model type (STDiT-XL/2, DiT-XL/2, etc.) from_pretrained="PRETRAINED_MODEL", # (Optional) Load from pretrained model input_sq_size=512, # Base spatial position embedding size qk_norm=True, # Normalize query and key in attention enable_flash_attn=True, # (Optional) Speed up training and inference with flash attention # Turn enable_flash_attn to False if you skip flashattn installation enable_layernorm_kernel=True, # (Optional) Speed up training and inference with fused kernel # Turn enable_layernorm_kernel to False if you skip apex installation ) vae = dict( type="VideoAutoencoderKL", # Select VAE type from_pretrained="stabilityai/sd-vae-ft-ema", # Load from pretrained VAE micro_batch_size=4, # VAE with micro batch size to save memory ) text_encoder = dict( type="t5", # Select text encoder type (t5, clip) from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", # Load from pretrained text encoder model_max_length=200, # Maximum length of input text ) scheduler = dict( type="iddpm", # Select scheduler type (iddpm, dpm-solver) num_sampling_steps=100, # Number of sampling steps cfg_scale=7.0, # hyper-parameter for classifier-free diffusion cfg_channel=3, # how many channels to use for classifier-free diffusion, if None, use all channels ) dtype = "bf16" # Computation type (fp16, fp32, bf16) # Condition prompt_path = "./assets/texts/t2v_samples.txt" # path to prompt file prompt = None # prompt has higher priority than prompt_path # Other settings batch_size = 1 # batch size seed = 42 # random seed save_dir = "./samples" # path to save samples ``` ## Advanced Inference config The [`inference-long.py`](/scripts/inference-long.py) script is used to generate long videos, and it also provides all functions of the [`inference.py`](/scripts/inference.py) script. The following arguments are specific to the `inference-long.py` script. ```python loop = 10 condition_frame_length = 4 reference_path = [ "https://cdn.openai.com/tmp/s/interp/d0.mp4", None, "assets/images/condition/wave.png", ] mask_strategy = [ "0,0,0,0,8,0.3", None, "0,0,0,0,1;0,0,0,-1,1", ] ``` The following figure provides an illustration of the `mask_strategy`: ![mask_strategy](/assets/readme/report_mask_config.png) To generate a long video of infinite time, our strategy is to generate a video with a fixed length first, and then use the last `condition_frame_length` number of frames for the next video generation. This will loop for `loop` times. Thus, the total length of the video is `loop * (num_frames - condition_frame_length) + condition_frame_length`. To condition the generation on images or videos, we introduce the `mask_strategy`. It is 6 number tuples separated by `;`. Each tuple indicate an insertion of the condition image or video to the target generation. The meaning of each number is: - **First number**: the loop index of the condition image or video. (0 means the first loop, 1 means the second loop, etc.) - **Second number**: the index of the condition image or video in the `reference_path`. - **Third number**: the start frame of the condition image or video. (0 means the first frame, and images only have one frame) - **Fourth number**: the location to insert. (0 means insert at the beginning, 1 means insert at the end, and -1 means insert at the end of the video) - **Fifth number**: the number of frames to insert. (1 means insert one frame, and images only have one frame) - **Sixth number**: the edit rate of the condition image or video. (0 means no edit, 1 means full edit). To facilitate usage, we also accept passing the reference path and mask strategy as a json appended to the prompt. For example, ```plaintext 'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff\'s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff\'s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.{"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0"}' ``` ## Inference Args You can use `python scripts/inference.py --help` to see the following arguments: - `--seed`: random seed - `--ckpt-path`: path to the checkpoint (`model["from_pretrained"]`) - `--batch-size`: batch size - `--save-dir`: path to save samples - `--sample-name`: if None, the sample will be name by `sample_{index}.mp4/png`, otherwise, the sample will be named by `{sample_name}_{index}.mp4/png` - `--start-index`: start index of the sample - `--end-index`: end index of the sample - `--num-sample`: number of samples to generate for each prompt. The sample will be suffixed by `-0`, `-1`, `-2`, etc. - `--prompt-as-path`: if True, use the prompt as the name for saving samples - `--prompt-path`: path to the prompt file - `--prompt`: prompt string list - `--num-frames`: number of frames - `--fps`: frames per second - `--image-size`: image size - `--num-sampling-steps`: number of sampling steps (`scheduler["num_sampling_steps"]`) - `--cfg-scale`: hyper-parameter for classifier-free diffusion (`scheduler["cfg_scale"]`) - `--loop`: loop for long video generation - `--condition-frame-length`: condition frame length for long video generation - `--reference-path`: reference path for long video generation - `--mask-strategy`: mask strategy for long video generation Example commands for inference can be found in [commands.md](/docs/commands.md). ## Training Config ```python # Define dataset dataset = dict( type="VariableVideoTextDataset", # Select dataset type # VideoTextDataset for OpenSora 1.0, VariableVideoTextDataset for OpenSora 1.1 and 1.2 data_path=None, # Path to the dataset num_frames=None, # Number of frames, set None since we support dynamic training frame_interval=3, # Frame interval image_size=(None, None), # Image size, set None since we support dynamic training transform_name="resize_crop", # Transform name ) # bucket config usage see next section bucket_config = { "144p": {1: (1.0, 48), 16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (1.0, 1)}, "256": {1: (0.8, 254), 16: (0.5, 17), 32: (0.5, 9), 64: (0.5, 4), 128: (0.5, 1)}, "240p": {1: (0.1, 20), 16: (0.9, 17), 32: (0.8, 9), 64: (0.8, 4), 128: (0.8, 2)}, "512": {1: (0.5, 86), 16: (0.2, 4), 32: (0.2, 2), 64: (0.2, 1), 128: (0.0, None)}, "480p": {1: (0.4, 54), 16: (0.4, 4), 32: (0.0, None)}, "720p": {1: (0.1, 20), 16: (0.1, 2), 32: (0.0, None)}, "1024": {1: (0.3, 20)}, "1080p": {1: (0.4, 8)}, } # mask ratio in training mask_ratios = { "identity": 0.75, # 75% no mask "quarter_random": 0.025, # 2.5% random mask with 1 frame to 1/4 #frames "quarter_head": 0.025, # 2.5% mask at the beginning with 1 frame to 1/4 #frames "quarter_tail": 0.025, # 2.5% mask at the end with 1 frame to 1/4 #frames "quarter_head_tail": 0.05, # 5% mask at the beginning and end with 1 frame to 1/4 #frames "image_random": 0.025, # 2.5% random mask with 1 image to 1/4 #images "image_head": 0.025, # 2.5% mask at the beginning with 1 image to 1/4 #images "image_tail": 0.025, # 2.5% mask at the end with 1 image to 1/4 #images "image_head_tail": 0.05, # 5% mask at the beginning and end with 1 image to 1/4 #images } # Define acceleration num_workers = 8 # Number of workers for dataloader num_bucket_build_workers = 16 # Number of workers for bucket building dtype = "bf16" # Computation type (fp16, fp32, bf16) grad_checkpoint = True # Use gradient checkpointing plugin = "zero2" # Plugin for training sp_size = 1 # Sequence parallel size # Define model model = dict( type="STDiT2-XL/2", # Select model type (STDiT-XL/2, DiT-XL/2, etc.) from_pretrained=None, # Load from pretrained model input_sq_size=512, # Base spatial position embedding size qk_norm=True, # Normalize query and key in attention enable_flash_attn=True, # (Optional) Speed up training and inference with flash attention enable_layernorm_kernel=True, # (Optional) Speed up training and inference with fused kernel ) vae = dict( type="VideoAutoencoderKL", # Select VAE type from_pretrained="stabilityai/sd-vae-ft-ema", micro_batch_size=4, # VAE with micro batch size to save memory local_files_only=True, # Load from local files only (first time should be false) ) text_encoder = dict( type="t5", # Select text encoder type (t5, clip) from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=200, # Maximum length of input text shardformer=True, # Use shardformer local_files_only=True, # Load from local files only (first time should be false) ) scheduler = dict( type="iddpm", # Select scheduler type (iddpm, iddpm-speed) timestep_respacing="", ) # Others seed = 42 # random seed outputs = "outputs" # path to save outputs wandb = False # Use wandb or not epochs = 1000 # Number of epochs (set a large number and kill the process when you want to stop) log_every = 10 ckpt_every = 500 load = None batch_size = None lr = 2e-5 grad_clip = 1.0 ``` ## Training Args - `--seed`: random seed - `--ckpt-path`: path to the checkpoint (`model["from_pretrained"]`) - `--batch-size`: batch size - `--wandb`: use wandb or not - `--load`: path to the checkpoint to load - `--data-path`: path to the dataset (`dataset["data_path"]`) See [commands.md](/docs/commands.md) for example commands. ## Training Bucket Configs We support multi-resolution/aspect-ratio/num_frames training with bucket. To enable dynamic training (for STDiT2), use `VariableVideoText` dataset, and set the `bucket_config` in the config. An example is: ```python bucket_config = { "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)}, "256": {1: (1.0, 256)}, "512": {1: (1.0, 80)}, "480p": {1: (1.0, 52), 16: (0.5, 4), 32: (0.0, None)}, "720p": {16: (1.0, 2), 32: (0.0, None)}, "1024": {1: (1.0, 20)}, "1080p": {1: (1.0, 8)}, } ``` This looks a bit difficult to understand at the first glance. Let's understand this config step by step. ### Three-level bucket ![bucket](/assets/readme/report_bucket.png) We design a three-level bucket: `(resolution, num_frames, aspect_ratios)`. The resolution and aspect ratios is predefined in [aspect.py](/opensora/datasets/aspect.py). Commonly used resolutions (e.g., 240p, 1080p) are supported, and the name represents the number of pixels (e.g., 240p is 240x426, however, we define 240p to represent any size with HxW approximately 240x426=102240 pixels). The aspect ratios are defined for each resolution. You do not need to define the aspect ratios in the `bucket_config`. The `num_frames` is the number of frames in each sample, with `num_frames=1` especially for images. If `frame_intervals` is not 1, a bucket with `num_frames=k` will contain videos with `k*frame_intervals` frames except for images. Only a video with more than `num_frames` and more than `resolution` pixels will be likely to be put into the bucket. The two number defined in the bucket config is `(keep_prob, batch_size)`. Since the memory and speed of samples from different buckets may be different, we use `batch_size` to balance the processing speed. Since our computation is limited, we cannot process videos with their original resolution as stated in OpenAI's sora's report. Thus, we give a `keep_prob` to control the number of samples in each bucket. The `keep_prob` is the probability to keep a sample in the bucket. Let's take the following config as an example: ```python bucket_config = { "480p": {16: (1.0, 8),}, "720p": {16: (0.5, 4),}, "1080p": {16: (0.2, 2)}, "4K", {16: (0.1, 1)}, } ``` Given a 2K video with more than 16 frames, the program will first try to put it into bucket "1080p" since it has a larger resolution than 1080p but less than 4K. Since the `keep_prob` for 1080p is 20%, a random number is generated, and if it is less than 0.2, the video will be put into the bucket. If the video is not put into the bucket, the program will try to put it into the "720p" bucket. Since the `keep_prob` for 720p is 50%, the video has a 50% chance to be put into the bucket. If the video is not put into the bucket, the program will try to put it into the "480p" bucket directly as it is the smallest resolution. ### Examples Let's see some simple examples to understand the bucket config. First, the aspect ratio bucket is compulsory, if you want to modify this you need to add your own resolution definition in [aspect.py](/opensora/datasets/aspect.py). Then, to keep only 256x256 resolution and 16 frames as OpenSora 1.0, you can use the following config: ```python bucket_config = { "256": {16: (1.0, 8)}, } ``` If you want to train a model supporting different resolutions of images, you can use the following config (example [image.py](/configs/opensora-v1-1/train/image.py)): ```python bucket_config = { "256": {1: (1.0, 256)}, "512": {1: (1.0, 80)}, "480p": {1: (1.0, 52)}, "1024": {1: (1.0, 20)}, "1080p": {1: (1.0, 8)}, } ``` Or if you find the number of high-resolution images is too large, you can modify the `keep_prob` to reduce the number of samples in the bucket: ```python bucket_config = { "256": {1: (1.0, 256)}, "512": {1: (0.8, 80)}, "480p": {1: (0.5, 52)}, "1024": {1: (0.5, 20)}, "1080p": {1: (0.2, 8)}, } ``` And similarly for videos (example [video.py](/configs/opensora-v1-1/train/video.py)): ```python bucket_config = { "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)}, "480p": {16: (1.0, 4)}, "720p": {16: (0.5, 2)}, } ``` Note that in the above case, a video with 480p resolution and more than 16 frames will all go into bucket `("480p", 16)`, since they all satisfy this bucket's requirement. But training long videos with 480p resolution may be slow, so you can modify the config as follows to enforce the video with more than 32 frames to go into the 240p bucket. ```python bucket_config = { "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)}, "480p": {16: (1.0, 4), 32: (0.0, None)}, "720p": {16: (0.5, 2)}, } ``` Combine the above examples together, we think you can understand the bucket config provided at the beginning of this section and in the config files. ================================================ FILE: Open-Sora/docs/data_processing.md ================================================ # Data Processing >Open-Sora v1.2 uses Data Propcessing Pipeline v1.1. We establish a complete pipeline for video/image data processing. The pipeline is shown below. ![pipeline](/assets/readme/report_data_pipeline.png) First, raw videos, either from the Internet or public datasets, are split into shorter clips based on scene detection. Then, we evaluate these videos by predicting multiple scores using existing models. We first predict the aesthetic score and the optical flow score for a video. We also conduct OCR to detect texts in the video. Only videos with satisfactory evaluation results are sent to the next step for captioning. After captioning, the matching score is also calculated as an assessment of video-text alignment. Finally, we filter samples based on the matching score and conduct camera motion detection for the remaining samples. In summary, our pipeline produces video-text pairs which have high aesthetic quality, large video motion and strong semantic consistency. Below is an example workflow to process videos. ```bash ROOT_VIDEO="/path/to/video/folder" ROOT_CLIPS="/path/to/video/clips/folder" ROOT_META="/path/to/meta/folder" # 1.1 Create a meta file from a video folder. This should output ${ROOT_META}/meta.csv python -m tools.datasets.convert video ${ROOT_VIDEO} --output ${ROOT_META}/meta.csv # 1.2 Get video information and remove broken videos. This should output ${ROOT_META}/meta_info_fmin1.csv python -m tools.datasets.datautil ${ROOT_META}/meta.csv --info --fmin 1 # 2.1 Detect scenes. This should output ${ROOT_META}/meta_info_fmin1_timestamp.csv python -m tools.scene_cut.scene_detect ${ROOT_META}/meta_info_fmin1.csv # 2.2 Cut video into clips based on scenes. This should produce video clips under ${ROOT_CLIPS} python -m tools.scene_cut.cut ${ROOT_META}/meta_info_fmin1_timestamp.csv --save_dir ${ROOT_CLIPS} # 2.3 Create a meta file for video clips. This should output ${ROOT_META}/meta_clips.csv python -m tools.datasets.convert video ${ROOT_CLIPS} --output ${ROOT_META}/meta_clips.csv # 2.4 Get clips information and remove broken ones. This should output ${ROOT_META}/meta_clips_info_fmin1.csv python -m tools.datasets.datautil ${ROOT_META}/meta_clips.csv --info --fmin 1 # 3.1 Predict aesthetic scores. This should output ${ROOT_META}/meta_clips_info_fmin1_aes.csv torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference \ ${ROOT_META}/meta_clips_info_fmin1.csv \ --bs 1024 \ --num_workers 16 # 3.2 Filter by aesthetic scores. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5.csv python -m tools.datasets.datautil ${ROOT_META}/meta_clips_info_fmin1_aes.csv --aesmin 5 # 4.1 Generate caption. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5_caption_part*.csv torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava \ ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5.csv \ --dp-size 8 \ --tp-size 1 \ --model-path /path/to/llava-v1.6-mistral-7b \ --prompt video # 4.2 Merge caption results. This should output ${ROOT_META}/meta_clips_caption.csv python -m tools.datasets.datautil ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5_caption_part*.csv --output ${ROOT_META}/meta_clips_caption.csv # 4.3 Clean caption. This should output ${ROOT_META}/meta_clips_caption_cleaned.csv python -m tools.datasets.datautil \ ${ROOT_META}/meta_clips_caption.csv \ --clean-caption \ --refine-llm-caption \ --remove-empty-caption \ --output ${ROOT_META}/meta_clips_caption_cleaned.csv # 4.4 Optionally generate tags (e.g., objects) based on the captions. This should output your_output_prefix_{key}.csv torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llama3 ${ROOT_META}/meta_clips_caption_cleaned.csv --key objects --output_prefix your_output_prefix ``` For more information, please refer to: - [Dataset Management](../tools/datasets/README.md) - [Scene Detection and Video Splitting](../tools/scene_cut/README.md) - [Scoring and Filtering](../tools/scoring/README.md) - [Captioning](../tools/caption/README.md) ================================================ FILE: Open-Sora/docs/datasets.md ================================================ # Datasets For Open-Sora 1.2, we conduct mixed training with both images and videos. The main datasets we use are listed below. Please refer to [README](/README.md#data-processing) for data processing. ## Video ### Webvid-10M [Webvid-10M](https://github.com/m-bain/webvid) contains 10 million video-text pairs scraped from the stock footage sites. We first train the model on this dataset (40k hours) for 30k steps (2 epochs). ### Panda-70M [Panda-70M](https://github.com/snap-research/Panda-70M) is a large-scale dataset with 70M video-caption pairs. We use the [training-10M subset](https://github.com/snap-research/Panda-70M/tree/main/dataset_dataloading) for training, which contains ~10M videos of better quality. ### Mixkit [Mixkit](https://mixkit.co/) is a video website where we obtained 9k videos. ### Pixabay [Pixabay](https://pixabay.com/videos/) is video website where we obtained 60.5k videos. ### Pexels [Pexels](https://www.pexels.com/) is a popular online platform that provides high-quality stock photos, videos, and music for free. Most videos from this website are of high quality. Thus, we use them for both pre-training and HQ fine-tuning. We really appreciate the great platform and the contributors! ### Inter4K [Inter4K](https://github.com/alexandrosstergiou/Inter4K) is a dataset containing 1K video clips with 4K resolution. The dataset is proposed for super-resolution tasks. We use the dataset for HQ fine-tuning. ### HD-VG-130M [HD-VG-130M](https://github.com/daooshee/HD-VG-130M?tab=readme-ov-file) comprises 130M text-video pairs. The caption is generated by BLIP-2. We find the scene and the text quality are relatively poor. For OpenSora 1.0, we only use ~350K samples from this dataset. ### MiraData [MiraData](https://github.com/mira-space/MiraData): a high-quality dataset with 77k long videos, mainly from games and city/scenic exploration. ### Vript [Vript](https://github.com/mutonix/Vript/tree/main): a densely annotated dataset of 400k videos. ## Image ### Midjourney-v5-1.7M [Midjourney-v5-1.7M](https://huggingface.co/datasets/wanng/midjourney-v5-202304-clean) includes 1.7M image-text pairs. In detail, this dataset introduces two subsets: original and upscale. This dataset is proposed for exploring the relationship of prompts and high-quality images. ### Midjourney-kaggle-clean [Midjourney-kaggle-clean](https://huggingface.co/datasets/wanng/midjourney-kaggle-clean) is a reconstructed version of [Midjourney User Prompts & Generated Images (250k)](https://www.kaggle.com/datasets/succinctlyai/midjourney-texttoimage?select=general-01_2022_06_20.json%5D), which is cleaned by rules. Moreover, this dataset is divided into two subsets: original and upscale. This dataset is proposed for enabling research on text-to-image model prompting. ### Unsplash-lite The [Unsplash-lite](https://github.com/unsplash/datasets) Dataset comprises 25k nature-themed Unsplash photos, 25k keywords, and 1M searches. This dataset covers a vast range of uses and contexts. Its extensive scope in intent and semantics opens new avenues for research and learning. ### LAION-AESTHETICS 6.5+ LAION aesthetic 6.5+ dataset is a subset of the LAION dataset, which contains 625K high-quality images with aesthetic scores > 6.5. However, as LAION is currently not publicly available, we use this 168k [subset](https://huggingface.co/datasets/bhargavsdesai/laion_improved_aesthetics_6.5plus_with_images). ================================================ FILE: Open-Sora/docs/installation.md ================================================ # Installation Requirements are listed in `requirements` folder. Note that besides these packages, some packages needs to be mannually installed, and are detailed in the following sections. ## Training & Inference You need to install `opensora` for training and inference. You can follow the steps below for installation. We also provide guideline for different CUDA versions for compatiblity. Please note that the default installation is for training and inference only. Other optional dependencies are detailed in the sections [Data Processing](#data-processing), [Evaluation](#evaluation), and [VAE](#vae) respectively. ### Step 1: Install PyTorch and xformers First of all, make sure you have the latest build toolkit for Python. ```bash # update build libs pip install -U pip setuptools wheel ``` If you are using **CUDA 12.1**, you can execute the command below to directly install PyTorch, torchvision and xformers. ```bash # install pytorch, torchvision, and xformers pip install -r requirements/requirements-cu121.txt ``` If you are using different CUDA versions, you need to manually install `torch`, `torchvision` and `xformers`. You can find the compatible distributions according to the links below. - PyTorch: choose install commands from [PyTorch installation page](https://pytorch.org/get-started/locally/) based on your own CUDA version. - xformers: choose install commands from [xformers repo](https://github.com/facebookresearch/xformers?tab=readme-ov-file#installing-xformers) based on your own CUDA version. ### Step 2: Install Open-Sora Then, you can install the project for training and inference with the following commands: ```bash # install this project git clone https://github.com/hpcaitech/Open-Sora cd Open-Sora # the default installation is for inference only pip install -v . # NOTE: for development mode, run `pip install -v -e .` ``` ### Step 3: Install Acceleration Tools (Optional) This is optional but recommended for faster speed, especially for training. To enable `layernorm_kernel` and `flash_attn`, you need to install `apex` and `flash-attn` with the following commands. ```bash # install flash attention # set enable_flash_attn=False in config to disable flash attention pip install packaging ninja pip install flash-attn --no-build-isolation # install apex, the compilation will take a long time # set enable_layernorm_kernel=False in config to disable apex pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git ``` ## Data Processing ### Step 1: Install Requirements First, run the following command to install requirements: ```bash pip install -v .[data] # For development: `pip install -v -e .[eval]` ``` Next, you need to manually install the packages listed in the following sections specific to your data processing needs. ### Step 2: Install OpenCV To get image and video information, we use [opencv-python](https://github.com/opencv/opencv-python). You can install it with pip: ```bash pip install opencv-python ``` However, if your videos are in av1 codec instead of h264, you need to install ffmpeg (already in our [requirement script](../requirements/requirements-data.txt)), then run the following to make conda support av1 codec: ```bash pip uninstall opencv-python conda install -c conda-forge opencv ``` ### Step 3: Install Task-specific Dependencies We have a variety of data processing pipelines, each requires its own dependencies. You can refer to the sections below to install dependencies according to your own needs. #### LLaVA Captioning You need to manually install LLaVA with the following command: ```bash pip install --no-deps llava@git+https://github.com/haotian-liu/LLaVA.git@v1.2.2.post1 ``` #### PLLaVA Captioning You need to manually install PLLaVa with the following commands: ```bash cd tools/caption/pllava_dir # Assume you are in Open-Sora-dev root directory git clone https://github.com/magic-research/PLLaVA.git cd PLLaVA git checkout fd9194a # since there is no version tag, we use this commit python python_scripts/hf.py # download the PLLaVA weights # IMPORTANT: create new environment for reliable pllava performances: conda create -n pllava python=3.10 # You need to manually install `torch`, `torchvision` and `xformers` for different CUDA versions, the following works for CUDA 12.1: conda activate pllava pip install -r ../../../requirements/requirements-cu121.txt pip install packaging ninja pip install flash-attn --no-build-isolation # You may manually remove any lines in requirements.txt that contains `cu11`, then run `pip install -r requirements.txt` # Alternatively, use our prepared pllava environment: pip install -r ../../../../requirements/requirements-pllava.txt ``` #### Scene Detection We use [`PySceneDetect`](https://github.com/Breakthrough/PySceneDetect) for this job. You need to manually run the following: ```bash pip install scenedetect[opencv] --upgrade ``` #### OCR You need to go into `path_to_your_env/lib/python3.10/site-packages/mmdet/__init__.py` and change the assert of `mmcv_version < digit_version(mmcv_maximum_version)` to `mmcv_version <= digit_version(mmcv_maximum_version)`. If you are unsure of your path to the mmdet init file, simply run our [OCR command](../tools/scoring/README.md), wait for the mmdeet assertion error on mmcv versions. The error will contain the exact path to the mmdet init file. ## Evaluation ### Step 1: Install Requirements To conduct evaluation, run the following command to install requirements: ```bash pip install -v .[eval] # For development:`pip install -v -e .[eval]` ``` ### Step 2: Install VBench You need to install VBench mannually by: ```bash # first clone their repo cd .. # assume you are in the Open-Sora root folder, you may install at other location but make sure the soft link paths later are correct git clone https://github.com/Vchitect/VBench.git cd VBench git checkout v0.1.2 # next, fix their hard-coded path isse vim vbench2_beta_i2v/utils.py # find `image_root` in the `load_i2v_dimension_info` function, change it to point to your appropriate image folder # last, create softlinks cd ../Open-Sora # or `cd ../Open-Sora-dev` for development ln -s ../VBench/vbench vbench # you may need to change ../VBench/vbench to your corresponding path ln -s ../VBench/vbench2_beta_i2v vbench2_beta_i2v # you may need to change ../VBench/vbench_beta_i2v to your corresponding path # later you need to make sure to run evaluation from your Open-Sora folder, else vbench, vbench2_beta_i2v cannot be found ``` ### Step 3: Install `cupy` for Potential VAE Errors You need to mannually install [cupy](https://docs.cupy.dev/en/stable/install.html). - For CUDA v11.2~11.8 (x86_64 / aarch64), `pip install cupy-cuda11x` - For CUDA v12.x (x86_64 / aarch64), `pip install cupy-cuda12x` Note that for VAE evaluation, you may run into error with `ModuleNotFoundError: No module named 'torchvision.transforms.functional_tensor'`, in this case, you need to go to the corresponding file (`.../pytorchvideo/transforms/augmentations.py`) reporting this error, then change as following: ```python # find the original line: import torchvision.transforms.functional_tensor as F_t # change to: import torchvision.transforms._functional_tensor as F_t ``` ## VAE ### Step 1: Install Requirements To train and evaluate your own VAE, run the following command to install requirements: ```bash pip install -v .[vae] # For development:`pip install -v -e .[vae]` ``` ### Step 2: VAE Evaluation (`cupy` and Potential VAE Errors) Refer to the [Evaluation's VAE section](#step-3-install-cupy-for-potential-vae-errors) above. ================================================ FILE: Open-Sora/docs/report_01.md ================================================ # Open-Sora 1.0 Report OpenAI's Sora is amazing at generating one minutes high quality videos. However, it reveals almost no information about its details. To make AI more "open", we are dedicated to build an open-source version of Sora. This report describes our first attempt to train a transformer-based video diffusion model. ## Efficiency in choosing the architecture To lower the computational cost, we want to utilize existing VAE models. Sora uses spatial-temporal VAE to reduce the temporal dimensions. However, we found that there is no open-source high-quality spatial-temporal VAE model. [MAGVIT](https://github.com/google-research/magvit)'s 4x4x4 VAE is not open-sourced, while [VideoGPT](https://wilson1yan.github.io/videogpt/index.html)'s 2x4x4 VAE has a low quality in our experiments. Thus, we decided to use a 2D VAE (from [Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original)) in our first version. The video training involves a large amount of tokens. Considering 24fps 1min videos, we have 1440 frames. With VAE downsampling 4x and patch size downsampling 2x, we have 1440x1024≈1.5M tokens. Full attention on 1.5M tokens leads to a huge computational cost. Thus, we use spatial-temporal attention to reduce the cost following [Latte](https://github.com/Vchitect/Latte). As shown in the figure, we insert a temporal attention right after each spatial attention in STDiT (ST stands for spatial-temporal). This is similar to variant 3 in Latte's paper. However, we do not control a similar number of parameters for these variants. While Latte's paper claims their variant is better than variant 3, our experiments on 16x256x256 videos show that with same number of iterations, the performance ranks as: DiT (full) > STDiT (Sequential) > STDiT (Parallel) ≈ Latte. Thus, we choose STDiT (Sequential) out of efficiency. Speed benchmark is provided [here](/docs/acceleration.md#efficient-stdit). ![Architecture Comparison](/assets/readme/report_arch_comp.png) To focus on video generation, we hope to train the model based on a powerful image generation model. [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) is an efficiently trained high-quality image generation model with T5-conditioned DiT structure. We initialize our model with PixArt-α and initialize the projection layer of inserted temporal attention with zero. This initialization preserves model's ability of image generation at beginning, while Latte's architecture cannot. The inserted attention increases the number of parameter from 580M to 724M. ![Architecture](/assets/readme/report_arch.jpg) Drawing from the success of PixArt-α and Stable Video Diffusion, we also adopt a progressive training strategy: 16x256x256 on 366K pretraining datasets, and then 16x256x256, 16x512x512, and 64x512x512 on 20K datasets. With scaled position embedding, this strategy greatly reduces the computational cost. We also try to use a 3D patch embedder in DiT. However, with 2x downsampling on temporal dimension, the generated videos have a low quality. Thus, we leave the downsampling to temporal VAE in our next version. For now, we sample at every 3 frames with 16 frames training and every 2 frames with 64 frames training. ## Data is the key to high quality We find that the number and quality of data have a great impact on the quality of generated videos, even larger than the model architecture and training strategy. At this time, we only prepared the first split (366K video clips) from [HD-VG-130M](https://github.com/daooshee/HD-VG-130M). The quality of these videos varies greatly, and the captions are not that accurate. Thus, we further collect 20k relatively high quality videos from [Pexels](https://www.pexels.com/), which provides free license videos. We label the video with LLaVA, an image captioning model, with three frames and a designed prompt. With designed prompt, LLaVA can generate good quality of captions. ![Caption](/assets/readme/report_caption.png) As we lay more emphasis on the quality of data, we prepare to collect more data and build a video preprocessing pipeline in our next version. ## Training Details With a limited training budgets, we made only a few exploration. We find learning rate 1e-4 is too large and scales down to 2e-5. When training with a large batch size, we find `fp16` less stable than `bf16` and may lead to generation failure. Thus, we switch to `bf16` for training on 64x512x512. For other hyper-parameters, we follow previous works. ## Loss curves 16x256x256 Pretraining Loss Curve ![16x256x256 Pretraining Loss Curve](/assets/readme/report_loss_curve_1.png) 16x256x256 HQ Training Loss Curve ![16x256x256 HQ Training Loss Curve](/assets/readme/report_loss_curve_2.png) 16x512x512 HQ Training Loss Curve ![16x512x512 HQ Training Loss Curve](/assets/readme/report_loss_curve_3.png) > Core Contributor: Zangwei Zheng*, Xiangyu Peng*, Shenggui Li, Hongxing Liu, Yang You ================================================ FILE: Open-Sora/docs/report_02.md ================================================ # Open-Sora 1.1 Report - [Model Architecture Modification](#model-architecture-modification) - [Support for Multi-time/resolution/aspect ratio/fps Training](#support-for-multi-timeresolutionaspect-ratiofps-training) - [Masked DiT as Image/Video-to-Video Model](#masked-dit-as-imagevideo-to-video-model) - [Data Collection \& Pipeline](#data-collection--pipeline) - [Training Details](#training-details) - [Limitation and Future Work](#limitation-and-future-work) In Open-Sora 1.1 release, we train a 700M models on 10M data (Open-Sora 1.0 trained on 400K data) with a better STDiT architecture. We implement the following features mentioned in [sora's report](https://openai.com/research/video-generation-models-as-world-simulators): - Variable durations, resolutions, aspect ratios (Sampling flexibility, Improved framing and composition) - Prompting with images and videos (Animating images, Extending generated videos, Video-to-video editing, Connecting videos) - Image generation capabilities To achieve this goal, we use multi-task learning in the pretraining stage. For diffusion models, training with different sampled timestep is already a multi-task learning. We further extend this idea to multi-resolution, aspect ratio, frame length, fps, and different mask strategies for image and video conditioned generation. We train the model on **0s~15s, 144p to 720p, various aspect ratios** videos. Although the quality of time consistency is not that high due to limit training FLOPs, we can still see the potential of the model. ## Model Architecture Modification We made the following modifications to the original ST-DiT for better training stability and performance (ST-DiT-2): - **[Rope embedding](https://arxiv.org/abs/2104.09864) for temporal attention**: Following LLM's best practice, we change the sinusoidal positional encoding to rope embedding for temporal attention since it is also a sequence prediction task. - **AdaIN and Layernorm for temporal attention**: we wrap the temporal attention with AdaIN and layernorm as the spatial attention to stabilize the training. - **[QK-normalization](https://arxiv.org/abs/2302.05442) with [RMSNorm](https://arxiv.org/abs/1910.07467)**: Following [SD3](https://arxiv.org/pdf/2403.03206.pdf), we apply QK-normalization to the all attention for better training stability in half-precision. - **Dynamic input size support and video infomation condition**: To support multi-resolution, aspect ratio, and fps training, we make ST-DiT-2 to accept any input size, and automatically scale positional embeddings. Extending [PixArt-alpha](https://github.com/PixArt-alpha/PixArt-alpha)'s idea, we conditioned on video's height, width, aspect ratio, frame length, and fps. - **Extending T5 tokens from 120 to 200**: our caption is usually less than 200 tokens, and we find the model can handle longer text well. ## Support for Multi-time/resolution/aspect ratio/fps Training As mentioned in the [sora's report](https://openai.com/research/video-generation-models-as-world-simulators), training with original video's resolution, aspect ratio, and length increase sampling flexibility and improve framing and composition. We found three ways to achieve this goal: - [NaViT](https://arxiv.org/abs/2307.06304): support dynamic size within the same batch by masking, with little efficiency loss. However, the system is a bit complex to implement, and may not benefit from optimized kernels such as flash attention. - Padding ([FiT](https://arxiv.org/abs/2402.12376), [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan)): support dynamic size within the same batch by padding. However, padding different resolutions to the same size is not efficient. - Bucket ([SDXL](https://arxiv.org/abs/2307.01952), [PixArt](https://arxiv.org/abs/2310.00426)): support dynamic size in different batches by bucketing, but the size must be the same within the same batch, and only a fixed number of size can be applied. With the same size in a batch, we do not need to implement complex masking or padding. For the simplicity of implementation, we choose the bucket method. We pre-define some fixed resolution, and allocate different samples to different bucket. The concern for bucketing is listed below. But we can see that the concern is not a big issue in our case.
View the concerns - The bucket size is limited to a fixed number: First, in real-world applications, only a few aspect ratios (9:16, 3:4) and resolutions (240p, 1080p) are commonly used. Second, we find trained models can generalize well to unseen resolutions. - The size in each batch is the same, breaks the i.i.d. assumption: Since we are using multiple GPUs, the local batches on different GPUs have different sizes. We did not see a significant performance drop due to this issue. - The may not be enough samples to fill each bucket and the distribution may be biased: First, our dataset is large enough to fill each bucket when local batch size is not too large. Second, we should analyze the data's distribution on sizes and define the bucket size accordingly. Third, an unbalanced distribution did not affect the training process significantly. - Different resolutions and frame lengths may have different processing speed: Different from PixArt, which only deals with aspect ratios of similar resolutions (similar token numbers), we need to consider the processing speed of different resolutions and frame lengths. We can use the `bucket_config` to define the batch size for each bucket to ensure the processing speed is similar.
![bucket](/assets/readme/report_bucket.png) As shown in the figure, a bucket is a triplet of `(resolution, num_frame, aspect_ratio)`. We provide pre-defined aspect ratios for different resolution that covers most of the common video aspect ratios. Before each epoch, we shuffle the dataset and allocate the samples to different buckets as shown in the figure. We put a sample into a bucket with largest resolution and frame length that is smaller than the video's. Considering our computational resource is limited, we further introduce two attributes `keep_prob` and `batch_size` for each `(resolution, num_frame)` to reduce the computational cost and enable multi-stage training. Specifically, a high-resolution video will be downsampled to a lower resolution with probability `1-keep_prob` and the batch size for each bucket is `batch_size`. In this way, we can control the number of samples in different buckets and balance the GPU load by search a good batch size for each bucket. A detailed explanation of the bucket usage in training is available in [docs/config.md](/docs/config.md#training-bucket-configs). ## Masked DiT as Image/Video-to-Video Model Transformers can be easily extended to support image-to-image and video-to-video tasks. We propose a mask strategy to support image and video conditioning. The mask strategy is shown in the figure below. ![mask strategy](/assets/readme/report_mask.png) Typically, we unmask the frames to be conditioned on for image/video-to-video condition. During the ST-DiT forward, unmasked frames will have timestep 0, while others remain the same (t). We find directly apply the strategy to trained model yield poor results as the diffusion model did not learn to handle different timesteps in one sample during training. Inspired by [UL2](https://arxiv.org/abs/2205.05131), we introduce random mask strategy during training. Specifically, we randomly unmask the frames during training, including unmask the first frame, the first k frames, the last frame, the last k frames, the first and last k frames, random frames, etc. Based on Open-Sora 1.0, with 50% probability of applying masking, we see the model can learn to handle image conditioning (while 30% yields worse ability) for 10k steps, with a little text-to-video performance drop. Thus, for Open-Sora 1.1, we pretrain the model from scratch with masking strategy. An illustration of masking strategy config to use in inference is given as follow. A five number tuple provides great flexibility in defining the mask strategy. By conditioning on generated frames, we can autogressively generate infinite frames (although error propagates). ![mask strategy config](/assets/readme/report_mask_config.png) A detailed explanation of the mask strategy usage is available in [docs/config.md](/docs/config.md#advanced-inference-config). ## Data Collection & Pipeline As we found in Open-Sora 1.0, the data number and quality are crucial for training a good model, we work hard on scaling the dataset. First, we create an automatic pipeline following [SVD](https://arxiv.org/abs/2311.15127), inlcuding scene cutting, captioning, various scoring and filtering, and dataset management scripts and conventions. More infomation can be found in [docs/data_processing.md](/docs/data_processing.md). ![pipeline](/assets/readme/report_data_pipeline.png) We plan to use [panda-70M](https://snap-research.github.io/Panda-70M/) and other data to traing the model, which is approximately 30M+ data. However, we find disk IO a botteleneck for training and data processing at the same time. Thus, we can only prepare a 10M dataset and did not go through all processing pipeline that we built. Finally, we use a dataset with 9.7M videos + 2.6M images for pre-training, and 560k videos + 1.6M images for fine-tuning. The pretraining dataset statistics are shown below. More information about the dataset can be found in [docs/datasets.md](/docs/datasets.md). Image text tokens (by T5 tokenizer): ![image text tokens](/assets/readme/report_image_textlen.png) Video text tokens (by T5 tokenizer). We directly use panda's short caption for training, and caption other datasets by ourselves. The generated caption is usually less than 200 tokens. ![video text tokens](/assets/readme/report_video_textlen.png) Video duration: ![video duration](/assets/readme/report_video_duration.png) ## Training Details With limited computational resources, we have to carefully monitor the training process, and change the training strategy if we speculate the model is not learning well since there is no computation for ablation study. Thus, Open-Sora 1.1's training includes multiple changes, and as a result, ema is not applied. 1. First, we fine-tune **6k** steps with images of different resolution from `Pixart-alpha-1024` checkpoints. We find the model easily adapts to generate images with different resolutions. We use [SpeeDiT](https://github.com/1zeryu/SpeeDiT) (iddpm-speed) to accelerate the diffusion training. 2. **[Stage 1]** Then, we pretrain the model with gradient-checkpointing for **24k** steps, which takes **4 days** on 64 H800 GPUs. Although the number of samples seen by the model is the same, we find the model learns slowly compared to a smaller batch size. We speculate that at an early stage, the number of steps is more important for training. The most videos are in **240p** resolution, and the config is similar to [stage2.py](/configs/opensora-v1-1/train/stage2.py). The video looking is good, but the model does not know much about the temporal knowledge. We use mask ratio of 10%. 3. **[Stage 1]** To increase the number of steps, we switch to a smaller batch size without gradient-checkpointing. We also add fps conditioning at this point. We trained **40k** steps for **2 days**. The most videos are in **144p** resolution, and the config file is [stage1.py](/configs/opensora-v1-1/train/stage1.py). We use a lower resolution as we find in Open-Sora 1.0 that the model can learn temporal knowledge with relatively low resolution. 4. **[Stage 1]** We find the model cannot learn well for long videos, and find a noised generation result as speculated to be half-precision problem found in Open-Sora 1.0 training. Thus, we adopt the QK-normalization to stabilize the training. Similar to SD3, we find the model quickly adapt to the QK-normalization. We also switch iddpm-speed to iddpm, and increase the mask ratio to 25% as we find image-condition not learning well. We trained for **17k** steps for **14 hours**. The most videos are in **144p** resolution, and the config file is [stage1.py](/configs/opensora-v1-1/train/stage1.py). The stage 1 training lasts for approximately one week, with total step **81k**. 5. **[Stage 2]** We switch to a higher resolution, where most videos are in **240p and 480p** resolution ([stage2.py](/configs/opensora-v1-1/train/stage2.py)). We trained **22k** steps for **one day** on all pre-training data. 6. **[Stage 3]** We switch to a higher resolution, where most videos are in **480p and 720p** resolution ([stage3.py](/configs/opensora-v1-1/train/stage3.py)). We trained **4k** with **one day** on high-quality data. We find loading previous stage's optimizer state can help the model learn faster. To summarize, the training of Open-Sora 1.1 requires approximately **9 days** on 64 H800 GPUs. ## Limitation and Future Work As we get one step closer to the replication of Sora, we find many limitations for the current model, and these limitations point to the future work. - **Generation Failure**: we fine many cases (especially when the total token number is large or the content is complex), our model fails to generate the scene. There may be a collapse in the temporal attention and we have identified a potential bug in our code. We are working hard to fix it. Besides, we will increase our model size and training data to improve the generation quality in the next version. - **Noisy generation and influency**: we find the generated model is sometimes noisy and not fluent, especially for long videos. We think the problem is due to not using a temporal VAE. As [Pixart-Sigma](https://arxiv.org/abs/2403.04692) finds that adapting to a new VAE is simple, we plan to develop a temporal VAE for the model in the next version. - **Lack of time consistency**: we find the model cannot generate videos with high time consistency. We think the problem is due to the lack of training FLOPs. We plan to collect more data and continue training the model to improve the time consistency. - **Bad human generation**: We find the model cannot generate high-quality human videos. We think the problem is due to the lack of human data. We plan to collect more human data and continue training the model to improve the human generation. - **Low aesthetic score**: we find the model's aesthetic score is not high. The problem is due to the lack of aesthetic score filtering, which is not conducted due to IO bottleneck. We plan to filter the data by aesthetic score and finetuning the model to improve the aesthetic score. - **Worse quality for longer video generation**: we find with a same prompt, the longer video has worse quality. This means the image quality is not equally adapted to different lengths of sequences. > - **Algorithm & Acceleration**: Zangwei Zheng, Xiangyu Peng, Shenggui Li, Hongxing Liu, Yukun Zhou, Tianyi Li > - **Data Collection & Pipeline**: Xiangyu Peng, Zangwei Zheng, Chenhui Shen, Tom Young, Junjie Wang, Chenfeng Yu ================================================ FILE: Open-Sora/docs/report_03.md ================================================ # Open-Sora 1.2 Report - [Video compression network](#video-compression-network) - [Rectified flow and model adaptation](#rectified-flow-and-model-adaptation) - [More data and better multi-stage training](#more-data-and-better-multi-stage-training) - [Easy and effective model conditioning](#easy-and-effective-model-conditioning) - [Evaluation](#evaluation) - [Sequence parallelism](#sequence-parallelism) In Open-Sora 1.2 release, we train a 1.1B models on >30M data (\~80k hours), with training cost 35k H100 GPU hours, supporting 0s\~16s, 144p to 720p, various aspect ratios video generation. Our configurations is listed below. Following our 1.1 version, Open-Sora 1.2 can also do image-to-video generation and video extension. | | image | 2s | 4s | 8s | 16s | | ---- | ----- | --- | --- | --- | --- | | 240p | ✅ | ✅ | ✅ | ✅ | ✅ | | 360p | ✅ | ✅ | ✅ | ✅ | ✅ | | 480p | ✅ | ✅ | ✅ | ✅ | 🆗 | | 720p | ✅ | ✅ | ✅ | 🆗 | 🆗 | Here ✅ means that the data is seen during training, and 🆗 means although not trained, the model can inference at that config. Inference for 🆗 requires more than one 80G memory GPU and sequence parallelism. Besides features introduced in Open-Sora 1.1, Open-Sora 1.2 highlights: - Video compression network - Rectifie-flow training - More data and better multi-stage training - Easy and effective model conditioning - Better evaluation metrics All implementations (both training and inference) of the above improvements are available in the Open-Sora 1.2 release. The following sections will introduce the details of the improvements. We also refine our codebase and documentation to make it easier to use and develop, and add a LLM to [refine input prompts](/README.md#gpt-4o-prompt-refinement) and support more languages. ## Video compression network For Open-Sora 1.0 & 1.1, we used stability-ai's 83M 2D VAE, which compress the video only in the spatial dimension by 8x8 times. To reduce the temporal dimension, we extracted one frame in every three frames. However, this method led to the low fluency of generated video as the generated fps is sacrificed. Thus, in this release, we introduce the video compression network as OpenAI's Sora does. With a 4 times compression in the temporal dimension, we do not need to extract frames and can generate videos with the original fps. Considering the high computational cost of training a 3D VAE, we hope to re-use the knowledge learnt in the 2D VAE. We notice that after 2D VAE's compression, the features adjacent in the temporal dimension are still highly correlated. Thus, we propose a simple video compression network, which first compress the video in the spatial dimension by 8x8 times, then compress the video in the temporal dimension by 4x times. The network is shown below: ![video_compression_network](/assets/readme/report_3d_vae.png) We initialize the 2D VAE with [SDXL's VAE](https://huggingface.co/stabilityai/sdxl-vae), which is better than our previously used one. For the 3D VAE, we adopt the structure of VAE in [Magvit-v2](https://magvit.cs.cmu.edu/v2/), which contains 300M parameters. Along with 83M 2D VAE, the total parameters of the video compression network is 384M. We train the 3D VAE for 1.2M steps with local batch size 1. The training data is videos from pixels and pixabay, and the training video size is mainly 17 frames, 256x256 resolution. Causal convolutions are used in the 3D VAE to make the image reconstruction more accurate. Our training involves three stages: 1. For the first 380k steps, we train on 8 GPUs and freeze the 2D VAE. The training objective includes the reconstruction of the compressed features from 2D VAE (pink one in the figure) and also add a loss to make features from the 3D VAE similar to the features from the 2D VAE (pink one and green one, called identity loss). We find the latter loss can quickly make the whole VAE achieve a good performance for image and much faster to converge in the next stage. 2. For the next 260k steps, We remove the identity loss and just learn the 3D VAE. 3. For the last 540k steps , since we find only reconstruction 2D VAE's feature cannot lead to further improvement, we remove the loss and train the whole VAE to reconstruct the original videos. This stage is trained on on 24 GPUs. For both stage 1 and stage 2 training, we adopt 20% images and 80% videos. Following [Magvit-v2](https://magvit.cs.cmu.edu/v2/), we train video using 17 frames, while zero-padding the first 16 frames for image. However, we find that this setting leads to blurring of videos with length different from 17 frames. Thus, in stage 3, we use a random number within 34 frames for mixed video length training (a.k.a., zero-pad the first `43-n` frames if we want to train a `n` frame video), to make our VAE more robust to different video lengths. Our [training](/scripts/train_vae.py) and [inference](/scripts/inference_vae.py) code is available in the Open-Sora 1.2 release. When using the VAE for diffusion model, our stacked VAE requires small memory as the our VAE's input is already compressed. We also split the input videos input several 17 frames clips to make the inference more efficient. The performance of our VAE is on par with another open-sourced 3D VAE in [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/docs/Report-v1.1.0.md). | Model | SSIM↑ | PSNR↑ | | ------------------ | ----- | ------ | | Open-Sora-Plan 1.1 | 0.882 | 29.890 | | Open-Sora 1.2 | 0.880 | 30.590 | ## Rectified flow and model adaptation Lastest diffusion model like Stable Diffusion 3 adopts the [rectified flow](https://github.com/gnobitab/RectifiedFlow) instead of DDPM for better performance. Pitiably, SD3's rectified flow training code is not open-sourced. However, Open-Sora 1.2 provides the training code following SD3's paper, including: - Basic rectified flow training ([original rectified flow paper](https://arxiv.org/abs/2209.03003)) - Logit-norm sampling for training acceleration ([SD3 paper](https://arxiv.org/pdf/2403.03206) Section 3.1, intuitively it is more likely to sample timesteps at middle noise level) - Resolution and video length aware timestep sampling ([SD3 paper](https://arxiv.org/pdf/2403.03206) Section 5.3.2, intuitively it is more likely to sample timesteps with more noise for larger resolution, and we extend it to longer video) For the resolution-aware timestep sampling, we should use more noise for images with larger resolution. We extend this idea to video generation and use more noise for videos with longer length. Open-Sora 1.2 starts from the [PixArt-Σ 2K](https://github.com/PixArt-alpha/PixArt-sigma) checkpoint. Note that this model is trained with DDPM and SDXL VAE, also a much higher resolution. We find finetuning on a small dataset can easily adapt the model for our video generation setting. The adaptation process is as follows, all training is done on 8 GPUs (the adaptation for the diffusion model is quite fast and straightforward): 1. Multi-resolution image generation ability: we train the model to generate different resolution ranging from 144p to 2K for 20k steps. 2. QK-norm: we add the QK-norm to the model and train for 18k steps. 3. Rectified flow: we transform from discrete-time DDPM to continuous-time rectified flow and train for 10k steps. 4. Rectified flow with logit-norm sampling and resolution-aware timestep sampling: we train for 33k steps. 5. Smaller AdamW epsilon: following SD3, with QK-norm, we can use a smaller epsilon (1e-15) for AdamW, we train for 8k steps. 6. New VAE and fps conditioning: we replace the original VAE with ours and add fps conditioning to the timestep conditioning, we train for 25k steps. Note that normalizing each channel is important for rectified flow training. 7. Temporal attention blocks: we add temporal attention blocks with zero initialized projection layers. We train on images for 3k steps. 8. Temporal blocks only for video with mask strategy: we train the temporal attention blocks only on videos for 38k steps. After the above adaptation, we are ready to train the model on videos. The adaptation above maintains the original model's ability to generate high-quality images, and brings multiple benefits for video generation: - With rectified flow, we can accelerate the training and reduce the number of sampling steps for video from 100 to 30, which greatly reduces the waiting time for inference. - With qk-norm, the training is more stablized and an aggressive optimizer can be used. - With new VAE, the temporal dimension is compressed by 4 times, which makes the training more efficient. - With multi-resolution image generation ability, the model can generate videos with different resolutions. ## More data and better multi-stage training Due to a limited computational budget, we carefully arrange the training data from low to high quality and split our training into three stages. Our training involves 12x8 GPUs, and the total training time is about 2 weeks for about 70k steps. ### First stage We first train the model on Webvid-10M datasets (40k hours) for 30k steps (2 epochs). Since the video is all lower than 360p resolution and contains watermark, we train on this dataset first. The training mainly happens on 240p and 360p, with video length 2s~16s. We use the original caption in the dataset for training. The training config locates in [stage1.py](/configs/opensora-v1-2/train/stage1.py). ### Second stage Then we train the model on Panda-70M datasets. This dataset is large but the quality varies. We use the official 30M subset which clips are more diverse, and filter out videos with aesthetic score lower than 4.5. This leads to a 20M subset with 41k hours. The captions in the dataset are directly used for our training. The training config locates in [stage2.py](/configs/opensora-v1-2/train/stage2.py). The training mainly happens on 360p and 480p. We train the model for 23k steps, which is 0.5 epoch. The training is not fully done since we hope our new model can meet you earlier. ### Third stage In this stage, we collect ~2M video clips with a total length of 5K hours from all kinds of sources, including: - Free-license videos, sourced from Pexels, Pixabay, Mixkit, etc. - [MiraData](https://github.com/mira-space/MiraData): a high-quality dataset with long videos, mainly from games and city/scenic exploration. - [Vript](https://github.com/mutonix/Vript/tree/main): a densely annotated dataset. - And some other datasets. While MiraData and Vript have captions from GPT, we use [PLLaVA](https://github.com/magic-research/PLLaVA) to caption the rest ones. Compared with LLaVA, which is only capable of single frame/image captioning, PLLaVA is specially designed and trained for video captioning. The [accelerated PLLaVA](/tools/caption/README.md#pllava-captioning) is released in our `tools/`. In practice, we use the pretrained PLLaVA 13B model and select 4 frames from each video for captioning with a spatial pooling shape of 2*2. Some statistics of the video data used in this stage are shown below. We present basic statistics of duration and resolution, as well as aesthetic score and optical flow score distribution. We also extract tags for objects and actions from video captions and count their frequencies. ![stats](/assets/readme/report-03_video_stats.png) ![object_count](/assets/readme/report-03_objects_count.png) ![object_count](/assets/readme/report-03_actions_count.png) We mainly train 720p and 1080p videos in this stage, aiming to extend the model's ability to larger resolutions. We use a mask ratio of 25% during training. The training config locates in [stage3.py](/configs/opensora-v1-2/train/stage3.py). We train the model for 15k steps, which is approximately 2 epochs. ## Easy and effective model conditioning For stage 3, we calculate the aesthetic score and motion score for each video clip. However, since the number of video clips is small, we are not willing to filter out clips with low scores, which leads to a smaller dataset. Instead, we append the scores to the captions and use them as conditioning. We find this method can make model aware of the scores and follows the scores to generate videos with better quality. For example, a video with aesthetic score 5.5, motion score 10, and a detected camera motion pan left, the caption will be: ```plaintext [Original Caption] aesthetic score: 5.5, motion score: 10, camera motion: pan left. ``` During inference, we can also use the scores to condition the model. For camera motion, we only label 13k clips with high confidence, and the camera motion detection module is released in our tools. ## Evaluation Previously, we monitor the training process only by human evaluation, as DDPM traning loss is not well correlated with the quality of generated videos. However, for rectified flow, we find the training loss is well correlated with the quality of generated videos as stated in SD3. Thus, we keep track of rectified flow evaluation loss on 100 images and 1k videos. We sampled 1k videos from pixabay as validation dataset. We calculate the evaluation loss for image and different lengths of videos (2s, 4s, 8s, 16s) for different resolution (144p, 240p, 360p, 480p, 720p). For each setting, we equidistantly sample 10 timesteps. Then all the losses are averaged. We also provide a [video](https://streamable.com/oqkkf1) showing the sampled videos with a fixed prompt for different steps. ![Evaluation Loss](/assets/readme/report_val_loss.png) ![Video Evaluation Loss](/assets/readme/report_vid_val_loss.png) In addition, we also keep track of [VBench](https://vchitect.github.io/VBench-project/) scores during training. VBench is an automatic video evaluation benchmark for short video generation. We calcuate the vbench score with 240p 2s videos. The two metrics verify that our model continues to improve during training. ![VBench](/assets/readme/report_vbench_score.png) All the evaluation code is released in `eval` folder. Check the [README](/eval/README.md) for more details. | Model | Total Score | Quality Score | Semantic Score | | -------------- | ----------- | ------------- | -------------- | | Open-Sora V1.0 | 75.91% | 78.81% | 64.28% | | Open-Sora V1.2 | 79.23% | 80.71% | 73.30% | ## Sequence parallelism We use sequence parallelism to support long-sequence training and inference. Our implementation is based on Ulysses and the workflow is shown below. When sequence parallelism is enabled, we only need to apply the `all-to-all` communication to the spatial block in STDiT as only spatial computation is dependent on the sequence dimension. ![SP](../assets/readme/sequence_parallelism.jpeg) Currently, we have not used sequence parallelism for training as data resolution is small and we plan to do so in the next release. As for inference, we can use sequence parallelism in case your GPU goes out of memory. A simple benchmark shows that sequence parallelism can achieve speedup | Resolution | Seconds | Number of GPUs | Enable SP | Time taken/s | Speedup per GPU | | ---------- | ------- | -------------- | --------- | ------------ | --------------- | | 720p | 16s | 1 | No | 547.97 | - | | 720p | 16s | 2 | Yes | 244.38 | 12% | ================================================ FILE: Open-Sora/docs/structure.md ================================================ # Repo Structure ```plaintext Open-Sora ├── README.md ├── assets │ ├── images -> images used for image-conditioned generation │ ├── demo -> images used for demo │ ├── texts -> prompts used for text-conditioned generation │ └── readme -> images used in README ├── configs -> Configs for training & inference ├── docker -> dockerfile for Open-Sora ├── docs │ ├── acceleration.md -> Report on acceleration & speed benchmark │ ├── commands.md -> Commands for training & inference │ ├── datasets.md -> Datasets used in this project | ├── data_processing.md -> Data pipeline documents | ├── installation.md -> Data pipeline documents │ ├── structure.md -> This file │ ├── config.md -> Configs for training and inference │ ├── report_01.md -> Report for Open-Sora 1.0 │ ├── report_02.md -> Report for Open-Sora 1.1 │ ├── report_03.md -> Report for Open-Sora 1.2 │ ├── vae.md -> our VAE report │ └── zh_CN -> Chinese version of the above ├── eval -> Evaluation scripts │ ├── README.md -> Evaluation documentation | ├── human_eval -> for human eval | ├── launch.sh -> script for launching 8 cards sampling | ├── loss -> eval loss | ├── sample.sh -> script for quickly launching inference on predefined prompts | ├── vae -> for vae eval | ├── vbench -> for VBench evaluation │ └── vbench_i2v -> for VBench i2v evaluation ├── gradio -> Gradio demo related code ├── notebooks -> Jupyter notebooks for generating commands to run ├── scripts │ ├── train.py -> diffusion training script │ ├── train_vae.py -> vae training script │ ├── inference.py -> diffusion inference script │ ├── inference_vae.py -> vae inference script │ └── misc -> misc scripts, including batch size search ├── opensora │ ├── __init__.py │ ├── registry.py -> Registry helper │   ├── acceleration -> Acceleration related code │   ├── datasets -> Dataset related code │   ├── models │   │   ├── dit -> DiT │   │   ├── layers -> Common layers │   │   ├── vae -> VAE as image encoder │   │   ├── text_encoder -> Text encoder │   │   │   ├── classes.py -> Class id encoder (inference only) │   │   │   ├── clip.py -> CLIP encoder │   │   │   └── t5.py -> T5 encoder │   │   ├── dit │   │   ├── latte │   │   ├── pixart │   │   └── stdit -> Our STDiT related code │   ├── schedulers -> Diffusion schedulers │   │   ├── iddpm -> IDDPM for training and inference │   │ └── dpms -> DPM-Solver for fast inference │ └── utils ├── tests -> Tests for the project └── tools -> Tools for data processing and more ``` ## Configs Our config files follows [MMEgine](https://github.com/open-mmlab/mmengine). MMEngine will reads the config file (a `.py` file) and parse it into a dictionary-like object. ```plaintext Open-Sora └── configs -> Configs for training & inference ├── opensora-v1-1 -> STDiT2 related configs │ ├── inference │ │ ├── sample.py -> Sample videos and images │ │ └── sample-ref.py -> Sample videos with image/video condition │ └── train │ ├── stage1.py -> Stage 1 training config │ ├── stage2.py -> Stage 2 training config │ ├── stage3.py -> Stage 3 training config │ ├── image.py -> Illustration of image training config │ ├── video.py -> Illustration of video training config │ └── benchmark.py -> For batch size searching ├── opensora -> STDiT related configs │ ├── inference │ │ ├── 16x256x256.py -> Sample videos 16 frames 256x256 │ │ ├── 16x512x512.py -> Sample videos 16 frames 512x512 │ │ └── 64x512x512.py -> Sample videos 64 frames 512x512 │ └── train │ ├── 16x256x256.py -> Train on videos 16 frames 256x256 │ ├── 16x256x256.py -> Train on videos 16 frames 256x256 │ └── 64x512x512.py -> Train on videos 64 frames 512x512 ├── dit -> DiT related configs    │   ├── inference    │   │   ├── 1x256x256-class.py -> Sample images with ckpts from DiT    │   │   ├── 1x256x256.py -> Sample images with clip condition    │   │   └── 16x256x256.py -> Sample videos    │   └── train    │     ├── 1x256x256.py -> Train on images with clip condition    │      └── 16x256x256.py -> Train on videos ├── latte -> Latte related configs └── pixart -> PixArt related configs ``` ## Tools ```plaintext Open-Sora └── tools ├── datasets -> dataset management related code ├── scene_cut -> scene cut related code ├── caption -> caption related code ├── scoring -> scoring related code │ ├── aesthetic -> aesthetic scoring related code │ ├── matching -> matching scoring related code │ ├── ocr -> ocr scoring related code │ └── optical_flow -> optical flow scoring related code └── frame_interpolation -> frame interpolation related code ================================================ FILE: Open-Sora/docs/vae.md ================================================ # VAE Report As [Pixart-Sigma](https://arxiv.org/abs/2403.04692) finds that adapting to a new VAE is simple, we develop an additional temporal VAE. Specifically, our VAE consists of a pipeline of a [spatial VAE](https://huggingface.co/PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers) followed by a temporal VAE. For the temporal VAE, we follow the implementation of [MAGVIT-v2](https://arxiv.org/abs/2310.05737), with the following modifications: * We remove the architecture specific to the codebook. * We do not use the discriminator, and use the VAE reconstruction loss, kl loss, and perceptual loss for training. * In the last linear layer of the encoder, we scale down to a diagonal Gaussian Distribution of 4 channels, following our previously trained STDiT that takes in 4 channels input. * Our decoder is symmetric to the encoder architecture. ## Training We train the model in different stages. We first train the temporal VAE only by freezing the spatial VAE for 380k steps on a single machine (8 GPUs). We use an additional identity loss to make features from the 3D VAE similar to the features from the 2D VAE. We train the VAE using 20% images and 80% videos with 17 frames. ```bash torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage1.py --data-path YOUR_CSV_PATH ``` Next, we remove the identity loss and train the 3D VAE pipeline to reconstructe the 2D-compressed videos for 260k steps. ```bash torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage2.py --data-path YOUR_CSV_PATH ``` Finally, we remove the reconstruction loss for the 2D-compressed videos and train the VAE pipeline to construct the 3D videos for 540k steps. We train our VAE with a random number within 34 frames to make it more robust to different video lengths. This stage is trained on 24 GPUs. ```bash torchrun --nnodes=3 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage3.py --data-path YOUR_CSV_PATH ``` Note that you need to adjust the `epochs` in the config file accordingly with respect to your own csv data size. ## Inference To visually check the performance of the VAE, you may run the following inference. It saves the original video to your specified video directory with `_ori` postfix (i.e. `"YOUR_VIDEO_DIR"_ori`), the reconstructed video from the full pipeline with the `_rec` postfix (i.e. `"YOUR_VIDEO_DIR"_rec`), and the reconstructed video from the 2D compression and decompression with the `_spatial` postfix (i.e. `"YOUR_VIDEO_DIR"_spatial`). ```bash torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference_vae.py configs/vae/inference/video.py --ckpt-path YOUR_VAE_CKPT_PATH --data-path YOUR_CSV_PATH --save-dir YOUR_VIDEO_DIR ``` ## Evaluation We can then calculate the scores of the VAE performances on metrics of SSIM, PSNR, LPIPS, and FLOLPIPS. * SSIM: structural similarity index measure, the higher the better * PSNR: peak-signal-to-noise ratio, the higher the better * LPIPS: learned perceptual image quality degradation, the lower the better * [FloLPIPS](https://arxiv.org/pdf/2207.08119): LPIPS with video interpolation, the lower the better. ```bash python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir YOUR_VIDEO_DIR_ori --generated_video_dir YOUR_VIDEO_DIR_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips ``` ## Acknowledgement We are grateful for the following work: * [MAGVIT-v2](https://arxiv.org/abs/2310.05737): Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation * [Taming Transformers](https://github.com/CompVis/taming-transformers): Taming Transformers for High-Resolution Image Synthesis * [3D blur pooling](https://github.com/adobe/antialiased-cnns/pull/39/commits/3d6f02b6943c58b68c19c07bc26fad57492ff3bc) * [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan) ================================================ FILE: Open-Sora/docs/zh_CN/README.md ================================================

## Open-Sora: 让所有人都能轻松制作高效视频 我们设计并实施了**Open-Sora**,这是一项致力于高效制作高质量视频的计划。我们希望让所有人都能使用模型、工具和所有细节。通过采用开源原则,Open-Sora 不仅使高级视频生成技术的使用变得民主化,而且还提供了一个简化且用户友好的平台,简化了视频生成的复杂性。借助 Open-Sora,我们的目标是在内容创作领域促进创新、创造力和包容性。 [[中文文档](/docs/zh_CN/README.md)] [[潞晨云](https://cloud.luchentech.com/)|[OpenSora镜像](https://cloud.luchentech.com/doc/docs/image/open-sora/)|[视频教程](https://www.bilibili.com/video/BV1ow4m1e7PX/?vd_source=c6b752764cd36ff0e535a768e35d98d2)] ## 📰 资讯 * **[2024.06.22]** 🔥我们在[潞晨云](https://cloud.luchentech.com/)上发布了Open-Sora1.2镜像,并在B站上传了详细的[使用教程](https://www.bilibili.com/video/BV1ow4m1e7PX/) * **[2024.06.17]** 🔥我们发布了**Open-Sora 1.2**,其中包括**3D-VAE**,**整流流**和**得分条件**。视频质量大大提高。[[模型权重]](#模型权重) [[技术报告]](report_v3.md) [[公众号文章]](https://mp.weixin.qq.com/s/QHq2eItZS9e00BVZnivdjg) * **[2024.04.25]** 🤗 我们在 Hugging Face Spaces 上发布了 [Open-Sora的Gradio演示](https://huggingface.co/spaces/hpcai-tech/open-sora)。 * **[2024.04.25]** 我们发布了**Open-Sora 1.1**,支持**2s~15s、144p 到 720p、任意比例的文本转图片、文本转视频、图片转视频、视频转视频、无限时间生成**。此外,还发布了完整的视频处理管道。 [[模型权重]](#模型权重) [[技术报告]](report_v2.md)[[公众号文章]](https://mp.weixin.qq.com/s/nkPSTep2se__tzp5OfiRQQ) * **[2024.03.18]** 我们发布了 **Open-Sora 1.0**, 一个完全开源的视频生成项目。Open-Sora 1.0 支持完整的视频数据预处理流程、加速训练 、推理等。我们的模型只需 3 天的训练就可以生成 2 秒的 512x512 视频。 [[模型权重]](#模型权重) [[公众号文章]](https://mp.weixin.qq.com/s/H52GW8i4z1Dco3Sg--tCGw) [[技术报告]](report_v1.md) * **[2024.03.04]** Open-Sora 提供培训,成本降低 46%。 [[公众号文章]](https://mp.weixin.qq.com/s/OjRUdrM55SufDHjwCCAvXg) ## 🎥 Latest Demo 🔥 您可以在HuggingFace上的 [🤗 Gradio应用程序](https://huggingface.co/spaces/hpcai-tech/open-sora)上体验Open-Sora. 我们的[画廊](https://hpcaitech.github.io/Open-Sora/)中提供了更多示例. | **4s 720×1280** | **4s 720×1280** | **4s 720×1280** | | ---------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/7895aab6-ed23-488c-8486-091480c26327) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/20f07c7b-182b-4562-bbee-f1df74c86c9a) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/3d897e0d-dc21-453a-b911-b3bda838acc2) | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/644bf938-96ce-44aa-b797-b3c0b513d64c) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/272d88ac-4b4a-484d-a665-8d07431671d0) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/ebbac621-c34e-4bb4-9543-1c34f8989764) | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/a1e3a1a3-4abd-45f5-8df2-6cced69da4ca) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/d6ce9c13-28e1-4dff-9644-cc01f5f11926) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/561978f8-f1b0-4f4d-ae7b-45bec9001b4a) |
OpenSora 1.1 演示 | **2秒 240×426** | **2秒 240×426** | | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) | | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/f7ce4aaa-528f-40a8-be7a-72e61eaacbbd) | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/5d58d71e-1fda-4d90-9ad3-5f2f7b75c6a9) | | **2秒 426×240** | **4秒 480×854** | | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/34ecb4a0-4eef-4286-ad4c-8e3a87e5a9fd) | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c1619333-25d7-42ba-a91c-18dbc1870b18) | | **16秒 320×320** | **16秒 224×448** | **2秒 426×240** | | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------- | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/3cab536e-9b43-4b33-8da8-a0f9cf842ff2) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/9fb0b9e0-c6f4-4935-b29e-4cac10b373c4) | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/3e892ad2-9543-4049-b005-643a4c1bf3bf) |
OpenSora 1.0 Demo | **2秒 512×512** | **2秒 512×512** | **2秒 512×512** | | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/de1963d3-b43b-4e68-a670-bb821ebb6f80) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/13f8338f-3d42-4b71-8142-d234fbd746cc) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/fa6a65a6-e32a-4d64-9a9e-eabb0ebb8c16) | |森林地区宁静的夜景。 [...] 该视频是一段延时摄影,捕捉了白天到夜晚的转变,湖泊和森林始终作为背景。 | 无人机拍摄的镜头捕捉到了海岸悬崖的壮丽美景,[...] 海水轻轻地拍打着岩石底部和紧贴悬崖顶部的绿色植物。| 瀑布从悬崖上倾泻而下,流入宁静的湖泊,气势磅礴。[...] 摄像机角度提供了瀑布的鸟瞰图。 | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/64232f84-1b36-4750-a6c0-3e610fa9aa94) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/983a1965-a374-41a7-a76b-c07941a6c1e9) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/ec10c879-9767-4c31-865f-2e8d6cf11e65) | | 夜晚繁华的城市街道,充满了汽车前灯的光芒和路灯的氛围光。 [...] | 向日葵田的生机勃勃,美不胜收。向日葵整齐排列,给人一种秩序感和对称感。 [...] |宁静的水下场景,一只海龟在珊瑚礁中游动。这只海龟的壳呈绿褐色 [...] | 视频经过降采样以.gif用于显示。单击查看原始视频。提示经过修剪以用于显示,请参阅[此处](/assets/texts/t2v_samples.txt)查看完整提示。
## 🔆 新功能/更新 * 📍 **Open-Sora 1.2** 发布。模型权重可在[此处](#model-weights)查看。有关更多详细信息,请参阅我们的**[技术报告 v1.2](docs/report_03.md)** 。 * ✅ 支持整流流调度。 * ✅ 训练我们的 3D-VAE 进行时间维度压缩。 * 📍 **Open-Sora 1.1**发布。模型权重可在[此处](#model-weights)获得。它针对**0s~15s、144p 到 720p、各种宽高比**的视频进行训练。有关更多讨论,请参阅我们的**[技术报告 v1.1](/docs/report_02.md)** 。 * 🔧 **数据处理流程** v1.1发布,提供从原始视频到(文本,视频片段)对的自动处理流程,包括场景剪切$\rightarrow$过滤(美学、光流、OCR 等)$\rightarrow$字幕$\rightarrow$管理。使用此工具,您可以轻松构建视频数据集。 * ✅ 改进的 ST-DiT 架构包括 rope 位置编码、qk 范数、更长的文本长度等。 * ✅ 支持任意分辨率、纵横比和时长(包括图像)的训练。 * ✅ 支持图像和视频调节以及视频编辑,从而支持动画图像,连接视频等。 * 📍 **Open-Sora 1.0**发布。模型权重可在[此处](#model-weights)获得。仅使用 400K 视频片段和 200 个 H800 天(相比稳定视频扩散中的 152M 样本),我们就能生成 2s 512×512 视频。有关更多讨论,请参阅我们的**[技术报告 v1.0](docs/report_01.md)**。 * ✅从图像扩散模型到视频扩散模型的三阶段训练。我们为每个阶段提供权重。 * ✅ 支持训练加速,包括加速 Transformer、更快的 T5 和 VAE 以及序列并行。Open-Sora 在 64x512x512 视频上训练时可将训练速度提高**55%**。详细信息位于[训练加速.md](docs/acceleration.md)。 * 🔧 **数据预处理流程 v1.0**,包括 [下载](tools/datasets/README.md), [视频剪辑](tools/scene_cut/README.md), 和 [字幕](tools/caption/README.md) 工具. 我们的数据收集计划可在 [数据集.md](docs/datasets.md)中找到.
查看更多 ✅ 我们发现[VideoGPT](https://wilson1yan.github.io/videogpt/index.html)的 VQ-VAE质量较低,因此采用了[Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original)中的更好的 VAE 。我们还发现时间维度的修补会降低质量。有关更多讨论,请参阅我们的**[技术报告v1.0](docs/report_01.md)**。 ✅ 我们研究了不同的架构,包括 DiT、Latte 和我们提出的 **STDiT**。我们的STDiT在质量和速度之间实现了更好的平衡。请参阅我们的 **[技术报告v1.0](docs/report_01.md)**以了解更多讨论。 ✅ 支持剪辑和T5文本调节。 ✅ 通过将图像视为单帧视频,我们的项目支持在图像和视频上训练 DiT(例如 ImageNet 和 UCF101)。有关更多说明,请参阅[commands.md](docs/commands.md) 。 ✅ 支持使用[DiT](https://github.com/facebookresearch/DiT), [Latte](https://github.com/Vchitect/Latte), 和 [PixArt](https://pixart-alpha.github.io/).的官方权重进行推理。 ✅ 重构代码库。查看[structure.md](docs/structure.md)以了解项目结构以及如何使用配置文件。
### 按优先级排序的 TODO 列表
查看更多 * [x] 训练视频 VAE 并使我们的模型适应新的 VAE * [x] 缩放模型参数和数据集大小 * [x] 纳入更好的调度程序(整流流程) * [x] 评估流程 * [x] 完成数据处理流程(包括密集光流、美学评分、文本-图像相似度等)。有关更多信息,请参阅[数据集](/docs/datasets.md) * [x] 支持图像和视频调节 * [x] 支持可变的纵横比、分辨率和持续时间
## 内容 * [安装](#安装) * [模型权重](#模型权重) * [Gradio演示](#gradio演示) * [推理](#推理) * [数据处理](#数据处理) * [训练](#训练) * [评估](#评估) * [贡献](#贡献) * [引用](#引用) * [致谢](#致谢) 下面列出了其他有用的文档和链接。 * 报告: [技术报告 v1.2](docs/report_v3.md), [技术报告 v1.1](/docs/report_v2.md), [技术报告 v1.0](/docs/report_v1.md), [训练加速.md](docs/acceleration.md) * Repo 结构: [结构.md](docs/structure.md) * 配置文件说明: [config.md](docs/config.md) * Useful commands: [commands.md](docs/commands.md) * 数据处理管道和数据集: [datasets.md](docs/datasets.md) * 每个数据处理工具的 README: [dataset conventions and management](/tools/datasets/README.md), [scene cutting](/tools/scene_cut/README.md), [scoring](/tools/scoring/README.md), [caption](/tools/caption/README.md) * 评估: [eval](/eval/README.md) * 画廊: [gallery](https://hpcaitech.github.io/Open-Sora/) ## 安装 ### 从源头安装 对于 CUDA 12.1,您可以使用以下命令[安装](/docs/installation.md)依赖项。否则,请参阅安装以获取有关不同 cuda 版本的更多说明以及数据预处理的其他依赖项。 ```bash # create a virtual env and activate (conda as an example) conda create -n opensora python=3.9 conda activate opensora # install torch, torchvision and xformers pip install -r requirements/requirements-cu121.txt # download the repo git clone https://github.com/hpcaitech/Open-Sora cd Open-Sora # the default installation is for inference only pip install -v . # for development mode, `pip install -v -e .` (Optional, recommended for fast speed, especially for training) To enable `layernorm_kernel` and `flash_attn`, you need to install `apex` and `flash-attn` with the following commands. ```bash # install flash attention # set enable_flash_attn=False in config to disable flash attention pip install packaging ninja pip install flash-attn --no-build-isolation # install apex # set enable_layernorm_kernel=False in config to disable apex pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git ``` ### 使用Docker 运行以下命令从提供的Dockerfile 构建docker 镜像。 ```bash docker build -t opensora . ``` 运行以下命令以交互模式启动docker容器。 ```bash docker run -ti --gpus all -v .:/workspace/Open-Sora opensora ``` ## 模型权重 ### Open-Sora 1.2 模型权重 | 分辨率 | 模型大小 | 数据 | 迭代次数 | 批次大小 | 网址 | | ---------- | ---------- | ---- | ----------- | ---------- | --- | | Diffusion | 1.1B | 30M | 70k | 动态大小 | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v3) | | VAE | 384M | 3M | 1M | 8 | [:link:](https://huggingface.co/hpcai-tech/OpenSora-VAE-v1.2) | 请参阅我们的**[report 1.2](docs/report_v3.md)**以了解更多信息。 ### Open-Sora 1.1 模型权重
查看更多 | 分辨率 | M | Data | #iterations | Batch Size | URL | | ------------------ | ---------- | -------------------------- | ----------- | ------------------------------------------------- | -------------------------------------------------------------------- | | mainly 144p & 240p | 700M | 10M videos + 2M images | 100k | [dynamic](/configs/opensora-v1-1/train/stage2.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage2) | | 144p to 720p | 700M | 500K HQ videos + 1M images | 4k | [dynamic](/configs/opensora-v1-1/train/stage3.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage3) | 请参阅我们的 **[报告 1.1](docs/report_02.md)** 以了解更多信息。 :warning: **局限性**: 此版本包含已知问题,我们将在下一版本中修复这些问题(因为我们为下一版本节省了计算资源)。此外,由于此问题,视频生成可能会长时间失败,高分辨率将产生嘈杂的结果。
### Open-Sora 1.0 模型权重
查看更多 | 分辨率 | 模型大小 | 数据 | 迭代次数 | 批量大小 | GPU 天数 (H800) | 网址 | ---------- | ---------- | ------ | ----------- | ---------- | --------------- | | 16×512×512 | 700M | 20K HQ | 20k | 2×64 | 35 | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x512x512.pth) | | 16×256×256 | 700M | 20K HQ | 24k | 8×64 | 45 | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x256x256.pth) | | 16×256×256 | 700M | 366K | 80k | 8×64 | 117 | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-16x256x256.pth) | 训练流程: 16x256x256 $\rightarrow$ 16x256x256 高清 $\rightarrow$ 16x512x512 高质量. 我们的模型权重部分由 [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha)初始化,参数数量为724M.更多信息请参阅 **[技术报告v1.0](docs/report_v1.md)**。数据集相关信息请参阅[数据集文件](docs/datasets.md). HQ 表示高质量. :warning: **局限性**: 我们的模型是在有限的预算下训练的。质量和文本对齐相对较差。该模型表现不佳,特别是在生成人类时,无法遵循详细的说明。我们正在努力提高质量和文本对齐。
## Gradio演示 🔥 您可以在Hugging Face 上的[🤗 Gradio 应用程序](https://huggingface.co/spaces/hpcai-tech/open-sora)上在线体验Open-Sora。【由于GPU资源不足,已失效】 ### 本地部署 如果您想在本地部署 gradio,我们还在这个存储库中提供了一个[Gradio 应用程序](./gradio) ,您可以使用以下命令启动一个交互式 Web 应用程序来体验使用 Open-Sora 生成视频。 ```bash pip install gradio spaces python gradio/app.py ``` 这将在您的本地主机上启动 Gradio 应用程序。如果您想了解有关 Gradio 应用程序的更多信息,可以参考[Gradio README](./gradio/README.md)。 要启用提示增强和其他语言输入(例如中文输入),您需要OPENAI_API_KEY在环境中进行设置。查看[OpenAI的文档](https://platform.openai.com/docs/quickstart)以获取您的 API 密钥。 ```bash export OPENAI_API_KEY=YOUR_API_KEY ``` ### 入门 在 Gradio 应用程序中,基本选项如下: ![Gradio Demo](/assets/readme/gradio_basic.png) 生成视频最简单的方式是输入文本提示,然后点击“**生成视频**”按钮(如果找不到,请向下滚动)。生成的视频将显示在右侧面板中。勾选“**使用 GPT4o 增强提示**”将使用 GPT-4o 来细化提示,而“**随机提示**”按钮将由 GPT-4o 为您生成随机提示。由于 OpenAI 的 API 限制,提示细化结果具有一定的随机性。 然后,你可以选择生成视频的**分辨率**、**时长**、**长宽比**。不同的分辨率和视频长度会影响视频生成速度。在 80G H100 GPU 上,生成速度和峰值内存使用量为: | 分辨率 | 图像 | 2秒 | 4秒 | 8秒 | 16秒 | | ---- | ------- | -------- | --------- | --------- | --------- | | 360p | 3s, 24G | 18s, 27G | 31s, 27G | 62s, 28G | 121s, 33G | | 480p | 2s, 24G | 29s, 31G | 55s, 30G | 108s, 32G | 219s, 36G | | 720p | 6s, 27G | 68s, 41G | 130s, 39G | 260s, 45G | 547s, 67G | 注意,除了文本转视频,你还可以使用图片转视频。你可以上传图片,然后点击“**生成视频**”按钮,生成以图片为第一帧的视频。或者,你可以填写文本提示,然后点击“**生成图片**”按钮,根据文本提示生成图片,然后点击“**生成视频**”按钮,根据同一模型生成的图片生成视频。 ![Gradio Demo](/assets/readme/gradio_option.png) 然后您可以指定更多选项,包括“**运动强度**”、“**美学**”和“**相机运动**”。如果未选中“启用”或选择“无”,则不会将信息传递给模型。否则,模型将生成具有指定运动强度、美学分数和相机运动的视频。 对于**美学分数**,我们建议使用高于 6 的值。对于**运动强度**,较小的值将导致更平滑但动态性较差的视频,而较大的值将导致更动态但可能更模糊的视频。因此,您可以尝试不使用它,然后根据生成的视频进行调整。对于**相机运动**,有时模型无法很好地遵循指令,我们正在努力改进它。 您还可以调整“**采样步数**”,这是去噪的次数,与生成速度直接相关。小于 30 的数字通常会导致较差的生成结果,而大于 100 的数字通常不会有明显的改善。“种子”用于可重复性,您可以将其设置为固定数字以生成相同的视频。“**CFG 比例**”控制模型遵循文本提示的程度,较小的值会导致视频更随机,而较大的值会导致视频更遵循文本(建议为 7)。 对于更高级的用法,您可以参考[Gradio README](./gradio/README.md#advanced-usage). ## 推理 ### Open-Sora 1.2 命令行推理 基础的命令行推理: ```bash # text to video python scripts/inference.py configs/opensora-v1-2/inference/sample.py \ --num-frames 4s --resolution 720p --aspect-ratio 9:16 \ --prompt "a beautiful waterfall" ``` 您可以向命令行添加更多选项来定制生成。 ```bash python scripts/inference.py configs/opensora-v1-2/inference/sample.py \ --num-frames 4s --resolution 720p --aspect-ratio 9:16 \ --num-sampling-steps 30 --flow 5 --aes 6.5 \ --prompt "a beautiful waterfall" ``` 对于图像到视频生成和其他功能,API 与 Open-Sora 1.1 兼容。请参阅[此处]](commands.md)了解更多说明。 如果您的安装不包含 `apex` 和 `flash-attn`, 则需要在配置文件中或通过以下命令禁用它们。 ```bash python scripts/inference.py configs/opensora-v1-2/inference/sample.py \ --num-frames 4s --resolution 720p \ --layernorm-kernel False --flash-attn False \ --prompt "a beautiful waterfall" ``` ### 序列并行推理 要启用序列并行,您需要使用 `torchrun` 来运行推理脚本。以下命令将使用 2 个 GPU 运行推理。 ```bash # text to video CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node 2 scripts/inference.py configs/opensora-v1-2/inference/sample.py \ --num-frames 4s --resolution 720p --aspect-ratio 9:16 \ --prompt "a beautiful waterfall" ``` :warning: **注意**: gradio 部署不支持序列并行。目前,只有当维度可以除以 GPU 数量时才支持序列并行。因此,在某些情况下可能会失败。我们测试了 4 个 GPU 用于 720p 和 2 个 GPU 用于 480p。 ### GPT-4o 快速细化 我们发现 GPT-4o 可以细化提示并提高生成视频的质量。利用此功能,您还可以使用其他语言(例如中文)作为提示。要启用此功能,您需要在环境中准备您的 openai api 密钥: ```bash export OPENAI_API_KEY=YOUR_API_KEY ``` 然后您可以用 `--llm-refine True` 启用GPT-4o进行提示细化以完成推理。 ### Open-Sora 1.1 命令行推理
查看更多 由于 Open-Sora 1.1 支持动态输入大小的推理,因此您可以将输入大小作为参数传递。 ```bash # text to video python scripts/inference.py configs/opensora-v1-1/inference/sample.py --prompt "A beautiful sunset over the city" --num-frames 32 --image-size 480 854 ``` 如果您的安装不包含`apex` 和 `flash-attn`,则需要在配置文件中或通过以下命令禁用它们。 ```bash python scripts/inference.py configs/opensora-v1-1/inference/sample.py --prompt "A beautiful sunset over the city" --num-frames 32 --image-size 480 854 --layernorm-kernel False --flash-attn False ``` 请参阅[此处](docs/commands.md#inference-with-open-sora-11)了解更多说明,包括文本转图像、图像转视频、视频转视频和无限时间生成。
### Open-Sora 1.0 命令行推理
查看更多 我们还提供了离线推理脚本。运行以下命令生成样本,所需的模型权重将自动下载。要更改采样提示,请修改传递给的 txt 文件--prompt-path。请参阅[此处](docs/structure.md#inference-config-demos)以自定义配置。 ```bash # Sample 16x512x512 (20s/sample, 100 time steps, 24 GB memory) torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x512x512.py --ckpt-path OpenSora-v1-HQ-16x512x512.pth --prompt-path ./assets/texts/t2v_samples.txt # Sample 16x256x256 (5s/sample, 100 time steps, 22 GB memory) torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path OpenSora-v1-HQ-16x256x256.pth --prompt-path ./assets/texts/t2v_samples.txt # Sample 64x512x512 (40s/sample, 100 time steps) torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth --prompt-path ./assets/texts/t2v_samples.txt # Sample 64x512x512 with sequence parallelism (30s/sample, 100 time steps) # sequence parallelism is enabled automatically when nproc_per_node is larger than 1 torchrun --standalone --nproc_per_node 2 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth --prompt-path ./assets/texts/t2v_samples.txt ``` 速度是在 H800 GPU 上测试的。有关使用其他型号进行推理,请参阅[此处](docs/commands.md) 了解更多说明。要降低内存使用量,请`vae.micro_batch_size`在配置中设置较小的值(略低采样速度)。
## 数据处理 高质量的数据对于训练良好的生成模型至关重要。为此,我们建立了完整的数据处理流程,可以将原始视频无缝转换为高质量的视频-文本对。流程如下所示。有关详细信息,请参阅[数据处理](docs/data_processing.md)。另请查看我们使用的[数据集](docs/datasets.md)。 ![Data Processing Pipeline](/assets/readme/report_data_pipeline.png) ## 训练 ### Open-Sora 1.2 训练 训练过程与Open-Sora 1.1相同。 ```bash # one node torchrun --standalone --nproc_per_node 8 scripts/train.py \ configs/opensora-v1-2/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT # multiple nodes colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py \ configs/opensora-v1-2/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT ``` ### Open-Sora 1.1 训练
查看更多 在文件中准备好数据后`csv`,运行以下命令在单个节点上启动训练。 ```bash # one node torchrun --standalone --nproc_per_node 8 scripts/train.py \ configs/opensora-v1-1/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT # multiple nodes colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py \ configs/opensora-v1-1/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT ```
### Open-Sora 1.0 训练
查看更多 在文件中准备好数据后`csv`,运行以下命令在单个节点上启动训练。 ```bash # 1 GPU, 16x256x256 torchrun --nnodes=1 --nproc_per_node=1 scripts/train.py configs/opensora/train/16x256x256.py --data-path YOUR_CSV_PATH # 8 GPUs, 64x512x512 torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT ``` 要在多个节点上启动训练,请根据[ColossalAI](https://colossalai.org/docs/basics/launch_colossalai/#launch-with-colossal-ai-cli)准备一个主机文件,并运行以下命令。 ```bash colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT ``` 有关训练其他模型和高级用法,请参阅[此处](docs/commands.md)获取更多说明。
## 评估 我们支持基于以下方面的评估: - 验证损失 - [VBench](https://github.com/Vchitect/VBench/tree/master)h分数 - VBench-i2v 分数 - 批量生成以供人工评估 所有评估代码均发布在 `eval`文件夹中。查看[README](/eval/README.md)了解更多详细信息。我们的 [技术报告](report_v3.md#评估)还提供了有关训练期间评估的更多信息。下表显示 Open-Sora 1.2 大大改进了 Open-Sora 1.0。 | 模型 | 总得分 | 质量得分 | 语义得分 | | -------------- | ----------- | ------------- | -------------- | | Open-Sora V1.0 | 75.91% | 78.81% | 64.28% | | Open-Sora V1.2 | 79.23% | 80.71% | 73.30% | ## VAE 训练与评估 我们训练一个由空间 VAE 和时间 VAE 组成的 VAE 管道。有关更多详细信息,请参阅[VAE 文档](vae.md)。在运行以下命令之前,请按照我们的[安装文档](installation.md)安装 VAE 和评估所需的依赖项。 如果您想训练自己的 VAE,我们需要按照[数据处理](#data-processing)流程在 csv 中准备数据,然后运行以下命令。请注意,您需要根据自己的 csv 数据大小相应地调整配置文件中的训练`epochs`数量。 ```bash # stage 1 training, 380k steps, 8 GPUs torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage1.py --data-path YOUR_CSV_PATH # stage 2 training, 260k steps, 8 GPUs torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage2.py --data-path YOUR_CSV_PATH # stage 3 training, 540k steps, 24 GPUs torchrun --nnodes=3 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage3.py --data-path YOUR_CSV_PATH ``` 为了评估 VAE 的性能,您需要首先运行 VAE 推理来生成视频,然后计算生成的视频的分数: ```bash # video generation torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference_vae.py configs/vae/inference/video.py --ckpt-path YOUR_VAE_CKPT_PATH --data-path YOUR_CSV_PATH --save-dir YOUR_VIDEO_DIR # the original videos will be saved to `YOUR_VIDEO_DIR_ori` # the reconstructed videos through the pipeline will be saved to `YOUR_VIDEO_DIR_rec` # the reconstructed videos through the spatial VAE only will be saved to `YOUR_VIDEO_DIR_spatial` # score calculation python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir YOUR_VIDEO_DIR_ori --generated_video_dir YOUR_VIDEO_DIR_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips ``` ## 贡献 感谢以下出色的贡献者: 如果您希望为该项目做出贡献,请参阅[Contribution Guideline](./CONTRIBUTING.md)。 ## 致谢 这里我们仅列出了部分项目,其他研究成果及数据集请参考我们的报告。 * [ColossalAI](https://github.com/hpcaitech/ColossalAI): 强大的大型模型并行加速与优化系统。 * [DiT](https://github.com/facebookresearch/DiT): 带有 Transformer 的可扩展扩散模型。 * [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT): DiT 训练的加速器。我们从 OpenDiT 中采用了有价值的训练进度加速策略。 * [PixArt](https://github.com/PixArt-alpha/PixArt-alpha): 一个基于 DiT 的开源文本转图像模型。 * [Latte](https://github.com/Vchitect/Latte): 尝试高效地训练视频的 DiT。 * [StabilityAI VAE](https://huggingface.co/stabilityai/sd-vae-ft-mse-original): 一个强大的图像 VAE 模型。 * [CLIP](https://github.com/openai/CLIP): 一个强大的文本图像嵌入模型。 * [T5](https://github.com/google-research/text-to-text-transfer-transformer): 强大的文本编码器。 * [LLaVA](https://github.com/haotian-liu/LLaVA): 基于[Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) 和 [Yi-34B](https://huggingface.co/01-ai/Yi-34B). 的强大图像字幕模型。 * [PLLaVA](https://github.com/magic-research/PLLaVA): 一个强大的视频字幕模型。 * [MiraData](https://github.com/mira-space/MiraData):具有长持续时间和结构化字幕的大规模视频数据集。 我们感谢他们的出色工作和对开源的慷慨贡献。 ## 引用 ```bibtex @software{opensora, author = {Zangwei Zheng and Xiangyu Peng and Tianji Yang and Chenhui Shen and Shenggui Li and Hongxin Liu and Yukun Zhou and Tianyi Li and Yang You}, title = {Open-Sora: Democratizing Efficient Video Production for All}, month = {March}, year = {2024}, url = {https://github.com/hpcaitech/Open-Sora} } ``` ## Star增长 [![Star History Chart](https://api.star-history.com/svg?repos=hpcaitech/Open-Sora&type=Date)](https://star-history.com/#hpcaitech/Open-Sora&Date) ================================================ FILE: Open-Sora/docs/zh_CN/READMEv1.1.md ================================================

## Open-Sora: 完全开源的高效复现类Sora视频生成方案 **Open-Sora**项目是一项致力于**高效**制作高质量视频,并使所有人都能使用其模型、工具和内容的计划。 通过采用**开源**原则,Open-Sora 不仅实现了先进视频生成技术的低成本普及,还提供了一个精简且用户友好的方案,简化了视频制作的复杂性。 通过 Open-Sora,我们希望更多开发者一起探索内容创作领域的创新、创造和包容。 [[English Document]](/README.md)

Open-Sora 项目目前处在早期阶段,并将持续更新。

## 📰 资讯 > 由于文档需要进行翻译,最新资讯请看[英文文档](/README.md#-news) * **[2024.04.25]** 🤗 我们在Hugging Face Spaces上发布了Open-Sora的[Gradio demo](https://huggingface.co/spaces/hpcai-tech/open-sora)。 * **[2024.04.25]** 🔥 我们发布了支持**2秒至15秒、144p至720p、任意宽高比**的文本到图像、文本到视频、图像到视频、视频到视频、无限时间生成的**Open-Sora 1.1**版本。此外,还发布了一个完整的视频处理流程。 [[checkpoints]]() [[report]](/docs/report_02.md) * **[2024.03.18]** 🔥 我们发布了**Open-Sora 1.0**,这是一个完全开源的视频生成项目。 * Open-Sora 1.0 支持视频数据预处理、加速训练、推理等全套流程。 * 我们提供的[模型权重](#模型权重)只需 3 天的训练就能生成 2 秒的 512x512 视频。 * **[2024.03.04]** Open-Sora:开源Sora复现方案,成本降低46%,序列扩充至近百万。[[英文博客]](https://hpc-ai.com/blog/open-sora) ## 🎥 最新视频 | **2s 512×512** | **2s 512×512** | **2s 512×512** | | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/de1963d3-b43b-4e68-a670-bb821ebb6f80) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/13f8338f-3d42-4b71-8142-d234fbd746cc) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/fa6a65a6-e32a-4d64-9a9e-eabb0ebb8c16) | | A serene night scene in a forested area. [...] The video is a time-lapse, capturing the transition from day to night, with the lake and forest serving as a constant backdrop. | A soaring drone footage captures the majestic beauty of a coastal cliff, [...] The water gently laps at the rock base and the greenery that clings to the top of the cliff. | The majestic beauty of a waterfall cascading down a cliff into a serene lake. [...] The camera angle provides a bird's eye view of the waterfall. | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/64232f84-1b36-4750-a6c0-3e610fa9aa94) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/983a1965-a374-41a7-a76b-c07941a6c1e9) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/ec10c879-9767-4c31-865f-2e8d6cf11e65) | | A bustling city street at night, filled with the glow of car headlights and the ambient light of streetlights. [...] | The vibrant beauty of a sunflower field. The sunflowers are arranged in neat rows, creating a sense of order and symmetry. [...] | A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell [...] | 视频经过降采样处理为`.gif`格式,以便显示。点击查看原始视频。为便于显示,文字经过修剪,全文请参见 [此处](/assets/texts/t2v_samples.txt)。在我们的[图片库](https://hpcaitech.github.io/Open-Sora/)中查看更多样本。 ## 🔆 新功能 > 由于文档需要进行翻译,最新资讯请看[英文文档](/README.md#-new-featuresupdates) * 📍Open-Sora-v1 已发布。[这里](#模型权重)提供了模型权重。只需 400K 视频片段和在单卡 H800 上训200天(类比Stable Video Diffusion 的 152M 样本),我们就能生成 2 秒的 512×512 视频。 * ✅ 从图像扩散模型到视频扩散模型的三阶段训练。我们提供每个阶段的权重。 * ✅ 支持训练加速,包括Transformer加速、更快的 T5 和 VAE 以及序列并行。在对 64x512x512 视频进行训练时,Open-Sora 可将训练速度提高**55%**。详细信息请参见[训练加速](acceleration.md)。 * 🔧 我们提供用于数据预处理的视频切割和字幕工具。有关说明请点击[此处](tools/data/README.md),我们的数据收集计划请点击 [数据集](datasets.md)。 * ✅ 我们发现来自[VideoGPT](https://wilson1yan.github.io/videogpt/index.html)的 VQ-VAE 质量较低,因此采用了来自[Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original) 的高质量 VAE。我们还发现使用添加了时间维度的采样会导致生成质量降低。更多讨论,请参阅我们的 **[报告](docs/report_v1.md)**。 * ✅ 我们研究了不同的架构,包括 DiT、Latte 和我们提出的 **STDiT**。我们的STDiT在质量和速度之间实现了更好的权衡。更多讨论,请参阅我们的 **[报告](report_v1.md)**。 * ✅ 支持剪辑和 T5 文本调节。 * ✅ 通过将图像视为单帧视频,我们的项目支持在图像和视频(如 ImageNet 和 UCF101)上训练 DiT。更多说明请参见 [指令解析](command.md)。 * ✅ 利用[DiT](https://github.com/facebookresearch/DiT)、[Latte](https://github.com/Vchitect/Latte) 和 [PixArt](https://pixart-alpha.github.io/) 的官方权重支持推理。
查看更多 * ✅ 重构代码库。请参阅[结构](structure.md),了解项目结构以及如何使用配置文件。
### 下一步计划【按优先级排序】 * [ ] 训练视频-VAE并让模型适应新的VAE **[项目进行中]** * [ ] 缩放模型参数和数据集大小 **[项目进行中]** * [ ] 纳入更好的时间表,例如 SD3 中的修正流程。 **[项目进行中]**
查看更多 * [x] 评估流程。 * [x] 完成数据处理流程(包括密集光流、美学评分、文本图像相似性、重复数据删除等)。更多信息请参见[数据集](datasets.md) * [x] 支持图像和视频调节。 * [x] 支持可变长宽比、分辨率和持续时间。
## 目录 * [安装](#安装) * [模型权重](#模型权重) * [推理](#推理) * [数据处理](#数据处理) * [训练](#训练) * [评估](#评估) * [贡献](#贡献) * [声明](#声明) * [引用](#引用) ## 安装 ### 从源码安装 ```bash # create a virtual env conda create -n opensora python=3.10 # install torch # the command below is for CUDA 12.1, choose install commands from # https://pytorch.org/get-started/locally/ based on your own CUDA version pip3 install torch torchvision # install flash attention (optional) pip install packaging ninja pip install flash-attn --no-build-isolation # install apex (optional) pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git # install xformers pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121 # install this project git clone https://github.com/hpcaitech/Open-Sora cd Open-Sora pip install -v . ``` ### 使用Docker镜像 运行如下指令使用提供的Dockerfile构建镜像: ```bash docker build -t opensora ./docker ``` 运行以下命令以启动交互模式下的 Docker 容器: ```bash docker run -ti --gpus all -v {MOUNT_DIR}:/data opensora ``` 安装完成后,建议阅读[结构](structure.md),了解项目结构以及如何使用配置文件。 ## 模型权重 | 分辨率 | 数据 | 迭代次数 | 批量大小 | GPU 天数 (H800) | 网址 | | ---------- | ------ | ----------- | ---------- | --------------- | ---------- | | 16×256×256 | 366K | 80k | 8×64 | 117 | [:link:]() | | 16×256×256 | 20K HQ | 24k | 8×64 | 45 | [:link:]() | | 16×512×512 | 20K HQ | 20k | 2×64 | 35 | [:link:]() | 我们模型的权重部分由[PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) 初始化。参数数量为 724M。有关训练的更多信息,请参阅我们的 **[报告](report_v1.md)**。有关数据集的更多信息,请参阅[数据](datasets.md)。HQ 表示高质量。 :warning: **局限性**:我们的模型是在有限的预算内训练出来的。质量和文本对齐度相对较差。特别是在生成人类时,模型表现很差,无法遵循详细的指令。我们正在努力改进质量和文本对齐。 ## 推理 要使用我们提供的权重进行推理,首先要将[T5](https://huggingface.co/DeepFloyd/t5-v1_1-xxl/tree/main)权重下载到pretrained_models/t5_ckpts/t5-v1_1-xxl 中。然后下载模型权重。运行以下命令生成样本。请参阅[此处](structure.md#推理配置演示)自定义配置。 ```bash # Sample 16x512x512 (20s/sample, 100 time steps, 24 GB memory) torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x512x512.py --ckpt-path OpenSora-v1-HQ-16x512x512.pth --prompt-path ./assets/texts/t2v_samples.txt # Sample 16x256x256 (5s/sample, 100 time steps, 22 GB memory) torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path OpenSora-v1-HQ-16x256x256.pth --prompt-path ./assets/texts/t2v_samples.txt # Sample 64x512x512 (40s/sample, 100 time steps) torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth --prompt-path ./assets/texts/t2v_samples.txt # Sample 64x512x512 with sequence parallelism (30s/sample, 100 time steps) # sequence parallelism is enabled automatically when nproc_per_node is larger than 1 torchrun --standalone --nproc_per_node 2 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth --prompt-path ./assets/texts/t2v_samples.txt ``` 我们在 H800 GPU 上进行了速度测试。如需使用其他模型进行推理,请参阅[此处](commands.md)获取更多说明。减小`vae.micro_batch_size`来降低显存使用(但取样速度会略微减慢)。 ## 数据处理 高质量数据是高质量模型的关键。[这里](datasets.md)有我们使用过的数据集和数据收集计划。我们提供处理视频数据的工具。目前,我们的数据处理流程包括以下步骤: 1. 下载数据集。[[文件](/tools/datasets/README.md)] 2. 将视频分割成片段。 [[文件](/tools/scene_cut/README.md)] 3. 生成视频字幕。 [[文件](/tools/caption/README.md)] ## 训练 ### Open-Sora 1.0 训练
查看更多 要启动训练,首先要将[T5](https://huggingface.co/DeepFloyd/t5-v1_1-xxl/tree/main)权重下载到pretrained_models/t5_ckpts/t5-v1_1-xxl 中。然后运行以下命令在单个节点上启动训练。 ```bash # 1 GPU, 16x256x256 torchrun --nnodes=1 --nproc_per_node=1 scripts/train.py configs/opensora/train/16x256x512.py --data-path YOUR_CSV_PATH # 8 GPUs, 64x512x512 torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT ``` 要在多个节点上启动训练,请根据[ColossalAI](https://colossalai.org/docs/basics/launch_colossalai/#launch-with-colossal-ai-cli) 准备一个主机文件,并运行以下命令。 ```bash colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT ``` 有关其他模型的训练和高级使用方法,请参阅[此处](commands.md)获取更多说明。
## 评估 点击[这里](https://github.com/hpcaitech/Open-Sora/blob/main/eval/README.md)查看评估 ## 贡献 本中文翻译还有许多不足,如果您希望为该项目做出贡献,可以参考 [贡献指南](/CONTRIBUTING.md). 目前需要翻译或更新的文件: * [ ] 更新[资讯](#-资讯) * [ ] 更新[最新视频](#-最新视频) * [ ] 更新[新功能](#-新功能)。 * [ ] 翻译[评估](https://github.com/hpcaitech/Open-Sora/blob/main/eval/README.md)文件 * [ ] 更新Open-Sora 1.1[训练](#训练) ## 声明 * [ColossalAI](https://github.com/hpcaitech/ColossalAI): A powerful large model parallel acceleration and optimization * [DiT](https://github.com/facebookresearch/DiT): Scalable Diffusion Models with Transformers. * [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT): An acceleration for DiT training. We adopt valuable acceleration strategies for training progress from OpenDiT. * [PixArt](https://github.com/PixArt-alpha/PixArt-alpha): An open-source DiT-based text-to-image model. * [Latte](https://github.com/Vchitect/Latte): An attempt to efficiently train DiT for video. * [StabilityAI VAE](https://huggingface.co/stabilityai/sd-vae-ft-mse-original): A powerful image VAE model. * [CLIP](https://github.com/openai/CLIP): A powerful text-image embedding model. * [T5](https://github.com/google-research/text-to-text-transfer-transformer): A powerful text encoder. * [LLaVA](https://github.com/haotian-liu/LLaVA): A powerful image captioning model based on [Yi-34B](https://huggingface.co/01-ai/Yi-34B). 我们对他们的出色工作和对开源的慷慨贡献表示感谢。 ## 引用 ```bibtex @software{opensora, author = {Zangwei Zheng and Xiangyu Peng and Yang You}, title = {Open-Sora: Democratizing Efficient Video Production for All}, month = {March}, year = {2024}, url = {https://github.com/hpcaitech/Open-Sora} } ``` [Zangwei Zheng](https://github.com/zhengzangw) and [Xiangyu Peng](https://github.com/xyupeng) equally contributed to this work during their internship at [HPC-AI Tech](https://hpc-ai.com/). ## Star 走势 [![Star History Chart](https://api.star-history.com/svg?repos=hpcaitech/Open-Sora&type=Date)](https://star-history.com/#hpcaitech/Open-Sora&Date) ================================================ FILE: Open-Sora/docs/zh_CN/acceleration.md ================================================ # 加速 >本文档对应于Open-Sora v1.1版本。 Open-Sora 旨在为扩散模型提供一个高速训练框架。在 64 帧 512x512 视频上训练时,我们可以实现 **55%** 的训练速度加速。我们的框架支持训练 **1分钟1080p视频**。 ## 加速的 Transformer Open-Sora 通过以下方式提高训练速度: - 内核优化,包括 [flash attention](https://github.com/Dao-AILab/flash-attention), 融合 layernorm 内核以及由 colossalAI 编译的内核。 - 混合并行性,包括 ZeRO。 - 用于更大批量的梯度检查点。 我们在图像上的训练速度可与 [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT) 相媲美,这是一个加速 DiT 训练的项目。训练速度是在批处理大小为 128、图像大小为 256x256 的 8 个 H800 GPU 上测量的。 | 模型 | 吞吐量 (img/s/GPU) | 吞吐量 (tokens/s/GPU) | |----------|-----------------|--------------------| | DiT | 100 | 26k | | OpenDiT | 175 | 45k | | OpenSora | 175 | 45k | ## 高效的 STDiT 我们的 STDiT 采用时空注意力对视频数据进行建模。与直接全神贯注在 Dit 相比,我们的 STDiT 随着帧数的增加而更有效率。我们当前的框架仅支持序列超长序列的并行性。 训练速度是在 8 个 H800 GPU 上测量的,应用了加速技术,GC 表示梯度检查点。 两者都具有像 PixArt 一样的 T5 调节。 | 模型 | 设置 | 吞吐量 (sample/s/GPU) | 吞吐量 (tokens/s/GPU) | |------------------|----------------|--------------------|--------------------| | DiT | 16x256 (4k) | 7.20 | 29k | | STDiT | 16x256 (4k) | 7.00 | 28k | | DiT | 16x512 (16k) | 0.85 | 14k | | STDiT | 16x512 (16k) | 1.45 | 23k | | DiT (GC) | 64x512 (65k) | 0.08 | 5k | | STDiT (GC) | 64x512 (65k) | 0.40 | 25k | | STDiT (GC, sp=2) | 360x512 (370k) | 0.10 | 18k | 使用 Video-VAE 在时间维度上进行 4 倍下采样时,24fps 视频有 450 帧。STDiT(28k tokens/s) 和 DiT 对图像 (高达 45k tokens/s) 两者之间的速度差距主要来自 T5 和 VAE 编码,以及时间注意力。 ## 加速的编码器 (T5, VAE) 在训练过程中,文本由 T5 编码,视频由 VAE 编码。通常有两种方法可以加速训练: 1. 提前预处理文本和视频数据并保存到磁盘。 2. 在训练过程中对文本和视频数据进行编码,并加快编码过程。 对于选项 1,一个样本的 120 个令牌需要 1M 磁盘空间,而 64x64x64 的潜在可能需要 4M。考虑训练 包含 10M 视频剪辑的数据集,所需的总磁盘空间为 50TB。我们的存储系统目前还没有准备好 这种数据规模。 对于选项 2,我们提高了 T5 速度和内存要求。根据在[OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT),我们发现 VAE 消耗了大量的 GPU 内存。因此,我们 将批大小拆分为较小的批大小,以便进行 VAE 编码。使用这两种技术,我们可以大大加快训练速度。 训练速度是在 8 个带有 STDiT 的 H800 GPU 上测量的。 | 加速模式 | 设置 | 吞吐量 (img/s/GPU) | 吞吐量 (tokens/s/GPU) | |--------------|---------------|-----------------|--------------------| | Baseline | 16x256 (4k) | 6.16 | 25k | | w. faster T5 | 16x256 (4k) | 7.00 | 29k | | Baseline | 64x512 (65k) | 0.94 | 15k | | w. both | 64x512 (65k) | 1.45 | 23k | ================================================ FILE: Open-Sora/docs/zh_CN/commands.md ================================================ # 命令 ## 推理 您可以修改相应的配置文件来更改推理设置。在 [此处](/docs/structure.md#inference-config-demos) 查看更多详细信息。 ### 在 ImageNet 上使用 DiT 预训练进行推理 以下命令会自动在 ImageNet 上下载预训练权重并运行推理。 ```bash python scripts/inference.py configs/dit/inference/1x256x256-class.py --ckpt-path DiT-XL-2-256x256.pt ``` ### 在 UCF101 上使用 Latte 预训练进行推理 以下命令会自动下载 UCF101 上的预训练权重并运行推理。 ```bash python scripts/inference.py configs/latte/inference/16x256x256-class.py --ckpt-path Latte-XL-2-256x256-ucf101.pt ``` ### 使用 PixArt-α 预训练权重进行推理 将 T5 下载到 `./pretrained_models` 并运行以下命令。 ```bash # 256x256 torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x256x256.py --ckpt-path PixArt-XL-2-256x256.pth # 512x512 torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x512x512.py --ckpt-path PixArt-XL-2-512x512.pth # 1024 multi-scale torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x1024MS.py --ckpt-path PixArt-XL-2-1024MS.pth ``` ### 使用训练期间保存的 checkpoints 进行推理 在训练期间,会在 `outputs` 目录中创建一个实验日志记录文件夹。在每个 checkpoint 文件夹下(例如 `epoch12-global_step2000`),有一个 `ema.pt` 文件和共享的 `model` 文件夹。执行以下命令进行推理。 ```bash # 使用 ema 模型进行推理 torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000/ema.pt # 使用模型进行推理 torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000 # 使用序列并行进行推理 # 当 nproc_per_node 大于 1 时,将自动启用序列并行 torchrun --standalone --nproc_per_node 2 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000 ``` 第二个命令将在 checkpoint 文件夹中自动生成一个 `model_ckpt.pt` 文件。 ### 推理超参数 1. DPM 求解器擅长对图像进行快速推理。但是,它的视频推理的效果并不令人满意。若出于快速演示目的您可以使用这个求解器。 ```python type="dmp-solver" num_sampling_steps=20 ``` 2. 您可以在视频推理上使用 [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt) 微调的 VAE 解码器(消耗更多内存)。但是,我们没有看到视频推理效果有明显改善。要使用它,请将 [预训练权重](https://huggingface.co/maxin-cn/Latte/tree/main/t2v_required_models/vae_temporal_decoder) 下载到 `./pretrained_models/vae_temporal_decoder` 中,并修改配置文件,如下所示。 ```python vae = dict( type="VideoAutoencoderKLTemporalDecoder", from_pretrained="pretrained_models/vae_temporal_decoder", ) ``` ## 训练 如果您要继续训练,请运行以下命令。参数 ``--load`` 和 ``--ckpt-path`` 不同之处在于,它会加载优化器和数据加载器的状态。 ```bash torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --load YOUR_PRETRAINED_CKPT ``` 如果要启用 wandb 日志,请添加到 `--wandb` 参数到命令中。 ```bash WANDB_API_KEY=YOUR_WANDB_API_KEY torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --wandb True ``` 您可以修改相应的配置文件来更改训练设置。在 [此处](/docs/structure.md#training-config-demos) 查看更多详细信息。 ### 训练超参数 1. `dtype` 是用于训练的数据类型。仅支持 `fp16` 和 `bf16`。ColossalAI 自动启用 `fp16` 和 `bf16` 的混合精度训练。在训练过程中,我们发现 `bf16` 更稳定。 ================================================ FILE: Open-Sora/docs/zh_CN/datasets.md ================================================ # 数据集 ## 正在使用的数据集 ### HD-VG-130M [HD-VG-130M](https://github.com/daooshee/HD-VG-130M?tab=readme-ov-file) 包括 130M 个文本视频对。标题是 由 BLIP-2 生成。我们发现剪切和文本质量相对较差。它包含 20 个拆分。对于 OpenSora 1.0,我们使用第一个拆分。我们计划使用整个数据集并对其进行重新处理。 ### Inter4k [Inter4k](https://github.com/alexandrosstergiou/Inter4K) 是一个包含分辨率为 4K 的 1k 视频剪辑的数据集。这个 数据集被提议用于超分辨率任务。我们使用数据集进行 HQ 训练。处理过的视频可以从这里找到 [这里](README.md#数据处理) 。 ### Pexels.com [Pexels.com](https://www.pexels.com/) 是一个提供免费库存照片和视频的网站。我们收集的 19K 视频 来自本网站的剪辑,用于高质量训练。处理过的视频可以从这里找到 [这里](README.md#数据处理) 。 ## 数据集监视列表 我们也在关注以下数据集,并考虑在未来使用它们,这取决于我们的存储空间以及数据集的质量。 | 名称 | 大小 | 描述 | |-------------------|--------------|-------------------------------| | Panda-70M | 70M videos | High quality video-text pairs | | WebVid-10M | 10M videos | Low quality | | InternVid-10M-FLT | 10M videos | | | EGO4D | 3670 hours | | | OpenDV-YouTube | 1700 hours | | | VidProM | 6.69M videos | | ================================================ FILE: Open-Sora/docs/zh_CN/report_v1.md ================================================ # Open-Sora v1 技术报告 OpenAI的Sora在生成一分钟高质量视频方面非常出色。然而,它几乎没有透露任何关于其细节的信息。为了使人工智能更加“开放”,我们致力于构建一个开源版本的Sora。这份报告描述了我们第一次尝试训练一个基于Transformer的视频扩散模型。 ## 选择高效的架构 为了降低计算成本,我们希望利用现有的VAE模型。Sora使用时空VAE来减少时间维度。然而,我们发现没有开源的高质量时空VAE模型。[MAGVIT](https://github.com/google-research/magvit)的4x4x4 VAE并未开源,而[VideoGPT](https://wilson1yan.github.io/videogpt/index.html)的2x4x4 VAE在我们的实验中质量较低。因此,我们决定在我们第一个版本中使用2D VAE(来自[Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original))。 视频训练涉及大量的token。考虑到24fps的1分钟视频,我们有1440帧。通过VAE下采样4倍和patch大小下采样2倍,我们得到了1440x1024≈150万个token。在150万个token上进行全注意力计算将带来巨大的计算成本。因此,我们使用时空注意力来降低成本,这是遵循[Latte](https://github.com/Vchitect/Latte)的方法。 如图中所示,在STDiT(ST代表时空)中,我们在每个空间注意力之后立即插入一个时间注意力。这类似于Latte论文中的变种3。然而,我们并没有控制这些变体的相似数量的参数。虽然Latte的论文声称他们的变体比变种3更好,但我们在16x256x256视频上的实验表明,相同数量的迭代次数下,性能排名为:DiT(完整)> STDiT(顺序)> STDiT(并行)≈ Latte。因此,我们出于效率考虑选择了STDiT(顺序)。[这里](/docs/acceleration.md#efficient-stdit)提供了速度基准测试。 ![Architecture Comparison](/assets/readme/report_arch_comp.png) 为了专注于视频生成,我们希望基于一个强大的图像生成模型来训练我们的模型。PixArt-α是一个经过高效训练的高质量图像生成模型,具有T5条件化的DiT结构。我们使用[PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha)初始化我们的模型,并将插入的时间注意力的投影层初始化为零。这种初始化在开始时保留了模型的图像生成能力,而Latte的架构则不能。插入的注意力将参数数量从5.8亿增加到7.24亿。 ![Architecture](/assets/readme/report_arch.jpg) 借鉴PixArt-α和Stable Video Diffusion的成功,我们还采用了渐进式训练策略:在366K预训练数据集上进行16x256x256的训练,然后在20K数据集上进行16x256x256、16x512x512和64x512x512的训练。通过扩展位置嵌入,这一策略极大地降低了计算成本。 我们还尝试在DiT中使用3D patch嵌入器。然而,在时间维度上2倍下采样后,生成的视频质量较低。因此,我们将在下一版本中将下采样留给时间VAE。目前,我们在每3帧采样一次进行16帧训练,以及在每2帧采样一次进行64帧训练。 ## 数据是训练高质量模型的核心 我们发现数据的数量和质量对生成视频的质量有很大的影响,甚至比模型架构和训练策略的影响还要大。目前,我们只从[HD-VG-130M](https://github.com/daooshee/HD-VG-130M)准备了第一批分割(366K个视频片段)。这些视频的质量参差不齐,而且字幕也不够准确。因此,我们进一步从提供免费许可视频的[Pexels](https://www.pexels.com/)收集了20k相对高质量的视频。我们使用LLaVA,一个图像字幕模型,通过三个帧和一个设计好的提示来标记视频。有了设计好的提示,LLaVA能够生成高质量的字幕。 ![Caption](/assets/readme/report_caption.png) 由于我们更加注重数据质量,我们准备收集更多数据,并在下一版本中构建一个视频预处理流程。 ## 训练细节 在有限的训练预算下,我们只进行了一些探索。我们发现学习率1e-4过大,因此将其降低到2e-5。在进行大批量训练时,我们发现`fp16`比`bf16`不太稳定,可能会导致生成失败。因此,我们在64x512x512的训练中切换到`bf16`。对于其他超参数,我们遵循了之前的研究工作。 ## 损失曲线 16x256x256 预训练损失曲线 ![16x256x256 Pretraining Loss Curve](/assets/readme/report_loss_curve_1.png) 16x256x256 高质量训练损失曲线 ![16x256x256 HQ Training Loss Curve](/assets/readme/report_loss_curve_2.png) 16x512x512 高质量训练损失曲线 ![16x512x512 HQ Training Loss Curve](/assets/readme/report_loss_curve_3.png) ================================================ FILE: Open-Sora/docs/zh_CN/report_v2.md ================================================ # Open-Sora 1.1 技术报告 - [模型架构修改](#模型架构修改) - [支持不同视频长度/分辨率/宽高比/帧率(fps)训练](#支持不同视频长度分辨率宽高比帧率fps训练) - [使用Masked DiT作为图生视频/视频生视频模型](#使用masked-dit作为图生视频视频生视频模型) - [数据收集和流程](#数据收集和流程) - [训练详情](#训练详情) - [结果和评价](#结果和评价) - [不足和下一步计划](#不足和下一步计划) 在Open-Sora1.1版本中,我们使用了10M数据来训练经过结构调优后的STDiT的700M模型(Open-Sora1.0版本仅用400K数据)。我们实现了[Sora报告](https://openai.com/research/video-generation-models-as-world-simulators)中提到的以下功能: - 可变的视频时长、分辨率、宽高比(包括采样灵活性、改进的取景范围和构图) - 提示词增加图片和视频选项(使图像动起来、生成式增长视频、视频到视频编辑、连接不同视频) - 图像生成功能 为了实现这一目标,我们在预训练阶段使用了多任务学习。对于扩散模型来说,用不同的采样时间步长进行训练已经是一种多任务学习。我们将这一思想在图像和视频的条件生成模型上,进一步扩展到多分辨率、宽高比、帧长、fps以及不同的掩码策略。我们在**0~15s、144p到720p、各种宽高比的视频**上训练模型。虽然由于训练FLOPs不足的限制,生成的视频在时间一致性上的表现没有那么高,但我们仍然可以看到这个模型的巨大潜力。 ## 模型架构修改 我们对原始ST-DiT模型进行了以下修改,以获得更好的训练稳定性和模型性能(ST-DiT-2): - **在时间注意力模块中添加[旋转位置编码](https://arxiv.org/abs/2104.09864)**:遵循目前LLM的最佳实践,我们将时间注意力模块中的正弦位置编码更改为旋转位置编码,因为它也算一项序列预测任务。 - **在时间注意力模块中添加AdaIN和Layernormal**:我们将时间注意力与AdaIN和Layer范数作为空间注意力包裹起来,以稳定训练。 - **[QK归一化](https://arxiv.org/abs/2302.05442)与[RMSNorm](https://arxiv.org/abs/1910.07467)**:和[SD3](https://arxiv.org/pdf/2403.03206.pdf)类似地,我们应用QK归一化来提高半精度训练的稳定性。 - **支持动态输入大小和视频条件限定**:为了支持多分辨率、宽高比和fps训练,我们ST-DiT-2来接受任何输入大小。延申[PixArt-alpha](https://github.com/PixArt-alpha/PixArt-alpha)的想法,我们支持限定视频的高度、宽度、宽高比、帧长和fps。 - **将T5token数量从120扩展到200**:我们使用的视频描述通常少于200个token,我们发现模型也可以很好地处理更长的文本。 ## 支持不同视频长度/分辨率/宽高比/帧率(fps)训练 正如[Sora报告](https://openai.com/research/video-generation-models-as-world-simulators)中提到的,使用原始无损视频的分辨率、宽高比和视频长度进行训练可以增加采样灵活性,改善取景和构图。我们找到了三种实现这一目标的方法: - [NaViT](https://arxiv.org/abs/2307.06304):通过不同掩码策略支持在同一训练批次内使用不同大小的数据,并且训练效率下降很少。然而,该系统实现起来有点复杂,并且可能无法兼容kernal优化技术(如flashattention)。 - 填充([FiT](https://arxiv.org/abs/2402.12376),[Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan)):通过填充支持同一批次内的不同大小的数据。然而,将不同的分辨率填充到相同的大小会导致效率降低。 - 分桶训练([SDXL](https://arxiv.org/abs/2307.01952)、[PixArt](https://arxiv.org/abs/2310.00426)):支持通过分桶的方式在不同批次中动态调整大小,但在同一批次内数据大小必须相同,只能应用固定数量的数据大小。在一个批次中,我们不需要实现复杂的掩码或填充。 为了更便捷的实现,我们选择分桶训练的方式。我们预先定义了一些固定的分辨率,并将不同的样本分配到不同的桶中。下面列出了分桶方案中值得注意的点。但我们可以看到,这些在我们的实验中并不是一个大问题。
查看注意事项 - 桶大小被限制为固定数量:首先,在实际应用中,通常只使用少数宽高比(9:16、3:4)和分辨率(240p、1080p)。其次,我们发现经过训练的模型可以很好地推广到未见过的解决方案。 - 每批的大小相同,打破了独立同分布(i.i.d.)假设:由于我们使用多个 GPU,因此不同 GPU 上的本地批次具有不同的大小。我们没有发现此问题导致性能显着下降。 - 可能没有足够的样本来填充每个桶,并且分布可能有偏差:首先,当本地批量大小不太大时,我们的数据集足够大以填充每个桶。其次,我们应该分析数据大小的分布并相应地定义桶大小。第三,分配不平衡并没有显着影响训练过程。 - 不同的分辨率和帧长可能有不同的处理速度:与PixArt只处理相似分辨率(相似token数)的宽高比不同,我们需要考虑不同分辨率和帧长的处理速度。我们可以使用“bucket_config”来定义每个桶的批量大小,以确保处理速度相似。
![bucket](/assets/readme/report_bucket.png) 如图所示,桶是(分辨率,帧数量,宽高比)的三元组。我们为不同的分辨率提供预定义的宽高比,涵盖了大多数常见的视频宽高比。在每个epoch之前,我们打乱数据集并将样本分配到不同的桶中,如图所示。我们将样本放入最大分辨率和帧长度小于视频的桶中。 考虑到我们的计算资源有限,我们进一步为每个(分辨率,num_frame)二元组引入keep_prob和batch_size两个属性,以降低计算成本并实现多阶段训练。具体来说,高清视频将以概率1-keep_prob下采样到较低分辨率的桶中,并且每个桶的样本数量是由batch_size属性决定的。这样,我们可以控制不同桶中的样本数量,并通过为每个桶搜索合适的数据量来平衡GPU负载。 有关训练中桶使用的详细说明,请参阅[配置文件](/docs/config.md#training-bucket-configs). ## 使用Masked DiT作为图生视频/视频生视频模型 Transformer可以很容易地扩展到支持图生图和视频生视频的任务。我们提出了一种蒙版策略来支持图像和视频的调节。蒙版策略如下图所示。 ![mask strategy](/assets/readme/report_mask.png) 在将图像或视频转换成另一个视频的过程中,我们通常会选择出需要作为条件的帧并取消其掩码(unmask)。在使用ST-DiT模型进行前向传播时,被选择取消掩码(unmask)的帧将被赋予时间步长0,而其他帧则保持它们原有的时间步长t。我们发现,如果直接将这种策略应用到训练好的模型上,会得到较差的结果,因为扩散模型在训练过程中并未学会如何处理一个样本中具有不同时间步长的帧。 受[UL2](https://arxiv.org/abs/2205.05131)的启发,我们在训练期间引入了随机掩码策略。具体来说,我们在训练期间随机取消掩码帧,包括取消掩码第一帧,前k帧,最后k帧,最后k帧,第一和最后k帧,随机帧等。基于Open-Sora 1.0模型,以50%的概率应用掩码策略,我们发现模型能够在10,000步的训练中学会处理图像条件(而30%的概率会导致处理能力变差),同时文本到视频的性能略有下降。因此,在Open-Sora 1.1版本中,我们从头开始预训练模型,并采用了掩码策略。 下图给出了用于推理的掩码策略配置的说明。五数字元组在定义掩码策略方面提供了极大的灵活性。 ![mask strategy config](/assets/readme/report_mask_config.png) 掩码策略用法的详细说明可在[配置文件](/docs/config.md#advanced-inference-config)中查看. ## 数据收集和流程 正如我们在Sora1.0版本中看见的那样,数据数量和质量对于训练一个好的模型至关重要,因此,我们努力扩展数据集。首先,我们创建了一个遵循[SVD](https://arxiv.org/abs/2311.15127)的自动流水线,包括场景切割、字幕、各种评分和过滤以及数据集管理脚本和通用惯例。 ![pipeline](/assets/readme/report_data_pipeline.png) 我们计划使用[panda-70M](https://snap-research.github.io/Panda-70M/)和其他数据来训练模型,大约包含3000万条数据。然而,我们发现磁盘输入输出(disk IO)在同时进行训练和数据处理时成为了一个瓶颈。因此,我们只能准备一个包含1000万条数据的数据集,并且没有完成我们构建的所有处理流程。最终,我们使用了包含970万视频和260万图像的数据集进行预训练,以及560,000视频和160万图像的数据集进行微调。预训练数据集的统计信息如下所示。 图像文本标记 (使用T5分词器): ![image text tokens](/assets/readme/report_image_textlen.png) 视频文本标记 (使用T5分词器)。我们直接使用Panda的短视频描述进行训练,并自己给其他数据集加视频描述。生成的字幕通常少于200个token。 ![video text tokens](/assets/readme/report_video_textlen.png) 视频时长: ![video duration](/assets/readme/report_video_duration.png) ## 训练详情 由于计算资源有限,我们必须仔细监控训练过程,并在推测模型学习不佳时更改训练策略,因为没有消融研究的计算。因此,Open-Sora1.1版本的训练包括多个更改,所以,指数移动平均(EMA)未被应用。 1. 首先,我们从`Pixart-alpha-1024`的模型checkpoint开始,使用不同分辨率的图像进行了6000步的微调。我们发现模型能够很容易地适应并生成不同分辨率的图像。为了加快扩散过程的训练,我们使用了[SpeeDiT](https://github.com/1zeryu/SpeeDiT)(iddpm-speed)技术。 2. **[阶段一]** 然后,我们使用梯度检查点(gradient-checkpointing)技术对模型进行了**24,000**步的预训练,这个过程在64个H800 GPU上运行了**4天**。尽管模型看到的数据样本数量相同,我们发现与使用较小批量大小相比,模型的学习速度较慢。我们推测,在训练的早期阶段,步数的数量对于训练更为重要。大多数视频的分辨率是**240p**,预训练时使用的配置与[stage2.py](/configs/opensora-v1-1/train/stage2.py)相似。 3. **[阶段一]** 为了增加训练步数,我们改用了更小的批量大小,并且没有使用梯度检查点技术。在这个阶段,我们还引入了帧率(fps)条件。模型训练了**40,000**步,持续了**2天**。训练中使用的视频大多数是**144p**分辨率,使用的配置文件是[stage1.py](/configs/opensora-v1-1/train/stage1.py)。我们使用较低的分辨率,因为我们在Open-Sora 1.0版本中发现模型可以以相对较低的分辨率学习时间知识。 4. **[阶段一]** 我们发现模型不能很好地学习长视频,并在Open-Sora1.0训练中发现了一个噪声生成结果,推测是半精度问题。因此,我们采用QK-归一化来稳定训练。我们还将iddpm-speed切换成iddpm。我们训练了**17k**步**14小时**。大多数视频的分辨率是144p,预训练时使用的配置是[stage1.py](/configs/opensora-v1-1/train/stage1.py)。阶段1训练持续约一周,总步长**81k**。 5. **[阶段二]** 我们切换到更高的分辨率,其中大多数视频是**240p和480p**分辨率([stage2.py](/configs/opensora-v1-1/train/stage2.py))。我们在所有预训练数据上训练了**22000**步,持续**一天**。 6. **[阶段三]** 我们切换到更高的分辨率,大多数视频的分辨率是**480p和720p**([stage3.py](/configs/opensora-v1-1/train/stage3.py))。我们在高质量数据上训了**4000**步,用时**一天**。 ## 结果和评价 ## 不足和下一步计划 随着我们离Sora的复现又近了一步,我们发现当前模型存在许多不足,这些不足将在我们下阶段工作中得到改善。 - **噪音的生成和影响**:我们发现生成的模型,特别是长视频中,有时很多噪点,不流畅。我们认为问题在于没有使用时间VAE。由于[Pixart-Sigma](https://arxiv.org/abs/2403.04692)发现适应新VAE很容易,我们计划在下一个版本中为模型开发时间VAE。 - **缺乏时间一致性**:我们发现模型无法生成具有高时间一致性的视频,我们认为问题是由于缺乏训练FLOPs,我们计划收集更多数据并继续训练模型以提高时间一致性。 - **人像生成质量低**:我们发现模型无法生成高质量的人类视频,我们认为问题是由于缺乏人类数据,我们计划收集更多的人类数据,并继续训练模型以提高人类生成。 - **美学得分低**:我们发现模型的美学得分不高。问题在于缺少美学得分过滤,由于IO瓶颈没我们没有进行这一步骤。我们计划通过美学得分和微调模型来过滤数据,以提高美学得分。 - **长视频生成质量低**:我们发现,使用同样的提示词,视频越长,质量越差。这意味着图像质量不能同等地被不同长度的序列所适应。 > - **算法与加速实现**:Zangwei Zheng, Xiangyu Peng, Shenggui Li, Hongxing Liu, Yukun Zhou > - **数据收集与处理**:Xiangyu Peng, Zangwei Zheng, Chenhui Shen, Tom Young, Junjie Wang, Chenfeng Yu ================================================ FILE: Open-Sora/docs/zh_CN/report_v3.md ================================================ # Open-Sora 1.2 报告 - [视频压缩网络](#视频压缩网络) - [整流流和模型适应](#整流流和模型适应) - [更多数据和更好的多阶段训练](#更多数据和更好的多阶段训练) - [简单有效的模型调节](#简单有效的模型调节) - [评估](#评估) 在 Open-Sora 1.2 版本中,我们在 >30M 数据上训练了 一个1.1B 的模型,支持 0s~16s、144p 到 720p、各种宽高比的视频生成。我们的配置如下所列。继 1.1 版本之后,Open-Sora 1.2 还可以进行图像到视频的生成和视频扩展。 | | 图像 | 2秒 | 4秒 | 8秒 | 16秒 | | ---- | ----- | --- | --- | --- | --- | | 240p | ✅ | ✅ | ✅ | ✅ | ✅ | | 360p | ✅ | ✅ | ✅ | ✅ | ✅ | | 480p | ✅ | ✅ | ✅ | ✅ | 🆗 | | 720p | ✅ | ✅ | ✅ | 🆗 | 🆗 | 这里✅表示在训练期间可以看到数据,🆗表示虽然没有经过训练,但模型可以在该配置下进行推理。🆗的推理需要多个80G内存的GPU和序列并行。 除了 Open-Sora 1.1 中引入的功能外,Open-Sora 1.2 还有以下重磅更新: - 视频压缩网络 - 整流流训练 - 更多数据和更好的多阶段训练 - 简单有效的模型调节 - 更好的评估指标 上述改进的所有实现(包括训练和推理)均可在 Open-Sora 1.2 版本中使用。以下部分将介绍改进的细节。我们还改进了代码库和文档,使其更易于使用。 ## 视频压缩网络 对于 Open-Sora 1.0 & 1.1,我们使用了 stable-ai 的 83M 2D VAE,它仅在空间维度上压缩,将视频压缩 8x8 倍。为了减少时间维度,我们每三帧提取一帧。然而,这种方法导致生成的视频流畅度较低,因为牺牲了生成的帧率(fps)。因此,在这个版本中,我们引入了像 OpenAI 的 Sora 一样的视频压缩网络。该网络在时域上将视频大小压缩至四分之一,因此,我们不必再额外抽帧,而可以使用原有帧率生成模型。 考虑到训练 3D VAE 的计算成本很高,我们希望重新利用在 2D VAE 中学到的知识。我们注意到,经过 2D VAE 压缩后,时间维度上相邻的特征仍然高度相关。因此,我们提出了一个简单的视频压缩网络,首先将视频在空间维度上压缩 8x8 倍,然后将视频在时间维度上压缩 4 倍。网络如下所示: ![video_compression_network](/assets/readme/report_3d_vae.png) 我们用[SDXL 的 VAE](https://huggingface.co/stabilityai/sdxl-vae)初始化 2D VAE ,它比我们以前使用的更好。对于 3D VAE,我们采用[Magvit-v2](https://magvit.cs.cmu.edu/v2/)中的 VAE 结构,它包含 300M 个参数。加上 83M 的 2D VAE,视频压缩网络的总参数为 384M。我们设定batch size 为 1, 对 3D VAE 进行了 1.2M 步的训练。训练数据是来自 pixels 和 pixabay 的视频,训练视频大小主要是 17 帧,256x256 分辨率。3D VAE 中使用causal convolotions使图像重建更加准确。 我们的训练包括三个阶段: 1. 对于前 380k 步,我们冻结 2D VAE并在 8 个 GPU 上进行训练。训练目标包括重建 2D VAE 的压缩特征(图中粉红色),并添加损失以使 3D VAE 的特征与 2D VAE 的特征相似(粉红色和绿色,称为identity loss)。我们发现后者的损失可以快速使整个 VAE 在图像上取得良好的性能,并在下一阶段更快地收敛。 2. 对于接下来的 260k 步,我们消除identity loss并仅学习 3D VAE。 3. 对于最后 540k 步,由于我们发现仅重建 2D VAE 的特征无法带来进一步的改进,因此我们移除了loss并训练整个 VAE 来重建原始视频。此阶段在 24 个 GPU 上进行训练。 对于训练的前半部分,我们采用 20% 的图像和 80% 的视频。按照[Magvit-v2](https://magvit.cs.cmu.edu/v2/),我们使用 17 帧训练视频,同时对图像的前 16 帧进行零填充。然而,我们发现这种设置会导致长度不同于 17 帧的视频变得模糊。因此,在第 3 阶段,我们使用不超过34帧长度的任意帧长度视频进行混合视频长度训练,以使我们的 VAE 对不同视频长度更具鲁棒性(也就是说,如果我们希望训练含有n帧的视频,我们就把原视频中`34-n`帧用0进行填充)。我们的 [训练](/scripts/train_vae.py)和[推理](/scripts/inference_vae.py)代码可在 Open-Sora 1.2 版本中找到。 当使用 VAE 进行扩散模型时,我们的堆叠 VAE 所需的内存较少,因为我们的 VAE 的输入已经经过压缩。我们还将输入视频拆分为几个 17 帧剪辑,以提高推理效率。我们的 VAE 与[Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/docs/Report-v1.1.0.md)中的另一个开源 3D VAE 性能相当。 | 模型 | 结构相似性↑ | 峰值信噪比↑ | | ------------------ | ----- | ------ | | Open-Sora-Plan 1.1 | 0.882 | 29.890 | | Open-Sora 1.2 | 0.880 | 30.590 | ## 整流流和模型适应 最新的扩散模型 Stable Diffusion 3 为了获得更好的性能,采用了[rectified flow](https://github.com/gnobitab/RectifiedFlow)替代了 DDPM。可惜 SD3 的 rectified flow 训练代码没有开源。不过 Open-Sora 1.2 提供了遵循 SD3 论文的训练代码,包括: - 基本整流流训练 - 用于训练加速的 Logit-norm 采样 - 分辨率和视频长度感知时间步长采样 对于分辨率感知的时间步长采样,我们应该对分辨率较大的图像使用更多的噪声。我们将这个想法扩展到视频生成,对长度较长的视频使用更多的噪声。 Open-Sora 1.2 从[PixArt-Σ 2K](https://github.com/PixArt-alpha/PixArt-sigma) 模型checkpoint开始。请注意,此模型使用 DDPM 和 SDXL VAE 进行训练,分辨率也高得多。我们发现在小数据集上进行微调可以轻松地使模型适应我们的视频生成设置。适应过程如下,所有训练都在 8 个 GPU 上完成: 1. 多分辨率图像生成能力:我们训练模型以 20k 步生成从 144p 到 2K 的不同分辨率。 2. QK-norm:我们将 QK-norm 添加到模型中并训练 18k 步。 3. 整流流:我们从离散时间 DDPM 转变为连续时间整流流并训练 10k 步。 4. 使用 logit-norm 采样和分辨率感知时间步采样的整流流:我们训练 33k 步。 5. 较小的 AdamW epsilon:按照 SD3,使用 QK-norm,我们可以对 AdamW 使用较小的 epsilon(1e-15),我们训练 8k 步。 6. 新的 VAE 和 fps 调节:我们用自己的 VAE 替换原来的 VAE,并将 fps 调节添加到时间步调节中,我们训练 25k 步。请注意,对每个通道进行规范化对于整流流训练非常重要。 7. 时间注意力模块:我们添加时间注意力模块,其中没有初始化投影层。我们在图像上进行 3k 步训练。 8. 仅针对具有掩码策略的视频的时间块:我们仅在视频上训练时间注意力块,步长为 38k。 经过上述调整后,我们就可以开始在视频上训练模型了。上述调整保留了原始模型生成高质量图像的能力,并未后续的视频生成提供了许多助力: - 通过整流,我们可以加速训练,将视频的采样步数从100步减少到30步,大大减少了推理的等待时间。 - 使用 qk-norm,训练更加稳定,并且可以使用积极的优化器。 - 采用新的VAE,时间维度压缩了4倍,使得训练更加高效。 - 该模型具有多分辨率图像生成能力,可以生成不同分辨率的视频。 ## 更多数据和更好的多阶段训练 由于计算预算有限,我们精心安排了训练数据的质量从低到高,并将训练分为三个阶段。我们的训练涉及 12x8 GPU,总训练时间约为 2 周, 约70k步。 ### 第一阶段 我们首先在 Webvid-10M 数据集(40k 小时)上训练模型,共 30k 步(2 个 epoch)。由于视频分辨率均低于 360p 且包含水印,因此我们首先在此数据集上进行训练。训练主要在 240p 和 360p 上进行,视频长度为 2s~16s。我们使用数据集中的原始字幕进行训练。训练配置位于[stage1.py](/configs/opensora-v1-2/train/stage1.py)中。 ### 第二阶段 然后我们在 Panda-70M 数据集上训练模型。这个数据集很大,但质量参差不齐。我们使用官方的 30M 子集,其中的片段更加多样化,并过滤掉美学评分低于 4.5 的视频。这产生了一个 20M 子集,包含 41k 小时。数据集中的字幕直接用于我们的训练。训练配置位于[stage2.py](/configs/opensora-v1-2/train/stage2.py)中。 训练主要在 360p 和 480p 上进行。我们训练模型 23k 步,即 0.5 个 epoch。训练尚未完成,因为我们希望我们的新模型能早日与大家见面。 ### 第三阶段 在此阶段,我们从各种来源收集了 200 万个视频片段,总时长 5000 小时,其中包括: - 来自 Pexels、Pixabay、Mixkit 等的免费授权视频。 - [MiraData](https://github.com/mira-space/MiraData):一个包含长视频的高质量数据集,主要来自游戏和城市/风景探索。 - [Vript](https://github.com/mutonix/Vript/tree/main):一个密集注释的数据集。 - 还有一些其他数据集。 MiraData 和 Vript 有来自 GPT 的字幕,而我们使用[PLLaVA](https://github.com/magic-research/PLLaVA)为其余字幕添加字幕。与只能进行单帧/图像字幕的 LLaVA 相比,PLLaVA 是专门为视频字幕设计和训练的。[加速版PLLaVA](/tools/caption/README.md#pllava-captioning)已在我们的`tools/`中发布。在实践中,我们使用预训练的 PLLaVA 13B 模型,并从每个视频中选择 4 帧生成字幕,空间池化形状为 2*2。 下面显示了此阶段使用的视频数据的一些统计数据。我们提供了持续时间和分辨率的基本统计数据,以及美学分数和光流分数分布。我们还从视频字幕中提取了对象和动作的标签并计算了它们的频率。 ![stats](/assets/readme/report-03_video_stats.png) ![object_count](/assets/readme/report-03_objects_count.png) ![object_count](/assets/readme/report-03_actions_count.png) 此阶段我们主要在 720p 和 1080p 上进行训练,以提高模型在高清视频上的表现力。在训练中,我们使用的掩码率为25%。训练配置位于[stage3.py](/configs/opensora-v1-2/train/stage3.py)中。我们对模型进行 15k 步训练,大约为 2 个 epoch。 ## 简单有效的模型调节 对于第 3 阶段,我们计算每个视频片段的美学分数和运动分数。但是,由于视频片段数量较少,我们不愿意过滤掉得分较低的片段,这会导致数据集较小。相反,我们将分数附加到字幕中并将其用作条件。我们发现这种方法可以让模型了解分数并遵循分数来生成质量更好的视频。 例如,一段美学评分为 5.5、运动评分为 10 且检测到摄像头运动向左平移的视频,其字幕将为: ```plaintext [Original Caption] aesthetic score: 5.5, motion score: 10, camera motion: pan left. ``` 在推理过程中,我们还可以使用分数来调节模型。对于摄像机运动,我们仅标记了 13k 个具有高置信度的剪辑,并且摄像机运动检测模块已在我们的工具中发布。 ## 评估 之前,我们仅通过人工评估来监控训练过程,因为 DDPM 训练损失与生成的视频质量没有很好的相关性。但是,对于校正流,如 SD3 中所述,我们发现训练损失与生成的视频质量有很好的相关性。因此,我们跟踪了 100 张图像和 1k 个视频的校正流评估损失。 我们从 pixabay 中抽样了 1k 个视频作为验证数据集。我们计算了不同分辨率(144p、240p、360p、480p、720p)下图像和不同长度的视频(2s、4s、8s、16s)的评估损失。对于每个设置,我们等距采样 10 个时间步长。然后对所有损失取平均值。 ![Evaluation Loss](/assets/readme/report_val_loss.png) ![Video Evaluation Loss](/assets/readme/report_vid_val_loss.png) 此外,我们还会在训练过程中跟踪[VBench](https://vchitect.github.io/VBench-project/)得分。VBench 是用于短视频生成的自动视频评估基准。我们用 240p 2s 视频计算 vbench 得分。这两个指标验证了我们的模型在训练过程中持续改进。 ![VBench](/assets/readme/report_vbench_score.png) 所有评估代码均发布在`eval`文件夹中。查看[评估指南](/eval/README.md)了解更多详细信息。 |模型 | 总得分 | 质量得分 | 语义分数 | | -------------- | ----------- | ------------- | -------------- | | Open-Sora V1.0 | 75.91% | 78.81% | 64.28% | | Open-Sora V1.2 | 79.23% | 80.71% | 73.30% | ## 序列并行 我们使用序列并行来支持长序列训练和推理。我们的实现基于Ulysses,工作流程如下所示。启用序列并行后,我们只需要将 `all-to-all` 通信应用于STDiT中的空间模块(spatial block),因为在序列维度上,只有对空间信息的计算是相互依赖的。 ![SP](/assets/readme/sequence_parallelism.jpeg) 目前,由于训练数据分辨率较小,我们尚未使用序列并行进行训练,我们计划在下一个版本中使用。至于推理,我们可以使用序列并行,以防您的 GPU 内存不足。下表显示,序列并行可以实现加速: | 分辨率 | 时长 | GPU数量 | 是否启用序列并行 |用时(秒) | 加速效果/GPU | | ---------- | ------- | -------------- | --------- | ------------ | --------------- | | 720p | 16秒 | 1 | 否 | 547.97 | - | | 720p | 16s秒 | 2 | 是 | 244.38 | 12% | ================================================ FILE: Open-Sora/docs/zh_CN/structure.md ================================================ # 代码仓库和配置文件结构 ## 代码仓库结构 ```plaintext Open-Sora ├── README.md ├── docs │ ├── acceleration.md -> Acceleration & Speed benchmark │ ├── command.md -> Commands for training & inference │ ├── datasets.md -> Datasets used in this project │ ├── structure.md -> This file │ └── report_v1.md -> Report for Open-Sora v1 ├── scripts │ ├── train.py -> diffusion training script │ └── inference.py -> Report for Open-Sora v1 ├── configs -> Configs for training & inference ├── opensora │ ├── __init__.py │ ├── registry.py -> Registry helper │   ├── acceleration -> Acceleration related code │   ├── dataset -> Dataset related code │   ├── models │   │   ├── layers -> Common layers │   │   ├── vae -> VAE as image encoder │   │   ├── text_encoder -> Text encoder │   │   │   ├── classes.py -> Class id encoder (inference only) │   │   │   ├── clip.py -> CLIP encoder │   │   │   └── t5.py -> T5 encoder │   │   ├── dit │   │   ├── latte │   │   ├── pixart │   │   └── stdit -> Our STDiT related code │   ├── schedulers -> Diffusion schedulers │   │   ├── iddpm -> IDDPM for training and inference │   │ └── dpms -> DPM-Solver for fast inference │ └── utils └── tools -> Tools for data processing and more ``` ## 配置文件结构 我们的配置文件遵循[MMEgine](https://github.com/open-mmlab/mmengine)。 MMEngine 将读取配置文件(“.py”文件)并将其解析为类似字典的对象。 ```plaintext Open-Sora └── configs -> Configs for training & inference ├── opensora -> STDiT related configs │ ├── inference │ │ ├── 16x256x256.py -> Sample videos 16 frames 256x256 │ │ ├── 16x512x512.py -> Sample videos 16 frames 512x512 │ │ └── 64x512x512.py -> Sample videos 64 frames 512x512 │ └── train │ ├── 16x256x256.py -> Train on videos 16 frames 256x256 │ ├── 16x256x256.py -> Train on videos 16 frames 256x256 │ └── 64x512x512.py -> Train on videos 64 frames 512x512 ├── dit -> DiT related configs    │   ├── inference    │   │   ├── 1x256x256-class.py -> Sample images with ckpts from DiT    │   │   ├── 1x256x256.py -> Sample images with clip condition    │   │   └── 16x256x256.py -> Sample videos    │   └── train    │     ├── 1x256x256.py -> Train on images with clip condition    │      └── 16x256x256.py -> Train on videos ├── latte -> Latte related configs └── pixart -> PixArt related configs ``` ## 推理配置演示 要更改推理设置,可以直接修改相应的配置文件。或者您可以传递参数来覆盖配置文件([config_utils.py](/opensora/utils/config_utils.py))。要更改采样提示,您应该修改传递给“--prompt_path”参数的“.txt”文件。 ```plaintext --prompt_path ./assets/texts/t2v_samples.txt -> prompt_path --ckpt-path ./path/to/your/ckpt.pth -> model["from_pretrained"] ``` 下面提供了每个字段的解释。 ```python # Define sampling size num_frames = 64 # number of frames fps = 24 // 2 # frames per second (divided by 2 for frame_interval=2) image_size = (512, 512) # image size (height, width) # Define model model = dict( type="STDiT-XL/2", # Select model type (STDiT-XL/2, DiT-XL/2, etc.) space_scale=1.0, # (Optional) Space positional encoding scale (new height / old height) time_scale=2 / 3, # (Optional) Time positional encoding scale (new frame_interval / old frame_interval) enable_flash_attn=True, # (Optional) Speed up training and inference with flash attention enable_layernorm_kernel=True, # (Optional) Speed up training and inference with fused kernel from_pretrained="PRETRAINED_MODEL", # (Optional) Load from pretrained model no_temporal_pos_emb=True, # (Optional) Disable temporal positional encoding (for image) ) vae = dict( type="VideoAutoencoderKL", # Select VAE type from_pretrained="stabilityai/sd-vae-ft-ema", # Load from pretrained VAE micro_batch_size=128, # VAE with micro batch size to save memory ) text_encoder = dict( type="t5", # Select text encoder type (t5, clip) from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", # Load from pretrained text encoder model_max_length=120, # Maximum length of input text ) scheduler = dict( type="iddpm", # Select scheduler type (iddpm, dpm-solver) num_sampling_steps=100, # Number of sampling steps cfg_scale=7.0, # hyper-parameter for classifier-free diffusion ) dtype = "fp16" # Computation type (fp16, fp32, bf16) # Other settings batch_size = 1 # batch size seed = 42 # random seed prompt_path = "./assets/texts/t2v_samples.txt" # path to prompt file save_dir = "./samples" # path to save samples ``` ## 训练配置演示 ```python # Define sampling size num_frames = 64 frame_interval = 2 # sample every 2 frames image_size = (512, 512) # Define dataset root = None # root path to the dataset data_path = "CSV_PATH" # path to the csv file use_image_transform = False # True if training on images num_workers = 4 # number of workers for dataloader # Define acceleration dtype = "bf16" # Computation type (fp16, bf16) grad_checkpoint = True # Use gradient checkpointing plugin = "zero2" # Plugin for distributed training (zero2, zero2-seq) sp_size = 1 # Sequence parallelism size (1 for no sequence parallelism) # Define model model = dict( type="STDiT-XL/2", space_scale=1.0, time_scale=2 / 3, from_pretrained="YOUR_PRETRAINED_MODEL", enable_flash_attn=True, # Enable flash attention enable_layernorm_kernel=True, # Enable layernorm kernel ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", micro_batch_size=128, ) text_encoder = dict( type="t5", from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", model_max_length=120, shardformer=True, # Enable shardformer for T5 acceleration ) scheduler = dict( type="iddpm", timestep_respacing="", # Default 1000 timesteps ) # Others seed = 42 outputs = "outputs" # path to save checkpoints wandb = False # Use wandb for logging epochs = 1000 # number of epochs (just large enough, kill when satisfied) log_every = 10 ckpt_every = 250 load = None # path to resume training batch_size = 4 lr = 2e-5 grad_clip = 1.0 # gradient clipping ``` ================================================ FILE: Open-Sora/docs/zh_CN/vae.md ================================================ # VAE 技术报告 由于 [Pixart-Sigma](https://arxiv.org/abs/2403.04692) 论文中指出适应新的VAE很简单,因此我们开发了一个额外的时间VAE。 具体而言, 我们的VAE由一个[空间 VAE](https://huggingface.co/PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers)和一个时间VA相接的形式组成. 对于时间VAE,我们遵循 [MAGVIT-v2](https://arxiv.org/abs/2310.05737)的实现, 并做了以下修改: * 我们删除了码本特有的架构。 * 我们不使用鉴别​​器(discriminator),而是使用VAE重建损失、kl损失和感知损失进行训练。 * 在编码器的最后一个线性层中,我们缩小到 4 通道的对角高斯分布,遵循我们之前训练的接受 4 通道输入的 STDiT。 * 我们的解码器与编码器架构对称。 ## 训练 我们分不同阶段训练模型。 我们首先通过在单台机器(8 个 GPU)上冻结空间 VAE 380k 步来训练时间 VAE。我们使用额外的身份损失使 3D VAE 的特征与 2D VAE 的特征相似。我们使用 20% 的图像和 80% 的视频(17 帧)来训练 VAE。 ```bash torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage1.py --data-path YOUR_CSV_PATH ``` 接下来,我们移除身份损失并训练 3D VAE 管道以重建 260k 步的 2D 压缩视频。 ```bash torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage2.py --data-path YOUR_CSV_PATH ``` 最后,我们移除了 2D 压缩视频的重建损失,并训练 VAE 管道以构建 540k 步的 3D 视频。我们在 34 帧内使用随机数训练 VAE,使其对不同长度的视频更具鲁棒性。此阶段在 24 个 GPU 上进行训练。 ```bash torchrun --nnodes=3 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage3.py --data-path YOUR_CSV_PATH ``` 请注意,您需要根据自己的 csv 数据大小相应地调整配置文件中的 `epochs` 。 ## 推理 为了直观地检查 VAE 的性能,您可以运行以下推理。它使用 `_ori` 后缀(即 `"YOUR_VIDEO_DIR"_ori`)将原始视频保存到您指定的视频目录中,使用`_rec`后缀(即`"YOUR_VIDEO_DIR"_rec`)将来自完整管道的重建视频保存到指定的视频目录中,并使用 `_spatial`后缀(即`"YOUR_VIDEO_DIR"_spatial`)将来自 2D 压缩和解压缩的重建视频保存到指定的视频目录中。 ```bash torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference_vae.py configs/vae/inference/video.py --ckpt-path YOUR_VAE_CKPT_PATH --data-path YOUR_CSV_PATH --save-dir YOUR_VIDEO_DIR ``` ## 评估 然后,我们可以计算 VAE 在 SSIM、PSNR、LPIPS 和 FLOLPIPS 指标上的表现得分。 * SSIM: 结构相似性指数度量,越高越好 * PSNR: 峰值信噪比,越高越好 * LPIPS: 学习感知图像质量下降,越低越好 * [FloLPIPS](https://arxiv.org/pdf/2207.08119): 带有视频插值的LPIPS,越低越好。 ```bash python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir YOUR_VIDEO_DIR_ori --generated_video_dir YOUR_VIDEO_DIR_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips ``` ## 致谢 我们非常感谢以下工作: * [MAGVIT-v2](https://arxiv.org/abs/2310.05737): Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation * [Taming Transformers](https://github.com/CompVis/taming-transformers): Taming Transformers for High-Resolution Image Synthesis * [3D blur pooling](https://github.com/adobe/antialiased-cnns/pull/39/commits/3d6f02b6943c58b68c19c07bc26fad57492ff3bc) * [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan) ================================================ FILE: Open-Sora/environment-opensora.yml ================================================ name: opensora channels: - defaults dependencies: - _libgcc_mutex=0.1=main - _openmp_mutex=5.1=1_gnu - ca-certificates=2024.7.2=h06a4308_0 - ld_impl_linux-64=2.38=h1181459_1 - libffi=3.4.4=h6a678d5_1 - libgcc-ng=11.2.0=h1234567_1 - libgomp=11.2.0=h1234567_1 - libstdcxx-ng=11.2.0=h1234567_1 - ncurses=6.4=h6a678d5_0 - openssl=3.0.15=h5eee18b_0 - pip=24.2=py39h06a4308_0 - python=3.9.19=h955ad1f_1 - readline=8.2=h5eee18b_0 - setuptools=72.1.0=py39h06a4308_0 - sqlite=3.45.3=h5eee18b_0 - tk=8.6.14=h39e8969_0 - wheel=0.43.0=py39h06a4308_0 - xz=5.4.6=h5eee18b_1 - zlib=1.2.13=h5eee18b_1 - pip: - absl-py==2.1.0 - accelerate==0.29.2 - addict==2.4.0 - aiofiles==23.2.1 - aiosignal==1.3.1 - altair==5.4.1 - annotated-types==0.7.0 - antlr4-python3-runtime==4.9.3 - anyio==4.4.0 - apex==0.1 - asttokens==2.4.1 - attrs==24.2.0 - av==13.0.0 - bcrypt==4.2.0 - beartype==0.18.5 - beautifulsoup4==4.12.3 - bitsandbytes==0.43.3 - black==24.8.0 - boto3==1.35.20 - botocore==1.35.20 - calflops==0.3.2 - certifi==2024.8.30 - cffi==1.17.1 - cfgv==3.4.0 - charset-normalizer==3.3.2 - click==8.1.7 - cloudpickle==3.0.0 - colossalai==0.4.0 - comm==0.2.2 - contexttimer==0.3.3 - contourpy==1.3.0 - cryptography==43.0.1 - cycler==0.12.1 - cython==3.0.11 - debugpy==1.8.5 - decorator==5.1.1 - decord==0.6.0 - deprecated==1.2.14 - detectron2==0.6 - diffusers==0.27.2 - dill==0.3.8 - distlib==0.3.8 - distro==1.9.0 - docker-pycreds==0.4.0 - easydict==1.13 - einops==0.8.0 - exceptiongroup==1.2.2 - executing==2.1.0 - fabric==3.2.2 - facexlib==0.3.0 - fairscale==0.4.13 - fastapi==0.114.0 - ffmpy==0.4.0 - filelock==3.16.0 - filterpy==1.4.5 - flash-attn==2.6.3 - fonttools==4.53.1 - frozenlist==1.4.1 - fsspec==2024.9.0 - ftfy==6.2.3 - future==1.0.0 - fvcore==0.1.5.post20221221 - galore-torch==1.0 - gitdb==4.0.11 - gitpython==3.1.43 - google==3.0.0 - gradio==4.26.0 - gradio-client==0.15.1 - grpcio==1.66.1 - h11==0.14.0 - httpcore==1.0.5 - httpx==0.27.2 - huggingface-hub==0.24.6 - hydra-core==1.3.2 - identify==2.6.0 - idna==3.8 - imageio==2.35.1 - imgaug==0.4.0 - importlib-metadata==8.4.0 - importlib-resources==6.4.5 - invoke==2.2.0 - iopath==0.1.9 - ipykernel==6.29.5 - ipython==8.18.1 - ipywidgets==8.1.5 - jedi==0.19.1 - jinja2==3.1.4 - jiter==0.5.0 - jmespath==1.0.1 - joblib==1.4.2 - jsonschema==4.23.0 - jsonschema-specifications==2023.12.1 - jupyter-client==8.6.2 - jupyter-core==5.7.2 - jupyterlab-widgets==3.0.13 - kiwisolver==1.4.7 - lazy-loader==0.4 - llvmlite==0.43.0 - lmdb==1.5.1 - lpips==0.1.4 - lvis==0.5.3 - markdown==3.7 - markdown-it-py==3.0.0 - markupsafe==2.1.5 - matplotlib==3.9.2 - matplotlib-inline==0.1.7 - mdurl==0.1.2 - mmengine==0.10.4 - mpmath==1.3.0 - msgpack==1.1.0 - mypy-extensions==1.0.0 - narwhals==1.8.1 - nest-asyncio==1.6.0 - networkx==3.2.1 - ninja==1.11.1.1 - nodeenv==1.9.1 - numba==0.60.0 - numpy==1.26.4 - nvidia-cublas-cu12==12.1.3.1 - nvidia-cuda-cupti-cu12==12.1.105 - nvidia-cuda-nvrtc-cu12==12.1.105 - nvidia-cuda-runtime-cu12==12.1.105 - nvidia-cudnn-cu12==8.9.2.26 - nvidia-cufft-cu12==11.0.2.54 - nvidia-curand-cu12==10.3.2.106 - nvidia-cusolver-cu12==11.4.5.107 - nvidia-cusparse-cu12==12.1.0.106 - nvidia-nccl-cu12==2.19.3 - nvidia-nvjitlink-cu12==12.6.68 - nvidia-nvtx-cu12==12.1.105 - omegaconf==2.3.0 - openai==1.44.1 - openai-clip==1.0.1 - opencv-python==4.10.0.84 - opensora==1.2.0 - orjson==3.10.7 - packaging==24.1 - pandarallel==1.6.5 - pandas==2.2.2 - parameterized==0.9.0 - paramiko==3.4.1 - parso==0.8.4 - pathspec==0.12.1 - peft==0.12.0 - pexpect==4.9.0 - pillow==10.4.0 - platformdirs==4.3.2 - plumbum==1.8.3 - portalocker==2.10.1 - pre-commit==3.8.0 - prompt-toolkit==3.0.47 - protobuf==5.28.0 - psutil==5.9.8 - ptyprocess==0.7.0 - pure-eval==0.2.3 - pyarrow==17.0.0 - pycocotools==2.0.8 - pycparser==2.22 - pydantic==2.9.1 - pydantic-core==2.23.3 - pydub==0.25.1 - pygments==2.18.0 - pyiqa==0.1.10 - pynacl==1.5.0 - pyparsing==3.1.4 - python-dateutil==2.9.0.post0 - python-multipart==0.0.9 - pytorchvideo==0.1.5 - pytz==2024.1 - pyyaml==6.0.2 - pyzmq==26.2.0 - ray==2.35.0 - referencing==0.35.1 - regex==2024.7.24 - requests==2.32.3 - rich==13.8.1 - rotary-embedding-torch==0.5.3 - rpds-py==0.20.0 - rpyc==6.0.0 - ruff==0.6.4 - s3transfer==0.10.2 - safetensors==0.4.5 - scikit-image==0.24.0 - scikit-learn==1.5.2 - scipy==1.13.1 - semantic-version==2.10.0 - sentencepiece==0.2.0 - sentry-sdk==2.14.0 - setproctitle==1.3.3 - shapely==2.0.6 - shellingham==1.5.4 - six==1.16.0 - smmap==5.0.1 - sniffio==1.3.1 - soupsieve==2.6 - spaces==0.30.2 - stack-data==0.6.3 - starlette==0.38.5 - sympy==1.13.2 - tabulate==0.9.0 - tensorboard==2.17.1 - tensorboard-data-server==0.7.2 - termcolor==2.4.0 - threadpoolctl==3.5.0 - tifffile==2024.8.30 - timm==0.9.16 - tokenizers==0.15.2 - tomli==2.0.1 - tomlkit==0.12.0 - torch==2.2.2 - torchvision==0.17.2 - tornado==6.4.1 - tqdm==4.66.5 - traitlets==5.14.3 - transformers==4.39.3 - triton==2.2.0 - typer==0.12.5 - typing-extensions==4.12.2 - tzdata==2024.1 - urllib3==1.26.20 - uvicorn==0.29.0 - virtualenv==20.26.4 - wandb==0.17.9 - wcwidth==0.2.13 - websockets==11.0.3 - werkzeug==3.0.4 - widgetsnbextension==4.0.13 - wrapt==1.16.0 - xformers==0.0.25.post1 - yacs==0.1.8 - yapf==0.40.2 - zipp==3.20.1 prefix: /root/miniconda3/envs/opensora ================================================ FILE: Open-Sora/eval/README.md ================================================ # Evalution ## Human evaluation To conduct human evaluation, we need to generate various samples. We provide many prompts in `assets/texts`, and defined some test setting covering different resolution, duration and aspect ratio in `eval/sample.sh`. To facilitate the usage of multiple GPUs, we split sampling tasks into several parts. ```bash # image (1) bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -1 # video (2a 2b 2c ...) bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -2a # launch 8 jobs at once (you must read the script to understand the details) bash eval/human_eval/launch.sh /path/to/ckpt num_frames model_name_for_log ``` ## Rectified Flow Loss Evaluate the rectified flow loss with the following commands. ```bash # image torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py --data-path /path/to/img.csv --ckpt-path /path/to/ckpt # video torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py --data-path /path/to/vid.csv --ckpt-path /path/to/ckpt # select resolution torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py --data-path /path/to/vid.csv --ckpt-path /path/to/ckpt --resolution 720p ``` To launch multiple jobs at once, use the following script. ```bash bash eval/loss/launch.sh /path/to/ckpt model_name ``` To obtain an organized list of scores: ```bash python eval/loss/tabulate_rl_loss.py --log_dir path/to/log/dir ``` ## VBench [VBench](https://github.com/Vchitect/VBench) is a benchmark for short text to video generation. We provide a script for easily generating samples required by VBench. First, generate the relevant videos with the following commands: ```bash # vbench task, if evaluation all set start_index to 0, end_index to 2000 bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -4 start_index end_index # Alternatively, launch 8 jobs at once (you must read the script to understand the details) bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name # in addition, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name res_value aspect_ratio_value steps_value flow_value llm_refine_value # for example # bash eval/vbench/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True ``` After generation, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies". Then, run the following commands to evaluate the generated samples. ```bash python eval/vbench/calc_vbench.py /path/to/video_folder /path/to/model/ckpt ``` Finally, we obtain the scaled scores for the model by: ```bash python eval/vbench/tabulate_vbench_scores.py --score_dir path/to/score/dir ``` ## VBench-i2v [VBench-i2v](https://github.com/Vchitect/VBench/tree/master/vbench2_beta_i2v) is a benchmark for short image to video generation (beta version). Similarly, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies". ```bash # Step 1: generate the relevant videos # vbench i2v tasks, if evaluation all set start_index to 0, end_index to 2000 bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -5 start_index end_index # Alternatively, launch 8 jobs at once bash eval/vbench_i2v/launch.sh /path/to/ckpt num_frames model_name # Step 2: run vbench to evaluate the generated samples python eval/vbench_i2v/vbench_i2v.py /path/to/video_folder /path/to/model/ckpt # Note that if you need to go to `VBench/vbench2_beta_i2v/utils.py` and change the harded-coded var `image_root` in the `load_i2v_dimension_info` function to your corresponding image folder. # Step 3: obtain the scaled scores python eval/vbench_i2v/tabulate_vbench_i2v_scores.py path/to/videos/folder path/to/your/model/ckpt # this will store the results under `eval/vbench_i2v` in the path/to/your/model/ckpt ``` Similarly as VBench, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine ```bash bash eval/vbench_i2v/launch.sh /path/to/ckpt num_frames model_name_for_log res_value aspect_ratio_value steps_value flow_value llm_refine_value # for example # bash eval/vbench_i2v/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True # if no flow control, use "None" instead ``` ## VAE Install the dependencies package following our [installation](../docs/installation.md)'s s sections of "Evaluation Dependencies". Then, run the following evaluation command: ```bash # metric can any one or list of: ssim, psnr, lpips, flolpips python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir path/to/original/videos --generated_video_dir path/to/generated/videos --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips ``` ================================================ FILE: Open-Sora/eval/human_eval/generate.sh ================================================ #!/bin/bash set -x set -e TEXT_PATH=/home/data/sora_data/pixart-sigma-generated/text.txt OUTPUT_PATH=/home/data/sora_data/pixart-sigma-generated/raw CMD="python scripts/inference.py configs/pixart/inference/1x2048MS.py" # LOG_BASE=logs/sample/generate LOG_BASE=$(dirname $CKPT)/eval/generate mkdir -p ${LOG_BASE} NUM_PER_GPU=10000 N_LAUNCH=2 NUM_START=$(($N_LAUNCH * $NUM_PER_GPU * 8)) CUDA_VISIBLE_DEVICES=0 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 0)) --end-index $(($NUM_START + $NUM_PER_GPU * 1)) --image-size 2048 2048 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_1.log 2>&1 & CUDA_VISIBLE_DEVICES=1 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 1)) --end-index $(($NUM_START + $NUM_PER_GPU * 2)) --image-size 1408 2816 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_2.log 2>&1 & CUDA_VISIBLE_DEVICES=2 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 2)) --end-index $(($NUM_START + $NUM_PER_GPU * 3)) --image-size 2816 1408 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_3.log 2>&1 & CUDA_VISIBLE_DEVICES=3 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 3)) --end-index $(($NUM_START + $NUM_PER_GPU * 4)) --image-size 1664 2304 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_4.log 2>&1 & CUDA_VISIBLE_DEVICES=4 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 4)) --end-index $(($NUM_START + $NUM_PER_GPU * 5)) --image-size 2304 1664 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_5.log 2>&1 & CUDA_VISIBLE_DEVICES=5 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 5)) --end-index $(($NUM_START + $NUM_PER_GPU * 6)) --image-size 1536 2560 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_6.log 2>&1 & CUDA_VISIBLE_DEVICES=6 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 6)) --end-index $(($NUM_START + $NUM_PER_GPU * 7)) --image-size 2560 1536 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_7.log 2>&1 & CUDA_VISIBLE_DEVICES=7 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 7)) --end-index $(($NUM_START + $NUM_PER_GPU * 8)) --image-size 2048 2048 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_8.log 2>&1 & ================================================ FILE: Open-Sora/eval/human_eval/launch.sh ================================================ #!/bin/bash CKPT=$1 NUM_FRAMES=$2 MODEL_NAME=$3 if [[ $CKPT == *"ema"* ]]; then parentdir=$(dirname $CKPT) CKPT_BASE=$(basename $parentdir)_ema else CKPT_BASE=$(basename $CKPT) fi LOG_BASE=$(dirname $CKPT)/eval mkdir -p ${LOG_BASE} echo "Logging to $LOG_BASE" GPUS=(0 1 2 3 4 5 6 7) # TASK_ID_LIST=(1 2a 2b 2c 2d 2e 2f 2g) # move image to video task TASK_ID_LIST=(2a 2b 2c 2d 2e 2f 2g 2h) # FRAME_LIST=(1 $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES) for i in "${!GPUS[@]}"; do CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -${TASK_ID_LIST[i]} >${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & done # kill all by: pkill -f "inference" ================================================ FILE: Open-Sora/eval/loss/eval_loss.py ================================================ from pprint import pformat import colossalai import torch import torch.distributed as dist from colossalai.cluster import DistCoordinator from mmengine.runner import set_random_seed from tqdm import tqdm from opensora.acceleration.parallel_states import get_data_parallel_group, set_data_parallel_group from opensora.datasets.dataloader import prepare_dataloader from opensora.registry import DATASETS, MODELS, SCHEDULERS, build_module from opensora.utils.config_utils import parse_configs from opensora.utils.misc import create_logger, to_torch_dtype from opensora.utils.train_utils import MaskGenerator def main(): torch.set_grad_enabled(False) # ====================================================== # configs & runtime variables # ====================================================== # == parse configs == cfg = parse_configs(training=False) # == device and dtype == device = "cuda" if torch.cuda.is_available() else "cpu" cfg_dtype = cfg.get("dtype", "fp32") assert cfg_dtype in ["fp16", "bf16", "fp32"], f"Unknown mixed precision {cfg_dtype}" dtype = to_torch_dtype(cfg.get("dtype", "bf16")) torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True # == init distributed env == colossalai.launch_from_torch({}) DistCoordinator() set_random_seed(seed=cfg.get("seed", 1024)) set_data_parallel_group(dist.group.WORLD) # == init logger == logger = create_logger() logger.info("Eval loss configuration:\n %s", pformat(cfg.to_dict())) # ====================================================== # build model & load weights # ====================================================== logger.info("Building models...") # == build text-encoder and vae == text_encoder = build_module(cfg.text_encoder, MODELS, device=device) vae = build_module(cfg.vae, MODELS).to(device, dtype).eval() # == build diffusion model == input_size = (None, None, None) latent_size = vae.get_latent_size(input_size) model = ( build_module( cfg.model, MODELS, input_size=latent_size, in_channels=vae.out_channels, caption_channels=text_encoder.output_dim, model_max_length=text_encoder.model_max_length, ) .to(device, dtype) .eval() ) text_encoder.y_embedder = model.y_embedder # HACK: for classifier-free guidance # == build scheduler == scheduler = build_module(cfg.scheduler, SCHEDULERS) if cfg.get("mask_ratios", None) is not None: mask_generator = MaskGenerator(cfg.mask_ratios) # ====================================================== # inference # ====================================================== # start evaluation, prepare a dataset everytime in the loop bucket_config = cfg.bucket_config if cfg.get("resolution", None) is not None: bucket_config = {cfg.resolution: bucket_config[cfg.resolution]} assert bucket_config is not None, "bucket_config is required for evaluation" logger.info("Evaluating bucket_config: %s", bucket_config) def build_dataset(resolution, num_frames, batch_size): bucket_config = {resolution: {num_frames: (1.0, batch_size)}} dataset = build_module(cfg.dataset, DATASETS) dataloader_args = dict( dataset=dataset, batch_size=None, num_workers=cfg.num_workers, shuffle=False, drop_last=False, pin_memory=True, process_group=get_data_parallel_group(), ) dataloader, sampler = prepare_dataloader(bucket_config=bucket_config, **dataloader_args) num_batch = sampler.get_num_batch() num_steps_per_epoch = num_batch // dist.get_world_size() return dataloader, num_steps_per_epoch, num_batch evaluation_losses = {} start = cfg.start_index if "start_index" in cfg else 0 end = cfg.end_index if "end_index" in cfg else len(bucket_config) for i, res in enumerate(bucket_config): if i < start or i >= end: # skip task continue t_bucket = bucket_config[res] for num_frames, (_, batch_size) in t_bucket.items(): if batch_size is None: continue logger.info("Evaluating resolution: %s, num_frames: %s", res, num_frames) dataloader, num_steps_per_epoch, num_batch = build_dataset(res, num_frames, batch_size) if num_batch == 0: logger.warning("No data for resolution: %s, num_frames: %s", res, num_frames) continue evaluation_t_losses = [] for t in torch.linspace(0, scheduler.num_timesteps, cfg.get("num_eval_timesteps", 10) + 2)[1:-1]: loss_t = 0.0 num_samples = 0 dataloader_iter = iter(dataloader) for _ in tqdm(range(num_steps_per_epoch), desc=f"res: {res}, num_frames: {num_frames}, t: {t:.2f}"): batch = next(dataloader_iter) x = batch.pop("video").to(device, dtype) y = batch.pop("text") x = vae.encode(x) model_args = text_encoder.encode(y) # == mask == mask = None if cfg.get("mask_ratios", None) is not None: mask = mask_generator.get_masks(x) model_args["x_mask"] = mask # == video meta info == for k, v in batch.items(): model_args[k] = v.to(device, dtype) # == diffusion loss computation == timestep = torch.tensor([t] * x.shape[0], device=device, dtype=dtype) loss_dict = scheduler.training_losses(model, x, model_args, mask=mask, t=timestep) losses = loss_dict["loss"] # (batch_size) num_samples += x.shape[0] loss_t += losses.sum().item() loss_t /= num_samples evaluation_t_losses.append(loss_t) logger.info("resolution: %s, num_frames: %s, timestep: %.2f, loss: %.4f", res, num_frames, t, loss_t) evaluation_losses[(res, num_frames)] = sum(evaluation_t_losses) / len(evaluation_t_losses) logger.info( "Evaluation losses for resolution: %s, num_frames: %s, loss: %s\n %s", res, num_frames, evaluation_losses[(res, num_frames)], evaluation_t_losses, ) logger.info("Evaluation losses: %s", evaluation_losses) if __name__ == "__main__": main() ================================================ FILE: Open-Sora/eval/loss/launch.sh ================================================ #!/bin/bash CMD="torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py" CKPT_PATH=$1 MODEL_NAME=$2 IMG_PATH=$3 VID_PATH=$4 if [ -z $IMG_PATH ]; then IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv" fi if [ -z $VID_PATH ]; then VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv" fi if [[ $CKPT_PATH == *"ema"* ]]; then parentdir=$(dirname $CKPT_PATH) CKPT_BASE=$(basename $parentdir)_ema else CKPT_BASE=$(basename $CKPT_PATH) fi LOG_BASE=$(dirname $CKPT_PATH)/eval mkdir -p $LOG_BASE echo "Logging to $LOG_BASE" GPUS=(3 4 5 6 7) RESOLUTION=(144p 240p 360p 480p 720p) CUDA_VISIBLE_DEVICES=0 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 0 --end-index 5 >${LOG_BASE}/img_0.log 2>&1 & CUDA_VISIBLE_DEVICES=1 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 5 --end-index 6 >${LOG_BASE}/img_1.log 2>&1 & CUDA_VISIBLE_DEVICES=2 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 6 >${LOG_BASE}/img_2.log 2>&1 & for i in "${!GPUS[@]}"; do CUDA_VISIBLE_DEVICES=${GPUS[i]} $CMD --data-path $VID_PATH --ckpt-path $CKPT_PATH --resolution ${RESOLUTION[i]} >${LOG_BASE}/${RESOLUTION[i]}_vid.log 2>&1 & done ================================================ FILE: Open-Sora/eval/loss/tabulate_rl_loss.py ================================================ """ usage: python tabulate_rl_loss.py --log_dir /home/zhengzangwei/projs/Open-Sora-dev/logs/loss --ckpt_name epoch0-global_step9000 save the processed json to: Open-Sora-dev/evaluation_results/rectified_flow/_loss.json """ import argparse import json import os from ast import literal_eval def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--log_dir", type=str) args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() files = os.listdir(args.log_dir) files = [ "img_0.log", "img_1.log", "img_2.log", "144p_vid.log", "240p_vid.log", "360p_vid.log", "480p_vid.log", "720p_vid.log", ] loss_info = {} for fname in files: path = os.path.join(args.log_dir, fname) with open(path, "r", encoding="utf-8") as f: content = f.readlines() eval_line = content[-1].split("losses:")[-1].strip() loss_dict = literal_eval(eval_line) for key, loss in loss_dict.items(): resolution, frame = key if resolution not in loss_info: loss_info[resolution] = {} loss_info[resolution][frame] = format(loss, ".4f") # Convert and write JSON object to file output_file_path = os.path.join(args.log_dir, "loss.json") with open(output_file_path, "w") as outfile: json.dump(loss_info, outfile, indent=4, sort_keys=True) print(f"results saved to: {output_file_path}") ================================================ FILE: Open-Sora/eval/sample.sh ================================================ # !/bin/bash CKPT=$1 NUM_FRAMES=$2 MODEL_NAME=$3 TASK_TYPE=$4 VBENCH_START_INDEX=$5 VBENCH_END_INDEX=$6 VBENCH_RES=$7 VBENCH_ASP_RATIO=$8 NUM_SAMPLING_STEPS=$9 FLOW=${10} LLM_REFINE=${11} BASE_ASPECT_RATIO=360p ASPECT_RATIOS=(144p 240p 360p 480p 720p 1080p) # Loop through the list of aspect ratios i=0 for r in "${ASPECT_RATIOS[@]}"; do if [[ "$r" == "$BASE_ASPECT_RATIO" ]]; then # get aspect ratio 1 level up if [[ $((i+1)) -lt ${#ASPECT_RATIOS[@]} ]]; then ASPECT_RATIO_INCR_1=${ASPECT_RATIOS[$((i+1))]} else # If this is the highest ratio, return the highest ratio ASPECT_RATIO_INCR_1=${ASPECT_RATIOS[-1]} fi # get aspect ratio 2 levels up if [[ $((i+2)) -lt ${#ASPECT_RATIOS[@]} ]]; then ASPECT_RATIO_INCR_2=${ASPECT_RATIOS[$((i+2))]} else # If this is the highest ratio, return the highest ratio ASPECT_RATIO_INCR_2=${ASPECT_RATIOS[-1]} fi fi i=$((i+1)) done echo "base aspect ratio: ${BASE_ASPECT_RATIO}" echo "aspect ratio 1 level up: ${ASPECT_RATIO_INCR_1}" echo "aspect ratio 2 levels up: ${ASPECT_RATIO_INCR_2}" echo "Note that this aspect ratio level setting is used for videos only, not images" echo "NUM_FRAMES=${NUM_FRAMES}" if [ -z "${NUM_FRAMES}" ]; then echo "you need to pass NUM_FRAMES" else let DOUBLE_FRAMES=$2*2 let QUAD_FRAMES=$2*4 let OCT_FRAMES=$2*8 fi echo "DOUBLE_FRAMES=${DOUBLE_FRAMES}" echo "QUAD_FRAMES=${QUAD_FRAMES}" echo "OCT_FRAMES=${OCT_FRAMES}" CMD="python scripts/inference.py configs/opensora-v1-2/inference/sample.py" if [[ $CKPT == *"ema"* ]]; then parentdir=$(dirname $CKPT) CKPT_BASE=$(basename $parentdir)_ema else CKPT_BASE=$(basename $CKPT) fi OUTPUT="/root/autodl-tmp/video_samples/samples_${MODEL_NAME}_${CKPT_BASE}" start=$(date +%s) DEFAULT_BS=1 ### Functions # called inside run_video_b function run_image() { # 14min # 1.1 1024x1024 eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 1024 --aspect-ratio 1:1 --sample-name image_1024_1_1 --batch-size $DEFAULT_BS # 1.2 240x426 eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 240p --aspect-ratio 9:16 --sample-name image_240p_9_16 --end-index 3 --batch-size $DEFAULT_BS # 1.3 512x512 eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 512 --aspect-ratio 1:1 --sample-name image_t2i_512_1_1 --end-index 3 --batch-size $DEFAULT_BS eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 512 --aspect-ratio 1:1 --sample-name image_t2v_512_1_1 --end-index 3 --batch-size $DEFAULT_BS eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 1 --resolution 512 --aspect-ratio 1:1 --sample-name image_short_512_1_1 --end-index 3 --batch-size $DEFAULT_BS eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 1 --resolution 512 --aspect-ratio 1:1 --sample-name image_sora_512_1_1 --end-index 3 --batch-size $DEFAULT_BS # 1.4 720p multi-resolution # 1:1 PROMPT="Bright scene, aerial view,ancient city, fantasy, gorgeous light, mirror reflection, high detail, wide angle lens." eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 1:1 --sample-name image_720p_1_1 # 9:16 eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 9:16 --sample-name image_720p_9_16 # 16:9 eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 16:9 --sample-name image_720p_16_9 # 4:3 eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 4:3 --sample-name image_720p_4_3 # 3:4 eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 3:4 --sample-name image_720p_3_4 # 1:2 eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 1:2 --sample-name image_720p_1_2 # 2:1 eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 2:1 --sample-name image_720p_2_1 } # for (sample, short, sora) # for ( (4s, 720p), (8s, 480p), (16s, 360p) ) function run_video_a() { # ~ 30min ? ### previous cmds # 42min, sample & multi-resolution # # sample, 144p, 9:16, 2s # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 2s --resolution 144p --aspect-ratio 9:16 --sample-name sample_2s_144p_9_16 --batch-size $DEFAULT_BS # # sample, 240p, 9:16, 2s # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 2s --resolution 240p --aspect-ratio 9:16 --sample-name sample_2s_240p_9_16 --batch-size $DEFAULT_BS # # sample, 240p, 9:16, 4s # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 4s --resolution 240p --aspect-ratio 9:16 --sample-name sample_4s_240p_9_16 --batch-size $DEFAULT_BS # # sample, 240p, 9:16, 8s # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 8s --resolution 240p --aspect-ratio 9:16 --sample-name sample_8s_240p_9_16 --batch-size $DEFAULT_BS # # sample, 480p, 9:16, 2s # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 2s --resolution 480p --aspect-ratio 9:16 --sample-name sample_2s_480p_9_16 --batch-size $DEFAULT_BS # # sample, 480p, 9:16, 4s # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 4s --resolution 480p --aspect-ratio 9:16 --sample-name sample_4s_480p_9_16 --batch-size $DEFAULT_BS # # sample, 720p, 9:16, 2s # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 9:16 --sample-name sample_2s_720p_9_16 --batch-size $DEFAULT_BS # sample, 720p, 9:16, 2s eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 4s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name sample_4s_${ASPECT_RATIO_INCR_2} --batch-size $DEFAULT_BS # sample, 480p, 9:16, 8s eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name sample_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS # sample, 360p, 9:16, 16s eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name sample_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS } function run_video_b() { # 18min + 14min = 32min, short 16x240p & 64x240p # run image, 14min echo "Inside run_video_b, running image samples..." run_image echo "Inside run_video_b, running video samples..." ### previous cmds, 18min # # short, 240p, 9:16, 4s # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 4s --resolution 240p --aspect-ratio 9:16 --sample-name short_4s_240p_9_16 --batch-size $DEFAULT_BS # # short, 240p, 9:16, 8s # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 8s --resolution 240p --aspect-ratio 9:16 --sample-name short_8s_240p_9_16 --batch-size $DEFAULT_BS # short, 480p, 9:16, 8s: ~24min eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name short_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS # short, 360p, 9:16, 16s: ~24min eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name short_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS } function run_video_c() { ### previous cmds, 60min # # sora, 240p, 16:9, 2s # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 2s --resolution 240p --aspect-ratio 16:9 --sample-name sora_2s_240p_16_9 --batch-size $DEFAULT_BS # # sora, 240p, 9:16, 2s # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 2s --resolution 240p --aspect-ratio 9:16 --sample-name sora_2s_240p_9_16 --batch-size $DEFAULT_BS # # sora, 240p, 9:16, 16s # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16s --resolution 240p --aspect-ratio 9:16 --sample-name sora_16s_240p_9_16 --batch-size $DEFAULT_BS # short, 720p, 9:16, 2s: ~9min eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 4s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name short_4s_${ASPECT_RATIO_INCR_2} --batch-size $DEFAULT_BS # sora, 360p, 9:16, 16s: ~40min eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name sora_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS } function run_video_d() { ### previous cmds, 21min + 30min = 51min # # short, 480p, 9:16, 4s: 21min # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 4s --resolution 480p --aspect-ratio 9:16 --sample-name short_4s_480p_9_16 --batch-size $DEFAULT_BS # # sora, 480p, 9:16, 8s, 1/3 # moved from run_video_e, 30min # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name sora_8s_480p_9_16 --batch-size $DEFAULT_BS --start-index 0 --end-index 16 # sora, 480p, 9:16, 8s, 1/3 # moved from run_video_e, 30min eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name sora_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS --start-index 0 --end-index 16 } function run_video_e() { # 90min * 2/3 = 60min # sora, 480p, 9:16, 8s, 2/3 eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name sora_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS --start-index 16 --end-index 100 } function run_video_f() { # 60min # sora, 720p, 9:16, 2s eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 4s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name sora_4s_${ASPECT_RATIO_INCR_2} --batch-size $DEFAULT_BS } # --resolution 720p --aspect-ratio [16:9, 9:16, ...] function run_video_g() { # 15min # 720p, 2s multi-resolution # 1:1 PROMPT="A soaring drone footage captures the majestic beauty of a coastal cliff, its red and yellow stratified rock faces rich in color and against the vibrant turquoise of the sea. Seabirds can be seen taking flight around the cliff's precipices. As the drone slowly moves from different angles, the changing sunlight casts shifting shadows that highlight the rugged textures of the cliff and the surrounding calm sea. The water gently laps at the rock base and the greenery that clings to the top of the cliff, and the scene gives a sense of peaceful isolation at the fringes of the ocean. The video captures the essence of pristine natural beauty untouched by human structures." eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 1:1 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_1_1 # 16:9 eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 16:9 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_16_9 # 9:16 eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_9_16 # 4:3 eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 4:3 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_4_3 # 3:4 eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 3:4 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_3_4 # 1:2 eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 1:2 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_1_2 # 2:1 eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 2:1 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_2_1 # add motion score eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --sample-name motion_2s_${ASPECT_RATIO_INCR_2} --prompt \ \"A stylish woman walking in the street of Tokyo.\" \"A stylish woman walking in the street of Tokyo. motion score: 0.0\" \ \"A stylish woman walking in the street of Tokyo. motion score: 2.0\" \ \"A stylish woman walking in the street of Tokyo. motion score: 4.0\" \ \"A stylish woman walking in the street of Tokyo. motion score: 6.0\" \ \"A stylish woman walking in the street of Tokyo. motion score: 10.0\" \ \"A stylish woman walking in the street of Tokyo. motion score: 25.0\" \ \"A stylish woman walking in the street of Tokyo. motion score: 50.0\" \ \"A stylish woman walking in the street of Tokyo. motion score: 100.0\" # add aes score eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --sample-name aes_2s_${ASPECT_RATIO_INCR_2} --prompt \ \"A stylish woman walking in the street of Tokyo.\" \"A stylish woman walking in the street of Tokyo. aesthetic score: 4.0\" \ \"A stylish woman walking in the street of Tokyo. aesthetic score: 4.5\" \ \"A stylish woman walking in the street of Tokyo. aesthetic score: 5.0\" \ \"A stylish woman walking in the street of Tokyo. aesthetic score: 5.5\" \ \"A stylish woman walking in the street of Tokyo. aesthetic score: 6.0\" \ \"A stylish woman walking in the street of Tokyo. aesthetic score: 6.5\" \ \"A stylish woman walking in the street of Tokyo. aesthetic score: 7.0\" } # resolution -> 480p function run_video_h() { # 61min # 3.1 image-conditioned long video generation eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C5_2s_${BASE_ASPECT_RATIO}_9_16 \ --prompt-path assets/texts/t2v_ref.txt --start-index 0 --end-index 3 \ --num-frames 2s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 \ --loop 5 --condition-frame-length 5 \ --reference-path assets/images/condition/cliff.png assets/images/condition/wave.png assets/images/condition/ship.png \ --mask-strategy "0" "0" "0" --batch-size $DEFAULT_BS eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C10_16s_${BASE_ASPECT_RATIO}_9_16 \ --prompt-path assets/texts/t2v_ref.txt --start-index 0 --end-index 3 \ --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 \ --loop 5 --condition-frame-length 10 \ --reference-path assets/images/condition/cliff.png assets/images/condition/wave.png assets/images/condition/ship.png \ --mask-strategy "0" "0" "0" --batch-size $DEFAULT_BS # 3.2 eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L1_16s_${BASE_ASPECT_RATIO}_9_16 \ --prompt-path assets/texts/t2v_ref.txt --start-index 3 --end-index 6 \ --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 \ --loop 1 \ --reference-path assets/images/condition/cliff.png "assets/images/condition/cactus-sad.png\;assets/images/condition/cactus-happy.png" https://cdn.openai.com/tmp/s/interp/d0.mp4 \ --mask-strategy "0" "0\;0,1,0,-1,1" "0,0,0,0,${QUAD_FRAMES},0.5" --batch-size $DEFAULT_BS } # vbench has 950 samples VBENCH_BS=1 # 80GB VBENCH_H=240 VBENCH_W=426 VBENCH_NUM_SAMPLE=5 function run_vbench() { if [ -z ${VBENCH_RES} ] || [ -z ${VBENCH_ASP_RATIO} ]; then eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample $VBENCH_NUM_SAMPLE \ --prompt-path assets/texts/VBench/all_dimension.txt \ --image-size $VBENCH_H $VBENCH_W \ --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 else if [ -z ${NUM_SAMPLING_STEPS} ]; then eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample $VBENCH_NUM_SAMPLE \ --prompt-path assets/texts/VBench/all_dimension.txt \ --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \ --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 else if [ -z ${FLOW} ]; then eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample $VBENCH_NUM_SAMPLE5 \ --prompt-path assets/texts/VBench/all_dimension.txt \ --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} \ --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 else if [ -z ${LLM_REFINE} ]; then eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample $VBENCH_NUM_SAMPLE \ --prompt-path assets/texts/VBench/all_dimension.txt \ --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \ --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 else if [ "${FLOW}" = "None" ]; then eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample $VBENCH_NUM_SAMPLE \ --prompt-path assets/texts/VBench/all_dimension.txt \ --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --llm-refine ${LLM_REFINE} \ --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 else eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample $VBENCH_NUM_SAMPLE \ --prompt-path assets/texts/VBench/all_dimension.txt \ --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \ --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 fi fi fi fi fi } # vbench-i2v has 1120 samples VBENCH_I2V_H=256 VBENCH_I2V_W=256 function run_vbench_i2v() { if [ -z ${VBENCH_RES} ] || [ -z ${VBENCH_ASP_RATIO} ]; then eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \ --prompt-path assets/texts/VBench/all_i2v.txt \ --image-size $VBENCH_I2V_H $VBENCH_I2V_W \ --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 else if [ -z ${NUM_SAMPLING_STEPS} ]; then eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \ --prompt-path assets/texts/VBench/all_i2v.txt \ --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \ --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 else if [ -z ${FLOW} ]; then eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \ --prompt-path assets/texts/VBench/all_i2v.txt \ --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} \ --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 else if [ -z ${LLM_REFINE} ]; then eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \ --prompt-path assets/texts/VBench/all_i2v.txt \ --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \ --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 else if [ "${FLOW}" = "None" ]; then eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \ --prompt-path assets/texts/VBench/all_i2v.txt \ --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --llm-refine ${LLM_REFINE} \ --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 else eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \ --prompt-path assets/texts/VBench/all_i2v.txt \ --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \ --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 fi fi fi fi fi } ### Main for arg in "$@"; do # image if [[ "$arg" = -1 ]] || [[ "$arg" = --image ]]; then echo "Running image samples..." run_image fi if [[ "$arg" = -2a ]] || [[ "$arg" = --video ]]; then echo "Running video samples a..." run_video_a fi if [[ "$arg" = -2b ]] || [[ "$arg" = --video ]]; then echo "Running video samples b..." run_video_b fi if [[ "$arg" = -2c ]] || [[ "$arg" = --video ]]; then echo "Running video samples c..." run_video_c fi if [[ "$arg" = -2d ]] || [[ "$arg" = --video ]]; then echo "Running video samples d..." run_video_d fi if [[ "$arg" = -2e ]] || [[ "$arg" = --video ]]; then echo "Running video samples e..." run_video_e fi if [[ "$arg" = -2f ]] || [[ "$arg" = --video ]]; then echo "Running video samples f..." run_video_f fi if [[ "$arg" = -2g ]] || [[ "$arg" = --video ]]; then echo "Running video samples g..." run_video_g fi if [[ "$arg" = -2h ]] || [[ "$arg" = --video ]]; then echo "Running video samples h..." run_video_h fi # vbench if [[ "$arg" = -4 ]] || [[ "$arg" = --vbench ]]; then echo "Running vbench samples ..." if [ -z ${VBENCH_START_INDEX} ] || [ -z ${VBENCH_END_INDEX} ]; then echo "need to set start_index and end_index" else run_vbench $VBENCH_START_INDEX $VBENCH_END_INDEX fi fi # vbench-i2v if [[ "$arg" = -5 ]] || [[ "$arg" = --vbench-i2v ]]; then echo "Running vbench-i2v samples ..." if [ -z ${VBENCH_START_INDEX} ] || [ -z ${VBENCH_END_INDEX} ]; then echo "need to set start_index and end_index" else run_vbench_i2v $VBENCH_START_INDEX $VBENCH_END_INDEX fi fi done ### End end=$(date +%s) runtime=$((end - start)) echo "Runtime: $runtime seconds" ================================================ FILE: Open-Sora/eval/vae/cal_flolpips.py ================================================ import sys import numpy as np import torch from tqdm import tqdm sys.path.append(".") from flolpips.flolpips import FloLPIPS from flolpips.pwcnet import Network as PWCNet loss_fn = FloLPIPS(net="alex", version="0.1").eval().requires_grad_(False) flownet = PWCNet().eval().requires_grad_(False) def trans(x): return x def calculate_flolpips(videos1, videos2, device): global loss_fn, flownet print("calculate_flowlpips...") loss_fn = loss_fn.to(device) flownet = flownet.to(device) if videos1.shape != videos2.shape: print("Warning: the shape of videos are not equal.") min_frames = min(videos1.shape[1], videos2.shape[1]) videos1 = videos1[:, :min_frames] videos2 = videos2[:, :min_frames] videos1 = trans(videos1) videos2 = trans(videos2) flolpips_results = [] for video_num in tqdm(range(videos1.shape[0])): video1 = videos1[video_num].to(device) video2 = videos2[video_num].to(device) frames_rec = video1[:-1] frames_rec_next = video1[1:] frames_gt = video2[:-1] frames_gt_next = video2[1:] t, c, h, w = frames_gt.shape flow_gt = flownet(frames_gt, frames_gt_next) flow_dis = flownet(frames_rec, frames_rec_next) flow_diff = flow_gt - flow_dis flolpips = loss_fn.forward(frames_gt, frames_rec, flow_diff, normalize=True) flolpips_results.append(flolpips.cpu().numpy().tolist()) flolpips_results = np.array(flolpips_results) # [batch_size, num_frames] flolpips = {} flolpips_std = {} for clip_timestamp in range(flolpips_results.shape[1]): flolpips[clip_timestamp] = np.mean(flolpips_results[:, clip_timestamp], axis=-1) flolpips_std[clip_timestamp] = np.std(flolpips_results[:, clip_timestamp], axis=-1) result = { "value": flolpips, "value_std": flolpips_std, "video_setting": video1.shape, "video_setting_name": "time, channel, heigth, width", "result": flolpips_results, "details": flolpips_results.tolist(), } return result # test code / using example def main(): NUMBER_OF_VIDEOS = 8 VIDEO_LENGTH = 50 CHANNEL = 3 SIZE = 64 videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False) videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False) import json result = calculate_flolpips(videos1, videos2, "cuda:0") print(json.dumps(result, indent=4)) if __name__ == "__main__": main() ================================================ FILE: Open-Sora/eval/vae/cal_lpips.py ================================================ import lpips import numpy as np import torch from tqdm import tqdm spatial = True # Return a spatial map of perceptual distance. # Linearly calibrated models (LPIPS) loss_fn = lpips.LPIPS(net="alex", spatial=spatial) # Can also set net = 'squeeze' or 'vgg' # loss_fn = lpips.LPIPS(net='alex', spatial=spatial, lpips=False) # Can also set net = 'squeeze' or 'vgg' def trans(x): # if greyscale images add channel if x.shape[-3] == 1: x = x.repeat(1, 1, 3, 1, 1) # value range [0, 1] -> [-1, 1] x = x * 2 - 1 return x def calculate_lpips(videos1, videos2, device): # image should be RGB, IMPORTANT: normalized to [-1,1] print("calculate_lpips...") assert videos1.shape == videos2.shape # videos [batch_size, timestamps, channel, h, w] # support grayscale input, if grayscale -> channel*3 # value range [0, 1] -> [-1, 1] videos1 = trans(videos1) videos2 = trans(videos2) lpips_results = [] for video_num in tqdm(range(videos1.shape[0])): # get a video # video [timestamps, channel, h, w] video1 = videos1[video_num] video2 = videos2[video_num] lpips_results_of_a_video = [] for clip_timestamp in range(len(video1)): # get a img # img [timestamps[x], channel, h, w] # img [channel, h, w] tensor img1 = video1[clip_timestamp].unsqueeze(0).to(device) img2 = video2[clip_timestamp].unsqueeze(0).to(device) loss_fn.to(device) # calculate lpips of a video lpips_results_of_a_video.append(loss_fn.forward(img1, img2).mean().detach().cpu().tolist()) lpips_results.append(lpips_results_of_a_video) lpips_results = np.array(lpips_results) lpips = {} lpips_std = {} for clip_timestamp in range(len(video1)): lpips[clip_timestamp] = np.mean(lpips_results[:, clip_timestamp]) lpips_std[clip_timestamp] = np.std(lpips_results[:, clip_timestamp]) result = { "value": lpips, "value_std": lpips_std, "video_setting": video1.shape, "video_setting_name": "time, channel, heigth, width", } return result # test code / using example def main(): NUMBER_OF_VIDEOS = 8 VIDEO_LENGTH = 50 CHANNEL = 3 SIZE = 64 videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False) videos2 = torch.ones(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False) device = torch.device("cuda") # device = torch.device("cpu") import json result = calculate_lpips(videos1, videos2, device) print(json.dumps(result, indent=4)) if __name__ == "__main__": main() ================================================ FILE: Open-Sora/eval/vae/cal_psnr.py ================================================ import math import numpy as np import torch from tqdm import tqdm def img_psnr(img1, img2): # [0,1] # compute mse # mse = np.mean((img1-img2)**2) mse = np.mean((img1 / 1.0 - img2 / 1.0) ** 2) # compute psnr if mse < 1e-10: return 100 psnr = 20 * math.log10(1 / math.sqrt(mse)) return psnr def trans(x): return x def calculate_psnr(videos1, videos2): print("calculate_psnr...") # videos [batch_size, timestamps, channel, h, w] assert videos1.shape == videos2.shape videos1 = trans(videos1) videos2 = trans(videos2) psnr_results = [] for video_num in tqdm(range(videos1.shape[0])): # get a video # video [timestamps, channel, h, w] video1 = videos1[video_num] video2 = videos2[video_num] psnr_results_of_a_video = [] for clip_timestamp in range(len(video1)): # get a img # img [timestamps[x], channel, h, w] # img [channel, h, w] numpy img1 = video1[clip_timestamp].numpy() img2 = video2[clip_timestamp].numpy() # calculate psnr of a video psnr_results_of_a_video.append(img_psnr(img1, img2)) psnr_results.append(psnr_results_of_a_video) psnr_results = np.array(psnr_results) # [batch_size, num_frames] psnr = {} psnr_std = {} for clip_timestamp in range(len(video1)): psnr[clip_timestamp] = np.mean(psnr_results[:, clip_timestamp]) psnr_std[clip_timestamp] = np.std(psnr_results[:, clip_timestamp]) result = { "value": psnr, "value_std": psnr_std, "video_setting": video1.shape, "video_setting_name": "time, channel, heigth, width", } return result # test code / using example def main(): NUMBER_OF_VIDEOS = 8 VIDEO_LENGTH = 50 CHANNEL = 3 SIZE = 64 videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False) videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False) import json result = calculate_psnr(videos1, videos2) print(json.dumps(result, indent=4)) if __name__ == "__main__": main() ================================================ FILE: Open-Sora/eval/vae/cal_ssim.py ================================================ import cv2 import numpy as np import torch from tqdm import tqdm def ssim(img1, img2): C1 = 0.01**2 C2 = 0.03**2 img1 = img1.astype(np.float64) img2 = img2.astype(np.float64) kernel = cv2.getGaussianKernel(11, 1.5) window = np.outer(kernel, kernel.transpose()) mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5] # valid mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5] mu1_sq = mu1**2 mu2_sq = mu2**2 mu1_mu2 = mu1 * mu2 sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2 ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)) return ssim_map.mean() def calculate_ssim_function(img1, img2): # [0,1] # ssim is the only metric extremely sensitive to gray being compared to b/w if not img1.shape == img2.shape: raise ValueError("Input images must have the same dimensions.") if img1.ndim == 2: return ssim(img1, img2) elif img1.ndim == 3: if img1.shape[0] == 3: ssims = [] for i in range(3): ssims.append(ssim(img1[i], img2[i])) return np.array(ssims).mean() elif img1.shape[0] == 1: return ssim(np.squeeze(img1), np.squeeze(img2)) else: raise ValueError("Wrong input image dimensions.") def trans(x): return x def calculate_ssim(videos1, videos2): print("calculate_ssim...") # videos [batch_size, timestamps, channel, h, w] assert videos1.shape == videos2.shape videos1 = trans(videos1) videos2 = trans(videos2) ssim_results = [] for video_num in tqdm(range(videos1.shape[0])): # get a video # video [timestamps, channel, h, w] video1 = videos1[video_num] video2 = videos2[video_num] ssim_results_of_a_video = [] for clip_timestamp in range(len(video1)): # get a img # img [timestamps[x], channel, h, w] # img [channel, h, w] numpy img1 = video1[clip_timestamp].numpy() img2 = video2[clip_timestamp].numpy() # calculate ssim of a video ssim_results_of_a_video.append(calculate_ssim_function(img1, img2)) ssim_results.append(ssim_results_of_a_video) ssim_results = np.array(ssim_results) ssim = {} ssim_std = {} for clip_timestamp in range(len(video1)): ssim[clip_timestamp] = np.mean(ssim_results[:, clip_timestamp]) ssim_std[clip_timestamp] = np.std(ssim_results[:, clip_timestamp]) result = { "value": ssim, "value_std": ssim_std, "video_setting": video1.shape, "video_setting_name": "time, channel, heigth, width", } return result # test code / using example def main(): NUMBER_OF_VIDEOS = 8 VIDEO_LENGTH = 50 CHANNEL = 3 SIZE = 64 videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False) videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False) torch.device("cuda") import json result = calculate_ssim(videos1, videos2) print(json.dumps(result, indent=4)) if __name__ == "__main__": main() ================================================ FILE: Open-Sora/eval/vae/eval_common_metric.py ================================================ """Calculates the CLIP Scores The CLIP model is a contrasitively learned language-image model. There is an image encoder and a text encoder. It is believed that the CLIP model could measure the similarity of cross modalities. Please find more information from https://github.com/openai/CLIP. The CLIP Score measures the Cosine Similarity between two embedded features. This repository utilizes the pretrained CLIP Model to calculate the mean average of cosine similarities. See --help to see further details. Code apapted from https://github.com/mseitzer/pytorch-fid and https://github.com/openai/CLIP. Copyright 2023 The Hong Kong Polytechnic University Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import os import os.path as osp import sys from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser import numpy as np import torch from decord import VideoReader, cpu from pytorchvideo.transforms import ShortSideScale from torch.utils.data import DataLoader, Dataset, Subset from torchvision.transforms import Compose, Lambda from torchvision.transforms._transforms_video import CenterCropVideo sys.path.append(".") from cal_flolpips import calculate_flolpips from cal_lpips import calculate_lpips from cal_psnr import calculate_psnr from cal_ssim import calculate_ssim try: from tqdm import tqdm except ImportError: # If tqdm is not available, provide a mock version of it def tqdm(x): return x class VideoDataset(Dataset): def __init__( self, real_video_dir, generated_video_dir, num_frames, sample_rate=1, crop_size=None, resolution=128, ) -> None: super().__init__() self.real_video_files = self._combine_without_prefix(real_video_dir) self.generated_video_files = self._combine_without_prefix(generated_video_dir) self.num_frames = num_frames self.sample_rate = sample_rate self.crop_size = crop_size self.short_size = resolution def __len__(self): return len(self.real_video_files) def __getitem__(self, index): if index >= len(self): raise IndexError real_video_file = self.real_video_files[index] generated_video_file = self.generated_video_files[index] print(real_video_file, generated_video_file) real_video_tensor = self._load_video(real_video_file) generated_video_tensor = self._load_video(generated_video_file) return {"real": real_video_tensor, "generated": generated_video_tensor} def _load_video(self, video_path): num_frames = self.num_frames sample_rate = self.sample_rate decord_vr = VideoReader(video_path, ctx=cpu(0)) total_frames = len(decord_vr) sample_frames_len = sample_rate * num_frames if total_frames >= sample_frames_len: s = 0 e = s + sample_frames_len num_frames = num_frames else: s = 0 e = total_frames num_frames = int(total_frames / sample_frames_len * num_frames) print( f"sample_frames_len {sample_frames_len}, only can sample {num_frames * sample_rate}", video_path, total_frames, ) frame_id_list = np.linspace(s, e - 1, num_frames, dtype=int) video_data = decord_vr.get_batch(frame_id_list).asnumpy() video_data = torch.from_numpy(video_data) video_data = video_data.permute(0, 3, 1, 2) # (T, H, W, C) -> (C, T, H, W) return _preprocess(video_data, short_size=self.short_size, crop_size=self.crop_size) def _combine_without_prefix(self, folder_path, prefix="."): folder = [] os.makedirs(folder_path, exist_ok=True) for name in os.listdir(folder_path): if name[0] == prefix: continue if osp.isfile(osp.join(folder_path, name)): folder.append(osp.join(folder_path, name)) folder.sort() return folder def _preprocess(video_data, short_size=128, crop_size=None): transform = Compose( [ Lambda(lambda x: x / 255.0), ShortSideScale(size=short_size), CenterCropVideo(crop_size=crop_size), ] ) video_outputs = transform(video_data) # video_outputs = torch.unsqueeze(video_outputs, 0) # (bz,c,t,h,w) return video_outputs def calculate_common_metric(args, dataloader, device): metric_dict = {} if type(args.metric) is str: args.metric = [m.strip() for m in args.metric.split(",")] print(args.metric) for metric in args.metric: score_list = [] for batch_data in tqdm(dataloader): # {'real': real_video_tensor, 'generated':generated_video_tensor } real_videos = batch_data["real"] generated_videos = batch_data["generated"] assert real_videos.shape[2] == generated_videos.shape[2] if metric == "ssim": tmp_list = list(calculate_ssim(real_videos, generated_videos)["value"].values()) elif metric == "psnr": tmp_list = list(calculate_psnr(real_videos, generated_videos)["value"].values()) elif metric == "flolpips": result = calculate_flolpips(real_videos, generated_videos, args.device) tmp_list = list(result["value"].values()) elif metric == "lpips": tmp_list = list(calculate_lpips(real_videos, generated_videos, args.device)["value"].values()) else: print(f"metric {metric} is not in acceped list, not calculated") continue score_list += tmp_list metric_dict[metric] = np.mean(score_list) return metric_dict def main(): parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument("--batch_size", type=int, default=2, help="Batch size to use") parser.add_argument("--real_video_dir", type=str, help=("the path of real videos`")) parser.add_argument("--generated_video_dir", type=str, help=("the path of generated videos`")) parser.add_argument("--device", type=str, default=None, help="Device to use. Like cuda, cuda:0 or cpu") parser.add_argument( "--num_workers", type=int, default=8, help=("Number of processes to use for data loading. " "Defaults to `min(8, num_cpus)`"), ) parser.add_argument("--sample_fps", type=int, default=30) parser.add_argument("--resolution", type=int, default=336) parser.add_argument("--crop_size", type=int, default=None) parser.add_argument("--num_frames", type=int, default=100) parser.add_argument("--sample_rate", type=int, default=1) parser.add_argument("--subset_size", type=int, default=None) # parser.add_argument("--metric", type=str, default="fvd",choices=['fvd','psnr','ssim','lpips', 'flolpips']) parser.add_argument("--metric", nargs="+", default=[]) parser.add_argument("--fvd_method", type=str, default="styleganv", choices=["styleganv", "videogpt"]) args = parser.parse_args() if args.device is None: device = torch.device("cuda" if (torch.cuda.is_available()) else "cpu") else: device = torch.device(args.device) if args.num_workers is None: try: num_cpus = len(os.sched_getaffinity(0)) except AttributeError: # os.sched_getaffinity is not available under Windows, use # os.cpu_count instead (which may not return the *available* number # of CPUs). num_cpus = os.cpu_count() num_workers = min(num_cpus, 8) if num_cpus is not None else 0 else: num_workers = args.num_workers dataset = VideoDataset( args.real_video_dir, args.generated_video_dir, num_frames=args.num_frames, sample_rate=args.sample_rate, crop_size=args.crop_size, resolution=args.resolution, ) if args.subset_size: indices = range(args.subset_size) dataset = Subset(dataset, indices=indices) dataloader = DataLoader(dataset, args.batch_size, num_workers=num_workers, pin_memory=True) metric_score = calculate_common_metric(args, dataloader, device) print("metric: ", args.metric, " ", metric_score) if __name__ == "__main__": main() ================================================ FILE: Open-Sora/eval/vae/flolpips/correlation/correlation.py ================================================ #!/usr/bin/env python import re import cupy import torch kernel_Correlation_rearrange = """ extern "C" __global__ void kernel_Correlation_rearrange( const int n, const float* input, float* output ) { int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; if (intIndex >= n) { return; } int intSample = blockIdx.z; int intChannel = blockIdx.y; float fltValue = input[(((intSample * SIZE_1(input)) + intChannel) * SIZE_2(input) * SIZE_3(input)) + intIndex]; __syncthreads(); int intPaddedY = (intIndex / SIZE_3(input)) + 4; int intPaddedX = (intIndex % SIZE_3(input)) + 4; int intRearrange = ((SIZE_3(input) + 8) * intPaddedY) + intPaddedX; output[(((intSample * SIZE_1(output) * SIZE_2(output)) + intRearrange) * SIZE_1(input)) + intChannel] = fltValue; } """ kernel_Correlation_updateOutput = """ extern "C" __global__ void kernel_Correlation_updateOutput( const int n, const float* rbot0, const float* rbot1, float* top ) { extern __shared__ char patch_data_char[]; float *patch_data = (float *)patch_data_char; // First (upper left) position of kernel upper-left corner in current center position of neighborhood in image 1 int x1 = blockIdx.x + 4; int y1 = blockIdx.y + 4; int item = blockIdx.z; int ch_off = threadIdx.x; // Load 3D patch into shared shared memory for (int j = 0; j < 1; j++) { // HEIGHT for (int i = 0; i < 1; i++) { // WIDTH int ji_off = (j + i) * SIZE_3(rbot0); for (int ch = ch_off; ch < SIZE_3(rbot0); ch += 32) { // CHANNELS int idx1 = ((item * SIZE_1(rbot0) + y1+j) * SIZE_2(rbot0) + x1+i) * SIZE_3(rbot0) + ch; int idxPatchData = ji_off + ch; patch_data[idxPatchData] = rbot0[idx1]; } } } __syncthreads(); __shared__ float sum[32]; // Compute correlation for (int top_channel = 0; top_channel < SIZE_1(top); top_channel++) { sum[ch_off] = 0; int s2o = top_channel % 9 - 4; int s2p = top_channel / 9 - 4; for (int j = 0; j < 1; j++) { // HEIGHT for (int i = 0; i < 1; i++) { // WIDTH int ji_off = (j + i) * SIZE_3(rbot0); for (int ch = ch_off; ch < SIZE_3(rbot0); ch += 32) { // CHANNELS int x2 = x1 + s2o; int y2 = y1 + s2p; int idxPatchData = ji_off + ch; int idx2 = ((item * SIZE_1(rbot0) + y2+j) * SIZE_2(rbot0) + x2+i) * SIZE_3(rbot0) + ch; sum[ch_off] += patch_data[idxPatchData] * rbot1[idx2]; } } } __syncthreads(); if (ch_off == 0) { float total_sum = 0; for (int idx = 0; idx < 32; idx++) { total_sum += sum[idx]; } const int sumelems = SIZE_3(rbot0); const int index = ((top_channel*SIZE_2(top) + blockIdx.y)*SIZE_3(top))+blockIdx.x; top[index + item*SIZE_1(top)*SIZE_2(top)*SIZE_3(top)] = total_sum / (float)sumelems; } } } """ kernel_Correlation_updateGradFirst = """ #define ROUND_OFF 50000 extern "C" __global__ void kernel_Correlation_updateGradFirst( const int n, const int intSample, const float* rbot0, const float* rbot1, const float* gradOutput, float* gradFirst, float* gradSecond ) { for (int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; intIndex < n; intIndex += blockDim.x * gridDim.x) { int n = intIndex % SIZE_1(gradFirst); // channels int l = (intIndex / SIZE_1(gradFirst)) % SIZE_3(gradFirst) + 4; // w-pos int m = (intIndex / SIZE_1(gradFirst) / SIZE_3(gradFirst)) % SIZE_2(gradFirst) + 4; // h-pos // round_off is a trick to enable integer division with ceil, even for negative numbers // We use a large offset, for the inner part not to become negative. const int round_off = ROUND_OFF; const int round_off_s1 = round_off; // We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior: int xmin = (l - 4 + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4) int ymin = (m - 4 + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4) // Same here: int xmax = (l - 4 + round_off_s1) - round_off; // floor (l - 4) int ymax = (m - 4 + round_off_s1) - round_off; // floor (m - 4) float sum = 0; if (xmax>=0 && ymax>=0 && (xmin<=SIZE_3(gradOutput)-1) && (ymin<=SIZE_2(gradOutput)-1)) { xmin = max(0,xmin); xmax = min(SIZE_3(gradOutput)-1,xmax); ymin = max(0,ymin); ymax = min(SIZE_2(gradOutput)-1,ymax); for (int p = -4; p <= 4; p++) { for (int o = -4; o <= 4; o++) { // Get rbot1 data: int s2o = o; int s2p = p; int idxbot1 = ((intSample * SIZE_1(rbot0) + (m+s2p)) * SIZE_2(rbot0) + (l+s2o)) * SIZE_3(rbot0) + n; float bot1tmp = rbot1[idxbot1]; // rbot1[l+s2o,m+s2p,n] // Index offset for gradOutput in following loops: int op = (p+4) * 9 + (o+4); // index[o,p] int idxopoffset = (intSample * SIZE_1(gradOutput) + op); for (int y = ymin; y <= ymax; y++) { for (int x = xmin; x <= xmax; x++) { int idxgradOutput = (idxopoffset * SIZE_2(gradOutput) + y) * SIZE_3(gradOutput) + x; // gradOutput[x,y,o,p] sum += gradOutput[idxgradOutput] * bot1tmp; } } } } } const int sumelems = SIZE_1(gradFirst); const int bot0index = ((n * SIZE_2(gradFirst)) + (m-4)) * SIZE_3(gradFirst) + (l-4); gradFirst[bot0index + intSample*SIZE_1(gradFirst)*SIZE_2(gradFirst)*SIZE_3(gradFirst)] = sum / (float)sumelems; } } """ kernel_Correlation_updateGradSecond = """ #define ROUND_OFF 50000 extern "C" __global__ void kernel_Correlation_updateGradSecond( const int n, const int intSample, const float* rbot0, const float* rbot1, const float* gradOutput, float* gradFirst, float* gradSecond ) { for (int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; intIndex < n; intIndex += blockDim.x * gridDim.x) { int n = intIndex % SIZE_1(gradSecond); // channels int l = (intIndex / SIZE_1(gradSecond)) % SIZE_3(gradSecond) + 4; // w-pos int m = (intIndex / SIZE_1(gradSecond) / SIZE_3(gradSecond)) % SIZE_2(gradSecond) + 4; // h-pos // round_off is a trick to enable integer division with ceil, even for negative numbers // We use a large offset, for the inner part not to become negative. const int round_off = ROUND_OFF; const int round_off_s1 = round_off; float sum = 0; for (int p = -4; p <= 4; p++) { for (int o = -4; o <= 4; o++) { int s2o = o; int s2p = p; //Get X,Y ranges and clamp // We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior: int xmin = (l - 4 - s2o + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4 - s2o) int ymin = (m - 4 - s2p + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4 - s2o) // Same here: int xmax = (l - 4 - s2o + round_off_s1) - round_off; // floor (l - 4 - s2o) int ymax = (m - 4 - s2p + round_off_s1) - round_off; // floor (m - 4 - s2p) if (xmax>=0 && ymax>=0 && (xmin<=SIZE_3(gradOutput)-1) && (ymin<=SIZE_2(gradOutput)-1)) { xmin = max(0,xmin); xmax = min(SIZE_3(gradOutput)-1,xmax); ymin = max(0,ymin); ymax = min(SIZE_2(gradOutput)-1,ymax); // Get rbot0 data: int idxbot0 = ((intSample * SIZE_1(rbot0) + (m-s2p)) * SIZE_2(rbot0) + (l-s2o)) * SIZE_3(rbot0) + n; float bot0tmp = rbot0[idxbot0]; // rbot1[l+s2o,m+s2p,n] // Index offset for gradOutput in following loops: int op = (p+4) * 9 + (o+4); // index[o,p] int idxopoffset = (intSample * SIZE_1(gradOutput) + op); for (int y = ymin; y <= ymax; y++) { for (int x = xmin; x <= xmax; x++) { int idxgradOutput = (idxopoffset * SIZE_2(gradOutput) + y) * SIZE_3(gradOutput) + x; // gradOutput[x,y,o,p] sum += gradOutput[idxgradOutput] * bot0tmp; } } } } } const int sumelems = SIZE_1(gradSecond); const int bot1index = ((n * SIZE_2(gradSecond)) + (m-4)) * SIZE_3(gradSecond) + (l-4); gradSecond[bot1index + intSample*SIZE_1(gradSecond)*SIZE_2(gradSecond)*SIZE_3(gradSecond)] = sum / (float)sumelems; } } """ def cupy_kernel(strFunction, objVariables): strKernel = globals()[strFunction] while True: objMatch = re.search("(SIZE_)([0-4])(\()([^\)]*)(\))", strKernel) if objMatch is None: break # end intArg = int(objMatch.group(2)) strTensor = objMatch.group(4) intSizes = objVariables[strTensor].size() strKernel = strKernel.replace(objMatch.group(), str(intSizes[intArg])) # end while True: objMatch = re.search("(VALUE_)([0-4])(\()([^\)]+)(\))", strKernel) if objMatch is None: break # end intArgs = int(objMatch.group(2)) strArgs = objMatch.group(4).split(",") strTensor = strArgs[0] intStrides = objVariables[strTensor].stride() strIndex = [ "((" + strArgs[intArg + 1].replace("{", "(").replace("}", ")").strip() + ")*" + str(intStrides[intArg]) + ")" for intArg in range(intArgs) ] strKernel = strKernel.replace(objMatch.group(0), strTensor + "[" + str.join("+", strIndex) + "]") # end return strKernel # end @cupy.memoize(for_each_device=True) def cupy_launch(strFunction, strKernel): return cupy.RawKernel(strKernel, strFunction) # end class _FunctionCorrelation(torch.autograd.Function): @staticmethod def forward(self, first, second): rbot0 = first.new_zeros([first.shape[0], first.shape[2] + 8, first.shape[3] + 8, first.shape[1]]) rbot1 = first.new_zeros([first.shape[0], first.shape[2] + 8, first.shape[3] + 8, first.shape[1]]) self.save_for_backward(first, second, rbot0, rbot1) first = first.contiguous() assert first.is_cuda == True second = second.contiguous() assert second.is_cuda == True output = first.new_zeros([first.shape[0], 81, first.shape[2], first.shape[3]]) if first.is_cuda == True: n = first.shape[2] * first.shape[3] cupy_launch( "kernel_Correlation_rearrange", cupy_kernel("kernel_Correlation_rearrange", {"input": first, "output": rbot0}), )( grid=tuple([int((n + 16 - 1) / 16), first.shape[1], first.shape[0]]), block=tuple([16, 1, 1]), args=[n, first.data_ptr(), rbot0.data_ptr()], ) n = second.shape[2] * second.shape[3] cupy_launch( "kernel_Correlation_rearrange", cupy_kernel("kernel_Correlation_rearrange", {"input": second, "output": rbot1}), )( grid=tuple([int((n + 16 - 1) / 16), second.shape[1], second.shape[0]]), block=tuple([16, 1, 1]), args=[n, second.data_ptr(), rbot1.data_ptr()], ) n = output.shape[1] * output.shape[2] * output.shape[3] cupy_launch( "kernel_Correlation_updateOutput", cupy_kernel("kernel_Correlation_updateOutput", {"rbot0": rbot0, "rbot1": rbot1, "top": output}), )( grid=tuple([output.shape[3], output.shape[2], output.shape[0]]), block=tuple([32, 1, 1]), shared_mem=first.shape[1] * 4, args=[n, rbot0.data_ptr(), rbot1.data_ptr(), output.data_ptr()], ) elif first.is_cuda == False: raise NotImplementedError() # end return output # end @staticmethod def backward(self, gradOutput): first, second, rbot0, rbot1 = self.saved_tensors gradOutput = gradOutput.contiguous() assert gradOutput.is_cuda == True gradFirst = ( first.new_zeros([first.shape[0], first.shape[1], first.shape[2], first.shape[3]]) if self.needs_input_grad[0] == True else None ) gradSecond = ( first.new_zeros([first.shape[0], first.shape[1], first.shape[2], first.shape[3]]) if self.needs_input_grad[1] == True else None ) if first.is_cuda == True: if gradFirst is not None: for intSample in range(first.shape[0]): n = first.shape[1] * first.shape[2] * first.shape[3] cupy_launch( "kernel_Correlation_updateGradFirst", cupy_kernel( "kernel_Correlation_updateGradFirst", { "rbot0": rbot0, "rbot1": rbot1, "gradOutput": gradOutput, "gradFirst": gradFirst, "gradSecond": None, }, ), )( grid=tuple([int((n + 512 - 1) / 512), 1, 1]), block=tuple([512, 1, 1]), args=[ n, intSample, rbot0.data_ptr(), rbot1.data_ptr(), gradOutput.data_ptr(), gradFirst.data_ptr(), None, ], ) # end # end if gradSecond is not None: for intSample in range(first.shape[0]): n = first.shape[1] * first.shape[2] * first.shape[3] cupy_launch( "kernel_Correlation_updateGradSecond", cupy_kernel( "kernel_Correlation_updateGradSecond", { "rbot0": rbot0, "rbot1": rbot1, "gradOutput": gradOutput, "gradFirst": None, "gradSecond": gradSecond, }, ), )( grid=tuple([int((n + 512 - 1) / 512), 1, 1]), block=tuple([512, 1, 1]), args=[ n, intSample, rbot0.data_ptr(), rbot1.data_ptr(), gradOutput.data_ptr(), None, gradSecond.data_ptr(), ], ) # end # end elif first.is_cuda == False: raise NotImplementedError() # end return gradFirst, gradSecond # end # end def FunctionCorrelation(tenFirst, tenSecond): return _FunctionCorrelation.apply(tenFirst, tenSecond) # end class ModuleCorrelation(torch.nn.Module): def __init__(self): super(ModuleCorrelation, self).__init__() # end def forward(self, tenFirst, tenSecond): return _FunctionCorrelation.apply(tenFirst, tenSecond) # end # end ================================================ FILE: Open-Sora/eval/vae/flolpips/flolpips.py ================================================ from __future__ import absolute_import import hashlib import os import requests import torch import torch.nn import torch.nn as nn import torch.nn.functional as F from torch.autograd import Variable from tqdm import tqdm from .pretrained_networks import alexnet, squeezenet, vgg16 from .pwcnet import Network as PWCNet from .utils import * URL_MAP = {"alex": "https://raw.githubusercontent.com/danier97/flolpips/main/weights/v0.1/alex.pth"} CKPT_MAP = {"alex": "alex.pth"} MD5_MAP = {"alex": "9642209e2b57a85d20f86d812320f9e6"} def spatial_average(in_tens, keepdim=True): return in_tens.mean([2, 3], keepdim=keepdim) def mw_spatial_average(in_tens, flow, keepdim=True): _, _, h, w = in_tens.shape flow = F.interpolate(flow, (h, w), align_corners=False, mode="bilinear") flow_mag = torch.sqrt(flow[:, 0:1] ** 2 + flow[:, 1:2] ** 2) flow_mag = flow_mag / torch.sum(flow_mag, dim=[1, 2, 3], keepdim=True) return torch.sum(in_tens * flow_mag, dim=[2, 3], keepdim=keepdim) def mtw_spatial_average(in_tens, flow, texture, keepdim=True): _, _, h, w = in_tens.shape flow = F.interpolate(flow, (h, w), align_corners=False, mode="bilinear") texture = F.interpolate(texture, (h, w), align_corners=False, mode="bilinear") flow_mag = torch.sqrt(flow[:, 0:1] ** 2 + flow[:, 1:2] ** 2) flow_mag = (flow_mag - flow_mag.min()) / (flow_mag.max() - flow_mag.min()) + 1e-6 texture = (texture - texture.min()) / (texture.max() - texture.min()) + 1e-6 weight = flow_mag / texture weight /= torch.sum(weight) return torch.sum(in_tens * weight, dim=[2, 3], keepdim=keepdim) def m2w_spatial_average(in_tens, flow, keepdim=True): _, _, h, w = in_tens.shape flow = F.interpolate(flow, (h, w), align_corners=False, mode="bilinear") flow_mag = flow[:, 0:1] ** 2 + flow[:, 1:2] ** 2 # B,1,H,W flow_mag = flow_mag / torch.sum(flow_mag) return torch.sum(in_tens * flow_mag, dim=[2, 3], keepdim=keepdim) def upsample(in_tens, out_HW=(64, 64)): # assumes scale factor is same for H and W in_H, in_W = in_tens.shape[2], in_tens.shape[3] return nn.Upsample(size=out_HW, mode="bilinear", align_corners=False)(in_tens) def md5_hash(path): with open(path, "rb") as f: content = f.read() return hashlib.md5(content).hexdigest() def download(url, local_path, chunk_size=1024): os.makedirs(os.path.split(local_path)[0], exist_ok=True) with requests.get(url, stream=True) as r: total_size = int(r.headers.get("content-length", 0)) with tqdm(total=total_size, unit="B", unit_scale=True) as pbar: with open(local_path, "wb") as f: for data in r.iter_content(chunk_size=chunk_size): if data: f.write(data) pbar.update(chunk_size) def get_ckpt_path(name, root, check=False): assert name in URL_MAP path = os.path.join(root, CKPT_MAP[name]) if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]): print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path)) download(URL_MAP[name], path) md5 = md5_hash(path) assert md5 == MD5_MAP[name], md5 return path # Learned perceptual metric class LPIPS(nn.Module): def __init__( self, pretrained=True, net="alex", version="0.1", lpips=True, spatial=False, pnet_rand=False, pnet_tune=False, use_dropout=True, model_path=None, eval_mode=True, verbose=False, ): # lpips - [True] means with linear calibration on top of base network # pretrained - [True] means load linear weights super(LPIPS, self).__init__() if verbose: print( "Setting up [%s] perceptual loss: trunk [%s], v[%s], spatial [%s]" % ("LPIPS" if lpips else "baseline", net, version, "on" if spatial else "off") ) self.pnet_type = net self.pnet_tune = pnet_tune self.pnet_rand = pnet_rand self.spatial = spatial self.lpips = lpips # false means baseline of just averaging all layers self.version = version self.scaling_layer = ScalingLayer() if self.pnet_type in ["vgg", "vgg16"]: net_type = vgg16 self.chns = [64, 128, 256, 512, 512] elif self.pnet_type == "alex": net_type = alexnet self.chns = [64, 192, 384, 256, 256] elif self.pnet_type == "squeeze": net_type = squeezenet self.chns = [64, 128, 256, 384, 384, 512, 512] self.L = len(self.chns) self.net = net_type(pretrained=not self.pnet_rand, requires_grad=self.pnet_tune) if lpips: self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout) self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout) self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout) self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout) self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout) self.lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4] if self.pnet_type == "squeeze": # 7 layers for squeezenet self.lin5 = NetLinLayer(self.chns[5], use_dropout=use_dropout) self.lin6 = NetLinLayer(self.chns[6], use_dropout=use_dropout) self.lins += [self.lin5, self.lin6] self.lins = nn.ModuleList(self.lins) if pretrained: self.load_from_pretrained(version, net) if verbose: print("Loaded model from: %s" % model_path) if eval_mode: self.eval() def load_from_pretrained(self, version, net): ckpt = get_ckpt_path(net, "pretrained_models/flolpips/weights/v%s" % (version)) self.load_state_dict(torch.load(ckpt, map_location="cpu"), strict=False) def forward(self, in0, in1, retPerLayer=False, normalize=False): if normalize: # turn on this flag if input is [0,1] so it can be adjusted to [-1, +1] in0 = 2 * in0 - 1 in1 = 2 * in1 - 1 # v0.0 - original release had a bug, where input was not scaled in0_input, in1_input = ( (self.scaling_layer(in0), self.scaling_layer(in1)) if self.version == "0.1" else (in0, in1) ) outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input) feats0, feats1, diffs = {}, {}, {} for kk in range(self.L): feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk]) diffs[kk] = (feats0[kk] - feats1[kk]) ** 2 if self.lpips: if self.spatial: res = [upsample(self.lins[kk](diffs[kk]), out_HW=in0.shape[2:]) for kk in range(self.L)] else: res = [spatial_average(self.lins[kk](diffs[kk]), keepdim=True) for kk in range(self.L)] else: if self.spatial: res = [upsample(diffs[kk].sum(dim=1, keepdim=True), out_HW=in0.shape[2:]) for kk in range(self.L)] else: res = [spatial_average(diffs[kk].sum(dim=1, keepdim=True), keepdim=True) for kk in range(self.L)] # val = res[0] # for l in range(1,self.L): # val += res[l] # print(val) # a = spatial_average(self.lins[kk](diffs[kk]), keepdim=True) # b = torch.max(self.lins[kk](feats0[kk]**2)) # for kk in range(self.L): # a += spatial_average(self.lins[kk](diffs[kk]), keepdim=True) # b = torch.max(b,torch.max(self.lins[kk](feats0[kk]**2))) # a = a/self.L # from IPython import embed # embed() # return 10*torch.log10(b/a) # if(retPerLayer): # return (val, res) # else: return torch.sum(torch.cat(res, 1), dim=(1, 2, 3), keepdims=False) class ScalingLayer(nn.Module): def __init__(self): super(ScalingLayer, self).__init__() self.register_buffer("shift", torch.Tensor([-0.030, -0.088, -0.188])[None, :, None, None]) self.register_buffer("scale", torch.Tensor([0.458, 0.448, 0.450])[None, :, None, None]) def forward(self, inp): return (inp - self.shift) / self.scale class NetLinLayer(nn.Module): """A single linear layer which does a 1x1 conv""" def __init__(self, chn_in, chn_out=1, use_dropout=False): super(NetLinLayer, self).__init__() layers = ( [ nn.Dropout(), ] if (use_dropout) else [] ) layers += [ nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False), ] self.model = nn.Sequential(*layers) def forward(self, x): return self.model(x) class Dist2LogitLayer(nn.Module): """takes 2 distances, puts through fc layers, spits out value between [0,1] (if use_sigmoid is True)""" def __init__(self, chn_mid=32, use_sigmoid=True): super(Dist2LogitLayer, self).__init__() layers = [ nn.Conv2d(5, chn_mid, 1, stride=1, padding=0, bias=True), ] layers += [ nn.LeakyReLU(0.2, True), ] layers += [ nn.Conv2d(chn_mid, chn_mid, 1, stride=1, padding=0, bias=True), ] layers += [ nn.LeakyReLU(0.2, True), ] layers += [ nn.Conv2d(chn_mid, 1, 1, stride=1, padding=0, bias=True), ] if use_sigmoid: layers += [ nn.Sigmoid(), ] self.model = nn.Sequential(*layers) def forward(self, d0, d1, eps=0.1): return self.model.forward(torch.cat((d0, d1, d0 - d1, d0 / (d1 + eps), d1 / (d0 + eps)), dim=1)) class BCERankingLoss(nn.Module): def __init__(self, chn_mid=32): super(BCERankingLoss, self).__init__() self.net = Dist2LogitLayer(chn_mid=chn_mid) # self.parameters = list(self.net.parameters()) self.loss = torch.nn.BCELoss() def forward(self, d0, d1, judge): per = (judge + 1.0) / 2.0 self.logit = self.net.forward(d0, d1) return self.loss(self.logit, per) # L2, DSSIM metrics class FakeNet(nn.Module): def __init__(self, use_gpu=True, colorspace="Lab"): super(FakeNet, self).__init__() self.use_gpu = use_gpu self.colorspace = colorspace class L2(FakeNet): def forward(self, in0, in1, retPerLayer=None): assert in0.size()[0] == 1 # currently only supports batchSize 1 if self.colorspace == "RGB": (N, C, X, Y) = in0.size() value = torch.mean( torch.mean(torch.mean((in0 - in1) ** 2, dim=1).view(N, 1, X, Y), dim=2).view(N, 1, 1, Y), dim=3 ).view(N) return value elif self.colorspace == "Lab": value = l2( tensor2np(tensor2tensorlab(in0.data, to_norm=False)), tensor2np(tensor2tensorlab(in1.data, to_norm=False)), range=100.0, ).astype("float") ret_var = Variable(torch.Tensor((value,))) if self.use_gpu: ret_var = ret_var.cuda() return ret_var class DSSIM(FakeNet): def forward(self, in0, in1, retPerLayer=None): assert in0.size()[0] == 1 # currently only supports batchSize 1 if self.colorspace == "RGB": value = dssim(1.0 * tensor2im(in0.data), 1.0 * tensor2im(in1.data), range=255.0).astype("float") elif self.colorspace == "Lab": value = dssim( tensor2np(tensor2tensorlab(in0.data, to_norm=False)), tensor2np(tensor2tensorlab(in1.data, to_norm=False)), range=100.0, ).astype("float") ret_var = Variable(torch.Tensor((value,))) if self.use_gpu: ret_var = ret_var.cuda() return ret_var def print_network(net): num_params = 0 for param in net.parameters(): num_params += param.numel() print("Network", net) print("Total number of parameters: %d" % num_params) class FloLPIPS(LPIPS): def __init__( self, pretrained=True, net="alex", version="0.1", lpips=True, spatial=False, pnet_rand=False, pnet_tune=False, use_dropout=True, model_path=None, eval_mode=True, verbose=False, ): super(FloLPIPS, self).__init__( pretrained, net, version, lpips, spatial, pnet_rand, pnet_tune, use_dropout, model_path, eval_mode, verbose ) def forward(self, in0, in1, flow, retPerLayer=False, normalize=False): if normalize: # turn on this flag if input is [0,1] so it can be adjusted to [-1, +1] in0 = 2 * in0 - 1 in1 = 2 * in1 - 1 in0_input, in1_input = ( (self.scaling_layer(in0), self.scaling_layer(in1)) if self.version == "0.1" else (in0, in1) ) outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input) feats0, feats1, diffs = {}, {}, {} for kk in range(self.L): feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk]) diffs[kk] = (feats0[kk] - feats1[kk]) ** 2 res = [mw_spatial_average(self.lins[kk](diffs[kk]), flow, keepdim=True) for kk in range(self.L)] return torch.sum(torch.cat(res, 1), dim=(1, 2, 3), keepdims=False) class Flolpips(nn.Module): def __init__(self): super(Flolpips, self).__init__() self.loss_fn = FloLPIPS(net="alex", version="0.1") self.flownet = PWCNet() @torch.no_grad() def forward(self, I0, I1, frame_dis, frame_ref): """ args: I0: first frame of the triplet, shape: [B, C, H, W] I1: third frame of the triplet, shape: [B, C, H, W] frame_dis: prediction of the intermediate frame, shape: [B, C, H, W] frame_ref: ground-truth of the intermediate frame, shape: [B, C, H, W] """ assert ( I0.size() == I1.size() == frame_dis.size() == frame_ref.size() ), "the 4 input tensors should have same size" flow_ref = self.flownet(frame_ref, I0) flow_dis = self.flownet(frame_dis, I0) flow_diff = flow_ref - flow_dis flolpips_wrt_I0 = self.loss_fn.forward(frame_ref, frame_dis, flow_diff, normalize=True) flow_ref = self.flownet(frame_ref, I1) flow_dis = self.flownet(frame_dis, I1) flow_diff = flow_ref - flow_dis flolpips_wrt_I1 = self.loss_fn.forward(frame_ref, frame_dis, flow_diff, normalize=True) flolpips = (flolpips_wrt_I0 + flolpips_wrt_I1) / 2 return flolpips ================================================ FILE: Open-Sora/eval/vae/flolpips/pretrained_networks.py ================================================ from collections import namedtuple import torch from torchvision import models as tv class squeezenet(torch.nn.Module): def __init__(self, requires_grad=False, pretrained=True): super(squeezenet, self).__init__() pretrained_features = tv.squeezenet1_1(pretrained=pretrained).features self.slice1 = torch.nn.Sequential() self.slice2 = torch.nn.Sequential() self.slice3 = torch.nn.Sequential() self.slice4 = torch.nn.Sequential() self.slice5 = torch.nn.Sequential() self.slice6 = torch.nn.Sequential() self.slice7 = torch.nn.Sequential() self.N_slices = 7 for x in range(2): self.slice1.add_module(str(x), pretrained_features[x]) for x in range(2, 5): self.slice2.add_module(str(x), pretrained_features[x]) for x in range(5, 8): self.slice3.add_module(str(x), pretrained_features[x]) for x in range(8, 10): self.slice4.add_module(str(x), pretrained_features[x]) for x in range(10, 11): self.slice5.add_module(str(x), pretrained_features[x]) for x in range(11, 12): self.slice6.add_module(str(x), pretrained_features[x]) for x in range(12, 13): self.slice7.add_module(str(x), pretrained_features[x]) if not requires_grad: for param in self.parameters(): param.requires_grad = False def forward(self, X): h = self.slice1(X) h_relu1 = h h = self.slice2(h) h_relu2 = h h = self.slice3(h) h_relu3 = h h = self.slice4(h) h_relu4 = h h = self.slice5(h) h_relu5 = h h = self.slice6(h) h_relu6 = h h = self.slice7(h) h_relu7 = h vgg_outputs = namedtuple("SqueezeOutputs", ["relu1", "relu2", "relu3", "relu4", "relu5", "relu6", "relu7"]) out = vgg_outputs(h_relu1, h_relu2, h_relu3, h_relu4, h_relu5, h_relu6, h_relu7) return out class alexnet(torch.nn.Module): def __init__(self, requires_grad=False, pretrained=True): super(alexnet, self).__init__() alexnet_pretrained_features = tv.alexnet(pretrained=pretrained).features self.slice1 = torch.nn.Sequential() self.slice2 = torch.nn.Sequential() self.slice3 = torch.nn.Sequential() self.slice4 = torch.nn.Sequential() self.slice5 = torch.nn.Sequential() self.N_slices = 5 for x in range(2): self.slice1.add_module(str(x), alexnet_pretrained_features[x]) for x in range(2, 5): self.slice2.add_module(str(x), alexnet_pretrained_features[x]) for x in range(5, 8): self.slice3.add_module(str(x), alexnet_pretrained_features[x]) for x in range(8, 10): self.slice4.add_module(str(x), alexnet_pretrained_features[x]) for x in range(10, 12): self.slice5.add_module(str(x), alexnet_pretrained_features[x]) if not requires_grad: for param in self.parameters(): param.requires_grad = False def forward(self, X): h = self.slice1(X) h_relu1 = h h = self.slice2(h) h_relu2 = h h = self.slice3(h) h_relu3 = h h = self.slice4(h) h_relu4 = h h = self.slice5(h) h_relu5 = h alexnet_outputs = namedtuple("AlexnetOutputs", ["relu1", "relu2", "relu3", "relu4", "relu5"]) out = alexnet_outputs(h_relu1, h_relu2, h_relu3, h_relu4, h_relu5) return out class vgg16(torch.nn.Module): def __init__(self, requires_grad=False, pretrained=True): super(vgg16, self).__init__() vgg_pretrained_features = tv.vgg16(pretrained=pretrained).features self.slice1 = torch.nn.Sequential() self.slice2 = torch.nn.Sequential() self.slice3 = torch.nn.Sequential() self.slice4 = torch.nn.Sequential() self.slice5 = torch.nn.Sequential() self.N_slices = 5 for x in range(4): self.slice1.add_module(str(x), vgg_pretrained_features[x]) for x in range(4, 9): self.slice2.add_module(str(x), vgg_pretrained_features[x]) for x in range(9, 16): self.slice3.add_module(str(x), vgg_pretrained_features[x]) for x in range(16, 23): self.slice4.add_module(str(x), vgg_pretrained_features[x]) for x in range(23, 30): self.slice5.add_module(str(x), vgg_pretrained_features[x]) if not requires_grad: for param in self.parameters(): param.requires_grad = False def forward(self, X): h = self.slice1(X) h_relu1_2 = h h = self.slice2(h) h_relu2_2 = h h = self.slice3(h) h_relu3_3 = h h = self.slice4(h) h_relu4_3 = h h = self.slice5(h) h_relu5_3 = h vgg_outputs = namedtuple("VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"]) out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3) return out class resnet(torch.nn.Module): def __init__(self, requires_grad=False, pretrained=True, num=18): super(resnet, self).__init__() if num == 18: self.net = tv.resnet18(pretrained=pretrained) elif num == 34: self.net = tv.resnet34(pretrained=pretrained) elif num == 50: self.net = tv.resnet50(pretrained=pretrained) elif num == 101: self.net = tv.resnet101(pretrained=pretrained) elif num == 152: self.net = tv.resnet152(pretrained=pretrained) self.N_slices = 5 self.conv1 = self.net.conv1 self.bn1 = self.net.bn1 self.relu = self.net.relu self.maxpool = self.net.maxpool self.layer1 = self.net.layer1 self.layer2 = self.net.layer2 self.layer3 = self.net.layer3 self.layer4 = self.net.layer4 def forward(self, X): h = self.conv1(X) h = self.bn1(h) h = self.relu(h) h_relu1 = h h = self.maxpool(h) h = self.layer1(h) h_conv2 = h h = self.layer2(h) h_conv3 = h h = self.layer3(h) h_conv4 = h h = self.layer4(h) h_conv5 = h outputs = namedtuple("Outputs", ["relu1", "conv2", "conv3", "conv4", "conv5"]) out = outputs(h_relu1, h_conv2, h_conv3, h_conv4, h_conv5) return out ================================================ FILE: Open-Sora/eval/vae/flolpips/pwcnet.py ================================================ #!/usr/bin/env python import math import torch # try: from .correlation import correlation # the custom cost volume layer # except: # sys.path.insert(0, './correlation'); import correlation # you should consider upgrading python # end ########################################################## # assert(int(str('').join(torch.__version__.split('.')[0:2])) >= 13) # requires at least pytorch version 1.3.0 # torch.set_grad_enabled(False) # make sure to not compute gradients for computational performance # torch.backends.cudnn.enabled = True # make sure to use cudnn for computational performance # ########################################################## # arguments_strModel = 'default' # 'default', or 'chairs-things' # arguments_strFirst = './images/first.png' # arguments_strSecond = './images/second.png' # arguments_strOut = './out.flo' # for strOption, strArgument in getopt.getopt(sys.argv[1:], '', [ strParameter[2:] + '=' for strParameter in sys.argv[1::2] ])[0]: # if strOption == '--model' and strArgument != '': arguments_strModel = strArgument # which model to use # if strOption == '--first' and strArgument != '': arguments_strFirst = strArgument # path to the first frame # if strOption == '--second' and strArgument != '': arguments_strSecond = strArgument # path to the second frame # if strOption == '--out' and strArgument != '': arguments_strOut = strArgument # path to where the output should be stored # end ########################################################## def backwarp(tenInput, tenFlow): backwarp_tenGrid = {} backwarp_tenPartial = {} if str(tenFlow.shape) not in backwarp_tenGrid: tenHor = ( torch.linspace(-1.0 + (1.0 / tenFlow.shape[3]), 1.0 - (1.0 / tenFlow.shape[3]), tenFlow.shape[3]) .view(1, 1, 1, -1) .expand(-1, -1, tenFlow.shape[2], -1) ) tenVer = ( torch.linspace(-1.0 + (1.0 / tenFlow.shape[2]), 1.0 - (1.0 / tenFlow.shape[2]), tenFlow.shape[2]) .view(1, 1, -1, 1) .expand(-1, -1, -1, tenFlow.shape[3]) ) backwarp_tenGrid[str(tenFlow.shape)] = torch.cat([tenHor, tenVer], 1).cuda() # end if str(tenFlow.shape) not in backwarp_tenPartial: backwarp_tenPartial[str(tenFlow.shape)] = tenFlow.new_ones( [tenFlow.shape[0], 1, tenFlow.shape[2], tenFlow.shape[3]] ) # end tenFlow = torch.cat( [ tenFlow[:, 0:1, :, :] / ((tenInput.shape[3] - 1.0) / 2.0), tenFlow[:, 1:2, :, :] / ((tenInput.shape[2] - 1.0) / 2.0), ], 1, ) tenInput = torch.cat([tenInput, backwarp_tenPartial[str(tenFlow.shape)]], 1) tenOutput = torch.nn.functional.grid_sample( input=tenInput, grid=(backwarp_tenGrid[str(tenFlow.shape)] + tenFlow).permute(0, 2, 3, 1), mode="bilinear", padding_mode="zeros", align_corners=False, ) tenMask = tenOutput[:, -1:, :, :] tenMask[tenMask > 0.999] = 1.0 tenMask[tenMask < 1.0] = 0.0 return tenOutput[:, :-1, :, :] * tenMask # end ########################################################## class Network(torch.nn.Module): def __init__(self): super(Network, self).__init__() class Extractor(torch.nn.Module): def __init__(self): super(Extractor, self).__init__() self.netOne = torch.nn.Sequential( torch.nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), ) self.netTwo = torch.nn.Sequential( torch.nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), ) self.netThr = torch.nn.Sequential( torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), ) self.netFou = torch.nn.Sequential( torch.nn.Conv2d(in_channels=64, out_channels=96, kernel_size=3, stride=2, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), ) self.netFiv = torch.nn.Sequential( torch.nn.Conv2d(in_channels=96, out_channels=128, kernel_size=3, stride=2, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), ) self.netSix = torch.nn.Sequential( torch.nn.Conv2d(in_channels=128, out_channels=196, kernel_size=3, stride=2, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=196, out_channels=196, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=196, out_channels=196, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), ) # end def forward(self, tenInput): tenOne = self.netOne(tenInput) tenTwo = self.netTwo(tenOne) tenThr = self.netThr(tenTwo) tenFou = self.netFou(tenThr) tenFiv = self.netFiv(tenFou) tenSix = self.netSix(tenFiv) return [tenOne, tenTwo, tenThr, tenFou, tenFiv, tenSix] # end # end class Decoder(torch.nn.Module): def __init__(self, intLevel): super(Decoder, self).__init__() intPrevious = [ None, None, 81 + 32 + 2 + 2, 81 + 64 + 2 + 2, 81 + 96 + 2 + 2, 81 + 128 + 2 + 2, 81, None, ][intLevel + 1] intCurrent = [ None, None, 81 + 32 + 2 + 2, 81 + 64 + 2 + 2, 81 + 96 + 2 + 2, 81 + 128 + 2 + 2, 81, None, ][intLevel + 0] if intLevel < 6: self.netUpflow = torch.nn.ConvTranspose2d( in_channels=2, out_channels=2, kernel_size=4, stride=2, padding=1 ) if intLevel < 6: self.netUpfeat = torch.nn.ConvTranspose2d( in_channels=intPrevious + 128 + 128 + 96 + 64 + 32, out_channels=2, kernel_size=4, stride=2, padding=1, ) if intLevel < 6: self.fltBackwarp = [None, None, None, 5.0, 2.5, 1.25, 0.625, None][intLevel + 1] self.netOne = torch.nn.Sequential( torch.nn.Conv2d(in_channels=intCurrent, out_channels=128, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), ) self.netTwo = torch.nn.Sequential( torch.nn.Conv2d(in_channels=intCurrent + 128, out_channels=128, kernel_size=3, stride=1, padding=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), ) self.netThr = torch.nn.Sequential( torch.nn.Conv2d( in_channels=intCurrent + 128 + 128, out_channels=96, kernel_size=3, stride=1, padding=1 ), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), ) self.netFou = torch.nn.Sequential( torch.nn.Conv2d( in_channels=intCurrent + 128 + 128 + 96, out_channels=64, kernel_size=3, stride=1, padding=1 ), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), ) self.netFiv = torch.nn.Sequential( torch.nn.Conv2d( in_channels=intCurrent + 128 + 128 + 96 + 64, out_channels=32, kernel_size=3, stride=1, padding=1, ), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), ) self.netSix = torch.nn.Sequential( torch.nn.Conv2d( in_channels=intCurrent + 128 + 128 + 96 + 64 + 32, out_channels=2, kernel_size=3, stride=1, padding=1, ) ) # end def forward(self, tenFirst, tenSecond, objPrevious): tenFlow = None tenFeat = None if objPrevious is None: tenFlow = None tenFeat = None tenVolume = torch.nn.functional.leaky_relu( input=correlation.FunctionCorrelation(tenFirst=tenFirst, tenSecond=tenSecond), negative_slope=0.1, inplace=False, ) tenFeat = torch.cat([tenVolume], 1) elif objPrevious is not None: tenFlow = self.netUpflow(objPrevious["tenFlow"]) tenFeat = self.netUpfeat(objPrevious["tenFeat"]) tenVolume = torch.nn.functional.leaky_relu( input=correlation.FunctionCorrelation( tenFirst=tenFirst, tenSecond=backwarp(tenInput=tenSecond, tenFlow=tenFlow * self.fltBackwarp), ), negative_slope=0.1, inplace=False, ) tenFeat = torch.cat([tenVolume, tenFirst, tenFlow, tenFeat], 1) # end tenFeat = torch.cat([self.netOne(tenFeat), tenFeat], 1) tenFeat = torch.cat([self.netTwo(tenFeat), tenFeat], 1) tenFeat = torch.cat([self.netThr(tenFeat), tenFeat], 1) tenFeat = torch.cat([self.netFou(tenFeat), tenFeat], 1) tenFeat = torch.cat([self.netFiv(tenFeat), tenFeat], 1) tenFlow = self.netSix(tenFeat) return {"tenFlow": tenFlow, "tenFeat": tenFeat} # end # end class Refiner(torch.nn.Module): def __init__(self): super(Refiner, self).__init__() self.netMain = torch.nn.Sequential( torch.nn.Conv2d( in_channels=81 + 32 + 2 + 2 + 128 + 128 + 96 + 64 + 32, out_channels=128, kernel_size=3, stride=1, padding=1, dilation=1, ), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=2, dilation=2), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=4, dilation=4), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=128, out_channels=96, kernel_size=3, stride=1, padding=8, dilation=8), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=96, out_channels=64, kernel_size=3, stride=1, padding=16, dilation=16), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1, dilation=1), torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), torch.nn.Conv2d(in_channels=32, out_channels=2, kernel_size=3, stride=1, padding=1, dilation=1), ) # end def forward(self, tenInput): return self.netMain(tenInput) # end # end self.netExtractor = Extractor() self.netTwo = Decoder(2) self.netThr = Decoder(3) self.netFou = Decoder(4) self.netFiv = Decoder(5) self.netSix = Decoder(6) self.netRefiner = Refiner() self.load_state_dict( { strKey.replace("module", "net"): tenWeight for strKey, tenWeight in torch.hub.load_state_dict_from_url( url="http://content.sniklaus.com/github/pytorch-pwc/network-" + "default" + ".pytorch" ).items() } ) # end def forward(self, tenFirst, tenSecond): intWidth = tenFirst.shape[3] intHeight = tenFirst.shape[2] intPreprocessedWidth = int(math.floor(math.ceil(intWidth / 64.0) * 64.0)) intPreprocessedHeight = int(math.floor(math.ceil(intHeight / 64.0) * 64.0)) tenPreprocessedFirst = torch.nn.functional.interpolate( input=tenFirst, size=(intPreprocessedHeight, intPreprocessedWidth), mode="bilinear", align_corners=False ) tenPreprocessedSecond = torch.nn.functional.interpolate( input=tenSecond, size=(intPreprocessedHeight, intPreprocessedWidth), mode="bilinear", align_corners=False ) tenFirst = self.netExtractor(tenPreprocessedFirst) tenSecond = self.netExtractor(tenPreprocessedSecond) objEstimate = self.netSix(tenFirst[-1], tenSecond[-1], None) objEstimate = self.netFiv(tenFirst[-2], tenSecond[-2], objEstimate) objEstimate = self.netFou(tenFirst[-3], tenSecond[-3], objEstimate) objEstimate = self.netThr(tenFirst[-4], tenSecond[-4], objEstimate) objEstimate = self.netTwo(tenFirst[-5], tenSecond[-5], objEstimate) tenFlow = objEstimate["tenFlow"] + self.netRefiner(objEstimate["tenFeat"]) tenFlow = 20.0 * torch.nn.functional.interpolate( input=tenFlow, size=(intHeight, intWidth), mode="bilinear", align_corners=False ) tenFlow[:, 0, :, :] *= float(intWidth) / float(intPreprocessedWidth) tenFlow[:, 1, :, :] *= float(intHeight) / float(intPreprocessedHeight) return tenFlow # end # end netNetwork = None ########################################################## def estimate(tenFirst, tenSecond): global netNetwork if netNetwork is None: netNetwork = Network().cuda().eval() # end assert tenFirst.shape[1] == tenSecond.shape[1] assert tenFirst.shape[2] == tenSecond.shape[2] intWidth = tenFirst.shape[2] intHeight = tenFirst.shape[1] assert ( intWidth == 1024 ) # remember that there is no guarantee for correctness, comment this line out if you acknowledge this and want to continue assert ( intHeight == 436 ) # remember that there is no guarantee for correctness, comment this line out if you acknowledge this and want to continue tenPreprocessedFirst = tenFirst.cuda().view(1, 3, intHeight, intWidth) tenPreprocessedSecond = tenSecond.cuda().view(1, 3, intHeight, intWidth) intPreprocessedWidth = int(math.floor(math.ceil(intWidth / 64.0) * 64.0)) intPreprocessedHeight = int(math.floor(math.ceil(intHeight / 64.0) * 64.0)) tenPreprocessedFirst = torch.nn.functional.interpolate( input=tenPreprocessedFirst, size=(intPreprocessedHeight, intPreprocessedWidth), mode="bilinear", align_corners=False, ) tenPreprocessedSecond = torch.nn.functional.interpolate( input=tenPreprocessedSecond, size=(intPreprocessedHeight, intPreprocessedWidth), mode="bilinear", align_corners=False, ) tenFlow = 20.0 * torch.nn.functional.interpolate( input=netNetwork(tenPreprocessedFirst, tenPreprocessedSecond), size=(intHeight, intWidth), mode="bilinear", align_corners=False, ) tenFlow[:, 0, :, :] *= float(intWidth) / float(intPreprocessedWidth) tenFlow[:, 1, :, :] *= float(intHeight) / float(intPreprocessedHeight) return tenFlow[0, :, :, :].cpu() # end ########################################################## # if __name__ == '__main__': # tenFirst = torch.FloatTensor(numpy.ascontiguousarray(numpy.array(PIL.Image.open(arguments_strFirst))[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0))) # tenSecond = torch.FloatTensor(numpy.ascontiguousarray(numpy.array(PIL.Image.open(arguments_strSecond))[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0))) # tenOutput = estimate(tenFirst, tenSecond) # objOutput = open(arguments_strOut, 'wb') # numpy.array([ 80, 73, 69, 72 ], numpy.uint8).tofile(objOutput) # numpy.array([ tenOutput.shape[2], tenOutput.shape[1] ], numpy.int32).tofile(objOutput) # numpy.array(tenOutput.numpy().transpose(1, 2, 0), numpy.float32).tofile(objOutput) # objOutput.close() # end ================================================ FILE: Open-Sora/eval/vae/flolpips/utils.py ================================================ import cv2 import numpy as np import torch def normalize_tensor(in_feat, eps=1e-10): norm_factor = torch.sqrt(torch.sum(in_feat**2, dim=1, keepdim=True)) return in_feat / (norm_factor + eps) def l2(p0, p1, range=255.0): return 0.5 * np.mean((p0 / range - p1 / range) ** 2) def dssim(p0, p1, range=255.0): from skimage.measure import compare_ssim return (1 - compare_ssim(p0, p1, data_range=range, multichannel=True)) / 2.0 def tensor2im(image_tensor, imtype=np.uint8, cent=1.0, factor=255.0 / 2.0): image_numpy = image_tensor[0].cpu().float().numpy() image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + cent) * factor return image_numpy.astype(imtype) def tensor2np(tensor_obj): # change dimension of a tensor object into a numpy array return tensor_obj[0].cpu().float().numpy().transpose((1, 2, 0)) def np2tensor(np_obj): # change dimenion of np array into tensor array return torch.Tensor(np_obj[:, :, :, np.newaxis].transpose((3, 2, 0, 1))) def tensor2tensorlab(image_tensor, to_norm=True, mc_only=False): # image tensor to lab tensor from skimage import color img = tensor2im(image_tensor) img_lab = color.rgb2lab(img) if mc_only: img_lab[:, :, 0] = img_lab[:, :, 0] - 50 if to_norm and not mc_only: img_lab[:, :, 0] = img_lab[:, :, 0] - 50 img_lab = img_lab / 100.0 return np2tensor(img_lab) def read_frame_yuv2rgb(stream, width, height, iFrame, bit_depth, pix_fmt="420"): if pix_fmt == "420": multiplier = 1 uv_factor = 2 elif pix_fmt == "444": multiplier = 2 uv_factor = 1 else: print("Pixel format {} is not supported".format(pix_fmt)) return if bit_depth == 8: datatype = np.uint8 stream.seek(iFrame * 1.5 * width * height * multiplier) Y = np.fromfile(stream, dtype=datatype, count=width * height).reshape((height, width)) # read chroma samples and upsample since original is 4:2:0 sampling U = np.fromfile(stream, dtype=datatype, count=(width // uv_factor) * (height // uv_factor)).reshape( (height // uv_factor, width // uv_factor) ) V = np.fromfile(stream, dtype=datatype, count=(width // uv_factor) * (height // uv_factor)).reshape( (height // uv_factor, width // uv_factor) ) else: datatype = np.uint16 stream.seek(iFrame * 3 * width * height * multiplier) Y = np.fromfile(stream, dtype=datatype, count=width * height).reshape((height, width)) U = np.fromfile(stream, dtype=datatype, count=(width // uv_factor) * (height // uv_factor)).reshape( (height // uv_factor, width // uv_factor) ) V = np.fromfile(stream, dtype=datatype, count=(width // uv_factor) * (height // uv_factor)).reshape( (height // uv_factor, width // uv_factor) ) if pix_fmt == "420": yuv = np.empty((height * 3 // 2, width), dtype=datatype) yuv[0:height, :] = Y yuv[height : height + height // 4, :] = U.reshape(-1, width) yuv[height + height // 4 :, :] = V.reshape(-1, width) if bit_depth != 8: yuv = (yuv / (2**bit_depth - 1) * 255).astype(np.uint8) # convert to rgb rgb = cv2.cvtColor(yuv, cv2.COLOR_YUV2RGB_I420) else: yvu = np.stack([Y, V, U], axis=2) if bit_depth != 8: yvu = (yvu / (2**bit_depth - 1) * 255).astype(np.uint8) rgb = cv2.cvtColor(yvu, cv2.COLOR_YCrCb2RGB) return rgb ================================================ FILE: Open-Sora/eval/vae/script/eval.sh ================================================ python eval/eval_common_metric.py \ --batch_size 2 \ --real_video_dir ../test_eval/release/origin \ --generated_video_dir ../test_eval/release \ --device cuda \ --sample_fps 10 \ --crop_size 256 \ --resolution 256 \ --num_frames 17 \ --sample_rate 1 \ --subset_size 100 \ --metric ssim psnr lpips flolpips ================================================ FILE: Open-Sora/eval/vbench/VBench_full_info.json ================================================ [ { "prompt_en": "In a still frame, a stop sign", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "a toilet, frozen in time", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "a laptop, frozen in time", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of alley", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of bar", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of barn", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of bathroom", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of bedroom", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of cliff", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, courtyard", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, gas station", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of house", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "indoor gymnasium, frozen in time", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of indoor library", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of kitchen", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of palace", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, parking lot", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, phone booth", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of restaurant", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of tower", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of a bowl", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of an apple", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of a bench", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of a bed", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of a chair", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of a cup", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of a dining table", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, a pear", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of a bunch of grapes", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of a bowl on the kitchen counter", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of a beautiful, handcrafted ceramic bowl", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of an antique bowl", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of an exquisite mahogany dining table", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of a wooden bench in the park", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of a beautiful wrought-iron bench surrounded by blooming flowers", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, a park bench with a view of the lake", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of a vintage rocking chair was placed on the porch", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of the jail cell was small and dimly lit, with cold, steel bars", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of the phone booth was tucked away in a quiet alley", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "a dilapidated phone booth stood as a relic of a bygone era on the sidewalk, frozen in time", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of the old red barn stood weathered and iconic against the backdrop of the countryside", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of a picturesque barn was painted a warm shade of red and nestled in a picturesque meadow", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, within the desolate desert, an oasis unfolded, characterized by the stoic presence of palm trees and a motionless, glassy pool of water", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, the Parthenon's majestic Doric columns stand in serene solitude atop the Acropolis, framed by the tranquil Athenian landscape", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, the Temple of Hephaestus, with its timeless Doric grace, stands stoically against the backdrop of a quiet Athens", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, the ornate Victorian streetlamp stands solemnly, adorned with intricate ironwork and stained glass panels", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of the Stonehenge presented itself as an enigmatic puzzle, each colossal stone meticulously placed against the backdrop of tranquility", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, in the vast desert, an oasis nestled among dunes, featuring tall palm trees and an air of serenity", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "static view on a desert scene with an oasis, palm trees, and a clear, calm pool of water", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of an ornate Victorian streetlamp standing on a cobblestone street corner, illuminating the empty night", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of a tranquil lakeside cabin nestled among tall pines, its reflection mirrored perfectly in the calm water", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, a vintage gas lantern, adorned with intricate details, gracing a historic cobblestone square", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, a tranquil Japanese tea ceremony room, with tatami mats, a delicate tea set, and a bonsai tree in the corner", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of the Parthenon stands resolute in its classical elegance, a timeless symbol of Athens' cultural legacy", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of in the heart of Plaka, the neoclassical architecture of the old city harmonizes with the ancient ruins", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of in the desolate beauty of the American Southwest, Chaco Canyon's ancient ruins whispered tales of an enigmatic civilization that once thrived amidst the arid landscapes", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of at the edge of the Arabian Desert, the ancient city of Petra beckoned with its enigmatic rock-carved fa\u00e7ades", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, amidst the cobblestone streets, an Art Nouveau lamppost stood tall", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of in the quaint village square, a traditional wrought-iron streetlamp featured delicate filigree patterns and amber-hued glass panels", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of the lampposts were adorned with Art Deco motifs, their geometric shapes and frosted glass creating a sense of vintage glamour", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, in the picturesque square, a Gothic-style lamppost adorned with intricate stone carvings added a touch of medieval charm to the setting", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, in the heart of the old city, a row of ornate lantern-style streetlamps bathed the narrow alleyway in a warm, welcoming light", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of in the heart of the Utah desert, a massive sandstone arch spanned the horizon", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of in the Arizona desert, a massive stone bridge arched across a rugged canyon", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of in the corner of the minimalist tea room, a bonsai tree added a touch of nature's beauty to the otherwise simple and elegant space", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, amidst the hushed ambiance of the traditional tea room, a meticulously arranged tea set awaited, with porcelain cups, a bamboo whisk", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, nestled in the Zen garden, a rustic teahouse featured tatami seating and a traditional charcoal brazier", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of a country estate's library featured elegant wooden shelves", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of beneath the shade of a solitary oak tree, an old wooden park bench sat patiently", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of beside a tranquil pond, a weeping willow tree draped its branches gracefully over the water's surface, creating a serene tableau of reflection and calm", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of in the Zen garden, a perfectly raked gravel path led to a serene rock garden", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, a tranquil pond was fringed by weeping cherry trees, their blossoms drifting lazily onto the glassy surface", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "In a still frame, within the historic library's reading room, rows of antique leather chairs and mahogany tables offered a serene haven for literary contemplation", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of a peaceful orchid garden showcased a variety of delicate blooms", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "A tranquil tableau of in the serene courtyard, a centuries-old stone well stood as a symbol of a bygone era, its mossy stones bearing witness to the passage of time", "dimension": [ "temporal_flickering" ] }, { "prompt_en": "a bird and a cat", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "bird and cat" } } }, { "prompt_en": "a cat and a dog", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "cat and dog" } } }, { "prompt_en": "a dog and a horse", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "dog and horse" } } }, { "prompt_en": "a horse and a sheep", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "horse and sheep" } } }, { "prompt_en": "a sheep and a cow", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "sheep and cow" } } }, { "prompt_en": "a cow and an elephant", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "cow and elephant" } } }, { "prompt_en": "an elephant and a bear", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "elephant and bear" } } }, { "prompt_en": "a bear and a zebra", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "bear and zebra" } } }, { "prompt_en": "a zebra and a giraffe", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "zebra and giraffe" } } }, { "prompt_en": "a giraffe and a bird", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "giraffe and bird" } } }, { "prompt_en": "a chair and a couch", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "chair and couch" } } }, { "prompt_en": "a couch and a potted plant", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "couch and potted plant" } } }, { "prompt_en": "a potted plant and a tv", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "potted plant and tv" } } }, { "prompt_en": "a tv and a laptop", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "tv and laptop" } } }, { "prompt_en": "a laptop and a remote", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "laptop and remote" } } }, { "prompt_en": "a remote and a keyboard", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "remote and keyboard" } } }, { "prompt_en": "a keyboard and a cell phone", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "keyboard and cell phone" } } }, { "prompt_en": "a cell phone and a book", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "cell phone and book" } } }, { "prompt_en": "a book and a clock", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "book and clock" } } }, { "prompt_en": "a clock and a backpack", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "clock and backpack" } } }, { "prompt_en": "a backpack and an umbrella", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "backpack and umbrella" } } }, { "prompt_en": "an umbrella and a handbag", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "umbrella and handbag" } } }, { "prompt_en": "a handbag and a tie", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "handbag and tie" } } }, { "prompt_en": "a tie and a suitcase", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "tie and suitcase" } } }, { "prompt_en": "a suitcase and a vase", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "suitcase and vase" } } }, { "prompt_en": "a vase and scissors", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "vase and scissors" } } }, { "prompt_en": "scissors and a teddy bear", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "scissors and teddy bear" } } }, { "prompt_en": "a teddy bear and a frisbee", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "teddy bear and frisbee" } } }, { "prompt_en": "a frisbee and skis", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "frisbee and skis" } } }, { "prompt_en": "skis and a snowboard", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "skis and snowboard" } } }, { "prompt_en": "a snowboard and a sports ball", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "snowboard and sports ball" } } }, { "prompt_en": "a sports ball and a kite", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "sports ball and kite" } } }, { "prompt_en": "a kite and a baseball bat", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "kite and baseball bat" } } }, { "prompt_en": "a baseball bat and a baseball glove", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "baseball bat and baseball glove" } } }, { "prompt_en": "a baseball glove and a skateboard", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "baseball glove and skateboard" } } }, { "prompt_en": "a skateboard and a surfboard", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "skateboard and surfboard" } } }, { "prompt_en": "a surfboard and a tennis racket", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "surfboard and tennis racket" } } }, { "prompt_en": "a tennis racket and a bottle", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "tennis racket and bottle" } } }, { "prompt_en": "a bottle and a chair", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "bottle and chair" } } }, { "prompt_en": "an airplane and a train", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "airplane and train" } } }, { "prompt_en": "a train and a boat", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "train and boat" } } }, { "prompt_en": "a boat and an airplane", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "boat and airplane" } } }, { "prompt_en": "a bicycle and a car", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "bicycle and car" } } }, { "prompt_en": "a car and a motorcycle", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "car and motorcycle" } } }, { "prompt_en": "a motorcycle and a bus", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "motorcycle and bus" } } }, { "prompt_en": "a bus and a traffic light", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "bus and traffic light" } } }, { "prompt_en": "a traffic light and a fire hydrant", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "traffic light and fire hydrant" } } }, { "prompt_en": "a fire hydrant and a stop sign", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "fire hydrant and stop sign" } } }, { "prompt_en": "a stop sign and a parking meter", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "stop sign and parking meter" } } }, { "prompt_en": "a parking meter and a truck", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "parking meter and truck" } } }, { "prompt_en": "a truck and a bicycle", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "truck and bicycle" } } }, { "prompt_en": "a toilet and a hair drier", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "toilet and hair drier" } } }, { "prompt_en": "a hair drier and a toothbrush", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "hair drier and toothbrush" } } }, { "prompt_en": "a toothbrush and a sink", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "toothbrush and sink" } } }, { "prompt_en": "a sink and a toilet", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "sink and toilet" } } }, { "prompt_en": "a wine glass and a chair", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "wine glass and chair" } } }, { "prompt_en": "a cup and a couch", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "cup and couch" } } }, { "prompt_en": "a fork and a potted plant", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "fork and potted plant" } } }, { "prompt_en": "a knife and a tv", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "knife and tv" } } }, { "prompt_en": "a spoon and a laptop", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "spoon and laptop" } } }, { "prompt_en": "a bowl and a remote", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "bowl and remote" } } }, { "prompt_en": "a banana and a keyboard", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "banana and keyboard" } } }, { "prompt_en": "an apple and a cell phone", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "apple and cell phone" } } }, { "prompt_en": "a sandwich and a book", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "sandwich and book" } } }, { "prompt_en": "an orange and a clock", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "orange and clock" } } }, { "prompt_en": "broccoli and a backpack", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "broccoli and backpack" } } }, { "prompt_en": "a carrot and an umbrella", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "carrot and umbrella" } } }, { "prompt_en": "a hot dog and a handbag", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "hot dog and handbag" } } }, { "prompt_en": "a pizza and a tie", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "pizza and tie" } } }, { "prompt_en": "a donut and a suitcase", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "donut and suitcase" } } }, { "prompt_en": "a cake and a vase", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "cake and vase" } } }, { "prompt_en": "an oven and scissors", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "oven and scissors" } } }, { "prompt_en": "a toaster and a teddy bear", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "toaster and teddy bear" } } }, { "prompt_en": "a microwave and a frisbee", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "microwave and frisbee" } } }, { "prompt_en": "a refrigerator and skis", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "refrigerator and skis" } } }, { "prompt_en": "a bicycle and an airplane", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "bicycle and airplane" } } }, { "prompt_en": "a car and a train", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "car and train" } } }, { "prompt_en": "a motorcycle and a boat", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "motorcycle and boat" } } }, { "prompt_en": "a person and a toilet", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "person and toilet" } } }, { "prompt_en": "a person and a hair drier", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "person and hair drier" } } }, { "prompt_en": "a person and a toothbrush", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "person and toothbrush" } } }, { "prompt_en": "a person and a sink", "dimension": [ "multiple_objects" ], "auxiliary_info": { "multiple_objects": { "object": "person and sink" } } }, { "prompt_en": "A person is riding a bike", "dimension": [ "human_action" ] }, { "prompt_en": "A person is marching", "dimension": [ "human_action" ] }, { "prompt_en": "A person is roller skating", "dimension": [ "human_action" ] }, { "prompt_en": "A person is tasting beer", "dimension": [ "human_action" ] }, { "prompt_en": "A person is clapping", "dimension": [ "human_action" ] }, { "prompt_en": "A person is drawing", "dimension": [ "human_action" ] }, { "prompt_en": "A person is petting animal (not cat)", "dimension": [ "human_action" ] }, { "prompt_en": "A person is eating watermelon", "dimension": [ "human_action" ] }, { "prompt_en": "A person is playing harp", "dimension": [ "human_action" ] }, { "prompt_en": "A person is wrestling", "dimension": [ "human_action" ] }, { "prompt_en": "A person is riding scooter", "dimension": [ "human_action" ] }, { "prompt_en": "A person is sweeping floor", "dimension": [ "human_action" ] }, { "prompt_en": "A person is skateboarding", "dimension": [ "human_action" ] }, { "prompt_en": "A person is dunking basketball", "dimension": [ "human_action" ] }, { "prompt_en": "A person is playing flute", "dimension": [ "human_action" ] }, { "prompt_en": "A person is stretching leg", "dimension": [ "human_action" ] }, { "prompt_en": "A person is tying tie", "dimension": [ "human_action" ] }, { "prompt_en": "A person is skydiving", "dimension": [ "human_action" ] }, { "prompt_en": "A person is shooting goal (soccer)", "dimension": [ "human_action" ] }, { "prompt_en": "A person is playing piano", "dimension": [ "human_action" ] }, { "prompt_en": "A person is finger snapping", "dimension": [ "human_action" ] }, { "prompt_en": "A person is canoeing or kayaking", "dimension": [ "human_action" ] }, { "prompt_en": "A person is laughing", "dimension": [ "human_action" ] }, { "prompt_en": "A person is digging", "dimension": [ "human_action" ] }, { "prompt_en": "A person is clay pottery making", "dimension": [ "human_action" ] }, { "prompt_en": "A person is shooting basketball", "dimension": [ "human_action" ] }, { "prompt_en": "A person is bending back", "dimension": [ "human_action" ] }, { "prompt_en": "A person is shaking hands", "dimension": [ "human_action" ] }, { "prompt_en": "A person is bandaging", "dimension": [ "human_action" ] }, { "prompt_en": "A person is push up", "dimension": [ "human_action" ] }, { "prompt_en": "A person is catching or throwing frisbee", "dimension": [ "human_action" ] }, { "prompt_en": "A person is playing trumpet", "dimension": [ "human_action" ] }, { "prompt_en": "A person is flying kite", "dimension": [ "human_action" ] }, { "prompt_en": "A person is filling eyebrows", "dimension": [ "human_action" ] }, { "prompt_en": "A person is shuffling cards", "dimension": [ "human_action" ] }, { "prompt_en": "A person is folding clothes", "dimension": [ "human_action" ] }, { "prompt_en": "A person is smoking", "dimension": [ "human_action" ] }, { "prompt_en": "A person is tai chi", "dimension": [ "human_action" ] }, { "prompt_en": "A person is squat", "dimension": [ "human_action" ] }, { "prompt_en": "A person is playing controller", "dimension": [ "human_action" ] }, { "prompt_en": "A person is throwing axe", "dimension": [ "human_action" ] }, { "prompt_en": "A person is giving or receiving award", "dimension": [ "human_action" ] }, { "prompt_en": "A person is air drumming", "dimension": [ "human_action" ] }, { "prompt_en": "A person is taking a shower", "dimension": [ "human_action" ] }, { "prompt_en": "A person is planting trees", "dimension": [ "human_action" ] }, { "prompt_en": "A person is sharpening knives", "dimension": [ "human_action" ] }, { "prompt_en": "A person is robot dancing", "dimension": [ "human_action" ] }, { "prompt_en": "A person is rock climbing", "dimension": [ "human_action" ] }, { "prompt_en": "A person is hula hooping", "dimension": [ "human_action" ] }, { "prompt_en": "A person is writing", "dimension": [ "human_action" ] }, { "prompt_en": "A person is bungee jumping", "dimension": [ "human_action" ] }, { "prompt_en": "A person is pushing cart", "dimension": [ "human_action" ] }, { "prompt_en": "A person is cleaning windows", "dimension": [ "human_action" ] }, { "prompt_en": "A person is cutting watermelon", "dimension": [ "human_action" ] }, { "prompt_en": "A person is cheerleading", "dimension": [ "human_action" ] }, { "prompt_en": "A person is washing hands", "dimension": [ "human_action" ] }, { "prompt_en": "A person is ironing", "dimension": [ "human_action" ] }, { "prompt_en": "A person is cutting nails", "dimension": [ "human_action" ] }, { "prompt_en": "A person is hugging", "dimension": [ "human_action" ] }, { "prompt_en": "A person is trimming or shaving beard", "dimension": [ "human_action" ] }, { "prompt_en": "A person is jogging", "dimension": [ "human_action" ] }, { "prompt_en": "A person is making bed", "dimension": [ "human_action" ] }, { "prompt_en": "A person is washing dishes", "dimension": [ "human_action" ] }, { "prompt_en": "A person is grooming dog", "dimension": [ "human_action" ] }, { "prompt_en": "A person is doing laundry", "dimension": [ "human_action" ] }, { "prompt_en": "A person is knitting", "dimension": [ "human_action" ] }, { "prompt_en": "A person is reading book", "dimension": [ "human_action" ] }, { "prompt_en": "A person is baby waking up", "dimension": [ "human_action" ] }, { "prompt_en": "A person is massaging legs", "dimension": [ "human_action" ] }, { "prompt_en": "A person is brushing teeth", "dimension": [ "human_action" ] }, { "prompt_en": "A person is crawling baby", "dimension": [ "human_action" ] }, { "prompt_en": "A person is motorcycling", "dimension": [ "human_action" ] }, { "prompt_en": "A person is driving car", "dimension": [ "human_action" ] }, { "prompt_en": "A person is sticking tongue out", "dimension": [ "human_action" ] }, { "prompt_en": "A person is shaking head", "dimension": [ "human_action" ] }, { "prompt_en": "A person is sword fighting", "dimension": [ "human_action" ] }, { "prompt_en": "A person is doing aerobics", "dimension": [ "human_action" ] }, { "prompt_en": "A person is strumming guitar", "dimension": [ "human_action" ] }, { "prompt_en": "A person is riding or walking with horse", "dimension": [ "human_action" ] }, { "prompt_en": "A person is archery", "dimension": [ "human_action" ] }, { "prompt_en": "A person is catching or throwing baseball", "dimension": [ "human_action" ] }, { "prompt_en": "A person is playing chess", "dimension": [ "human_action" ] }, { "prompt_en": "A person is rock scissors paper", "dimension": [ "human_action" ] }, { "prompt_en": "A person is using computer", "dimension": [ "human_action" ] }, { "prompt_en": "A person is arranging flowers", "dimension": [ "human_action" ] }, { "prompt_en": "A person is bending metal", "dimension": [ "human_action" ] }, { "prompt_en": "A person is ice skating", "dimension": [ "human_action" ] }, { "prompt_en": "A person is climbing a rope", "dimension": [ "human_action" ] }, { "prompt_en": "A person is crying", "dimension": [ "human_action" ] }, { "prompt_en": "A person is dancing ballet", "dimension": [ "human_action" ] }, { "prompt_en": "A person is getting a haircut", "dimension": [ "human_action" ] }, { "prompt_en": "A person is running on treadmill", "dimension": [ "human_action" ] }, { "prompt_en": "A person is kissing", "dimension": [ "human_action" ] }, { "prompt_en": "A person is counting money", "dimension": [ "human_action" ] }, { "prompt_en": "A person is barbequing", "dimension": [ "human_action" ] }, { "prompt_en": "A person is peeling apples", "dimension": [ "human_action" ] }, { "prompt_en": "A person is milking cow", "dimension": [ "human_action" ] }, { "prompt_en": "A person is shining shoes", "dimension": [ "human_action" ] }, { "prompt_en": "A person is making snowman", "dimension": [ "human_action" ] }, { "prompt_en": "A person is sailing", "dimension": [ "human_action" ] }, { "prompt_en": "a person swimming in ocean", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a person giving a presentation to a room full of colleagues", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a person washing the dishes", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a person eating a burger", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a person walking in the snowstorm", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a person drinking coffee in a cafe", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a person playing guitar", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a bicycle leaning against a tree", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a bicycle gliding through a snowy field", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a bicycle slowing down to stop", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a bicycle accelerating to gain speed", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a car stuck in traffic during rush hour", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a car turning a corner", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a car slowing down to stop", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a car accelerating to gain speed", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a motorcycle cruising along a coastal highway", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a motorcycle turning a corner", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a motorcycle slowing down to stop", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a motorcycle gliding through a snowy field", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a motorcycle accelerating to gain speed", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "an airplane soaring through a clear blue sky", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "an airplane taking off", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "an airplane landing smoothly on a runway", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "an airplane accelerating to gain speed", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a bus turning a corner", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a bus stuck in traffic during rush hour", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a bus accelerating to gain speed", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a train speeding down the tracks", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a train crossing over a tall bridge", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a train accelerating to gain speed", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a truck turning a corner", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a truck anchored in a tranquil bay", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a truck stuck in traffic during rush hour", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a truck slowing down to stop", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a truck accelerating to gain speed", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a boat sailing smoothly on a calm lake", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a boat slowing down to stop", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a boat accelerating to gain speed", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a bird soaring gracefully in the sky", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a bird building a nest from twigs and leaves", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a bird flying over a snowy forest", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a cat grooming itself meticulously with its tongue", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a cat playing in park", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a cat drinking water", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a cat running happily", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a dog enjoying a peaceful walk", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a dog playing in park", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a dog drinking water", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a dog running happily", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a horse bending down to drink water from a river", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a horse galloping across an open field", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a horse taking a peaceful walk", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a horse running to join a herd of its kind", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a sheep bending down to drink water from a river", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a sheep taking a peaceful walk", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a sheep running to join a herd of its kind", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a cow bending down to drink water from a river", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a cow chewing cud while resting in a tranquil barn", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a cow running to join a herd of its kind", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "an elephant spraying itself with water using its trunk to cool down", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "an elephant taking a peaceful walk", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "an elephant running to join a herd of its kind", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a bear catching a salmon in its powerful jaws", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a bear sniffing the air for scents of food", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a bear climbing a tree", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a bear hunting for prey", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a zebra bending down to drink water from a river", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a zebra running to join a herd of its kind", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a zebra taking a peaceful walk", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a giraffe bending down to drink water from a river", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a giraffe taking a peaceful walk", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a giraffe running to join a herd of its kind", "dimension": [ "subject_consistency", "dynamic_degree", "motion_smoothness" ] }, { "prompt_en": "a person", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "person" } } }, { "prompt_en": "a bicycle", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "bicycle" } } }, { "prompt_en": "a car", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "car" } } }, { "prompt_en": "a motorcycle", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "motorcycle" } } }, { "prompt_en": "an airplane", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "airplane" } } }, { "prompt_en": "a bus", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "bus" } } }, { "prompt_en": "a train", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "train" } } }, { "prompt_en": "a truck", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "truck" } } }, { "prompt_en": "a boat", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "boat" } } }, { "prompt_en": "a traffic light", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "traffic light" } } }, { "prompt_en": "a fire hydrant", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "fire hydrant" } } }, { "prompt_en": "a stop sign", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "stop sign" } } }, { "prompt_en": "a parking meter", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "parking meter" } } }, { "prompt_en": "a bench", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "bench" } } }, { "prompt_en": "a bird", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "bird" } } }, { "prompt_en": "a cat", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "cat" } } }, { "prompt_en": "a dog", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "dog" } } }, { "prompt_en": "a horse", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "horse" } } }, { "prompt_en": "a sheep", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "sheep" } } }, { "prompt_en": "a cow", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "cow" } } }, { "prompt_en": "an elephant", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "elephant" } } }, { "prompt_en": "a bear", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "bear" } } }, { "prompt_en": "a zebra", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "zebra" } } }, { "prompt_en": "a giraffe", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "giraffe" } } }, { "prompt_en": "a backpack", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "backpack" } } }, { "prompt_en": "an umbrella", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "umbrella" } } }, { "prompt_en": "a handbag", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "handbag" } } }, { "prompt_en": "a tie", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "tie" } } }, { "prompt_en": "a suitcase", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "suitcase" } } }, { "prompt_en": "a frisbee", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "frisbee" } } }, { "prompt_en": "skis", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "skis" } } }, { "prompt_en": "a snowboard", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "snowboard" } } }, { "prompt_en": "a sports ball", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "sports ball" } } }, { "prompt_en": "a kite", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "kite" } } }, { "prompt_en": "a baseball bat", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "baseball bat" } } }, { "prompt_en": "a baseball glove", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "baseball glove" } } }, { "prompt_en": "a skateboard", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "skateboard" } } }, { "prompt_en": "a surfboard", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "surfboard" } } }, { "prompt_en": "a tennis racket", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "tennis racket" } } }, { "prompt_en": "a bottle", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "bottle" } } }, { "prompt_en": "a wine glass", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "wine glass" } } }, { "prompt_en": "a cup", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "cup" } } }, { "prompt_en": "a fork", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "fork" } } }, { "prompt_en": "a knife", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "knife" } } }, { "prompt_en": "a spoon", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "spoon" } } }, { "prompt_en": "a bowl", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "bowl" } } }, { "prompt_en": "a banana", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "banana" } } }, { "prompt_en": "an apple", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "apple" } } }, { "prompt_en": "a sandwich", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "sandwich" } } }, { "prompt_en": "an orange", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "orange" } } }, { "prompt_en": "broccoli", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "broccoli" } } }, { "prompt_en": "a carrot", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "carrot" } } }, { "prompt_en": "a hot dog", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "hot dog" } } }, { "prompt_en": "a pizza", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "pizza" } } }, { "prompt_en": "a donut", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "donut" } } }, { "prompt_en": "a cake", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "cake" } } }, { "prompt_en": "a chair", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "chair" } } }, { "prompt_en": "a couch", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "couch" } } }, { "prompt_en": "a potted plant", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "potted plant" } } }, { "prompt_en": "a bed", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "bed" } } }, { "prompt_en": "a dining table", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "dining table" } } }, { "prompt_en": "a toilet", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "toilet" } } }, { "prompt_en": "a tv", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "tv" } } }, { "prompt_en": "a laptop", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "laptop" } } }, { "prompt_en": "a remote", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "remote" } } }, { "prompt_en": "a keyboard", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "keyboard" } } }, { "prompt_en": "a cell phone", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "cell phone" } } }, { "prompt_en": "a microwave", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "microwave" } } }, { "prompt_en": "an oven", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "oven" } } }, { "prompt_en": "a toaster", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "toaster" } } }, { "prompt_en": "a sink", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "sink" } } }, { "prompt_en": "a refrigerator", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "refrigerator" } } }, { "prompt_en": "a book", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "book" } } }, { "prompt_en": "a clock", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "clock" } } }, { "prompt_en": "a vase", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "vase" } } }, { "prompt_en": "scissors", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "scissors" } } }, { "prompt_en": "a teddy bear", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "teddy bear" } } }, { "prompt_en": "a hair drier", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "hair drier" } } }, { "prompt_en": "a toothbrush", "dimension": [ "object_class" ], "auxiliary_info": { "object_class": { "object": "toothbrush" } } }, { "prompt_en": "a red bicycle", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "red" } } }, { "prompt_en": "a green bicycle", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "green" } } }, { "prompt_en": "a blue bicycle", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "blue" } } }, { "prompt_en": "a yellow bicycle", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "yellow" } } }, { "prompt_en": "an orange bicycle", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "orange" } } }, { "prompt_en": "a purple bicycle", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "purple" } } }, { "prompt_en": "a pink bicycle", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "pink" } } }, { "prompt_en": "a black bicycle", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "black" } } }, { "prompt_en": "a white bicycle", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "white" } } }, { "prompt_en": "a red car", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "red" } } }, { "prompt_en": "a green car", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "green" } } }, { "prompt_en": "a blue car", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "blue" } } }, { "prompt_en": "a yellow car", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "yellow" } } }, { "prompt_en": "an orange car", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "orange" } } }, { "prompt_en": "a purple car", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "purple" } } }, { "prompt_en": "a pink car", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "pink" } } }, { "prompt_en": "a black car", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "black" } } }, { "prompt_en": "a white car", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "white" } } }, { "prompt_en": "a red bird", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "red" } } }, { "prompt_en": "a green bird", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "green" } } }, { "prompt_en": "a blue bird", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "blue" } } }, { "prompt_en": "a yellow bird", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "yellow" } } }, { "prompt_en": "an orange bird", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "orange" } } }, { "prompt_en": "a purple bird", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "purple" } } }, { "prompt_en": "a pink bird", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "pink" } } }, { "prompt_en": "a black bird", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "black" } } }, { "prompt_en": "a white bird", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "white" } } }, { "prompt_en": "a black cat", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "black" } } }, { "prompt_en": "a white cat", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "white" } } }, { "prompt_en": "an orange cat", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "orange" } } }, { "prompt_en": "a yellow cat", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "yellow" } } }, { "prompt_en": "a red umbrella", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "red" } } }, { "prompt_en": "a green umbrella", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "green" } } }, { "prompt_en": "a blue umbrella", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "blue" } } }, { "prompt_en": "a yellow umbrella", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "yellow" } } }, { "prompt_en": "an orange umbrella", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "orange" } } }, { "prompt_en": "a purple umbrella", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "purple" } } }, { "prompt_en": "a pink umbrella", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "pink" } } }, { "prompt_en": "a black umbrella", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "black" } } }, { "prompt_en": "a white umbrella", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "white" } } }, { "prompt_en": "a red suitcase", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "red" } } }, { "prompt_en": "a green suitcase", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "green" } } }, { "prompt_en": "a blue suitcase", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "blue" } } }, { "prompt_en": "a yellow suitcase", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "yellow" } } }, { "prompt_en": "an orange suitcase", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "orange" } } }, { "prompt_en": "a purple suitcase", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "purple" } } }, { "prompt_en": "a pink suitcase", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "pink" } } }, { "prompt_en": "a black suitcase", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "black" } } }, { "prompt_en": "a white suitcase", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "white" } } }, { "prompt_en": "a red bowl", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "red" } } }, { "prompt_en": "a green bowl", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "green" } } }, { "prompt_en": "a blue bowl", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "blue" } } }, { "prompt_en": "a yellow bowl", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "yellow" } } }, { "prompt_en": "an orange bowl", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "orange" } } }, { "prompt_en": "a purple bowl", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "purple" } } }, { "prompt_en": "a pink bowl", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "pink" } } }, { "prompt_en": "a black bowl", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "black" } } }, { "prompt_en": "a white bowl", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "white" } } }, { "prompt_en": "a red chair", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "red" } } }, { "prompt_en": "a green chair", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "green" } } }, { "prompt_en": "a blue chair", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "blue" } } }, { "prompt_en": "a yellow chair", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "yellow" } } }, { "prompt_en": "an orange chair", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "orange" } } }, { "prompt_en": "a purple chair", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "purple" } } }, { "prompt_en": "a pink chair", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "pink" } } }, { "prompt_en": "a black chair", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "black" } } }, { "prompt_en": "a white chair", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "white" } } }, { "prompt_en": "a red clock", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "red" } } }, { "prompt_en": "a green clock", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "green" } } }, { "prompt_en": "a blue clock", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "blue" } } }, { "prompt_en": "a yellow clock", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "yellow" } } }, { "prompt_en": "an orange clock", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "orange" } } }, { "prompt_en": "a purple clock", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "purple" } } }, { "prompt_en": "a pink clock", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "pink" } } }, { "prompt_en": "a black clock", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "black" } } }, { "prompt_en": "a white clock", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "white" } } }, { "prompt_en": "a red vase", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "red" } } }, { "prompt_en": "a green vase", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "green" } } }, { "prompt_en": "a blue vase", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "blue" } } }, { "prompt_en": "a yellow vase", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "yellow" } } }, { "prompt_en": "an orange vase", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "orange" } } }, { "prompt_en": "a purple vase", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "purple" } } }, { "prompt_en": "a pink vase", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "pink" } } }, { "prompt_en": "a black vase", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "black" } } }, { "prompt_en": "a white vase", "dimension": [ "color" ], "auxiliary_info": { "color": { "color": "white" } } }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, Van Gogh style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "Van Gogh style" } } }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, oil painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "oil painting" } } }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "by Hokusai, in the style of Ukiyo" } } }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, black and white", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "black and white" } } }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, pixel art", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "pixel art" } } }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, in cyberpunk style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "in cyberpunk style" } } }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, animated style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "animated style" } } }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, watercolor painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "watercolor painting" } } }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, surrealism style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "surrealism style" } } }, { "prompt_en": "The bund Shanghai, Van Gogh style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "Van Gogh style" } } }, { "prompt_en": "The bund Shanghai, oil painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "oil painting" } } }, { "prompt_en": "The bund Shanghai by Hokusai, in the style of Ukiyo", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "by Hokusai, in the style of Ukiyo" } } }, { "prompt_en": "The bund Shanghai, black and white", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "black and white" } } }, { "prompt_en": "The bund Shanghai, pixel art", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "pixel art" } } }, { "prompt_en": "The bund Shanghai, in cyberpunk style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "in cyberpunk style" } } }, { "prompt_en": "The bund Shanghai, animated style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "animated style" } } }, { "prompt_en": "The bund Shanghai, watercolor painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "watercolor painting" } } }, { "prompt_en": "The bund Shanghai, surrealism style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "surrealism style" } } }, { "prompt_en": "a shark is swimming in the ocean, Van Gogh style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "Van Gogh style" } } }, { "prompt_en": "a shark is swimming in the ocean, oil painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "oil painting" } } }, { "prompt_en": "a shark is swimming in the ocean by Hokusai, in the style of Ukiyo", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "by Hokusai, in the style of Ukiyo" } } }, { "prompt_en": "a shark is swimming in the ocean, black and white", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "black and white" } } }, { "prompt_en": "a shark is swimming in the ocean, pixel art", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "pixel art" } } }, { "prompt_en": "a shark is swimming in the ocean, in cyberpunk style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "in cyberpunk style" } } }, { "prompt_en": "a shark is swimming in the ocean, animated style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "animated style" } } }, { "prompt_en": "a shark is swimming in the ocean, watercolor painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "watercolor painting" } } }, { "prompt_en": "a shark is swimming in the ocean, surrealism style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "surrealism style" } } }, { "prompt_en": "A panda drinking coffee in a cafe in Paris, Van Gogh style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "Van Gogh style" } } }, { "prompt_en": "A panda drinking coffee in a cafe in Paris, oil painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "oil painting" } } }, { "prompt_en": "A panda drinking coffee in a cafe in Paris by Hokusai, in the style of Ukiyo", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "by Hokusai, in the style of Ukiyo" } } }, { "prompt_en": "A panda drinking coffee in a cafe in Paris, black and white", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "black and white" } } }, { "prompt_en": "A panda drinking coffee in a cafe in Paris, pixel art", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "pixel art" } } }, { "prompt_en": "A panda drinking coffee in a cafe in Paris, in cyberpunk style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "in cyberpunk style" } } }, { "prompt_en": "A panda drinking coffee in a cafe in Paris, animated style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "animated style" } } }, { "prompt_en": "A panda drinking coffee in a cafe in Paris, watercolor painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "watercolor painting" } } }, { "prompt_en": "A panda drinking coffee in a cafe in Paris, surrealism style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "surrealism style" } } }, { "prompt_en": "A cute happy Corgi playing in park, sunset, Van Gogh style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "Van Gogh style" } } }, { "prompt_en": "A cute happy Corgi playing in park, sunset, oil painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "oil painting" } } }, { "prompt_en": "A cute happy Corgi playing in park, sunset by Hokusai, in the style of Ukiyo", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "by Hokusai, in the style of Ukiyo" } } }, { "prompt_en": "A cute happy Corgi playing in park, sunset, black and white", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "black and white" } } }, { "prompt_en": "A cute happy Corgi playing in park, sunset, pixel art", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "pixel art" } } }, { "prompt_en": "A cute happy Corgi playing in park, sunset, in cyberpunk style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "in cyberpunk style" } } }, { "prompt_en": "A cute happy Corgi playing in park, sunset, animated style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "animated style" } } }, { "prompt_en": "A cute happy Corgi playing in park, sunset, watercolor painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "watercolor painting" } } }, { "prompt_en": "A cute happy Corgi playing in park, sunset, surrealism style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "surrealism style" } } }, { "prompt_en": "Gwen Stacy reading a book, Van Gogh style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "Van Gogh style" } } }, { "prompt_en": "Gwen Stacy reading a book, oil painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "oil painting" } } }, { "prompt_en": "Gwen Stacy reading a book by Hokusai, in the style of Ukiyo", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "by Hokusai, in the style of Ukiyo" } } }, { "prompt_en": "Gwen Stacy reading a book, black and white", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "black and white" } } }, { "prompt_en": "Gwen Stacy reading a book, pixel art", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "pixel art" } } }, { "prompt_en": "Gwen Stacy reading a book, in cyberpunk style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "in cyberpunk style" } } }, { "prompt_en": "Gwen Stacy reading a book, animated style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "animated style" } } }, { "prompt_en": "Gwen Stacy reading a book, watercolor painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "watercolor painting" } } }, { "prompt_en": "Gwen Stacy reading a book, surrealism style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "surrealism style" } } }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, Van Gogh style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "Van Gogh style" } } }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, oil painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "oil painting" } } }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Hokusai, in the style of Ukiyo", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "by Hokusai, in the style of Ukiyo" } } }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, black and white", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "black and white" } } }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pixel art", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "pixel art" } } }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in cyberpunk style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "in cyberpunk style" } } }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, animated style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "animated style" } } }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, watercolor painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "watercolor painting" } } }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, surrealism style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "surrealism style" } } }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, Van Gogh style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "Van Gogh style" } } }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, oil painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "oil painting" } } }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas by Hokusai, in the style of Ukiyo", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "by Hokusai, in the style of Ukiyo" } } }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, black and white", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "black and white" } } }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pixel art", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "pixel art" } } }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in cyberpunk style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "in cyberpunk style" } } }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, animated style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "animated style" } } }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, watercolor painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "watercolor painting" } } }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, surrealism style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "surrealism style" } } }, { "prompt_en": "An astronaut flying in space, Van Gogh style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "Van Gogh style" } } }, { "prompt_en": "An astronaut flying in space, oil painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "oil painting" } } }, { "prompt_en": "An astronaut flying in space by Hokusai, in the style of Ukiyo", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "by Hokusai, in the style of Ukiyo" } } }, { "prompt_en": "An astronaut flying in space, black and white", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "black and white" } } }, { "prompt_en": "An astronaut flying in space, pixel art", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "pixel art" } } }, { "prompt_en": "An astronaut flying in space, in cyberpunk style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "in cyberpunk style" } } }, { "prompt_en": "An astronaut flying in space, animated style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "animated style" } } }, { "prompt_en": "An astronaut flying in space, watercolor painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "watercolor painting" } } }, { "prompt_en": "An astronaut flying in space, surrealism style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "surrealism style" } } }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, Van Gogh style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "Van Gogh style" } } }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, oil painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "oil painting" } } }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks by Hokusai, in the style of Ukiyo", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "by Hokusai, in the style of Ukiyo" } } }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, black and white", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "black and white" } } }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pixel art", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "pixel art" } } }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in cyberpunk style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "in cyberpunk style" } } }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, animated style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "animated style" } } }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, watercolor painting", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "watercolor painting" } } }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, surrealism style", "dimension": [ "appearance_style" ], "auxiliary_info": { "appearance_style": { "appearance_style": "surrealism style" } } }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, in super slow motion", "dimension": [ "temporal_style" ] }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, zoom in", "dimension": [ "temporal_style" ] }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, zoom out", "dimension": [ "temporal_style" ] }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, pan left", "dimension": [ "temporal_style" ] }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, pan right", "dimension": [ "temporal_style" ] }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, tilt up", "dimension": [ "temporal_style" ] }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, tilt down", "dimension": [ "temporal_style" ] }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, with an intense shaking effect", "dimension": [ "temporal_style" ] }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, featuring a steady and smooth perspective", "dimension": [ "temporal_style" ] }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, racking focus", "dimension": [ "temporal_style" ] }, { "prompt_en": "The bund Shanghai, in super slow motion", "dimension": [ "temporal_style" ] }, { "prompt_en": "The bund Shanghai, zoom in", "dimension": [ "temporal_style" ] }, { "prompt_en": "The bund Shanghai, zoom out", "dimension": [ "temporal_style" ] }, { "prompt_en": "The bund Shanghai, pan left", "dimension": [ "temporal_style" ] }, { "prompt_en": "The bund Shanghai, pan right", "dimension": [ "temporal_style" ] }, { "prompt_en": "The bund Shanghai, tilt up", "dimension": [ "temporal_style" ] }, { "prompt_en": "The bund Shanghai, tilt down", "dimension": [ "temporal_style" ] }, { "prompt_en": "The bund Shanghai, with an intense shaking effect", "dimension": [ "temporal_style" ] }, { "prompt_en": "The bund Shanghai, featuring a steady and smooth perspective", "dimension": [ "temporal_style" ] }, { "prompt_en": "The bund Shanghai, racking focus", "dimension": [ "temporal_style" ] }, { "prompt_en": "a shark is swimming in the ocean, in super slow motion", "dimension": [ "temporal_style" ] }, { "prompt_en": "a shark is swimming in the ocean, zoom in", "dimension": [ "temporal_style" ] }, { "prompt_en": "a shark is swimming in the ocean, zoom out", "dimension": [ "temporal_style" ] }, { "prompt_en": "a shark is swimming in the ocean, pan left", "dimension": [ "temporal_style" ] }, { "prompt_en": "a shark is swimming in the ocean, pan right", "dimension": [ "temporal_style" ] }, { "prompt_en": "a shark is swimming in the ocean, tilt up", "dimension": [ "temporal_style" ] }, { "prompt_en": "a shark is swimming in the ocean, tilt down", "dimension": [ "temporal_style" ] }, { "prompt_en": "a shark is swimming in the ocean, with an intense shaking effect", "dimension": [ "temporal_style" ] }, { "prompt_en": "a shark is swimming in the ocean, featuring a steady and smooth perspective", "dimension": [ "temporal_style" ] }, { "prompt_en": "a shark is swimming in the ocean, racking focus", "dimension": [ "temporal_style" ] }, { "prompt_en": "A panda drinking coffee in a cafe in Paris, in super slow motion", "dimension": [ "temporal_style" ] }, { "prompt_en": "A panda drinking coffee in a cafe in Paris, zoom in", "dimension": [ "temporal_style" ] }, { "prompt_en": "A panda drinking coffee in a cafe in Paris, zoom out", "dimension": [ "temporal_style" ] }, { "prompt_en": "A panda drinking coffee in a cafe in Paris, pan left", "dimension": [ "temporal_style" ] }, { "prompt_en": "A panda drinking coffee in a cafe in Paris, pan right", "dimension": [ "temporal_style" ] }, { "prompt_en": "A panda drinking coffee in a cafe in Paris, tilt up", "dimension": [ "temporal_style" ] }, { "prompt_en": "A panda drinking coffee in a cafe in Paris, tilt down", "dimension": [ "temporal_style" ] }, { "prompt_en": "A panda drinking coffee in a cafe in Paris, with an intense shaking effect", "dimension": [ "temporal_style" ] }, { "prompt_en": "A panda drinking coffee in a cafe in Paris, featuring a steady and smooth perspective", "dimension": [ "temporal_style" ] }, { "prompt_en": "A panda drinking coffee in a cafe in Paris, racking focus", "dimension": [ "temporal_style" ] }, { "prompt_en": "A cute happy Corgi playing in park, sunset, in super slow motion", "dimension": [ "temporal_style" ] }, { "prompt_en": "A cute happy Corgi playing in park, sunset, zoom in", "dimension": [ "temporal_style" ] }, { "prompt_en": "A cute happy Corgi playing in park, sunset, zoom out", "dimension": [ "temporal_style" ] }, { "prompt_en": "A cute happy Corgi playing in park, sunset, pan left", "dimension": [ "temporal_style" ] }, { "prompt_en": "A cute happy Corgi playing in park, sunset, pan right", "dimension": [ "temporal_style" ] }, { "prompt_en": "A cute happy Corgi playing in park, sunset, tilt up", "dimension": [ "temporal_style" ] }, { "prompt_en": "A cute happy Corgi playing in park, sunset, tilt down", "dimension": [ "temporal_style" ] }, { "prompt_en": "A cute happy Corgi playing in park, sunset, with an intense shaking effect", "dimension": [ "temporal_style" ] }, { "prompt_en": "A cute happy Corgi playing in park, sunset, featuring a steady and smooth perspective", "dimension": [ "temporal_style" ] }, { "prompt_en": "A cute happy Corgi playing in park, sunset, racking focus", "dimension": [ "temporal_style" ] }, { "prompt_en": "Gwen Stacy reading a book, in super slow motion", "dimension": [ "temporal_style" ] }, { "prompt_en": "Gwen Stacy reading a book, zoom in", "dimension": [ "temporal_style" ] }, { "prompt_en": "Gwen Stacy reading a book, zoom out", "dimension": [ "temporal_style" ] }, { "prompt_en": "Gwen Stacy reading a book, pan left", "dimension": [ "temporal_style" ] }, { "prompt_en": "Gwen Stacy reading a book, pan right", "dimension": [ "temporal_style" ] }, { "prompt_en": "Gwen Stacy reading a book, tilt up", "dimension": [ "temporal_style" ] }, { "prompt_en": "Gwen Stacy reading a book, tilt down", "dimension": [ "temporal_style" ] }, { "prompt_en": "Gwen Stacy reading a book, with an intense shaking effect", "dimension": [ "temporal_style" ] }, { "prompt_en": "Gwen Stacy reading a book, featuring a steady and smooth perspective", "dimension": [ "temporal_style" ] }, { "prompt_en": "Gwen Stacy reading a book, racking focus", "dimension": [ "temporal_style" ] }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in super slow motion", "dimension": [ "temporal_style" ] }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom in", "dimension": [ "temporal_style" ] }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom out", "dimension": [ "temporal_style" ] }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan left", "dimension": [ "temporal_style" ] }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan right", "dimension": [ "temporal_style" ] }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt up", "dimension": [ "temporal_style" ] }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt down", "dimension": [ "temporal_style" ] }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, with an intense shaking effect", "dimension": [ "temporal_style" ] }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, featuring a steady and smooth perspective", "dimension": [ "temporal_style" ] }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, racking focus", "dimension": [ "temporal_style" ] }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in super slow motion", "dimension": [ "temporal_style" ] }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom in", "dimension": [ "temporal_style" ] }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom out", "dimension": [ "temporal_style" ] }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan left", "dimension": [ "temporal_style" ] }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan right", "dimension": [ "temporal_style" ] }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt up", "dimension": [ "temporal_style" ] }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt down", "dimension": [ "temporal_style" ] }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, with an intense shaking effect", "dimension": [ "temporal_style" ] }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, featuring a steady and smooth perspective", "dimension": [ "temporal_style" ] }, { "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, racking focus", "dimension": [ "temporal_style" ] }, { "prompt_en": "An astronaut flying in space, in super slow motion", "dimension": [ "temporal_style" ] }, { "prompt_en": "An astronaut flying in space, zoom in", "dimension": [ "temporal_style" ] }, { "prompt_en": "An astronaut flying in space, zoom out", "dimension": [ "temporal_style" ] }, { "prompt_en": "An astronaut flying in space, pan left", "dimension": [ "temporal_style" ] }, { "prompt_en": "An astronaut flying in space, pan right", "dimension": [ "temporal_style" ] }, { "prompt_en": "An astronaut flying in space, tilt up", "dimension": [ "temporal_style" ] }, { "prompt_en": "An astronaut flying in space, tilt down", "dimension": [ "temporal_style" ] }, { "prompt_en": "An astronaut flying in space, with an intense shaking effect", "dimension": [ "temporal_style" ] }, { "prompt_en": "An astronaut flying in space, featuring a steady and smooth perspective", "dimension": [ "temporal_style" ] }, { "prompt_en": "An astronaut flying in space, racking focus", "dimension": [ "temporal_style" ] }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in super slow motion", "dimension": [ "temporal_style" ] }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom in", "dimension": [ "temporal_style" ] }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom out", "dimension": [ "temporal_style" ] }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan left", "dimension": [ "temporal_style" ] }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan right", "dimension": [ "temporal_style" ] }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt up", "dimension": [ "temporal_style" ] }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt down", "dimension": [ "temporal_style" ] }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, with an intense shaking effect", "dimension": [ "temporal_style" ] }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, featuring a steady and smooth perspective", "dimension": [ "temporal_style" ] }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, racking focus", "dimension": [ "temporal_style" ] }, { "prompt_en": "Close up of grapes on a rotating table.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Turtle swimming in ocean.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A storm trooper vacuuming the beach.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A panda standing on a surfboard in the ocean in sunset.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "An astronaut feeding ducks on a sunny afternoon, reflection from the water.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Two pandas discussing an academic paper.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Sunset time lapse at the beach with moving clouds and colors in the sky.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A fat rabbit wearing a purple robe walking through a fantasy landscape.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A koala bear playing piano in the forest.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "An astronaut flying in space.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Fireworks.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "An animated painting of fluffy white clouds moving in sky.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Flying through fantasy landscapes.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A bigfoot walking in the snowstorm.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A squirrel eating a burger.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A cat wearing sunglasses and working as a lifeguard at a pool.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Splash of turquoise water in extreme slow motion, alpha channel included.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "an ice cream is melting on the table.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "a drone flying over a snowy forest.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "a shark is swimming in the ocean.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Aerial panoramic video from a drone of a fantasy land.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "a teddy bear is swimming in the ocean.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "time lapse of sunrise on mars.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "golden fish swimming in the ocean.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "An artist brush painting on a canvas close up.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A drone view of celebration with Christmas tree and fireworks, starry sky - background.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "happy dog wearing a yellow turtleneck, studio, portrait, facing camera, dark background", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Origami dancers in white paper, 3D render, on white background, studio shot, dancing modern dance.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Campfire at night in a snowy forest with starry sky in the background.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "a fantasy landscape", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A 3D model of a 1800s victorian house.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "this is how I do makeup in the morning.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A raccoon that looks like a turtle, digital art.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Robot dancing in Times Square.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Busy freeway at night.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Balloon full of water exploding in extreme slow motion.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "An astronaut is riding a horse in the space in a photorealistic style.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Macro slo-mo. Slow motion cropped closeup of roasted coffee beans falling into an empty bowl.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Sewing machine, old sewing machine working.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Motion colour drop in water, ink swirling in water, colourful ink in water, abstraction fancy dream cloud of ink.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Few big purple plums rotating on the turntable. water drops appear on the skin during rotation. isolated on the white background. close-up. macro.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Vampire makeup face of beautiful girl, red contact lenses.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Ashtray full of butts on table, smoke flowing on black background, close-up", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Pacific coast, carmel by the sea ocean and waves.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A teddy bear is playing drum kit in NYC Times Square.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A corgi is playing drum kit.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "An Iron man is playing the electronic guitar, high electronic guitar.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A raccoon is playing the electronic guitar.", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Vincent van Gogh", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A corgi's head depicted as an explosion of a nebula", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A fantasy landscape", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A future where humans have achieved teleportation technology", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A jellyfish floating through the ocean, with bioluminescent tentacles", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A Mars rover moving on Mars", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A panda drinking coffee in a cafe in Paris", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A space shuttle launching into orbit, with flames and smoke billowing out from the engines", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A steam train moving on a mountainside", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A super cool giant robot in Cyberpunk Beijing", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A tropical beach at sunrise, with palm trees and crystal-clear water in the foreground", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Cinematic shot of Van Gogh's selfie, Van Gogh style", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Gwen Stacy reading a book", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Iron Man flying in the sky", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "The bund Shanghai, oil painting", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Yoda playing guitar on the stage", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand by Vincent van Gogh", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A car moving slowly on an empty street, rainy evening", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A cat eating food out of a bowl", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A cat wearing sunglasses at a pool", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A confused panda in calculus class", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A cute fluffy panda eating Chinese food in a restaurant", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A cute happy Corgi playing in park, sunset", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A cute raccoon playing guitar in a boat on the ocean", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A happy fuzzy panda playing guitar nearby a campfire, snow mountain in the background", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A lightning striking atop of eiffel tower, dark clouds in the sky", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A modern art museum, with colorful paintings", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A panda cooking in the kitchen", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A panda playing on a swing set", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A polar bear is playing guitar", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A raccoon dressed in suit playing the trumpet, stage background", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A robot DJ is playing the turntable, in heavy raining futuristic tokyo rooftop cyberpunk night, sci-fi, fantasy", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A shark swimming in clear Caribbean ocean", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A super robot protecting city", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "A teddy bear washing the dishes", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "An epic tornado attacking above a glowing city at night, the tornado is made of smoke", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with umbrellas", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Clown fish swimming through the coral reef", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Hyper-realistic spaceship landing on Mars", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "The bund Shanghai, vibrant color", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Vincent van Gogh is painting in the room", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "Yellow flowers swing in the wind", "dimension": [ "overall_consistency", "aesthetic_quality", "imaging_quality" ] }, { "prompt_en": "alley", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "alley" } } } }, { "prompt_en": "amusement park", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "amusement park" } } } }, { "prompt_en": "aquarium", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "aquarium" } } } }, { "prompt_en": "arch", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "arch" } } } }, { "prompt_en": "art gallery", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "art gallery" } } } }, { "prompt_en": "bathroom", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "bathroom" } } } }, { "prompt_en": "bakery shop", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "bakery shop" } } } }, { "prompt_en": "ballroom", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "ballroom" } } } }, { "prompt_en": "bar", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "bar" } } } }, { "prompt_en": "barn", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "barn" } } } }, { "prompt_en": "basement", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "basement" } } } }, { "prompt_en": "beach", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "beach" } } } }, { "prompt_en": "bedroom", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "bedroom" } } } }, { "prompt_en": "bridge", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "bridge" } } } }, { "prompt_en": "botanical garden", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "botanical garden" } } } }, { "prompt_en": "cafeteria", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "cafeteria" } } } }, { "prompt_en": "campsite", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "campsite" } } } }, { "prompt_en": "campus", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "campus" } } } }, { "prompt_en": "carrousel", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "carrousel" } } } }, { "prompt_en": "castle", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "castle" } } } }, { "prompt_en": "cemetery", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "cemetery" } } } }, { "prompt_en": "classroom", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "classroom" } } } }, { "prompt_en": "cliff", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "cliff" } } } }, { "prompt_en": "crosswalk", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "crosswalk" } } } }, { "prompt_en": "construction site", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "construction site" } } } }, { "prompt_en": "corridor", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "corridor" } } } }, { "prompt_en": "courtyard", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "courtyard" } } } }, { "prompt_en": "desert", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "desert" } } } }, { "prompt_en": "downtown", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "downtown" } } } }, { "prompt_en": "driveway", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "driveway" } } } }, { "prompt_en": "farm", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "farm" } } } }, { "prompt_en": "food court", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "food court" } } } }, { "prompt_en": "football field", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "football field" } } } }, { "prompt_en": "forest road", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "forest road" } } } }, { "prompt_en": "fountain", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "fountain" } } } }, { "prompt_en": "gas station", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "gas station" } } } }, { "prompt_en": "glacier", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "glacier" } } } }, { "prompt_en": "golf course", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "golf course" } } } }, { "prompt_en": "indoor gymnasium", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "indoor gymnasium" } } } }, { "prompt_en": "harbor", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "harbor" } } } }, { "prompt_en": "highway", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "highway" } } } }, { "prompt_en": "hospital", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "hospital" } } } }, { "prompt_en": "house", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "house" } } } }, { "prompt_en": "iceberg", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "iceberg" } } } }, { "prompt_en": "industrial area", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "industrial area" } } } }, { "prompt_en": "jail cell", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "jail cell" } } } }, { "prompt_en": "junkyard", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "junkyard" } } } }, { "prompt_en": "kitchen", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "kitchen" } } } }, { "prompt_en": "indoor library", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "indoor library" } } } }, { "prompt_en": "lighthouse", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "lighthouse" } } } }, { "prompt_en": "laboratory", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "laboratory" } } } }, { "prompt_en": "mansion", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "mansion" } } } }, { "prompt_en": "marsh", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "marsh" } } } }, { "prompt_en": "mountain", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "mountain" } } } }, { "prompt_en": "indoor movie theater", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "indoor movie theater" } } } }, { "prompt_en": "indoor museum", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "indoor museum" } } } }, { "prompt_en": "music studio", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "music studio" } } } }, { "prompt_en": "nursery", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "nursery" } } } }, { "prompt_en": "ocean", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "ocean" } } } }, { "prompt_en": "office", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "office" } } } }, { "prompt_en": "palace", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "palace" } } } }, { "prompt_en": "parking lot", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "parking lot" } } } }, { "prompt_en": "pharmacy", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "pharmacy" } } } }, { "prompt_en": "phone booth", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "phone booth" } } } }, { "prompt_en": "raceway", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "raceway" } } } }, { "prompt_en": "restaurant", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "restaurant" } } } }, { "prompt_en": "river", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "river" } } } }, { "prompt_en": "science museum", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "science museum" } } } }, { "prompt_en": "shower", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "shower" } } } }, { "prompt_en": "ski slope", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "ski slope" } } } }, { "prompt_en": "sky", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "sky" } } } }, { "prompt_en": "skyscraper", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "skyscraper" } } } }, { "prompt_en": "baseball stadium", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "baseball stadium" } } } }, { "prompt_en": "staircase", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "staircase" } } } }, { "prompt_en": "street", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "street" } } } }, { "prompt_en": "supermarket", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "supermarket" } } } }, { "prompt_en": "indoor swimming pool", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "indoor swimming pool" } } } }, { "prompt_en": "tower", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "tower" } } } }, { "prompt_en": "outdoor track", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "outdoor track" } } } }, { "prompt_en": "train railway", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "train railway" } } } }, { "prompt_en": "train station platform", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "train station platform" } } } }, { "prompt_en": "underwater coral reef", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "underwater coral reef" } } } }, { "prompt_en": "valley", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "valley" } } } }, { "prompt_en": "volcano", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "volcano" } } } }, { "prompt_en": "waterfall", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "waterfall" } } } }, { "prompt_en": "windmill", "dimension": [ "scene", "background_consistency" ], "auxiliary_info": { "scene": { "scene": { "scene": "windmill" } } } }, { "prompt_en": "a bicycle on the left of a car, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "bicycle", "object_b": "car", "relationship": "on the left of" } } } }, { "prompt_en": "a car on the right of a motorcycle, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "car", "object_b": "motorcycle", "relationship": "on the right of" } } } }, { "prompt_en": "a motorcycle on the left of a bus, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "motorcycle", "object_b": "bus", "relationship": "on the left of" } } } }, { "prompt_en": "a bus on the right of a traffic light, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "bus", "object_b": "traffic light", "relationship": "on the right of" } } } }, { "prompt_en": "a traffic light on the left of a fire hydrant, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "traffic light", "object_b": "fire hydrant", "relationship": "on the left of" } } } }, { "prompt_en": "a fire hydrant on the right of a stop sign, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "fire hydrant", "object_b": "stop sign", "relationship": "on the right of" } } } }, { "prompt_en": "a stop sign on the left of a parking meter, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "stop sign", "object_b": "parking meter", "relationship": "on the left of" } } } }, { "prompt_en": "a parking meter on the right of a bench, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "parking meter", "object_b": "bench", "relationship": "on the right of" } } } }, { "prompt_en": "a bench on the left of a truck, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "bench", "object_b": "truck", "relationship": "on the left of" } } } }, { "prompt_en": "a truck on the right of a bicycle, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "truck", "object_b": "bicycle", "relationship": "on the right of" } } } }, { "prompt_en": "a bird on the left of a cat, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "bird", "object_b": "cat", "relationship": "on the left of" } } } }, { "prompt_en": "a cat on the right of a dog, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "cat", "object_b": "dog", "relationship": "on the right of" } } } }, { "prompt_en": "a dog on the left of a horse, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "dog", "object_b": "horse", "relationship": "on the left of" } } } }, { "prompt_en": "a horse on the right of a sheep, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "horse", "object_b": "sheep", "relationship": "on the right of" } } } }, { "prompt_en": "a sheep on the left of a cow, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "sheep", "object_b": "cow", "relationship": "on the left of" } } } }, { "prompt_en": "a cow on the right of an elephant, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "cow", "object_b": "elephant", "relationship": "on the right of" } } } }, { "prompt_en": "an elephant on the left of a bear, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "elephant", "object_b": "bear", "relationship": "on the left of" } } } }, { "prompt_en": "a bear on the right of a zebra, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "bear", "object_b": "zebra", "relationship": "on the right of" } } } }, { "prompt_en": "a zebra on the left of a giraffe, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "zebra", "object_b": "giraffe", "relationship": "on the left of" } } } }, { "prompt_en": "a giraffe on the right of a bird, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "giraffe", "object_b": "bird", "relationship": "on the right of" } } } }, { "prompt_en": "a bottle on the left of a wine glass, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "bottle", "object_b": "wine glass", "relationship": "on the left of" } } } }, { "prompt_en": "a wine glass on the right of a cup, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "wine glass", "object_b": "cup", "relationship": "on the right of" } } } }, { "prompt_en": "a cup on the left of a fork, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "cup", "object_b": "fork", "relationship": "on the left of" } } } }, { "prompt_en": "a fork on the right of a knife, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "fork", "object_b": "knife", "relationship": "on the right of" } } } }, { "prompt_en": "a knife on the left of a spoon, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "knife", "object_b": "spoon", "relationship": "on the left of" } } } }, { "prompt_en": "a spoon on the right of a bowl, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "spoon", "object_b": "bowl", "relationship": "on the right of" } } } }, { "prompt_en": "a bowl on the left of a bottle, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "bowl", "object_b": "bottle", "relationship": "on the left of" } } } }, { "prompt_en": "a potted plant on the left of a remote, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "potted plant", "object_b": "remote", "relationship": "on the left of" } } } }, { "prompt_en": "a remote on the right of a clock, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "remote", "object_b": "clock", "relationship": "on the right of" } } } }, { "prompt_en": "a clock on the left of a vase, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "clock", "object_b": "vase", "relationship": "on the left of" } } } }, { "prompt_en": "a vase on the right of scissors, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "vase", "object_b": "scissors", "relationship": "on the right of" } } } }, { "prompt_en": "scissors on the left of a teddy bear, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "scissors", "object_b": "teddy bear", "relationship": "on the left of" } } } }, { "prompt_en": "a teddy bear on the right of a potted plant, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "teddy bear", "object_b": "potted plant", "relationship": "on the right of" } } } }, { "prompt_en": "a frisbee on the left of a sports ball, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "frisbee", "object_b": "sports ball", "relationship": "on the left of" } } } }, { "prompt_en": "a sports ball on the right of a baseball bat, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "sports ball", "object_b": "baseball bat", "relationship": "on the right of" } } } }, { "prompt_en": "a baseball bat on the left of a baseball glove, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "baseball bat", "object_b": "baseball glove", "relationship": "on the left of" } } } }, { "prompt_en": "a baseball glove on the right of a tennis racket, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "baseball glove", "object_b": "tennis racket", "relationship": "on the right of" } } } }, { "prompt_en": "a tennis racket on the left of a frisbee, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "tennis racket", "object_b": "frisbee", "relationship": "on the left of" } } } }, { "prompt_en": "a toilet on the left of a hair drier, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "toilet", "object_b": "hair drier", "relationship": "on the left of" } } } }, { "prompt_en": "a hair drier on the right of a toothbrush, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "hair drier", "object_b": "toothbrush", "relationship": "on the right of" } } } }, { "prompt_en": "a toothbrush on the left of a sink, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "toothbrush", "object_b": "sink", "relationship": "on the left of" } } } }, { "prompt_en": "a sink on the right of a toilet, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "sink", "object_b": "toilet", "relationship": "on the right of" } } } }, { "prompt_en": "a chair on the left of a couch, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "chair", "object_b": "couch", "relationship": "on the left of" } } } }, { "prompt_en": "a couch on the right of a bed, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "couch", "object_b": "bed", "relationship": "on the right of" } } } }, { "prompt_en": "a bed on the left of a tv, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "bed", "object_b": "tv", "relationship": "on the left of" } } } }, { "prompt_en": "a tv on the right of a dining table, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "tv", "object_b": "dining table", "relationship": "on the right of" } } } }, { "prompt_en": "a dining table on the left of a chair, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "dining table", "object_b": "chair", "relationship": "on the left of" } } } }, { "prompt_en": "an airplane on the left of a train, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "airplane", "object_b": "train", "relationship": "on the left of" } } } }, { "prompt_en": "a train on the right of a boat, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "train", "object_b": "boat", "relationship": "on the right of" } } } }, { "prompt_en": "a boat on the left of an airplane, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "boat", "object_b": "airplane", "relationship": "on the left of" } } } }, { "prompt_en": "an oven on the top of a toaster, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "oven", "object_b": "toaster", "relationship": "on the top of" } } } }, { "prompt_en": "an oven on the bottom of a toaster, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "oven", "object_b": "toaster", "relationship": "on the bottom of" } } } }, { "prompt_en": "a toaster on the top of a microwave, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "toaster", "object_b": "microwave", "relationship": "on the top of" } } } }, { "prompt_en": "a toaster on the bottom of a microwave, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "toaster", "object_b": "microwave", "relationship": "on the bottom of" } } } }, { "prompt_en": "a microwave on the top of an oven, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "microwave", "object_b": "oven", "relationship": "on the top of" } } } }, { "prompt_en": "a microwave on the bottom of an oven, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "microwave", "object_b": "oven", "relationship": "on the bottom of" } } } }, { "prompt_en": "a banana on the top of an apple, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "banana", "object_b": "apple", "relationship": "on the top of" } } } }, { "prompt_en": "a banana on the bottom of an apple, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "banana", "object_b": "apple", "relationship": "on the bottom of" } } } }, { "prompt_en": "an apple on the top of a sandwich, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "apple", "object_b": "sandwich", "relationship": "on the top of" } } } }, { "prompt_en": "an apple on the bottom of a sandwich, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "apple", "object_b": "sandwich", "relationship": "on the bottom of" } } } }, { "prompt_en": "a sandwich on the top of an orange, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "sandwich", "object_b": "orange", "relationship": "on the top of" } } } }, { "prompt_en": "a sandwich on the bottom of an orange, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "sandwich", "object_b": "orange", "relationship": "on the bottom of" } } } }, { "prompt_en": "an orange on the top of a carrot, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "orange", "object_b": "carrot", "relationship": "on the top of" } } } }, { "prompt_en": "an orange on the bottom of a carrot, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "orange", "object_b": "carrot", "relationship": "on the bottom of" } } } }, { "prompt_en": "a carrot on the top of a hot dog, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "carrot", "object_b": "hot dog", "relationship": "on the top of" } } } }, { "prompt_en": "a carrot on the bottom of a hot dog, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "carrot", "object_b": "hot dog", "relationship": "on the bottom of" } } } }, { "prompt_en": "a hot dog on the top of a pizza, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "hot dog", "object_b": "pizza", "relationship": "on the top of" } } } }, { "prompt_en": "a hot dog on the bottom of a pizza, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "hot dog", "object_b": "pizza", "relationship": "on the bottom of" } } } }, { "prompt_en": "a pizza on the top of a donut, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "pizza", "object_b": "donut", "relationship": "on the top of" } } } }, { "prompt_en": "a pizza on the bottom of a donut, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "pizza", "object_b": "donut", "relationship": "on the bottom of" } } } }, { "prompt_en": "a donut on the top of broccoli, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "donut", "object_b": "broccoli", "relationship": "on the top of" } } } }, { "prompt_en": "a donut on the bottom of broccoli, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "donut", "object_b": "broccoli", "relationship": "on the bottom of" } } } }, { "prompt_en": "broccoli on the top of a banana, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "broccoli", "object_b": "banana", "relationship": "on the top of" } } } }, { "prompt_en": "broccoli on the bottom of a banana, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "broccoli", "object_b": "banana", "relationship": "on the bottom of" } } } }, { "prompt_en": "skis on the top of a snowboard, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "skis", "object_b": "snowboard", "relationship": "on the top of" } } } }, { "prompt_en": "skis on the bottom of a snowboard, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "skis", "object_b": "snowboard", "relationship": "on the bottom of" } } } }, { "prompt_en": "a snowboard on the top of a kite, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "snowboard", "object_b": "kite", "relationship": "on the top of" } } } }, { "prompt_en": "a snowboard on the bottom of a kite, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "snowboard", "object_b": "kite", "relationship": "on the bottom of" } } } }, { "prompt_en": "a kite on the top of a skateboard, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "kite", "object_b": "skateboard", "relationship": "on the top of" } } } }, { "prompt_en": "a kite on the bottom of a skateboard, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "kite", "object_b": "skateboard", "relationship": "on the bottom of" } } } }, { "prompt_en": "a skateboard on the top of a surfboard, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "skateboard", "object_b": "surfboard", "relationship": "on the top of" } } } }, { "prompt_en": "a skateboard on the bottom of a surfboard, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "skateboard", "object_b": "surfboard", "relationship": "on the bottom of" } } } }, { "prompt_en": "a surfboard on the top of skis, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "surfboard", "object_b": "skis", "relationship": "on the top of" } } } }, { "prompt_en": "a surfboard on the bottom of skis, front view", "dimension": [ "spatial_relationship" ], "auxiliary_info": { "spatial_relationship": { "spatial_relationship": { "object_a": "surfboard", "object_b": "skis", "relationship": "on the bottom of" } } } } ] ================================================ FILE: Open-Sora/eval/vbench/calc_vbench.py ================================================ import argparse import os import time import torch from vbench import VBench full_info_path = "eval/vbench/VBench_full_info.json" dimensions = [ # a: 10min "subject_consistency", # 4min "imaging_quality", # 6min # b: 12min "background_consistency", # 2min "motion_smoothness", # 5min "overall_consistency", # 2min "human_action", # 3min # c: 14min "multiple_objects", # 14min # d: 14min "spatial_relationship", # 14min # e: 12min "object_class", # 12min # f: 12min "color", # 12min # g: 10.5min "aesthetic_quality", # 2.5min "appearance_style", # 6min "temporal_flickering", # 2min # h: 9min "scene", # 3min "temporal_style", # 2min "dynamic_degree", # 4min ] def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("video_folder", type=str) # samples/samples..._vbench/eval parser.add_argument("model_ckpt", type=str) parser.add_argument("--start", type=int, default=0) # start index of dimension to be evaluated parser.add_argument("--end", type=int, default=-1) # start index of dimension to be evaluated args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() output_dir = os.path.join(args.model_ckpt, "vbench") os.makedirs(output_dir, exist_ok=True) video_path = args.video_folder kwargs = {} kwargs["imaging_quality_preprocessing_mode"] = "longer" # use VBench/evaluate.py default start_time = time.time() # NOTE: important to use torch.device("cuda"), else will have issue with object_class third_party module my_VBench = VBench(torch.device("cuda"), full_info_path, output_dir) if args.end == -1: # adjust end accordingly args.end = len(dimensions) for dim in dimensions[args.start : args.end]: my_VBench.evaluate( videos_path=video_path, name=dim, local=False, read_frame=False, dimension_list=[dim], mode="vbench_standard", **kwargs, ) print("Runtime: %s seconds " % (time.time() - start_time)) ================================================ FILE: Open-Sora/eval/vbench/launch.sh ================================================ # !/bin/bash CKPT=$1 NUM_FRAMES=$2 MODEL_NAME=$3 RES=$4 ASP_RATIO=$5 NUM_SAMPLING_STEPS=$6 FLOW=$7 LLM_REFINE=$8 if [[ $CKPT == *"ema"* ]]; then parentdir=$(dirname $CKPT) CKPT_BASE=$(basename $parentdir)_ema else CKPT_BASE=$(basename $CKPT) fi LOG_BASE=$(dirname $CKPT)/eval echo "Logging to $LOG_BASE" # 确保 eval 目录存在 mkdir -p $LOG_BASE #GPUS=(0 1 2 3 4 5 6 7) #TASK_ID_LIST=(4a 4b 4c 4d 4e 4f 4g 4h) # for log records only #START_INDEX_LIST=(0 120 240 360 480 600 720 840) #END_INDEX_LIST=(120 240 360 480 600 720 840 2000) # 使用 6 张 GPU GPUS=(0 1 2 3 4 5) TASK_ID_LIST=(4a 4b 4c 4d 4e 4f) # 将 950 个 prompts 划分为 6 个区间 START_INDEX_LIST=(0 158 316 474 632 790) END_INDEX_LIST=(158 316 474 632 790 2000) # 使用 5 张 GPU #GPUS=(0 1 2 3 4) #TASK_ID_LIST=(4a 4b 4c 4d 4e) ## 将 950 个 prompts 划分为 5 个区间 #START_INDEX_LIST=(0 190 380 570 760) #END_INDEX_LIST=(190 380 570 760 2000) ## Modify the following to run on multiple machines for faster results ## 720p will take quite long on a single machine # START_INDEX_LIST=(60 180 300 420 540 660 780 900) # END_INDEX_LIST=(120 240 360 480 600 720 840 2000) # LOG_BASE=$(dirname $CKPT)/eval/last_60 # mkdir -p ${LOG_BASE} # echo "Logging to $LOG_BASE" for i in "${!GPUS[@]}"; do if [ -z ${RES} ] || [ -z ${ASP_RATIO} ] ; then CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & else if [ -z ${NUM_SAMPLING_STEPS} ]; then CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & else if [ -z ${FLOW} ]; then CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & else if [ -z ${LLM_REFINE} ]; then CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & else CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & fi fi fi fi done ================================================ FILE: Open-Sora/eval/vbench/launch_calc.sh ================================================ # !/bin/bash VIDEO_DIR=$1 CKPT_DIR=$2 LOG_BASE=$CKPT_DIR mkdir -p $LOG_BASE echo "Logging to $LOG_BASE" GPUS=(0 1 2 3 4 5 6 7) START_INDEX_LIST=(0 2 6 7 8 9 10 13) END_INDEX_LIST=(2 6 7 8 9 10 13 16) TASK_ID_LIST=(calc_vbench_a calc_vbench_b calc_vbench_c calc_vbench_d calc_vbench_e calc_vbench_f calc_vbench_g calc_vbench_h) # for log records only for i in "${!GPUS[@]}"; do CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench/calc_vbench.py $VIDEO_DIR $CKPT_DIR --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & done ================================================ FILE: Open-Sora/eval/vbench/tabulate_vbench_scores.py ================================================ import argparse import json import os SEMANTIC_WEIGHT = 1 QUALITY_WEIGHT = 4 QUALITY_LIST = [ "subject consistency", "background consistency", "temporal flickering", "motion smoothness", "aesthetic quality", "imaging quality", "dynamic degree", ] SEMANTIC_LIST = [ "object class", "multiple objects", "human action", "color", "spatial relationship", "scene", "appearance style", "temporal style", "overall consistency", ] NORMALIZE_DIC = { "subject consistency": {"Min": 0.1462, "Max": 1.0}, "background consistency": {"Min": 0.2615, "Max": 1.0}, "temporal flickering": {"Min": 0.6293, "Max": 1.0}, "motion smoothness": {"Min": 0.706, "Max": 0.9975}, "dynamic degree": {"Min": 0.0, "Max": 1.0}, "aesthetic quality": {"Min": 0.0, "Max": 1.0}, "imaging quality": {"Min": 0.0, "Max": 1.0}, "object class": {"Min": 0.0, "Max": 1.0}, "multiple objects": {"Min": 0.0, "Max": 1.0}, "human action": {"Min": 0.0, "Max": 1.0}, "color": {"Min": 0.0, "Max": 1.0}, "spatial relationship": {"Min": 0.0, "Max": 1.0}, "scene": {"Min": 0.0, "Max": 0.8222}, "appearance style": {"Min": 0.0009, "Max": 0.2855}, "temporal style": {"Min": 0.0, "Max": 0.364}, "overall consistency": {"Min": 0.0, "Max": 0.364}, } DIM_WEIGHT = { "subject consistency": 1, "background consistency": 1, "temporal flickering": 1, "motion smoothness": 1, "aesthetic quality": 1, "imaging quality": 1, "dynamic degree": 0.5, "object class": 1, "multiple objects": 1, "human action": 1, "color": 1, "spatial relationship": 1, "scene": 1, "appearance style": 1, "temporal style": 1, "overall consistency": 1, } ordered_scaled_res = [ "total score", "quality score", "semantic score", "subject consistency", "background consistency", "temporal flickering", "motion smoothness", "dynamic degree", "aesthetic quality", "imaging quality", "object class", "multiple objects", "human action", "color", "spatial relationship", "scene", "appearance style", "temporal style", "overall consistency", ] def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--score_dir", type=str) # ckpt_dir/eval/vbench args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() res_postfix = "_eval_results.json" info_postfix = "_full_info.json" files = os.listdir(args.score_dir) res_files = [x for x in files if res_postfix in x] info_files = [x for x in files if info_postfix in x] assert len(res_files) == len(info_files), f"got {len(res_files)} res files, but {len(info_files)} info files" full_results = {} for res_file in res_files: # first check if results is normal info_file = res_file.split(res_postfix)[0] + info_postfix with open(os.path.join(args.score_dir, info_file), "r", encoding="utf-8") as f: info = json.load(f) assert len(info[0]["video_list"]) > 0, f"Error: {info_file} has 0 video list" # read results with open(os.path.join(args.score_dir, res_file), "r", encoding="utf-8") as f: data = json.load(f) for key, val in data.items(): full_results[key] = format(val[0], ".4f") scaled_results = {} dims = set() for key, val in full_results.items(): dim = key.replace("_", " ") if "_" in key else key scaled_score = (float(val) - NORMALIZE_DIC[dim]["Min"]) / ( NORMALIZE_DIC[dim]["Max"] - NORMALIZE_DIC[dim]["Min"] ) scaled_score *= DIM_WEIGHT[dim] scaled_results[dim] = scaled_score dims.add(dim) assert len(dims) == len(NORMALIZE_DIC), f"{set(NORMALIZE_DIC.keys())-dims} not calculated yet" quality_score = sum([scaled_results[i] for i in QUALITY_LIST]) / sum([DIM_WEIGHT[i] for i in QUALITY_LIST]) semantic_score = sum([scaled_results[i] for i in SEMANTIC_LIST]) / sum([DIM_WEIGHT[i] for i in SEMANTIC_LIST]) scaled_results["quality score"] = quality_score scaled_results["semantic score"] = semantic_score scaled_results["total score"] = (quality_score * QUALITY_WEIGHT + semantic_score * SEMANTIC_WEIGHT) / ( QUALITY_WEIGHT + SEMANTIC_WEIGHT ) formated_scaled_results = {"items": []} for key in ordered_scaled_res: # formated_scaled_results[key] = format(val * 100, ".2f") + "%" formated_score = format(scaled_results[key] * 100, ".2f") + "%" formated_scaled_results["items"].append({key: formated_score}) output_file_path = os.path.join(args.score_dir, "all_results.json") with open(output_file_path, "w") as outfile: json.dump(full_results, outfile, indent=4, sort_keys=True) print(f"results saved to: {output_file_path}") scaled_file_path = os.path.join(args.score_dir, "scaled_results.json") with open(scaled_file_path, "w") as outfile: json.dump(formated_scaled_results, outfile, indent=4, sort_keys=True) print(f"results saved to: {scaled_file_path}") ================================================ FILE: Open-Sora/eval/vbench_i2v/calc_vbench_i2v.py ================================================ import argparse import os import time import torch from vbench import VBench from vbench2_beta_i2v import VBenchI2V full_info_path = "eval/vbench_i2v/vbench2_i2v_full_info.json" video_quality_dimensions = [ "subject_consistency", "background_consistency", "motion_smoothness", "dynamic_degree", "aesthetic_quality", "imaging_quality", "temporal_flickering", ] i2v_dimensions = ["i2v_subject", "i2v_background", "camera_motion"] def str2bool(v): if isinstance(v, bool): return v if v.lower() in ("yes", "true", "t", "y", "1"): return True elif v.lower() in ("no", "false", "f", "n", "0"): return False else: raise argparse.ArgumentTypeError("Boolean value expected.") def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("video_folder", type=str) # samples/samples..._vbench_i2v/ parser.add_argument("model_ckpt", type=str) parser.add_argument("--start", type=int, default=0) # start index of dimension to be evaluated parser.add_argument("--end", type=int, default=-1) # start index of dimension to be evaluated parser.add_argument("--calc_i2v", type=str2bool, default=True) parser.add_argument("--calc_quality", type=str2bool, default=True) args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() output_dir = os.path.join(args.model_ckpt, "vbench_i2v") os.makedirs(output_dir, exist_ok=True) video_path = args.video_folder start_time = time.time() if args.calc_i2v: my_VBench_I2V = VBenchI2V(torch.device("cuda"), full_info_path, output_dir) end = args.end if args.end != -1 else len(i2v_dimensions) for i2v_dim in i2v_dimensions[args.start : end]: my_VBench_I2V.evaluate(videos_path=video_path, name=i2v_dim, dimension_list=[i2v_dim], resolution="1-1") kwargs = {} kwargs["imaging_quality_preprocessing_mode"] = "longer" # use VBench/evaluate.py default if args.calc_quality: my_VBench = VBench(torch.device("cuda"), full_info_path, output_dir) end = args.end if args.end != -1 else len(video_quality_dimensions) for quality_dim in video_quality_dimensions[args.start : end]: my_VBench.evaluate( videos_path=video_path, name=quality_dim, dimension_list=[quality_dim], mode="vbench_standard", **kwargs ) print("Runtime: %s seconds " % (time.time() - start_time)) ================================================ FILE: Open-Sora/eval/vbench_i2v/json_to_txt.py ================================================ import json import os RESOLUTIONS = ["1-1", "16-9", "7-4", "8-5"] cache_root = "/mnt/jfs-hdd/sora/data/vbench-i2v/crop" resolution = RESOLUTIONS[0] json_file = "vbench2_i2v_full_info.json" save_path = "all_i2v.txt" data = json.load(open(json_file)) txt = [ f'{x["prompt_en"]}{{"reference_path": "{os.path.join(cache_root, resolution, x["image_name"])}", "mask_strategy": "0"}}' for x in data ] with open(save_path, "w") as f: f.write("\n".join(txt)) ================================================ FILE: Open-Sora/eval/vbench_i2v/launch.sh ================================================ #!/bin/bash CKPT=$1 NUM_FRAMES=$2 MODEL_NAME=$3 RES=$4 ASP_RATIO=$5 NUM_SAMPLING_STEPS=$6 FLOW=$7 LLM_REFINE=$8 if [[ $CKPT == *"ema"* ]]; then parentdir=$(dirname $CKPT) CKPT_BASE=$(basename $parentdir)_ema else CKPT_BASE=$(basename $CKPT) fi LOG_BASE=$(dirname $CKPT)/eval echo "Logging to $LOG_BASE" GPUS=(0 1 2 3 4 5 6 7) TASK_ID_LIST=(5a 5b 5c 5d 5e 5f 5g 5h) # for log records only START_INDEX_LIST=(0 140 280 420 560 700 840 980) END_INDEX_LIST=(140 280 420 560 700 840 980 2000) for i in "${!GPUS[@]}"; do if [ -z ${RES} ] || [ -z ${ASP_RATIO} ] ; then CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & else if [ -z ${NUM_SAMPLING_STEPS} ]; then CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & else if [ -z ${FLOW} ]; then CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & else if [ -z ${LLM_REFINE} ]; then CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & else CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & fi fi fi fi done ================================================ FILE: Open-Sora/eval/vbench_i2v/launch_calc.sh ================================================ # !/bin/bash VIDEO_DIR=$1 CKPT_DIR=$2 LOG_BASE=$CKPT_DIR mkdir -p $LOG_BASE echo "Logging to $LOG_BASE" GPUS=(0 1 2 3 4 5 6 7) CALC_I2V_LIST=(True True False False False False False False) CALC_QUALITY_LIST=(False False True True True True True True) START_INDEX_LIST=(0 2 0 2 3 4 5 6) END_INDEX_LIST=(2 -1 2 3 4 5 6 -1) TASK_ID_LIST=(calc_vbench_i2v_a calc_vbench_i2v_b calc_vbench_i2v_c calc_vbench_i2v_d calc_vbench_i2v_e calc_vbench_i2v_f calc_vbench_i2v_g calc_vbench_i2v_h) # for log records only for i in "${!GPUS[@]}"; do CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench_i2v/calc_vbench_i2v.py $VIDEO_DIR $CKPT_DIR --calc_i2v ${CALC_I2V_LIST[i]} --calc_quality ${CALC_QUALITY_LIST[i]} --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & done ================================================ FILE: Open-Sora/gradio/README.md ================================================ --- title: Open Sora emoji: 🎥 colorFrom: red colorTo: purple sdk: gradio sdk_version: 4.25.0 app_file: app.py pinned: false license: apache-2.0 preload_from_hub: - hpcai-tech/OpenSora-STDiT-v3 - hpcai-tech/OpenSora-VAE-v1.2 - DeepFloyd/t5-v1_1-xxl --- # 🕹 Gradio Demo We have provided a Gradio demo app for you to generate videos via a web interface. You can choose to run it locally or deploy it to Hugging Face by following the instructions given below. ## 🚀 Run Gradio Locally (Outdated) We assume that you have already installed `opensora` based on the instructions given in the [main README](../README.md). Follow the steps below to run this app on your local machine. 1. First of all, you need to install `gradio` and `spaces`. ```bash pip install gradio spaces ``` 2. Afterwards, you can use the following command to launch the application. Remember to launch the command in the project root directory instead of the `gradio` folder. ```bash # start the gradio app python gradio/app.py # run with a different port python gradio/app.py --port 8000 # run with acceleration such as flash attention and fused norm python gradio/app.py --enable-optimization # run with a sharable Gradio link python gradio/app.py --share ``` 3. You should then be able to access this demo via the link which appears in your terminal. ## 📦 Deploy Gradio to Hugging Face Space (Outdated) We have also tested this Gradio app on Hugging Face Spaces. You can follow the steps below. 1. Create a Space on Hugging Face, remember to choose `Gradio SDK` and GPU space hardware. 2. Clone the Space repository in your local machine. 3. Copy the `configs` folder and `gradio/app.py` and `gradio/requirements.txt` to the repository you just cloned. The file structure will look like: ```text - configs - ... - app.py - requirements.txt - README.md - LICENSE - ... ``` 4. Push the files to your remote Hugging Face Spaces repository. The application will be built and run automatically. ## Advanced Usage ![Gradio Demo](../assets/readme/gradio_advanced.png) For the "**FPS**" option, as now we fix the output video's FPS to 24, this option will not affect the output video's length. Thus, for a smaller FPS, the video is supposed to be longer but accelerated due to 24 FPS. Thus, the video will be less smooth but faster. For a larger FPS, the video will be smoother but slower. For the "**Number of Loops**", it will affect the output video's length and generation speed. For example, if you set the number of loops to 2, the output video will be twice as long as the original video. This is achieved by conditioning the next generation on 1/4 of the previous generated frames and then concatenating all the frames together. A trick to give different text prompts for different parts of the video is to use the `|x|` symbol to separate the text prompts, where x is the start frame of the next text prompt. This format requires a `|0|` at the start of the prompt. For example, if you want to generate a video with the text prompt "A cat" for the first 2 generations and "A dog" for the rest generations, you can use the text prompt "|0|A cat|2|A dog". You can still check the "**Enhance prompt with GPT4o**" to refine your prompts in each part separately. ================================================ FILE: Open-Sora/gradio/app.py ================================================ #!/usr/bin/env python """ This script runs a Gradio App for the Open-Sora model. Usage: python demo.py """ import argparse import datetime import importlib import os import subprocess import sys from tempfile import NamedTemporaryFile import spaces import torch import gradio as gr MODEL_TYPES = ["v1.2-stage3"] WATERMARK_PATH = "./assets/images/watermark/watermark.png" CONFIG_MAP = { "v1.2-stage3": "configs/opensora-v1-2/inference/sample.py", } HF_STDIT_MAP = {"v1.2-stage3": "hpcai-tech/OpenSora-STDiT-v3"} # ============================ # Prepare Runtime Environment # ============================ def install_dependencies(enable_optimization=False): """ Install the required dependencies for the demo if they are not already installed. """ def _is_package_available(name) -> bool: try: importlib.import_module(name) return True except (ImportError, ModuleNotFoundError): return False if enable_optimization: # install flash attention if not _is_package_available("flash_attn"): subprocess.run( f"{sys.executable} -m pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True, ) # install apex for fused layernorm if not _is_package_available("apex"): subprocess.run( f'{sys.executable} -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git', shell=True, ) # install ninja if not _is_package_available("ninja"): subprocess.run(f"{sys.executable} -m pip install ninja", shell=True) # install xformers if not _is_package_available("xformers"): subprocess.run( f"{sys.executable} -m pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers", shell=True, ) # ============================ # Model-related # ============================ def read_config(config_path): """ Read the configuration file. """ from mmengine.config import Config return Config.fromfile(config_path) def build_models(model_type, config, enable_optimization=False): """ Build the models for the given model type and configuration. """ # build vae from opensora.registry import MODELS, build_module vae = build_module(config.vae, MODELS).cuda() # build text encoder text_encoder = build_module(config.text_encoder, MODELS) # T5 must be fp32 text_encoder.t5.model = text_encoder.t5.model.cuda() # build stdit # we load model from HuggingFace directly so that we don't need to # handle model download logic in HuggingFace Space from opensora.models.stdit.stdit3 import STDiT3 model_kwargs = {k: v for k, v in config.model.items() if k not in ("type", "from_pretrained", "force_huggingface")} stdit = STDiT3.from_pretrained(HF_STDIT_MAP[model_type], **model_kwargs) stdit = stdit.cuda() # build scheduler from opensora.registry import SCHEDULERS scheduler = build_module(config.scheduler, SCHEDULERS) # hack for classifier-free guidance text_encoder.y_embedder = stdit.y_embedder # move modelst to device vae = vae.to(torch.bfloat16).eval() text_encoder.t5.model = text_encoder.t5.model.eval() # t5 must be in fp32 stdit = stdit.to(torch.bfloat16).eval() # clear cuda torch.cuda.empty_cache() return vae, text_encoder, stdit, scheduler def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( "--model-type", default="v1.2-stage3", choices=MODEL_TYPES, help=f"The type of model to run for the Gradio App, can only be {MODEL_TYPES}", ) parser.add_argument("--output", default="./outputs", type=str, help="The path to the output folder") parser.add_argument("--port", default=None, type=int, help="The port to run the Gradio App on.") parser.add_argument("--host", default="0.0.0.0", type=str, help="The host to run the Gradio App on.") parser.add_argument("--share", action="store_true", help="Whether to share this gradio demo.") parser.add_argument( "--enable-optimization", action="store_true", help="Whether to enable optimization such as flash attention and fused layernorm", ) return parser.parse_args() # ============================ # Main Gradio Script # ============================ # as `run_inference` needs to be wrapped by `spaces.GPU` and the input can only be the prompt text # so we can't pass the models to `run_inference` as arguments. # instead, we need to define them globally so that we can access these models inside `run_inference` # read config args = parse_args() config = read_config(CONFIG_MAP[args.model_type]) torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True # make outputs dir os.makedirs(args.output, exist_ok=True) # disable torch jit as it can cause failure in gradio SDK # gradio sdk uses torch with cuda 11.3 torch.jit._state.disable() # set up install_dependencies(enable_optimization=args.enable_optimization) # import after installation from opensora.datasets import IMG_FPS, save_sample from opensora.datasets.aspect import get_image_size, get_num_frames from opensora.models.text_encoder.t5 import text_preprocessing from opensora.utils.inference_utils import ( add_watermark, append_generated, append_score_to_prompts, apply_mask_strategy, collect_references_batch, dframe_to_frame, extract_json_from_prompts, extract_prompts_loop, get_random_prompt_by_openai, has_openai_key, merge_prompt, prepare_multi_resolution_info, refine_prompts_by_openai, split_prompt, ) from opensora.utils.misc import to_torch_dtype # some global variables dtype = to_torch_dtype(config.dtype) device = torch.device("cuda") # build model vae, text_encoder, stdit, scheduler = build_models( args.model_type, config, enable_optimization=args.enable_optimization ) def run_inference( mode, prompt_text, resolution, aspect_ratio, length, motion_strength, aesthetic_score, use_motion_strength, use_aesthetic_score, camera_motion, reference_image, refine_prompt, fps, num_loop, seed, sampling_steps, cfg_scale, ): if prompt_text is None or prompt_text == "": gr.Warning("Your prompt is empty, please enter a valid prompt") return None torch.manual_seed(seed) with torch.inference_mode(): # ====================== # 1. Preparation arguments # ====================== # parse the inputs # frame_interval must be 1 so we ignore it here image_size = get_image_size(resolution, aspect_ratio) # compute generation parameters if mode == "Text2Image": num_frames = 1 fps = IMG_FPS else: num_frames = config.num_frames num_frames = get_num_frames(length) condition_frame_length = int(num_frames / 17 * 5 / 3) condition_frame_edit = 0.0 input_size = (num_frames, *image_size) latent_size = vae.get_latent_size(input_size) multi_resolution = "OpenSora" align = 5 # == prepare mask strategy == if mode == "Text2Image": mask_strategy = [None] elif mode == "Text2Video": if reference_image is not None: mask_strategy = ["0"] else: mask_strategy = [None] else: raise ValueError(f"Invalid mode: {mode}") # == prepare reference == if mode == "Text2Image": refs = [""] elif mode == "Text2Video": if reference_image is not None: # save image to disk from PIL import Image im = Image.fromarray(reference_image) temp_file = NamedTemporaryFile(suffix=".png") im.save(temp_file.name) refs = [temp_file.name] else: refs = [""] else: raise ValueError(f"Invalid mode: {mode}") # == get json from prompts == batch_prompts = [prompt_text] batch_prompts, refs, mask_strategy = extract_json_from_prompts(batch_prompts, refs, mask_strategy) # == get reference for condition == refs = collect_references_batch(refs, vae, image_size) # == multi-resolution info == model_args = prepare_multi_resolution_info( multi_resolution, len(batch_prompts), image_size, num_frames, fps, device, dtype ) # == process prompts step by step == # 0. split prompt # each element in the list is [prompt_segment_list, loop_idx_list] batched_prompt_segment_list = [] batched_loop_idx_list = [] for prompt in batch_prompts: prompt_segment_list, loop_idx_list = split_prompt(prompt) batched_prompt_segment_list.append(prompt_segment_list) batched_loop_idx_list.append(loop_idx_list) # 1. refine prompt by openai if refine_prompt: # check if openai key is provided if not has_openai_key(): gr.Warning("OpenAI API key is not provided, the prompt will not be enhanced.") else: for idx, prompt_segment_list in enumerate(batched_prompt_segment_list): batched_prompt_segment_list[idx] = refine_prompts_by_openai(prompt_segment_list) # process scores aesthetic_score = aesthetic_score if use_aesthetic_score else None motion_strength = motion_strength if use_motion_strength and mode != "Text2Image" else None camera_motion = None if camera_motion == "none" or mode == "Text2Image" else camera_motion # 2. append score for idx, prompt_segment_list in enumerate(batched_prompt_segment_list): batched_prompt_segment_list[idx] = append_score_to_prompts( prompt_segment_list, aes=aesthetic_score, flow=motion_strength, camera_motion=camera_motion, ) # 3. clean prompt with T5 for idx, prompt_segment_list in enumerate(batched_prompt_segment_list): batched_prompt_segment_list[idx] = [text_preprocessing(prompt) for prompt in prompt_segment_list] # 4. merge to obtain the final prompt batch_prompts = [] for prompt_segment_list, loop_idx_list in zip(batched_prompt_segment_list, batched_loop_idx_list): batch_prompts.append(merge_prompt(prompt_segment_list, loop_idx_list)) # ========================= # Generate image/video # ========================= video_clips = [] for loop_i in range(num_loop): # 4.4 sample in hidden space batch_prompts_loop = extract_prompts_loop(batch_prompts, loop_i) # == loop == if loop_i > 0: refs, mask_strategy = append_generated( vae, video_clips[-1], refs, mask_strategy, loop_i, condition_frame_length, condition_frame_edit ) # == sampling == z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype) masks = apply_mask_strategy(z, refs, mask_strategy, loop_i, align=align) # 4.6. diffusion sampling # hack to update num_sampling_steps and cfg_scale scheduler_kwargs = config.scheduler.copy() scheduler_kwargs.pop("type") scheduler_kwargs["num_sampling_steps"] = sampling_steps scheduler_kwargs["cfg_scale"] = cfg_scale scheduler.__init__(**scheduler_kwargs) samples = scheduler.sample( stdit, text_encoder, z=z, prompts=batch_prompts_loop, device=device, additional_args=model_args, progress=True, mask=masks, ) samples = vae.decode(samples.to(dtype), num_frames=num_frames) video_clips.append(samples) # ========================= # Save output # ========================= video_clips = [val[0] for val in video_clips] for i in range(1, num_loop): video_clips[i] = video_clips[i][:, dframe_to_frame(condition_frame_length) :] video = torch.cat(video_clips, dim=1) current_datetime = datetime.datetime.now() timestamp = current_datetime.timestamp() save_path = os.path.join(args.output, f"output_{timestamp}") saved_path = save_sample(video, save_path=save_path, fps=24) torch.cuda.empty_cache() # add watermark # all watermarked videos should have a _watermarked suffix if mode != "Text2Image" and os.path.exists(WATERMARK_PATH): watermarked_path = saved_path.replace(".mp4", "_watermarked.mp4") success = add_watermark(saved_path, WATERMARK_PATH, watermarked_path) if success: return watermarked_path else: return saved_path else: return saved_path @spaces.GPU(duration=200) def run_image_inference( prompt_text, resolution, aspect_ratio, length, motion_strength, aesthetic_score, use_motion_strength, use_aesthetic_score, camera_motion, reference_image, refine_prompt, fps, num_loop, seed, sampling_steps, cfg_scale, ): return run_inference( "Text2Image", prompt_text, resolution, aspect_ratio, length, motion_strength, aesthetic_score, use_motion_strength, use_aesthetic_score, camera_motion, reference_image, refine_prompt, fps, num_loop, seed, sampling_steps, cfg_scale, ) @spaces.GPU(duration=200) def run_video_inference( prompt_text, resolution, aspect_ratio, length, motion_strength, aesthetic_score, use_motion_strength, use_aesthetic_score, camera_motion, reference_image, refine_prompt, fps, num_loop, seed, sampling_steps, cfg_scale, ): # if (resolution == "480p" and length == "16s") or \ # (resolution == "720p" and length in ["8s", "16s"]): # gr.Warning("Generation is interrupted as the combination of 480p and 16s will lead to CUDA out of memory") # else: return run_inference( "Text2Video", prompt_text, resolution, aspect_ratio, length, motion_strength, aesthetic_score, use_motion_strength, use_aesthetic_score, camera_motion, reference_image, refine_prompt, fps, num_loop, seed, sampling_steps, cfg_scale, ) def generate_random_prompt(): if "OPENAI_API_KEY" not in os.environ: gr.Warning("Your prompt is empty and the OpenAI API key is not provided, please enter a valid prompt") return None else: prompt_text = get_random_prompt_by_openai() return prompt_text def main(): # create demo with gr.Blocks() as demo: with gr.Row(): with gr.Column(): gr.HTML( """

Open-Sora: Democratizing Efficient Video Production for All

""" ) with gr.Row(): with gr.Column(): prompt_text = gr.Textbox(label="Prompt", placeholder="Describe your video here", lines=4) refine_prompt = gr.Checkbox( value=has_openai_key(), label="Refine prompt with GPT4o", interactive=has_openai_key() ) random_prompt_btn = gr.Button("Random Prompt By GPT4o", interactive=has_openai_key()) gr.Markdown("## Basic Settings") resolution = gr.Radio( choices=["144p", "240p", "360p", "480p", "720p"], value="480p", label="Resolution", ) aspect_ratio = gr.Radio( choices=["9:16", "16:9", "3:4", "4:3", "1:1"], value="9:16", label="Aspect Ratio (H:W)", ) length = gr.Radio( choices=["2s", "4s", "8s", "16s"], value="2s", label="Video Length", info="only effective for video generation, 8s may fail as Hugging Face ZeroGPU has the limitation of max 200 seconds inference time.", ) with gr.Row(): seed = gr.Slider(value=1024, minimum=1, maximum=2048, step=1, label="Seed") sampling_steps = gr.Slider(value=30, minimum=1, maximum=200, step=1, label="Sampling steps") cfg_scale = gr.Slider(value=7.0, minimum=0.0, maximum=10.0, step=0.1, label="CFG Scale") with gr.Row(): with gr.Column(): motion_strength = gr.Slider( value=5, minimum=0, maximum=100, step=1, label="Motion Strength", info="only effective for video generation", ) use_motion_strength = gr.Checkbox(value=False, label="Enable") with gr.Column(): aesthetic_score = gr.Slider( value=6.5, minimum=4, maximum=7, step=0.1, label="Aesthetic", info="effective for text & video generation", ) use_aesthetic_score = gr.Checkbox(value=True, label="Enable") camera_motion = gr.Radio( value="none", label="Camera Motion", choices=["none", "pan right", "pan left", "tilt up", "tilt down", "zoom in", "zoom out", "static"], interactive=True, ) gr.Markdown("## Advanced Settings") with gr.Row(): fps = gr.Slider( value=24, minimum=1, maximum=60, step=1, label="FPS", info="This is the frames per seconds for video generation, keep it to 24 if you are not sure", ) num_loop = gr.Slider( value=1, minimum=1, maximum=20, step=1, label="Number of Loops", info="This will change the length of the generated video, keep it to 1 if you are not sure", ) gr.Markdown("## Reference Image") reference_image = gr.Image(label="Image (optional)", show_download_button=True) with gr.Column(): output_video = gr.Video(label="Output Video", height="100%") with gr.Row(): image_gen_button = gr.Button("Generate image") video_gen_button = gr.Button("Generate video") image_gen_button.click( fn=run_image_inference, inputs=[ prompt_text, resolution, aspect_ratio, length, motion_strength, aesthetic_score, use_motion_strength, use_aesthetic_score, camera_motion, reference_image, refine_prompt, fps, num_loop, seed, sampling_steps, cfg_scale, ], outputs=reference_image, ) video_gen_button.click( fn=run_video_inference, inputs=[ prompt_text, resolution, aspect_ratio, length, motion_strength, aesthetic_score, use_motion_strength, use_aesthetic_score, camera_motion, reference_image, refine_prompt, fps, num_loop, seed, sampling_steps, cfg_scale, ], outputs=output_video, ) random_prompt_btn.click(fn=generate_random_prompt, outputs=prompt_text) # launch demo.queue(max_size=5, default_concurrency_limit=1) demo.launch(server_port=args.port, server_name=args.host, share=args.share, max_threads=1) if __name__ == "__main__": main() ================================================ FILE: Open-Sora/gradio/requirements.txt ================================================ xformers transformers git+https://github.com/hpcaitech/Open-Sora.git ================================================ FILE: Open-Sora/notebooks/inference.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Inference for OpenSora" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Define global variables. You should change the following variables according to your setting." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# global variables\n", "ROOT = \"..\"\n", "cfg_path = f\"{ROOT}/configs/opensora-v1-2/inference/sample.py\"\n", "ckpt_path = \"/home/lishenggui/projects/sora/Open-Sora-dev/outputs/207-STDiT3-XL-2/epoch0-global_step9000/\"\n", "vae_path = f\"{ROOT}/pretrained_models/vae-pipeline\"\n", "save_dir = f\"{ROOT}/samples/samples_notebook/\"\n", "device = \"cuda:0\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Import necessary libraries and load the models." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "from pprint import pformat\n", "\n", "import colossalai\n", "import torch\n", "import torch.distributed as dist\n", "from colossalai.cluster import DistCoordinator\n", "from mmengine.runner import set_random_seed\n", "from tqdm.notebook import tqdm\n", "\n", "from opensora.acceleration.parallel_states import set_sequence_parallel_group\n", "from opensora.datasets import save_sample, is_img\n", "from opensora.datasets.aspect import get_image_size, get_num_frames\n", "from opensora.models.text_encoder.t5 import text_preprocessing\n", "from opensora.registry import MODELS, SCHEDULERS, build_module\n", "from opensora.utils.config_utils import read_config\n", "from opensora.utils.inference_utils import (\n", " append_generated,\n", " apply_mask_strategy,\n", " collect_references_batch,\n", " extract_json_from_prompts,\n", " extract_prompts_loop,\n", " get_save_path_name,\n", " load_prompts,\n", " prepare_multi_resolution_info,\n", ")\n", "from opensora.utils.misc import all_exists, create_logger, is_distributed, is_main_process, to_torch_dtype" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "torch.set_grad_enabled(False)\n", "\n", "# == parse configs ==\n", "cfg = read_config(cfg_path)\n", "cfg.model.from_pretrained = ckpt_path\n", "cfg.vae.from_pretrained = vae_path\n", "\n", "# == device and dtype ==\n", "cfg_dtype = cfg.get(\"dtype\", \"fp32\")\n", "assert cfg_dtype in [\"fp16\", \"bf16\", \"fp32\"], f\"Unknown mixed precision {cfg_dtype}\"\n", "dtype = to_torch_dtype(cfg.get(\"dtype\", \"bf16\"))\n", "torch.backends.cuda.matmul.allow_tf32 = True\n", "torch.backends.cudnn.allow_tf32 = True\n", "\n", "set_random_seed(seed=cfg.get(\"seed\", 1024))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# == build text-encoder and vae ==\n", "text_encoder = build_module(cfg.text_encoder, MODELS, device=device)\n", "vae = build_module(cfg.vae, MODELS).to(device, dtype).eval()\n", "\n", "# == build diffusion model ==\n", "input_size = (None, None, None)\n", "latent_size = vae.get_latent_size(input_size)\n", "model = (\n", " build_module(\n", " cfg.model,\n", " MODELS,\n", " input_size=latent_size,\n", " in_channels=vae.out_channels,\n", " caption_channels=text_encoder.output_dim,\n", " model_max_length=text_encoder.model_max_length,\n", " )\n", " .to(device, dtype)\n", " .eval()\n", ")\n", "text_encoder.y_embedder = model.y_embedder # HACK: for classifier-free guidance" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Define inference function." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start_idx = 0\n", "multi_resolution = cfg.get(\"multi_resolution\", None)\n", "batch_size = cfg.get(\"batch_size\", 1)\n", "\n", "\n", "def inference(\n", " prompts=cfg.get(\"prompt\", None),\n", " image_size=None,\n", " num_frames=None,\n", " resolution=None,\n", " aspect_ratio=None,\n", " mask_strategy=None,\n", " reference_path=None,\n", " num_sampling_steps=None,\n", " cfg_scale=None,\n", " seed=None,\n", " fps=cfg.fps,\n", " num_sample=cfg.get(\"num_sample\", 1),\n", " loop=cfg.get(\"loop\", 1),\n", " condition_frame_length=cfg.get(\"condition_frame_length\", 5),\n", " align=cfg.get(\"align\", None),\n", " sample_name=cfg.get(\"sample_name\", None),\n", " prompt_as_path=cfg.get(\"prompt_as_path\", False),\n", " disable_progress=False,\n", "):\n", " global start_idx\n", " os.makedirs(save_dir, exist_ok=True)\n", " if seed is not None:\n", " set_random_seed(seed=seed)\n", " if not isinstance(prompts, list):\n", " prompts = [prompts]\n", " if mask_strategy is None:\n", " mask_strategy = [\"\"] * len(prompts)\n", " if reference_path is None:\n", " reference_path = [\"\"] * len(prompts)\n", " save_fps = cfg.fps // cfg.get(\"frame_interval\", 1)\n", " if num_sampling_steps is not None:\n", " cfg.scheduler[\"num_sampling_steps\"] = num_sampling_steps\n", " if cfg_scale is not None:\n", " cfg.scheduler[\"scale\"] = cfg_scale\n", " scheduler = build_module(cfg.scheduler, SCHEDULERS)\n", " ret_path = []\n", "\n", " # == prepare video size ==\n", " if image_size is None:\n", " assert (\n", " resolution is not None and aspect_ratio is not None\n", " ), \"resolution and aspect_ratio must be provided if image_size is not provided\"\n", " image_size = get_image_size(resolution, aspect_ratio)\n", " num_frames = get_num_frames(num_frames)\n", " input_size = (num_frames, *image_size)\n", " latent_size = vae.get_latent_size(input_size)\n", "\n", " # == Iter over all samples ==\n", " for i in tqdm(range(0, len(prompts), batch_size), disable=disable_progress):\n", " # == prepare batch prompts ==\n", " batch_prompts = prompts[i : i + batch_size]\n", " ms = mask_strategy[i : i + batch_size]\n", " refs = reference_path[i : i + batch_size]\n", "\n", " batch_prompts, refs, ms = extract_json_from_prompts(batch_prompts, refs, ms)\n", " refs = collect_references_batch(refs, vae, image_size)\n", "\n", " # == multi-resolution info ==\n", " model_args = prepare_multi_resolution_info(\n", " multi_resolution, len(batch_prompts), image_size, num_frames, fps, device, dtype\n", " )\n", "\n", " # == Iter over number of sampling for one prompt ==\n", " for k in range(num_sample):\n", " # == prepare save paths ==\n", " save_paths = [\n", " get_save_path_name(\n", " save_dir,\n", " sample_name=sample_name,\n", " sample_idx=start_idx + idx,\n", " prompt=batch_prompts[idx],\n", " prompt_as_path=prompt_as_path,\n", " num_sample=num_sample,\n", " k=k,\n", " )\n", " for idx in range(len(batch_prompts))\n", " ]\n", "\n", " # NOTE: Skip if the sample already exists\n", " # This is useful for resuming sampling VBench\n", " if prompt_as_path and all_exists(save_paths):\n", " continue\n", "\n", " # == Iter over loop generation ==\n", " video_clips = []\n", " for loop_i in range(loop):\n", " batch_prompts_loop = extract_prompts_loop(batch_prompts, loop_i)\n", " batch_prompts_cleaned = [text_preprocessing(prompt) for prompt in batch_prompts_loop]\n", "\n", " # == loop ==\n", " if loop_i > 0:\n", " refs, ms = append_generated(vae, video_clips[-1], refs, ms, loop_i, condition_frame_length)\n", "\n", " # == sampling ==\n", " z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)\n", " masks = apply_mask_strategy(z, refs, ms, loop_i, align=align)\n", " samples = scheduler.sample(\n", " model,\n", " text_encoder,\n", " z=z,\n", " prompts=batch_prompts_cleaned,\n", " device=device,\n", " additional_args=model_args,\n", " progress=False,\n", " mask=masks,\n", " )\n", " samples = vae.decode(samples.to(dtype), num_frames=num_frames)\n", " video_clips.append(samples)\n", "\n", " # == save samples ==\n", " if is_main_process():\n", " for idx, batch_prompt in enumerate(batch_prompts):\n", " save_path = save_paths[idx]\n", " video = [video_clips[i][idx] for i in range(loop)]\n", " for i in range(1, loop):\n", " video[i] = video[i][:, condition_frame_length:]\n", " video = torch.cat(video, dim=1)\n", " path = save_sample(\n", " video,\n", " fps=save_fps,\n", " save_path=save_path,\n", " verbose=False,\n", " )\n", " ret_path.append(path)\n", " start_idx += len(batch_prompts)\n", " return ret_path" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.display import Video, Image, display\n", "\n", "\n", "def display_results(paths):\n", " for path in paths:\n", " if is_img(path):\n", " display(Image(path))\n", " else:\n", " display(Video(path, embed=True))\n", "\n", "\n", "def reset_start_idx():\n", " global start_idx\n", " start_idx = 0\n", "\n", "\n", "ALL_ASPECT_RATIO = [\"1:1\", \"16:9\", \"9:16\", \"3:4\", \"4:3\", \"1:2\", \"2:1\"]\n", "\n", "\n", "def inference_all_aspects(prompts, resolution, num_frames, *args, **kwargs):\n", " paths = []\n", " for aspect_ratio in tqdm(ALL_ASPECT_RATIO):\n", " paths.extend(\n", " inference(\n", " prompts,\n", " resolution=resolution,\n", " num_frames=num_frames,\n", " aspect_ratio=aspect_ratio,\n", " disable_progress=True,\n", " *args,\n", " **kwargs\n", " )\n", " )\n", " return paths" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Inference for OpenSora" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Sample code for inference for OpenSora." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "paths = inference(\n", " [\"a man.\", \"a woman\"],\n", " resolution=\"240p\",\n", " aspect_ratio=\"1:1\",\n", " num_frames=\"1x\",\n", " num_sampling_steps=30,\n", " cfg_scale=7.0,\n", ")\n", "display_results(paths)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Sample all aspect ratios." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "PROMPT = \"a boy.\"\n", "paths = inference_all_aspects(\n", " PROMPT,\n", " resolution=\"240p\",\n", " num_frames=\"1x\",\n", " num_sampling_steps=30,\n", " cfg_scale=7.0,\n", ")\n", "display_results(paths)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Sample all resolution and length." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "PROMPT = \"a boy.\"\n", "sample_cfg = {\n", " \"144p\": [1, \"1x\", \"2x\", \"4x\", \"8x\"],\n", " \"240p\": [1, \"1x\", \"2x\", \"4x\", \"8x\"],\n", " \"360p\": [1, \"1x\", \"2x\", \"4x\"],\n", " \"480p\": [1, \"1x\", \"2x\", \"4x\"],\n", " \"720p\": [1, \"1x\", \"2x\"],\n", "}\n", "all_paths = []\n", "for resolution, num_frames in sample_cfg.items():\n", " for num_frame in num_frames:\n", " print(f\"Resolution: {resolution}, Num Frames: {num_frame}\")\n", " paths = inference(\n", " PROMPT,\n", " resolution=resolution,\n", " num_frames=num_frame,\n", " aspect_ratio=\"9:16\",\n", " num_sampling_steps=30,\n", " cfg_scale=7.0,\n", " disable_progress=True,\n", " )\n", " display_results(paths)\n", " all_paths.extend(paths)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Sample all resolution, length, and aspect ratios." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "PROMPT = \"a boy.\"\n", "sample_cfg = {\n", " \"144p\": [1, \"1x\", \"2x\", \"4x\", \"8x\"],\n", " \"240p\": [1, \"1x\", \"2x\", \"4x\", \"8x\"],\n", " \"360p\": [1, \"1x\", \"2x\", \"4x\"],\n", " \"480p\": [1, \"1x\", \"2x\", \"4x\"],\n", " \"720p\": [1, \"1x\", \"2x\"],\n", "}\n", "all_paths = []\n", "for resolution, num_frames in sample_cfg.items():\n", " for num_frame in num_frames:\n", " paths = inference_all_aspects(\n", " PROMPT,\n", " resolution=resolution,\n", " num_frames=num_frames,\n", " num_sampling_steps=30,\n", " cfg_scale=7.0,\n", " )\n", " display_results(paths)\n", " all_paths.extend(paths)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "opensora", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: Open-Sora/notebooks/launch.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Process Pipeline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data Process Commands" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "# TODO: change to your own project path!!!\n", "OPEN_SORA_HOME = \"/path/to/Open-Sora/\"\n", "\n", "\n", "def convert_dataset_cmd(input_dir, output_file, datatype=\"video\"):\n", " commands = []\n", " commands.append(f'echo \"Converting {input_dir} to {output_file}\"')\n", " output_dir = os.path.dirname(output_file)\n", "\n", " commands.append(f\"mkdir -p {output_dir}\")\n", " commands.append(f\"cd {OPEN_SORA_HOME}\")\n", " commands.append(f\"python -m tools.datasets.convert {datatype} {input_dir} --output {output_file}\")\n", " return \" && \".join(commands), output_file\n", "\n", "\n", "def get_video_info(input_file):\n", " commands = []\n", " base, ext = os.path.splitext(input_file)\n", " output_file = f\"{base}_info{ext}\"\n", " output_format = ext[1:]\n", "\n", " commands.append(f'echo \"Getting info of {input_file} to {output_file}\"')\n", " commands.append(f\"cd {OPEN_SORA_HOME}\")\n", " commands.append(\n", " f\"python -m tools.datasets.datautil {input_file} --output {output_file} --format {output_format} --info --fmin 1\"\n", " )\n", " return \" && \".join(commands), output_file\n", "\n", "\n", "def get_video_info_torchvision(input_file):\n", " commands = []\n", " base, ext = os.path.splitext(input_file)\n", " output_file = f\"{base}_info{ext}\"\n", " output_format = ext[1:]\n", "\n", " commands.append(f'echo \"Getting info of {input_file} to {output_file}\"')\n", " commands.append(f\"cd {OPEN_SORA_HOME}\")\n", " commands.append(\n", " f\"python -m tools.datasets.datautil {input_file} --output {output_file} --format {output_format} --video-info --fmin 1\"\n", " )\n", " return \" && \".join(commands), output_file\n", "\n", "\n", "def get_caption_llava7b_video(input_file):\n", " commands = []\n", " base, ext = os.path.splitext(input_file)\n", " output_file = f\"{base}_caption{ext}\"\n", " output_format = ext[1:]\n", "\n", " commands.append(f'echo \"Getting info of {input_file} to {output_file}\"')\n", " commands.append(f\"cd {OPEN_SORA_HOME}\")\n", " commands.append(f\"conda activate llava2\")\n", " commands.append(\n", " f\"torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava {input_file} --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video\"\n", " )\n", " commands.append(f\"conda activate opensora\")\n", " commands.append(\n", " f\"python -m tools.datasets.datautil {base}_caption_part*{ext} --output {output_file} --format {output_format} --intersection {input_file} --clean-caption --refine-llm-caption --remove-empty-caption\"\n", " )\n", " return \" && \".join(commands), output_file\n", "\n", "\n", "def get_caption_load(input_file):\n", " commands = []\n", " base, ext = os.path.splitext(input_file)\n", " output_file = f\"{base}_caption{ext}\"\n", " output_format = ext[1:]\n", "\n", " commands.append(f'echo \"Getting caption of {input_file} to {output_file}\"')\n", " commands.append(f\"cd {OPEN_SORA_HOME}\")\n", " commands.append(\n", " f\"python -m tools.datasets.datautil {input_file} --output {output_file} --format {output_format} --load-caption json --remove-empty-caption --clean-caption\"\n", " )\n", " return \" && \".join(commands), output_file\n", "\n", "\n", "def get_aesthetic_score(input_file):\n", " commands = []\n", " base, ext = os.path.splitext(input_file)\n", " output_file = f\"{base}_aes{ext}\"\n", " output_format = ext[1:]\n", "\n", " commands.append(f'echo \"Getting aesthetic score of {input_file} to {output_file}\"')\n", " commands.append(f\"cd {OPEN_SORA_HOME}\")\n", " commands.append(f\"torchrun --standalone --nproc_per_node 8 -m tools.scoring.aesthetic.inference {input_file}\")\n", " commands.append(\n", " f\"python -m tools.datasets.datautil {base}_aes_part*{ext} --output {output_file} --format {output_format} --sort aes\"\n", " )\n", " return \" && \".join(commands), output_file\n", "\n", "\n", "def get_flow_score(input_file):\n", " commands = []\n", " base, ext = os.path.splitext(input_file)\n", " output_file = f\"{base}_flow{ext}\"\n", "\n", " commands.append(f'echo \"Getting flow score of {input_file} to {output_file}\"')\n", " commands.append(f\"cd {OPEN_SORA_HOME}\")\n", " commands.append(f\"torchrun --standalone --nproc_per_node 8 -m tools.scoring.optical_flow.inference {input_file}\")\n", " return \" && \".join(commands), output_file\n", "\n", "\n", "def get_ocr(input_file):\n", " commands = []\n", " base, ext = os.path.splitext(input_file)\n", " output_file = f\"{base}_match{ext}\"\n", "\n", " commands.append(f'echo \"Getting match score of {input_file} to {output_file}\"')\n", " commands.append(f\"cd {OPEN_SORA_HOME}\")\n", " commands.append(f\"torchrun --standalone --nproc_per_node 8 -m tools.scoring.ocr.inference {input_file}\")\n", " return \" && \".join(commands), output_file\n", "\n", " \n", "def get_match_score(input_file):\n", " commands = []\n", " base, ext = os.path.splitext(input_file)\n", " output_file = f\"{base}_match{ext}\"\n", "\n", " commands.append(f'echo \"Getting match score of {input_file} to {output_file}\"')\n", " commands.append(f\"cd {OPEN_SORA_HOME}\")\n", " commands.append(f\"torchrun --standalone --nproc_per_node 8 -m tools.scoring.matching.inference {input_file}\")\n", " return \" && \".join(commands), output_file\n", "\n", "\n", "def get_cmotion_score(input_file):\n", " commands = []\n", " base, ext = os.path.splitext(input_file)\n", " output_file = f\"{base}_cmotion{ext}\"\n", "\n", " commands.append(f'echo \"Getting cmotion score of {input_file} to {output_file}\"')\n", " commands.append(f\"cd {OPEN_SORA_HOME}\")\n", " commands.append(f\"python -m tools.caption.camera_motion_detect {input_file}\")\n", " return \" && \".join(commands), output_file\n", "\n", "\n", "def get_commands(job_list):\n", " commands = []\n", " output_file = None\n", " for job in job_list:\n", " cmd = job.pop(\"cmd\")\n", " if output_file is None:\n", " command, output_file = cmd(**job)\n", " commands.append(command)\n", " else:\n", " job[\"input_file\"] = output_file\n", " command, output_file = cmd(**job)\n", " commands.append(command)\n", " commands.append(f'echo \"All Done!\"')\n", " return \" && \".join(commands), output_file" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Remote Launch via Paramiko\n", "\n", "First, add hosts to `~/.ssh/config`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import paramiko\n", "\n", "HOSTS = [\"host-0\", \"host-1\", \"host-2\", \"host-3\", \"host-4\", \"host-5\", \"host-6\", \"host-7\"]\n", "\n", "# load from ~/.ssh/config\n", "ssh_config = paramiko.SSHConfig()\n", "user_config_file = os.path.expanduser(\"~/.ssh/config\")\n", "if os.path.exists(user_config_file):\n", " with open(user_config_file) as f:\n", " ssh_config.parse(f)\n", "\n", "\n", "def get_ssh_config(hostname):\n", " # get the configuration for the host\n", " user_config = ssh_config.lookup(hostname)\n", " cfg = {\n", " \"hostname\": user_config[\"hostname\"],\n", " \"username\": user_config[\"user\"],\n", " \"port\": int(user_config[\"port\"]),\n", " \"key_filename\": user_config[\"identityfile\"],\n", " }\n", " return cfg\n", "\n", "\n", "def connect(hostname):\n", " cfg = get_ssh_config(hostname)\n", " # connect\n", " client = paramiko.SSHClient()\n", " client.set_missing_host_key_policy(paramiko.AutoAddPolicy())\n", " client.connect(**cfg)\n", " return client\n", "\n", "\n", "def run_command(command, hostname, nohup=False, log_file=None, sleep=None):\n", " client = connect(hostname)\n", " print(\"HOST:\", hostname)\n", " if sleep:\n", " command = f\"sleep {sleep}; {command}\"\n", " command = f\"bash -ic '{command}'\"\n", " if log_file:\n", " command = f\"{command} >> {log_file} 2>&1\"\n", " if nohup:\n", " command = f\"nohup {command} &\"\n", " print(\"COMMAND:\", command)\n", " stdin, stdout, stderr = client.exec_command(command, get_pty=False)\n", "\n", " stdout_str = stdout.read().decode()\n", " stderr_str = stderr.read().decode()\n", " if stdout_str:\n", " print(\"==== STDOUT ====\\n\", stdout_str)\n", " if stderr_str:\n", " print(\"==== STDERR ====\\n\", stderr_str)\n", "\n", " client.close()\n", "\n", "\n", "def run_command_all_hosts(command, hosts=HOSTS):\n", " for hostname in hosts:\n", " run_command(command, hostname)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here are tools to examine machine's status." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def nvidia_smi(host):\n", " if host:\n", " run_command(\"nvidia-smi\", host)\n", " else:\n", " run_command_all_hosts(\"nvidia-smi\")\n", "\n", "\n", "def nvitop(host=None):\n", " if host:\n", " run_command(f\"/home/user/.local/bin/nvitop -1\", host)\n", " else:\n", " run_command_all_hosts(\"/home/user/.local/bin/nvitop -1\")\n", "\n", "\n", "def ps(host=None, interest=\"python|sleep|torchrun|colossal\", all=True):\n", " cmd = \"ps aux\" if all else \"ps ux\"\n", " if host:\n", " if interest is None:\n", " run_command(f\"{cmd} | cat\", host)\n", " else:\n", " run_command(f'{cmd} | cat | grep --color=never -E \"{interest}\"', host)\n", " else:\n", " if interest is None:\n", " run_command_all_hosts(f\"{cmd} | cat\")\n", " else:\n", " run_command_all_hosts(f'{cmd} | cat | grep --color=never -E \"{interest}\"')\n", "\n", "\n", "def kill(pid, host):\n", " run_command(f\"kill -KILL {pid}\", host)\n", "\n", "\n", "def pkill(interest, host):\n", " run_command(f'pkill -9 -f \"{interest}\"', host)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Example\n", "\n", "Remote launch via paramiko." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sleep = None\n", "run_command(cmd, host, log_file=log_file, nohup=True, sleep=sleep)\n", "ps(host)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using following commands to monitor the status of the jobs." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ps()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nvitop(host)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "kill(, host)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Training" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def colossal_run(data_path, load_path=None):\n", " commands = []\n", " commands.append(f\"cd {OPEN_SORA_HOME}\")\n", " command = f\"colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py configs/opensora-v1-1/train/video.py --wandb True --data-path {data_path}\"\n", " if load_path:\n", " command = f\"{command} --load-path {load_path}\"\n", " commands.append(command)\n", " cmd = \" && \".join(commands)\n", " return cmd\n", "\n", "\n", "def kill_all():\n", " commands = []\n", " commands.append(f\"cd {OPEN_SORA_HOME}\")\n", " commands.append('cat hostfile | xargs -I \"{}\" ssh \"{}\" pkill -9 python')\n", " cmd = \" && \".join(commands)\n", " return cmd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Examples" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "host = \"host-0\"\n", "log_file = os.path.join(OPEN_SORA_HOME, \"logs/train.log\")\n", "data_path = \"/path/to/meta.csv\"\n", "cmd = colossal_run(data_path)\n", "print(cmd)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "run_command(cmd, host, log_file=log_file, nohup=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cmd = kill_all()\n", "run_command(cmd, host)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: Open-Sora/opensora/__init__.py ================================================ ================================================ FILE: Open-Sora/opensora/acceleration/__init__.py ================================================ ================================================ FILE: Open-Sora/opensora/acceleration/checkpoint.py ================================================ from collections.abc import Iterable import torch.nn as nn from torch.utils.checkpoint import checkpoint, checkpoint_sequential def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1): assert isinstance(model, nn.Module) def set_attr(module): module.grad_checkpointing = True module.fp32_attention = use_fp32_attention module.grad_checkpointing_step = gc_step model.apply(set_attr) def auto_grad_checkpoint(module, *args, **kwargs): if getattr(module, "grad_checkpointing", False): if not isinstance(module, Iterable): return checkpoint(module, *args, use_reentrant=False, **kwargs) gc_step = module[0].grad_checkpointing_step return checkpoint_sequential(module, gc_step, *args, use_reentrant=False, **kwargs) return module(*args, **kwargs) ================================================ FILE: Open-Sora/opensora/acceleration/communications.py ================================================ import torch import torch.distributed as dist # ==================== # All-To-All # ==================== def _all_to_all( input_: torch.Tensor, world_size: int, group: dist.ProcessGroup, scatter_dim: int, gather_dim: int, ): input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)] output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)] dist.all_to_all(output_list, input_list, group=group) return torch.cat(output_list, dim=gather_dim).contiguous() class _AllToAll(torch.autograd.Function): """All-to-all communication. Args: input_: input matrix process_group: communication group scatter_dim: scatter dimension gather_dim: gather dimension """ @staticmethod def forward(ctx, input_, process_group, scatter_dim, gather_dim): ctx.process_group = process_group ctx.scatter_dim = scatter_dim ctx.gather_dim = gather_dim ctx.world_size = dist.get_world_size(process_group) output = _all_to_all(input_, ctx.world_size, process_group, scatter_dim, gather_dim) return output @staticmethod def backward(ctx, grad_output): grad_output = _all_to_all( grad_output, ctx.world_size, ctx.process_group, ctx.gather_dim, ctx.scatter_dim, ) return ( grad_output, None, None, None, ) def all_to_all( input_: torch.Tensor, process_group: dist.ProcessGroup, scatter_dim: int = 2, gather_dim: int = 1, ): return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim) def _gather( input_: torch.Tensor, world_size: int, group: dist.ProcessGroup, gather_dim: int, ): if gather_list is None: gather_list = [torch.empty_like(input_) for _ in range(world_size)] dist.gather(input_, gather_list, group=group, gather_dim=gather_dim) return gather_list # ==================== # Gather-Split # ==================== def _split(input_, pg: dist.ProcessGroup, dim=-1): # skip if only one rank involved world_size = dist.get_world_size(pg) rank = dist.get_rank(pg) if world_size == 1: return input_ # Split along last dimension. dim_size = input_.size(dim) assert dim_size % world_size == 0, ( f"The dimension to split ({dim_size}) is not a multiple of world size ({world_size}), " f"cannot split tensor evenly" ) tensor_list = torch.split(input_, dim_size // world_size, dim=dim) output = tensor_list[rank].contiguous() return output def _gather(input_, pg: dist.ProcessGroup, dim=-1): # skip if only one rank involved input_ = input_.contiguous() world_size = dist.get_world_size(pg) dist.get_rank(pg) if world_size == 1: return input_ # all gather tensor_list = [torch.empty_like(input_) for _ in range(world_size)] assert input_.device.type == "cuda" torch.distributed.all_gather(tensor_list, input_, group=pg) # concat output = torch.cat(tensor_list, dim=dim).contiguous() return output class _GatherForwardSplitBackward(torch.autograd.Function): """Gather the input from model parallel region and concatenate. Args: input_: input matrix. process_group: parallel mode. dim: dimension """ @staticmethod def symbolic(graph, input_): return _gather(input_) @staticmethod def forward(ctx, input_, process_group, dim, grad_scale): ctx.mode = process_group ctx.dim = dim ctx.grad_scale = grad_scale return _gather(input_, process_group, dim) @staticmethod def backward(ctx, grad_output): if ctx.grad_scale == "up": grad_output = grad_output * dist.get_world_size(ctx.mode) elif ctx.grad_scale == "down": grad_output = grad_output / dist.get_world_size(ctx.mode) return _split(grad_output, ctx.mode, ctx.dim), None, None, None class _SplitForwardGatherBackward(torch.autograd.Function): """ Split the input and keep only the corresponding chuck to the rank. Args: input_: input matrix. process_group: parallel mode. dim: dimension """ @staticmethod def symbolic(graph, input_): return _split(input_) @staticmethod def forward(ctx, input_, process_group, dim, grad_scale): ctx.mode = process_group ctx.dim = dim ctx.grad_scale = grad_scale return _split(input_, process_group, dim) @staticmethod def backward(ctx, grad_output): if ctx.grad_scale == "up": grad_output = grad_output * dist.get_world_size(ctx.mode) elif ctx.grad_scale == "down": grad_output = grad_output / dist.get_world_size(ctx.mode) return _gather(grad_output, ctx.mode, ctx.dim), None, None, None def split_forward_gather_backward(input_, process_group, dim, grad_scale=1.0): return _SplitForwardGatherBackward.apply(input_, process_group, dim, grad_scale) def gather_forward_split_backward(input_, process_group, dim, grad_scale=None): return _GatherForwardSplitBackward.apply(input_, process_group, dim, grad_scale) ================================================ FILE: Open-Sora/opensora/acceleration/parallel_states.py ================================================ import torch.distributed as dist _GLOBAL_PARALLEL_GROUPS = dict() def set_data_parallel_group(group: dist.ProcessGroup): _GLOBAL_PARALLEL_GROUPS["data"] = group def get_data_parallel_group(): return _GLOBAL_PARALLEL_GROUPS.get("data", dist.group.WORLD) def set_sequence_parallel_group(group: dist.ProcessGroup): _GLOBAL_PARALLEL_GROUPS["sequence"] = group def get_sequence_parallel_group(): return _GLOBAL_PARALLEL_GROUPS.get("sequence", None) ================================================ FILE: Open-Sora/opensora/acceleration/plugin.py ================================================ import random from typing import Optional import numpy as np import torch from colossalai.booster.plugin import LowLevelZeroPlugin from colossalai.cluster import ProcessGroupMesh from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler DP_AXIS, SP_AXIS = 0, 1 class ZeroSeqParallelPlugin(LowLevelZeroPlugin): def __init__( self, sp_size: int = 1, stage: int = 2, precision: str = "fp16", initial_scale: float = 2**32, min_scale: float = 1, growth_factor: float = 2, backoff_factor: float = 0.5, growth_interval: int = 1000, hysteresis: int = 2, max_scale: float = 2**32, max_norm: float = 0.0, norm_type: float = 2.0, reduce_bucket_size_in_m: int = 12, communication_dtype: Optional[torch.dtype] = None, overlap_communication: bool = True, cpu_offload: bool = False, master_weights: bool = True, verbose: bool = False, ) -> None: super().__init__( stage=stage, precision=precision, initial_scale=initial_scale, min_scale=min_scale, growth_factor=growth_factor, backoff_factor=backoff_factor, growth_interval=growth_interval, hysteresis=hysteresis, max_scale=max_scale, max_norm=max_norm, norm_type=norm_type, reduce_bucket_size_in_m=reduce_bucket_size_in_m, communication_dtype=communication_dtype, overlap_communication=overlap_communication, cpu_offload=cpu_offload, master_weights=master_weights, verbose=verbose, ) self.sp_size = sp_size assert self.world_size % sp_size == 0, "world_size must be divisible by sp_size" self.dp_size = self.world_size // sp_size self.pg_mesh = ProcessGroupMesh(self.dp_size, self.sp_size) self.dp_group = self.pg_mesh.get_group_along_axis(DP_AXIS) self.sp_group = self.pg_mesh.get_group_along_axis(SP_AXIS) self.dp_rank = self.pg_mesh.coordinate(DP_AXIS) self.sp_rank = self.pg_mesh.coordinate(SP_AXIS) def __del__(self): """Destroy the prcess groups in ProcessGroupMesh""" self.pg_mesh.destroy_mesh_process_groups() def prepare_dataloader( self, dataset, batch_size, shuffle=False, seed=1024, drop_last=False, pin_memory=False, num_workers=0, distributed_sampler_cls=None, **kwargs, ): _kwargs = kwargs.copy() distributed_sampler_cls = distributed_sampler_cls or DistributedSampler sampler = distributed_sampler_cls(dataset, num_replicas=self.dp_size, rank=self.dp_rank, shuffle=shuffle) # Deterministic dataloader def seed_worker(worker_id): worker_seed = seed np.random.seed(worker_seed) torch.manual_seed(worker_seed) random.seed(worker_seed) return DataLoader( dataset, batch_size=batch_size, sampler=sampler, worker_init_fn=seed_worker, drop_last=drop_last, pin_memory=pin_memory, num_workers=num_workers, **_kwargs, ) ================================================ FILE: Open-Sora/opensora/acceleration/shardformer/__init__.py ================================================ ================================================ FILE: Open-Sora/opensora/acceleration/shardformer/modeling/__init__.py ================================================ ================================================ FILE: Open-Sora/opensora/acceleration/shardformer/modeling/t5.py ================================================ import torch import torch.nn as nn class T5LayerNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): """ Construct a layernorm module in the T5 style. No bias and no subtraction of mean. """ super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps def forward(self, hidden_states): # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for # half-precision inputs is done in fp32 variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) # convert into half-precision if necessary if self.weight.dtype in [torch.float16, torch.bfloat16]: hidden_states = hidden_states.to(self.weight.dtype) return self.weight * hidden_states @staticmethod def from_native_module(module, *args, **kwargs): assert module.__class__.__name__ == "FusedRMSNorm", ( "Recovering T5LayerNorm requires the original layer to be apex's Fused RMS Norm." "Apex's fused norm is automatically used by Hugging Face Transformers https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L265C5-L265C48" ) layer_norm = T5LayerNorm(module.normalized_shape, eps=module.eps) layer_norm.weight.data.copy_(module.weight.data) layer_norm = layer_norm.to(module.weight.device) return layer_norm ================================================ FILE: Open-Sora/opensora/acceleration/shardformer/policy/__init__.py ================================================ ================================================ FILE: Open-Sora/opensora/acceleration/shardformer/policy/t5_encoder.py ================================================ from colossalai.shardformer.modeling.jit import get_jit_fused_dropout_add_func from colossalai.shardformer.modeling.t5 import get_jit_fused_T5_layer_ff_forward, get_T5_layer_self_attention_forward from colossalai.shardformer.policies.base_policy import Policy, SubModuleReplacementDescription class T5EncoderPolicy(Policy): def config_sanity_check(self): assert not self.shard_config.enable_tensor_parallelism assert not self.shard_config.enable_flash_attention def preprocess(self): return self.model def module_policy(self): from transformers.models.t5.modeling_t5 import T5LayerFF, T5LayerSelfAttention, T5Stack policy = {} # check whether apex is installed try: from opensora.acceleration.shardformer.modeling.t5 import T5LayerNorm # recover hf from fused rms norm to T5 norm which is faster self.append_or_create_submodule_replacement( description=SubModuleReplacementDescription( suffix="layer_norm", target_module=T5LayerNorm, ), policy=policy, target_key=T5LayerFF, ) self.append_or_create_submodule_replacement( description=SubModuleReplacementDescription(suffix="layer_norm", target_module=T5LayerNorm), policy=policy, target_key=T5LayerSelfAttention, ) self.append_or_create_submodule_replacement( description=SubModuleReplacementDescription(suffix="final_layer_norm", target_module=T5LayerNorm), policy=policy, target_key=T5Stack, ) except (ImportError, ModuleNotFoundError): pass # use jit operator if self.shard_config.enable_jit_fused: self.append_or_create_method_replacement( description={ "forward": get_jit_fused_T5_layer_ff_forward(), "dropout_add": get_jit_fused_dropout_add_func(), }, policy=policy, target_key=T5LayerFF, ) self.append_or_create_method_replacement( description={ "forward": get_T5_layer_self_attention_forward(), "dropout_add": get_jit_fused_dropout_add_func(), }, policy=policy, target_key=T5LayerSelfAttention, ) return policy def postprocess(self): return self.model ================================================ FILE: Open-Sora/opensora/datasets/__init__.py ================================================ from .datasets import IMG_FPS, BatchFeatureDataset, VariableVideoTextDataset, VideoTextDataset from .utils import get_transforms_image, get_transforms_video, is_img, is_vid, save_sample ================================================ FILE: Open-Sora/opensora/datasets/aspect.py ================================================ import math # computation def get_h_w(a, ts, eps=1e-4): h = (ts * a) ** 0.5 h = h + eps h = math.ceil(h) if math.ceil(h) % 2 == 0 else math.floor(h) w = h / a w = w + eps w = math.ceil(w) if math.ceil(w) % 2 == 0 else math.floor(w) return h, w def get_aspect_ratios_dict(ars, ts=360 * 640): est = {f"{a:.2f}": get_h_w(a, ts) for a in ars} return est def get_ar(ratio): h, w = ratio.split(":") return int(h) / int(w) # H:W ASPECT_RATIO_MAP = { "3:8": "0.38", "9:21": "0.43", "12:25": "0.48", "1:2": "0.50", "9:17": "0.53", "27:50": "0.54", "9:16": "0.56", "5:8": "0.62", "2:3": "0.67", "3:4": "0.75", "1:1": "1.00", "4:3": "1.33", "3:2": "1.50", "16:9": "1.78", "17:9": "1.89", "2:1": "2.00", "50:27": "2.08", } AR = [get_ar(ratio) for ratio in ASPECT_RATIO_MAP.keys()] # computed from above code # S = 8294400 ASPECT_RATIO_4K = { "0.38": (1764, 4704), "0.43": (1886, 4400), "0.48": (1996, 4158), "0.50": (2036, 4072), "0.53": (2096, 3960), "0.54": (2118, 3918), "0.62": (2276, 3642), "0.56": (2160, 3840), # base "0.67": (2352, 3528), "0.75": (2494, 3326), "1.00": (2880, 2880), "1.33": (3326, 2494), "1.50": (3528, 2352), "1.78": (3840, 2160), "1.89": (3958, 2096), "2.00": (4072, 2036), "2.08": (4156, 1994), } # S = 3686400 ASPECT_RATIO_2K = { "0.38": (1176, 3136), "0.43": (1256, 2930), "0.48": (1330, 2770), "0.50": (1358, 2716), "0.53": (1398, 2640), "0.54": (1412, 2612), "0.56": (1440, 2560), # base "0.62": (1518, 2428), "0.67": (1568, 2352), "0.75": (1662, 2216), "1.00": (1920, 1920), "1.33": (2218, 1664), "1.50": (2352, 1568), "1.78": (2560, 1440), "1.89": (2638, 1396), "2.00": (2716, 1358), "2.08": (2772, 1330), } # S = 2073600 ASPECT_RATIO_1080P = { "0.38": (882, 2352), "0.43": (942, 2198), "0.48": (998, 2080), "0.50": (1018, 2036), "0.53": (1048, 1980), "0.54": (1058, 1958), "0.56": (1080, 1920), # base "0.62": (1138, 1820), "0.67": (1176, 1764), "0.75": (1248, 1664), "1.00": (1440, 1440), "1.33": (1662, 1246), "1.50": (1764, 1176), "1.78": (1920, 1080), "1.89": (1980, 1048), "2.00": (2036, 1018), "2.08": (2078, 998), } # S = 921600 ASPECT_RATIO_720P = { "0.38": (588, 1568), "0.43": (628, 1466), "0.48": (666, 1388), "0.50": (678, 1356), "0.53": (698, 1318), "0.54": (706, 1306), "0.56": (720, 1280), # base "0.62": (758, 1212), "0.67": (784, 1176), "0.75": (832, 1110), "1.00": (960, 960), "1.33": (1108, 832), "1.50": (1176, 784), "1.78": (1280, 720), "1.89": (1320, 698), "2.00": (1358, 680), "2.08": (1386, 666), } # S = 409920 ASPECT_RATIO_480P = { "0.38": (392, 1046), "0.43": (420, 980), "0.48": (444, 925), "0.50": (452, 904), "0.53": (466, 880), "0.54": (470, 870), "0.56": (480, 854), # base "0.62": (506, 810), "0.67": (522, 784), "0.75": (554, 738), "1.00": (640, 640), "1.33": (740, 555), "1.50": (784, 522), "1.78": (854, 480), "1.89": (880, 466), "2.00": (906, 454), "2.08": (924, 444), } # S = 230400 ASPECT_RATIO_360P = { "0.38": (294, 784), "0.43": (314, 732), "0.48": (332, 692), "0.50": (340, 680), "0.53": (350, 662), "0.54": (352, 652), "0.56": (360, 640), # base "0.62": (380, 608), "0.67": (392, 588), "0.75": (416, 554), "1.00": (480, 480), "1.33": (554, 416), "1.50": (588, 392), "1.78": (640, 360), "1.89": (660, 350), "2.00": (678, 340), "2.08": (692, 332), } # S = 102240 ASPECT_RATIO_240P = { "0.38": (196, 522), "0.43": (210, 490), "0.48": (222, 462), "0.50": (226, 452), "0.53": (232, 438), "0.54": (236, 436), "0.56": (240, 426), # base "0.62": (252, 404), "0.67": (262, 393), "0.75": (276, 368), "1.00": (320, 320), "1.33": (370, 278), "1.50": (392, 262), "1.78": (426, 240), "1.89": (440, 232), "2.00": (452, 226), "2.08": (462, 222), } # S = 36864 ASPECT_RATIO_144P = { "0.38": (117, 312), "0.43": (125, 291), "0.48": (133, 277), "0.50": (135, 270), "0.53": (139, 262), "0.54": (141, 260), "0.56": (144, 256), # base "0.62": (151, 241), "0.67": (156, 234), "0.75": (166, 221), "1.00": (192, 192), "1.33": (221, 165), "1.50": (235, 156), "1.78": (256, 144), "1.89": (263, 139), "2.00": (271, 135), "2.08": (277, 132), } # from PixArt # S = 8294400 ASPECT_RATIO_2880 = { "0.25": (1408, 5760), "0.26": (1408, 5568), "0.27": (1408, 5376), "0.28": (1408, 5184), "0.32": (1600, 4992), "0.33": (1600, 4800), "0.34": (1600, 4672), "0.40": (1792, 4480), "0.42": (1792, 4288), "0.47": (1920, 4096), "0.49": (1920, 3904), "0.51": (1920, 3776), "0.55": (2112, 3840), "0.59": (2112, 3584), "0.68": (2304, 3392), "0.72": (2304, 3200), "0.78": (2496, 3200), "0.83": (2496, 3008), "0.89": (2688, 3008), "0.93": (2688, 2880), "1.00": (2880, 2880), "1.07": (2880, 2688), "1.12": (3008, 2688), "1.21": (3008, 2496), "1.28": (3200, 2496), "1.39": (3200, 2304), "1.47": (3392, 2304), "1.70": (3584, 2112), "1.82": (3840, 2112), "2.03": (3904, 1920), "2.13": (4096, 1920), "2.39": (4288, 1792), "2.50": (4480, 1792), "2.92": (4672, 1600), "3.00": (4800, 1600), "3.12": (4992, 1600), "3.68": (5184, 1408), "3.82": (5376, 1408), "3.95": (5568, 1408), "4.00": (5760, 1408), } # S = 4194304 ASPECT_RATIO_2048 = { "0.25": (1024, 4096), "0.26": (1024, 3968), "0.27": (1024, 3840), "0.28": (1024, 3712), "0.32": (1152, 3584), "0.33": (1152, 3456), "0.35": (1152, 3328), "0.40": (1280, 3200), "0.42": (1280, 3072), "0.48": (1408, 2944), "0.50": (1408, 2816), "0.52": (1408, 2688), "0.57": (1536, 2688), "0.60": (1536, 2560), "0.68": (1664, 2432), "0.72": (1664, 2304), "0.78": (1792, 2304), "0.82": (1792, 2176), "0.88": (1920, 2176), "0.94": (1920, 2048), "1.00": (2048, 2048), "1.07": (2048, 1920), "1.13": (2176, 1920), "1.21": (2176, 1792), "1.29": (2304, 1792), "1.38": (2304, 1664), "1.46": (2432, 1664), "1.67": (2560, 1536), "1.75": (2688, 1536), "2.00": (2816, 1408), "2.09": (2944, 1408), "2.40": (3072, 1280), "2.50": (3200, 1280), "2.89": (3328, 1152), "3.00": (3456, 1152), "3.11": (3584, 1152), "3.62": (3712, 1024), "3.75": (3840, 1024), "3.88": (3968, 1024), "4.00": (4096, 1024), } # S = 1048576 ASPECT_RATIO_1024 = { "0.25": (512, 2048), "0.26": (512, 1984), "0.27": (512, 1920), "0.28": (512, 1856), "0.32": (576, 1792), "0.33": (576, 1728), "0.35": (576, 1664), "0.40": (640, 1600), "0.42": (640, 1536), "0.48": (704, 1472), "0.50": (704, 1408), "0.52": (704, 1344), "0.57": (768, 1344), "0.60": (768, 1280), "0.68": (832, 1216), "0.72": (832, 1152), "0.78": (896, 1152), "0.82": (896, 1088), "0.88": (960, 1088), "0.94": (960, 1024), "1.00": (1024, 1024), "1.07": (1024, 960), "1.13": (1088, 960), "1.21": (1088, 896), "1.29": (1152, 896), "1.38": (1152, 832), "1.46": (1216, 832), "1.67": (1280, 768), "1.75": (1344, 768), "2.00": (1408, 704), "2.09": (1472, 704), "2.40": (1536, 640), "2.50": (1600, 640), "2.89": (1664, 576), "3.00": (1728, 576), "3.11": (1792, 576), "3.62": (1856, 512), "3.75": (1920, 512), "3.88": (1984, 512), "4.00": (2048, 512), } # S = 262144 ASPECT_RATIO_512 = { "0.25": (256, 1024), "0.26": (256, 992), "0.27": (256, 960), "0.28": (256, 928), "0.32": (288, 896), "0.33": (288, 864), "0.35": (288, 832), "0.40": (320, 800), "0.42": (320, 768), "0.48": (352, 736), "0.50": (352, 704), "0.52": (352, 672), "0.57": (384, 672), "0.60": (384, 640), "0.68": (416, 608), "0.72": (416, 576), "0.78": (448, 576), "0.82": (448, 544), "0.88": (480, 544), "0.94": (480, 512), "1.00": (512, 512), "1.07": (512, 480), "1.13": (544, 480), "1.21": (544, 448), "1.29": (576, 448), "1.38": (576, 416), "1.46": (608, 416), "1.67": (640, 384), "1.75": (672, 384), "2.00": (704, 352), "2.09": (736, 352), "2.40": (768, 320), "2.50": (800, 320), "2.89": (832, 288), "3.00": (864, 288), "3.11": (896, 288), "3.62": (928, 256), "3.75": (960, 256), "3.88": (992, 256), "4.00": (1024, 256), } # S = 65536 ASPECT_RATIO_256 = { "0.25": (128, 512), "0.26": (128, 496), "0.27": (128, 480), "0.28": (128, 464), "0.32": (144, 448), "0.33": (144, 432), "0.35": (144, 416), "0.40": (160, 400), "0.42": (160, 384), "0.48": (176, 368), "0.50": (176, 352), "0.52": (176, 336), "0.57": (192, 336), "0.60": (192, 320), "0.68": (208, 304), "0.72": (208, 288), "0.78": (224, 288), "0.82": (224, 272), "0.88": (240, 272), "0.94": (240, 256), "1.00": (256, 256), "1.07": (256, 240), "1.13": (272, 240), "1.21": (272, 224), "1.29": (288, 224), "1.38": (288, 208), "1.46": (304, 208), "1.67": (320, 192), "1.75": (336, 192), "2.00": (352, 176), "2.09": (368, 176), "2.40": (384, 160), "2.50": (400, 160), "2.89": (416, 144), "3.00": (432, 144), "3.11": (448, 144), "3.62": (464, 128), "3.75": (480, 128), "3.88": (496, 128), "4.00": (512, 128), } def get_closest_ratio(height: float, width: float, ratios: dict): aspect_ratio = height / width closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio)) return closest_ratio ASPECT_RATIOS = { "144p": (36864, ASPECT_RATIO_144P), "256": (65536, ASPECT_RATIO_256), "240p": (102240, ASPECT_RATIO_240P), "360p": (230400, ASPECT_RATIO_360P), "512": (262144, ASPECT_RATIO_512), "480p": (409920, ASPECT_RATIO_480P), "720p": (921600, ASPECT_RATIO_720P), "1024": (1048576, ASPECT_RATIO_1024), "1080p": (2073600, ASPECT_RATIO_1080P), "2k": (3686400, ASPECT_RATIO_2K), "2048": (4194304, ASPECT_RATIO_2048), "2880": (8294400, ASPECT_RATIO_2880), "4k": (8294400, ASPECT_RATIO_4K), } def get_num_pixels(name): return ASPECT_RATIOS[name][0] def get_image_size(resolution, ar_ratio): if ar_ratio in ASPECT_RATIO_MAP: ar_key = ASPECT_RATIO_MAP[ar_ratio] else: ar_key = ar_ratio rs_dict = ASPECT_RATIOS[resolution][1] assert ar_key in rs_dict, f"Aspect ratio {ar_ratio} not found for resolution {resolution}" return rs_dict[ar_key] NUM_FRAMES_MAP = { "1x": 51, "2x": 102, "4x": 204, "8x": 408, "16x": 816, "2s": 51, "4s": 102, "8s": 204, "16s": 408, "32s": 816, } def get_num_frames(num_frames): if num_frames in NUM_FRAMES_MAP: return NUM_FRAMES_MAP[num_frames] else: return int(num_frames) ================================================ FILE: Open-Sora/opensora/datasets/bucket.py ================================================ from collections import OrderedDict import numpy as np from opensora.utils.misc import get_logger from .aspect import ASPECT_RATIOS, get_closest_ratio def find_approximate_hw(hw, hw_dict, approx=0.8): for k, v in hw_dict.items(): if hw >= v * approx: return k return None def find_closet_smaller_bucket(t, t_dict, frame_interval): # process image if t == 1: if 1 in t_dict: return 1 else: return None # process video for k, v in t_dict.items(): if t >= v * frame_interval and v != 1: return k return None class Bucket: def __init__(self, bucket_config): for key in bucket_config: assert key in ASPECT_RATIOS, f"Aspect ratio {key} not found." # wrap config with OrderedDict bucket_probs = OrderedDict() bucket_bs = OrderedDict() bucket_names = sorted(bucket_config.keys(), key=lambda x: ASPECT_RATIOS[x][0], reverse=True) for key in bucket_names: bucket_time_names = sorted(bucket_config[key].keys(), key=lambda x: x, reverse=True) bucket_probs[key] = OrderedDict({k: bucket_config[key][k][0] for k in bucket_time_names}) bucket_bs[key] = OrderedDict({k: bucket_config[key][k][1] for k in bucket_time_names}) # first level: HW num_bucket = 0 hw_criteria = dict() t_criteria = dict() ar_criteria = dict() bucket_id = OrderedDict() bucket_id_cnt = 0 for k1, v1 in bucket_probs.items(): hw_criteria[k1] = ASPECT_RATIOS[k1][0] t_criteria[k1] = dict() ar_criteria[k1] = dict() bucket_id[k1] = dict() for k2, _ in v1.items(): t_criteria[k1][k2] = k2 bucket_id[k1][k2] = bucket_id_cnt bucket_id_cnt += 1 ar_criteria[k1][k2] = dict() for k3, v3 in ASPECT_RATIOS[k1][1].items(): ar_criteria[k1][k2][k3] = v3 num_bucket += 1 self.bucket_probs = bucket_probs self.bucket_bs = bucket_bs self.bucket_id = bucket_id self.hw_criteria = hw_criteria self.t_criteria = t_criteria self.ar_criteria = ar_criteria self.num_bucket = num_bucket get_logger().info("Number of buckets: %s", num_bucket) def get_bucket_id(self, T, H, W, frame_interval=1, seed=None): resolution = H * W approx = 0.8 fail = True for hw_id, t_criteria in self.bucket_probs.items(): if resolution < self.hw_criteria[hw_id] * approx: continue # if sample is an image if T == 1: if 1 in t_criteria: rng = np.random.default_rng(seed + self.bucket_id[hw_id][1]) if rng.random() < t_criteria[1]: fail = False t_id = 1 break else: continue # otherwise, find suitable t_id for video t_fail = True for t_id, prob in t_criteria.items(): rng = np.random.default_rng(seed + self.bucket_id[hw_id][t_id]) if isinstance(prob, tuple): prob_t = prob[1] if rng.random() > prob_t: continue if T > t_id * frame_interval and t_id != 1: t_fail = False break if t_fail: continue # leave the loop if prob is high enough if isinstance(prob, tuple): prob = prob[0] if prob >= 1 or rng.random() < prob: fail = False break if fail: return None # get aspect ratio id ar_criteria = self.ar_criteria[hw_id][t_id] ar_id = get_closest_ratio(H, W, ar_criteria) return hw_id, t_id, ar_id def get_thw(self, bucket_id): assert len(bucket_id) == 3 T = self.t_criteria[bucket_id[0]][bucket_id[1]] H, W = self.ar_criteria[bucket_id[0]][bucket_id[1]][bucket_id[2]] return T, H, W def get_prob(self, bucket_id): return self.bucket_probs[bucket_id[0]][bucket_id[1]] def get_batch_size(self, bucket_id): return self.bucket_bs[bucket_id[0]][bucket_id[1]] def __len__(self): return self.num_bucket def closet_smaller_bucket(value, bucket): for i in range(1, len(bucket)): if value < bucket[i]: return bucket[i - 1] return bucket[-1] ================================================ FILE: Open-Sora/opensora/datasets/dataloader.py ================================================ import collections import random from typing import Optional import numpy as np import torch from torch.distributed import ProcessGroup from torch.distributed.distributed_c10d import _get_default_group from torch.utils.data import DataLoader from .datasets import BatchFeatureDataset, VariableVideoTextDataset, VideoTextDataset from .sampler import BatchDistributedSampler, StatefulDistributedSampler, VariableVideoBatchSampler # Deterministic dataloader def get_seed_worker(seed): def seed_worker(worker_id): worker_seed = seed np.random.seed(worker_seed) torch.manual_seed(worker_seed) random.seed(worker_seed) return seed_worker def prepare_dataloader( dataset, batch_size=None, shuffle=False, seed=1024, drop_last=False, pin_memory=False, num_workers=0, process_group: Optional[ProcessGroup] = None, bucket_config=None, num_bucket_build_workers=1, prefetch_factor=None, **kwargs, ): _kwargs = kwargs.copy() if isinstance(dataset, VariableVideoTextDataset): batch_sampler = VariableVideoBatchSampler( dataset, bucket_config, num_replicas=process_group.size(), rank=process_group.rank(), shuffle=shuffle, seed=seed, drop_last=drop_last, verbose=True, num_bucket_build_workers=num_bucket_build_workers, ) return ( DataLoader( dataset, batch_sampler=batch_sampler, worker_init_fn=get_seed_worker(seed), pin_memory=pin_memory, num_workers=num_workers, collate_fn=collate_fn_default, prefetch_factor=prefetch_factor, **_kwargs, ), batch_sampler, ) elif isinstance(dataset, VideoTextDataset): process_group = process_group or _get_default_group() sampler = StatefulDistributedSampler( dataset, num_replicas=process_group.size(), rank=process_group.rank(), shuffle=shuffle, ) return ( DataLoader( dataset, batch_size=batch_size, sampler=sampler, worker_init_fn=get_seed_worker(seed), drop_last=drop_last, pin_memory=pin_memory, num_workers=num_workers, collate_fn=collate_fn_default, prefetch_factor=prefetch_factor, **_kwargs, ), sampler, ) elif isinstance(dataset, BatchFeatureDataset): sampler = BatchDistributedSampler( dataset, num_replicas=process_group.size(), rank=process_group.rank(), ) return ( DataLoader( dataset, batch_size=1, sampler=sampler, worker_init_fn=get_seed_worker(seed), pin_memory=pin_memory, num_workers=num_workers, collate_fn=collate_fn_batch, prefetch_factor=prefetch_factor, **_kwargs, ), sampler, ) else: raise ValueError(f"Unsupported dataset type: {type(dataset)}") def collate_fn_default(batch): # filter out None batch = [x for x in batch if x is not None] # HACK: for loading text features use_mask = False if "mask" in batch[0] and isinstance(batch[0]["mask"], int): masks = [x.pop("mask") for x in batch] texts = [x.pop("text") for x in batch] texts = torch.cat(texts, dim=1) use_mask = True ret = torch.utils.data.default_collate(batch) if use_mask: ret["mask"] = masks ret["text"] = texts return ret def collate_fn_batch(batch): """ Used only with BatchDistributedSampler """ # filter out None batch = [x for x in batch if x is not None] res = torch.utils.data.default_collate(batch) # squeeze the first dimension, which is due to torch.stack() in default_collate() if isinstance(res, collections.abc.Mapping): for k, v in res.items(): if isinstance(v, torch.Tensor): res[k] = v.squeeze(0) elif isinstance(res, collections.abc.Sequence): res = [x.squeeze(0) if isinstance(x, torch.Tensor) else x for x in res] elif isinstance(res, torch.Tensor): res = res.squeeze(0) else: raise TypeError return res ================================================ FILE: Open-Sora/opensora/datasets/datasets.py ================================================ import os from glob import glob import numpy as np import torch from PIL import ImageFile from torchvision.datasets.folder import IMG_EXTENSIONS, pil_loader from opensora.registry import DATASETS from .read_video import read_video from .utils import VID_EXTENSIONS, get_transforms_image, get_transforms_video, read_file, temporal_random_crop ImageFile.LOAD_TRUNCATED_IMAGES = True IMG_FPS = 120 @DATASETS.register_module() class VideoTextDataset(torch.utils.data.Dataset): """load video according to the csv file. Args: target_video_len (int): the number of video frames will be load. align_transform (callable): Align different videos in a specified size. temporal_sample (callable): Sample the target length of a video. """ def __init__( self, data_path=None, num_frames=16, frame_interval=1, image_size=(256, 256), transform_name="center", ): self.data_path = data_path self.data = read_file(data_path) self.get_text = "text" in self.data.columns self.num_frames = num_frames self.frame_interval = frame_interval self.image_size = image_size self.transforms = { "image": get_transforms_image(transform_name, image_size), "video": get_transforms_video(transform_name, image_size), } def _print_data_number(self): num_videos = 0 num_images = 0 for path in self.data["path"]: if self.get_type(path) == "video": num_videos += 1 else: num_images += 1 print(f"Dataset contains {num_videos} videos and {num_images} images.") def get_type(self, path): ext = os.path.splitext(path)[-1].lower() if ext.lower() in VID_EXTENSIONS: return "video" else: assert ext.lower() in IMG_EXTENSIONS, f"Unsupported file format: {ext}" return "image" def getitem(self, index): sample = self.data.iloc[index] path = sample["path"] file_type = self.get_type(path) if file_type == "video": # loading vframes, vinfo = read_video(path, backend="av") video_fps = vinfo["video_fps"] if "video_fps" in vinfo else 24 # Sampling video frames video = temporal_random_crop(vframes, self.num_frames, self.frame_interval) # transform transform = self.transforms["video"] video = transform(video) # T C H W else: # loading image = pil_loader(path) video_fps = IMG_FPS # transform transform = self.transforms["image"] image = transform(image) # repeat video = image.unsqueeze(0).repeat(self.num_frames, 1, 1, 1) # TCHW -> CTHW video = video.permute(1, 0, 2, 3) ret = {"video": video, "fps": video_fps} if self.get_text: ret["text"] = sample["text"] return ret def __getitem__(self, index): for _ in range(10): try: return self.getitem(index) except Exception as e: path = self.data.iloc[index]["path"] print(f"data {path}: {e}") index = np.random.randint(len(self)) raise RuntimeError("Too many bad data.") def __len__(self): return len(self.data) @DATASETS.register_module() class VariableVideoTextDataset(VideoTextDataset): def __init__( self, data_path=None, num_frames=None, frame_interval=1, image_size=(None, None), transform_name=None, dummy_text_feature=False, ): super().__init__(data_path, num_frames, frame_interval, image_size, transform_name=None) self.transform_name = transform_name self.data["id"] = np.arange(len(self.data)) self.dummy_text_feature = dummy_text_feature def get_data_info(self, index): T = self.data.iloc[index]["num_frames"] H = self.data.iloc[index]["height"] W = self.data.iloc[index]["width"] return T, H, W def getitem(self, index): # a hack to pass in the (time, height, width) info from sampler index, num_frames, height, width = [int(val) for val in index.split("-")] sample = self.data.iloc[index] path = sample["path"] file_type = self.get_type(path) ar = height / width video_fps = 24 # default fps if file_type == "video": # loading vframes, vinfo = read_video(path, backend="av") video_fps = vinfo["video_fps"] if "video_fps" in vinfo else 24 # Sampling video frames video = temporal_random_crop(vframes, num_frames, self.frame_interval) video = video.clone() del vframes video_fps = video_fps // self.frame_interval # transform transform = get_transforms_video(self.transform_name, (height, width)) video = transform(video) # T C H W else: # loading image = pil_loader(path) video_fps = IMG_FPS # transform transform = get_transforms_image(self.transform_name, (height, width)) image = transform(image) # repeat video = image.unsqueeze(0) # TCHW -> CTHW video = video.permute(1, 0, 2, 3) ret = { "video": video, "num_frames": num_frames, "height": height, "width": width, "ar": ar, "fps": video_fps, } if self.get_text: ret["text"] = sample["text"] if self.dummy_text_feature: text_len = 50 ret["text"] = torch.zeros((1, text_len, 1152)) ret["mask"] = text_len return ret def __getitem__(self, index): try: return self.getitem(index) except: return None @DATASETS.register_module() class BatchFeatureDataset(torch.utils.data.Dataset): """ The dataset is composed of multiple .bin files. Each .bin file is a list of batch data (like a buffer). All .bin files have the same length. In each training iteration, one batch is fetched from the current buffer. Once a buffer is consumed, load another one. Avoid loading the same .bin on two difference GPUs, i.e., one .bin is assigned to one GPU only. """ def __init__(self, data_path=None): self.path_list = sorted(glob(data_path + "/**/*.bin")) self._len_buffer = len(torch.load(self.path_list[0])) self._num_buffers = len(self.path_list) self.num_samples = self.len_buffer * len(self.path_list) self.cur_file_idx = -1 self.cur_buffer = None @property def num_buffers(self): return self._num_buffers @property def len_buffer(self): return self._len_buffer def _load_buffer(self, idx): file_idx = idx // self.len_buffer if file_idx != self.cur_file_idx: self.cur_file_idx = file_idx self.cur_buffer = torch.load(self.path_list[file_idx]) def __len__(self): return self.num_samples def __getitem__(self, idx): self._load_buffer(idx) batch = self.cur_buffer[idx % self.len_buffer] # dict; keys are {'x', 'fps'} and text related ret = { "video": batch["x"], "text": batch["y"], "mask": batch["mask"], "fps": batch["fps"], "height": batch["height"], "width": batch["width"], "num_frames": batch["num_frames"], } return ret ================================================ FILE: Open-Sora/opensora/datasets/read_video.py ================================================ import gc import math import os import re import warnings from fractions import Fraction from typing import Any, Dict, List, Optional, Tuple, Union import av import cv2 import numpy as np import torch from torchvision import get_video_backend from torchvision.io.video import _check_av_available MAX_NUM_FRAMES = 2500 def read_video_av( filename: str, start_pts: Union[float, Fraction] = 0, end_pts: Optional[Union[float, Fraction]] = None, pts_unit: str = "pts", output_format: str = "THWC", ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]: """ Reads a video from a file, returning both the video frames and the audio frames This method is modified from torchvision.io.video.read_video, with the following changes: 1. will not extract audio frames and return empty for aframes 2. remove checks and only support pyav 3. add container.close() and gc.collect() to avoid thread leakage 4. try our best to avoid memory leak Args: filename (str): path to the video file start_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional): The start presentation time of the video end_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional): The end presentation time pts_unit (str, optional): unit in which start_pts and end_pts values will be interpreted, either 'pts' or 'sec'. Defaults to 'pts'. output_format (str, optional): The format of the output video tensors. Can be either "THWC" (default) or "TCHW". Returns: vframes (Tensor[T, H, W, C] or Tensor[T, C, H, W]): the `T` video frames aframes (Tensor[K, L]): the audio frames, where `K` is the number of channels and `L` is the number of points info (Dict): metadata for the video and audio. Can contain the fields video_fps (float) and audio_fps (int) """ # format output_format = output_format.upper() if output_format not in ("THWC", "TCHW"): raise ValueError(f"output_format should be either 'THWC' or 'TCHW', got {output_format}.") # file existence if not os.path.exists(filename): raise RuntimeError(f"File not found: {filename}") # backend check assert get_video_backend() == "pyav", "pyav backend is required for read_video_av" _check_av_available() # end_pts check if end_pts is None: end_pts = float("inf") if end_pts < start_pts: raise ValueError(f"end_pts should be larger than start_pts, got start_pts={start_pts} and end_pts={end_pts}") # == get video info == info = {} # TODO: creating an container leads to memory leak (1G for 8 workers 1 GPU) container = av.open(filename, metadata_errors="ignore") # fps video_fps = container.streams.video[0].average_rate # guard against potentially corrupted files if video_fps is not None: info["video_fps"] = float(video_fps) iter_video = container.decode(**{"video": 0}) frame = next(iter_video).to_rgb().to_ndarray() height, width = frame.shape[:2] total_frames = container.streams.video[0].frames if total_frames == 0: total_frames = MAX_NUM_FRAMES warnings.warn(f"total_frames is 0, using {MAX_NUM_FRAMES} as a fallback") container.close() del container # HACK: must create before iterating stream # use np.zeros will not actually allocate memory # use np.ones will lead to a little memory leak video_frames = np.zeros((total_frames, height, width, 3), dtype=np.uint8) # == read == try: # TODO: The reading has memory leak (4G for 8 workers 1 GPU) container = av.open(filename, metadata_errors="ignore") assert container.streams.video is not None video_frames = _read_from_stream( video_frames, container, start_pts, end_pts, pts_unit, container.streams.video[0], {"video": 0}, filename=filename, ) except av.AVError as e: print(f"[Warning] Error while reading video {filename}: {e}") vframes = torch.from_numpy(video_frames).clone() del video_frames if output_format == "TCHW": # [T,H,W,C] --> [T,C,H,W] vframes = vframes.permute(0, 3, 1, 2) aframes = torch.empty((1, 0), dtype=torch.float32) return vframes, aframes, info def _read_from_stream( video_frames, container: "av.container.Container", start_offset: float, end_offset: float, pts_unit: str, stream: "av.stream.Stream", stream_name: Dict[str, Optional[Union[int, Tuple[int, ...], List[int]]]], filename: Optional[str] = None, ) -> List["av.frame.Frame"]: if pts_unit == "sec": # TODO: we should change all of this from ground up to simply take # sec and convert to MS in C++ start_offset = int(math.floor(start_offset * (1 / stream.time_base))) if end_offset != float("inf"): end_offset = int(math.ceil(end_offset * (1 / stream.time_base))) else: warnings.warn("The pts_unit 'pts' gives wrong results. Please use pts_unit 'sec'.") should_buffer = True max_buffer_size = 5 if stream.type == "video": # DivX-style packed B-frames can have out-of-order pts (2 frames in a single pkt) # so need to buffer some extra frames to sort everything # properly extradata = stream.codec_context.extradata # overly complicated way of finding if `divx_packed` is set, following # https://github.com/FFmpeg/FFmpeg/commit/d5a21172283572af587b3d939eba0091484d3263 if extradata and b"DivX" in extradata: # can't use regex directly because of some weird characters sometimes... pos = extradata.find(b"DivX") d = extradata[pos:] o = re.search(rb"DivX(\d+)Build(\d+)(\w)", d) if o is None: o = re.search(rb"DivX(\d+)b(\d+)(\w)", d) if o is not None: should_buffer = o.group(3) == b"p" seek_offset = start_offset # some files don't seek to the right location, so better be safe here seek_offset = max(seek_offset - 1, 0) if should_buffer: # FIXME this is kind of a hack, but we will jump to the previous keyframe # so this will be safe seek_offset = max(seek_offset - max_buffer_size, 0) try: # TODO check if stream needs to always be the video stream here or not container.seek(seek_offset, any_frame=False, backward=True, stream=stream) except av.AVError as e: print(f"[Warning] Error while seeking video {filename}: {e}") return [] # == main == buffer_count = 0 frames_pts = [] cnt = 0 try: for _idx, frame in enumerate(container.decode(**stream_name)): frames_pts.append(frame.pts) video_frames[cnt] = frame.to_rgb().to_ndarray() cnt += 1 if cnt >= len(video_frames): break if frame.pts >= end_offset: if should_buffer and buffer_count < max_buffer_size: buffer_count += 1 continue break except av.AVError as e: print(f"[Warning] Error while reading video {filename}: {e}") # garbage collection for thread leakage container.close() del container # NOTE: manually garbage collect to close pyav threads gc.collect() # ensure that the results are sorted wrt the pts # NOTE: here we assert frames_pts is sorted start_ptr = 0 end_ptr = cnt while start_ptr < end_ptr and frames_pts[start_ptr] < start_offset: start_ptr += 1 while start_ptr < end_ptr and frames_pts[end_ptr - 1] > end_offset: end_ptr -= 1 if start_offset > 0 and start_offset not in frames_pts[start_ptr:end_ptr]: # if there is no frame that exactly matches the pts of start_offset # add the last frame smaller than start_offset, to guarantee that # we will have all the necessary data. This is most useful for audio if start_ptr > 0: start_ptr -= 1 result = video_frames[start_ptr:end_ptr].copy() return result def read_video_cv2(video_path): cap = cv2.VideoCapture(video_path) if not cap.isOpened(): # print("Error: Unable to open video") raise ValueError else: fps = cap.get(cv2.CAP_PROP_FPS) vinfo = { "video_fps": fps, } frames = [] while True: # Read a frame from the video ret, frame = cap.read() # If frame is not read correctly, break the loop if not ret: break frames.append(frame[:, :, ::-1]) # BGR to RGB # Exit if 'q' is pressed if cv2.waitKey(25) & 0xFF == ord("q"): break # Release the video capture object and close all windows cap.release() cv2.destroyAllWindows() frames = np.stack(frames) frames = torch.from_numpy(frames) # [T, H, W, C=3] frames = frames.permute(0, 3, 1, 2) return frames, vinfo def read_video(video_path, backend="av"): if backend == "cv2": vframes, vinfo = read_video_cv2(video_path) elif backend == "av": vframes, _, vinfo = read_video_av(filename=video_path, pts_unit="sec", output_format="TCHW") else: raise ValueError return vframes, vinfo ================================================ FILE: Open-Sora/opensora/datasets/sampler.py ================================================ from collections import OrderedDict, defaultdict from pprint import pformat from typing import Iterator, List, Optional import numpy as np import torch import torch.distributed as dist from torch.utils.data import Dataset, DistributedSampler from opensora.utils.misc import format_numel_str, get_logger from .aspect import get_num_pixels from .bucket import Bucket from .datasets import VariableVideoTextDataset # use pandarallel to accelerate bucket processing # NOTE: pandarallel should only access local variables def apply(data, method=None, frame_interval=None, seed=None, num_bucket=None): return method( data["num_frames"], data["height"], data["width"], frame_interval, seed + data["id"] * num_bucket, ) class StatefulDistributedSampler(DistributedSampler): def __init__( self, dataset: Dataset, num_replicas: Optional[int] = None, rank: Optional[int] = None, shuffle: bool = True, seed: int = 0, drop_last: bool = False, ) -> None: super().__init__(dataset, num_replicas, rank, shuffle, seed, drop_last) self.start_index: int = 0 def __iter__(self) -> Iterator: iterator = super().__iter__() indices = list(iterator) indices = indices[self.start_index :] return iter(indices) def __len__(self) -> int: return self.num_samples - self.start_index def reset(self) -> None: self.start_index = 0 def state_dict(self, step) -> dict: return {"start_index": step} def load_state_dict(self, state_dict: dict) -> None: self.__dict__.update(state_dict) class VariableVideoBatchSampler(DistributedSampler): def __init__( self, dataset: VariableVideoTextDataset, bucket_config: dict, num_replicas: Optional[int] = None, rank: Optional[int] = None, shuffle: bool = True, seed: int = 0, drop_last: bool = False, verbose: bool = False, num_bucket_build_workers: int = 1, ) -> None: super().__init__( dataset=dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle, seed=seed, drop_last=drop_last ) self.dataset = dataset self.bucket = Bucket(bucket_config) self.verbose = verbose self.last_micro_batch_access_index = 0 self.approximate_num_batch = None self._get_num_batch_cached_bucket_sample_dict = None self.num_bucket_build_workers = num_bucket_build_workers def __iter__(self) -> Iterator[List[int]]: if self._get_num_batch_cached_bucket_sample_dict is not None: bucket_sample_dict = self._get_num_batch_cached_bucket_sample_dict self._get_num_batch_cached_bucket_sample_dict = None else: bucket_sample_dict = self.group_by_bucket() if self.verbose: self._print_bucket_info(bucket_sample_dict) g = torch.Generator() g.manual_seed(self.seed + self.epoch) bucket_micro_batch_count = OrderedDict() bucket_last_consumed = OrderedDict() # process the samples for bucket_id, data_list in bucket_sample_dict.items(): # handle droplast bs_per_gpu = self.bucket.get_batch_size(bucket_id) remainder = len(data_list) % bs_per_gpu if remainder > 0: if not self.drop_last: # if there is remainder, we pad to make it divisible data_list += data_list[: bs_per_gpu - remainder] else: # we just drop the remainder to make it divisible data_list = data_list[:-remainder] bucket_sample_dict[bucket_id] = data_list # handle shuffle if self.shuffle: data_indices = torch.randperm(len(data_list), generator=g).tolist() data_list = [data_list[i] for i in data_indices] bucket_sample_dict[bucket_id] = data_list # compute how many micro-batches each bucket has num_micro_batches = len(data_list) // bs_per_gpu bucket_micro_batch_count[bucket_id] = num_micro_batches # compute the bucket access order # each bucket may have more than one batch of data # thus bucket_id may appear more than 1 time bucket_id_access_order = [] for bucket_id, num_micro_batch in bucket_micro_batch_count.items(): bucket_id_access_order.extend([bucket_id] * num_micro_batch) # randomize the access order if self.shuffle: bucket_id_access_order_indices = torch.randperm(len(bucket_id_access_order), generator=g).tolist() bucket_id_access_order = [bucket_id_access_order[i] for i in bucket_id_access_order_indices] # make the number of bucket accesses divisible by dp size remainder = len(bucket_id_access_order) % self.num_replicas if remainder > 0: if self.drop_last: bucket_id_access_order = bucket_id_access_order[: len(bucket_id_access_order) - remainder] else: bucket_id_access_order += bucket_id_access_order[: self.num_replicas - remainder] # prepare each batch from its bucket # according to the predefined bucket access order num_iters = len(bucket_id_access_order) // self.num_replicas start_iter_idx = self.last_micro_batch_access_index // self.num_replicas # re-compute the micro-batch consumption # this is useful when resuming from a state dict with a different number of GPUs self.last_micro_batch_access_index = start_iter_idx * self.num_replicas for i in range(self.last_micro_batch_access_index): bucket_id = bucket_id_access_order[i] bucket_bs = self.bucket.get_batch_size(bucket_id) if bucket_id in bucket_last_consumed: bucket_last_consumed[bucket_id] += bucket_bs else: bucket_last_consumed[bucket_id] = bucket_bs for i in range(start_iter_idx, num_iters): bucket_access_list = bucket_id_access_order[i * self.num_replicas : (i + 1) * self.num_replicas] self.last_micro_batch_access_index += self.num_replicas # compute the data samples consumed by each access bucket_access_boundaries = [] for bucket_id in bucket_access_list: bucket_bs = self.bucket.get_batch_size(bucket_id) last_consumed_index = bucket_last_consumed.get(bucket_id, 0) bucket_access_boundaries.append([last_consumed_index, last_consumed_index + bucket_bs]) # update consumption if bucket_id in bucket_last_consumed: bucket_last_consumed[bucket_id] += bucket_bs else: bucket_last_consumed[bucket_id] = bucket_bs # compute the range of data accessed by each GPU bucket_id = bucket_access_list[self.rank] boundary = bucket_access_boundaries[self.rank] cur_micro_batch = bucket_sample_dict[bucket_id][boundary[0] : boundary[1]] # encode t, h, w into the sample index real_t, real_h, real_w = self.bucket.get_thw(bucket_id) cur_micro_batch = [f"{idx}-{real_t}-{real_h}-{real_w}" for idx in cur_micro_batch] yield cur_micro_batch self.reset() def __len__(self) -> int: return self.get_num_batch() // dist.get_world_size() def group_by_bucket(self) -> dict: bucket_sample_dict = OrderedDict() from pandarallel import pandarallel pandarallel.initialize(nb_workers=self.num_bucket_build_workers, progress_bar=False) get_logger().info("Building buckets...") bucket_ids = self.dataset.data.parallel_apply( apply, axis=1, method=self.bucket.get_bucket_id, frame_interval=self.dataset.frame_interval, seed=self.seed + self.epoch, num_bucket=self.bucket.num_bucket, ) # group by bucket # each data sample is put into a bucket with a similar image/video size for i in range(len(self.dataset)): bucket_id = bucket_ids[i] if bucket_id is None: continue if bucket_id not in bucket_sample_dict: bucket_sample_dict[bucket_id] = [] bucket_sample_dict[bucket_id].append(i) return bucket_sample_dict def get_num_batch(self) -> int: bucket_sample_dict = self.group_by_bucket() self._get_num_batch_cached_bucket_sample_dict = bucket_sample_dict # calculate the number of batches if self.verbose: self._print_bucket_info(bucket_sample_dict) return self.approximate_num_batch def _print_bucket_info(self, bucket_sample_dict: dict) -> None: # collect statistics total_samples = 0 total_batch = 0 num_aspect_dict = defaultdict(lambda: [0, 0]) num_hwt_dict = defaultdict(lambda: [0, 0]) for k, v in bucket_sample_dict.items(): size = len(v) num_batch = size // self.bucket.get_batch_size(k[:-1]) total_samples += size total_batch += num_batch num_aspect_dict[k[-1]][0] += size num_aspect_dict[k[-1]][1] += num_batch num_hwt_dict[k[:-1]][0] += size num_hwt_dict[k[:-1]][1] += num_batch # sort num_aspect_dict = dict(sorted(num_aspect_dict.items(), key=lambda x: x[0])) num_hwt_dict = dict( sorted(num_hwt_dict.items(), key=lambda x: (get_num_pixels(x[0][0]), x[0][1]), reverse=True) ) num_hwt_img_dict = {k: v for k, v in num_hwt_dict.items() if k[1] == 1} num_hwt_vid_dict = {k: v for k, v in num_hwt_dict.items() if k[1] > 1} # log if dist.get_rank() == 0 and self.verbose: get_logger().info("Bucket Info:") get_logger().info( "Bucket [#sample, #batch] by aspect ratio:\n%s", pformat(num_aspect_dict, sort_dicts=False) ) get_logger().info( "Image Bucket [#sample, #batch] by HxWxT:\n%s", pformat(num_hwt_img_dict, sort_dicts=False) ) get_logger().info( "Video Bucket [#sample, #batch] by HxWxT:\n%s", pformat(num_hwt_vid_dict, sort_dicts=False) ) get_logger().info( "#training batch: %s, #training sample: %s, #non empty bucket: %s", format_numel_str(total_batch), format_numel_str(total_samples), len(bucket_sample_dict), ) self.approximate_num_batch = total_batch def reset(self): self.last_micro_batch_access_index = 0 def state_dict(self, num_steps: int) -> dict: # the last_micro_batch_access_index in the __iter__ is often # not accurate during multi-workers and data prefetching # thus, we need the user to pass the actual steps which have been executed # to calculate the correct last_micro_batch_access_index return {"seed": self.seed, "epoch": self.epoch, "last_micro_batch_access_index": num_steps * self.num_replicas} def load_state_dict(self, state_dict: dict) -> None: self.__dict__.update(state_dict) class BatchDistributedSampler(DistributedSampler): """ Used with BatchDataset; Suppose len_buffer == 5, num_buffers == 6, #GPUs == 3, then | buffer {i} | buffer {i+1} ------ | ------------------- | ------------------- rank 0 | 0, 1, 2, 3, 4, | 5, 6, 7, 8, 9 rank 1 | 10, 11, 12, 13, 14, | 15, 16, 17, 18, 19 rank 2 | 20, 21, 22, 23, 24, | 25, 26, 27, 28, 29 """ def __init__(self, dataset: Dataset, **kwargs): super().__init__(dataset, **kwargs) self.start_index = 0 def __iter__(self): num_buffers = self.dataset.num_buffers len_buffer = self.dataset.len_buffer num_buffers_i = num_buffers // self.num_replicas num_samples_i = len_buffer * num_buffers_i indices_i = np.arange(self.start_index, num_samples_i) + self.rank * num_samples_i indices_i = indices_i.tolist() return iter(indices_i) def reset(self): self.start_index = 0 def state_dict(self, step) -> dict: return {"start_index": step} def load_state_dict(self, state_dict: dict): self.start_index = state_dict["start_index"] + 1 ================================================ FILE: Open-Sora/opensora/datasets/utils.py ================================================ import os import re import numpy as np import pandas as pd import requests import torch import torchvision import torchvision.transforms as transforms from PIL import Image from torchvision.datasets.folder import IMG_EXTENSIONS, pil_loader from torchvision.io import write_video from torchvision.utils import save_image from . import video_transforms VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv") regex = re.compile( r"^(?:http|ftp)s?://" # http:// or https:// r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain... r"localhost|" # localhost... r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip r"(?::\d+)?" # optional port r"(?:/?|[/?]\S+)$", re.IGNORECASE, ) def is_img(path): ext = os.path.splitext(path)[-1].lower() return ext in IMG_EXTENSIONS def is_vid(path): ext = os.path.splitext(path)[-1].lower() return ext in VID_EXTENSIONS def is_url(url): return re.match(regex, url) is not None def read_file(input_path): if input_path.endswith(".csv"): return pd.read_csv(input_path) elif input_path.endswith(".parquet"): return pd.read_parquet(input_path) else: raise NotImplementedError(f"Unsupported file format: {input_path}") def download_url(input_path): output_dir = "cache" os.makedirs(output_dir, exist_ok=True) base_name = os.path.basename(input_path) output_path = os.path.join(output_dir, base_name) img_data = requests.get(input_path).content with open(output_path, "wb") as handler: handler.write(img_data) print(f"URL {input_path} downloaded to {output_path}") return output_path def temporal_random_crop(vframes, num_frames, frame_interval): temporal_sample = video_transforms.TemporalRandomCrop(num_frames * frame_interval) total_frames = len(vframes) start_frame_ind, end_frame_ind = temporal_sample(total_frames) assert ( end_frame_ind - start_frame_ind >= num_frames ), f"Not enough frames to sample, {end_frame_ind} - {start_frame_ind} < {num_frames}" frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, num_frames, dtype=int) video = vframes[frame_indice] return video def get_transforms_video(name="center", image_size=(256, 256)): if name is None: return None elif name == "center": assert image_size[0] == image_size[1], "image_size must be square for center crop" transform_video = transforms.Compose( [ video_transforms.ToTensorVideo(), # TCHW # video_transforms.RandomHorizontalFlipVideo(), video_transforms.UCFCenterCropVideo(image_size[0]), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True), ] ) elif name == "resize_crop": transform_video = transforms.Compose( [ video_transforms.ToTensorVideo(), # TCHW video_transforms.ResizeCrop(image_size), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True), ] ) else: raise NotImplementedError(f"Transform {name} not implemented") return transform_video def get_transforms_image(name="center", image_size=(256, 256)): if name is None: return None elif name == "center": assert image_size[0] == image_size[1], "Image size must be square for center crop" transform = transforms.Compose( [ transforms.Lambda(lambda pil_image: center_crop_arr(pil_image, image_size[0])), # transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True), ] ) elif name == "resize_crop": transform = transforms.Compose( [ transforms.Lambda(lambda pil_image: resize_crop_to_fill(pil_image, image_size)), transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True), ] ) else: raise NotImplementedError(f"Transform {name} not implemented") return transform def read_image_from_path(path, transform=None, transform_name="center", num_frames=1, image_size=(256, 256)): image = pil_loader(path) if transform is None: transform = get_transforms_image(image_size=image_size, name=transform_name) image = transform(image) video = image.unsqueeze(0).repeat(num_frames, 1, 1, 1) video = video.permute(1, 0, 2, 3) return video def read_video_from_path(path, transform=None, transform_name="center", image_size=(256, 256)): vframes, aframes, info = torchvision.io.read_video(filename=path, pts_unit="sec", output_format="TCHW") if transform is None: transform = get_transforms_video(image_size=image_size, name=transform_name) video = transform(vframes) # T C H W video = video.permute(1, 0, 2, 3) return video def read_from_path(path, image_size, transform_name="center"): if is_url(path): path = download_url(path) ext = os.path.splitext(path)[-1].lower() if ext.lower() in VID_EXTENSIONS: return read_video_from_path(path, image_size=image_size, transform_name=transform_name) else: assert ext.lower() in IMG_EXTENSIONS, f"Unsupported file format: {ext}" return read_image_from_path(path, image_size=image_size, transform_name=transform_name) def save_sample(x, save_path=None, fps=8, normalize=True, value_range=(-1, 1), force_video=False, verbose=True): """ Args: x (Tensor): shape [C, T, H, W] """ assert x.ndim == 4 if not force_video and x.shape[1] == 1: # T = 1: save as image save_path += ".png" x = x.squeeze(1) save_image([x], save_path, normalize=normalize, value_range=value_range) else: save_path += ".mp4" if normalize: low, high = value_range x.clamp_(min=low, max=high) x.sub_(low).div_(max(high - low, 1e-5)) x = x.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 3, 0).to("cpu", torch.uint8) write_video(save_path, x, fps=fps, video_codec="h264") if verbose: print(f"Saved to {save_path}") return save_path def center_crop_arr(pil_image, image_size): """ Center cropping implementation from ADM. https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126 """ while min(*pil_image.size) >= 2 * image_size: pil_image = pil_image.resize(tuple(x // 2 for x in pil_image.size), resample=Image.BOX) scale = image_size / min(*pil_image.size) pil_image = pil_image.resize(tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC) arr = np.array(pil_image) crop_y = (arr.shape[0] - image_size) // 2 crop_x = (arr.shape[1] - image_size) // 2 return Image.fromarray(arr[crop_y : crop_y + image_size, crop_x : crop_x + image_size]) def resize_crop_to_fill(pil_image, image_size): w, h = pil_image.size # PIL is (W, H) th, tw = image_size rh, rw = th / h, tw / w if rh > rw: sh, sw = th, round(w * rh) image = pil_image.resize((sw, sh), Image.BICUBIC) i = 0 j = int(round((sw - tw) / 2.0)) else: sh, sw = round(h * rw), tw image = pil_image.resize((sw, sh), Image.BICUBIC) i = int(round((sh - th) / 2.0)) j = 0 arr = np.array(image) assert i + th <= arr.shape[0] and j + tw <= arr.shape[1] return Image.fromarray(arr[i : i + th, j : j + tw]) ================================================ FILE: Open-Sora/opensora/datasets/video_transforms.py ================================================ # Copyright 2024 Vchitect/Latte # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License.# Modified from Latte # - This file is adapted from https://github.com/Vchitect/Latte/blob/main/datasets/video_transforms.py import numbers import random import numpy as np import torch def _is_tensor_video_clip(clip): if not torch.is_tensor(clip): raise TypeError("clip should be Tensor. Got %s" % type(clip)) if not clip.ndimension() == 4: raise ValueError("clip should be 4D. Got %dD" % clip.dim()) return True def crop(clip, i, j, h, w): """ Args: clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W) """ if len(clip.size()) != 4: raise ValueError("clip should be a 4D tensor") return clip[..., i : i + h, j : j + w] def resize(clip, target_size, interpolation_mode): if len(target_size) != 2: raise ValueError(f"target size should be tuple (height, width), instead got {target_size}") return torch.nn.functional.interpolate(clip, size=target_size, mode=interpolation_mode, align_corners=False) def resize_scale(clip, target_size, interpolation_mode): if len(target_size) != 2: raise ValueError(f"target size should be tuple (height, width), instead got {target_size}") H, W = clip.size(-2), clip.size(-1) scale_ = target_size[0] / min(H, W) return torch.nn.functional.interpolate(clip, scale_factor=scale_, mode=interpolation_mode, align_corners=False) def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"): """ Do spatial cropping and resizing to the video clip Args: clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W) i (int): i in (i,j) i.e coordinates of the upper left corner. j (int): j in (i,j) i.e coordinates of the upper left corner. h (int): Height of the cropped region. w (int): Width of the cropped region. size (tuple(int, int)): height and width of resized clip Returns: clip (torch.tensor): Resized and cropped clip. Size is (T, C, H, W) """ if not _is_tensor_video_clip(clip): raise ValueError("clip should be a 4D torch.tensor") clip = crop(clip, i, j, h, w) clip = resize(clip, size, interpolation_mode) return clip def center_crop(clip, crop_size): if not _is_tensor_video_clip(clip): raise ValueError("clip should be a 4D torch.tensor") h, w = clip.size(-2), clip.size(-1) th, tw = crop_size if h < th or w < tw: raise ValueError("height and width must be no smaller than crop_size") i = int(round((h - th) / 2.0)) j = int(round((w - tw) / 2.0)) return crop(clip, i, j, th, tw) def center_crop_using_short_edge(clip): if not _is_tensor_video_clip(clip): raise ValueError("clip should be a 4D torch.tensor") h, w = clip.size(-2), clip.size(-1) if h < w: th, tw = h, h i = 0 j = int(round((w - tw) / 2.0)) else: th, tw = w, w i = int(round((h - th) / 2.0)) j = 0 return crop(clip, i, j, th, tw) def resize_crop_to_fill(clip, target_size): if not _is_tensor_video_clip(clip): raise ValueError("clip should be a 4D torch.tensor") h, w = clip.size(-2), clip.size(-1) th, tw = target_size[0], target_size[1] rh, rw = th / h, tw / w if rh > rw: sh, sw = th, round(w * rh) clip = resize(clip, (sh, sw), "bilinear") i = 0 j = int(round(sw - tw) / 2.0) else: sh, sw = round(h * rw), tw clip = resize(clip, (sh, sw), "bilinear") i = int(round(sh - th) / 2.0) j = 0 assert i + th <= clip.size(-2) and j + tw <= clip.size(-1) return crop(clip, i, j, th, tw) def random_shift_crop(clip): """ Slide along the long edge, with the short edge as crop size """ if not _is_tensor_video_clip(clip): raise ValueError("clip should be a 4D torch.tensor") h, w = clip.size(-2), clip.size(-1) if h <= w: short_edge = h else: short_edge = w th, tw = short_edge, short_edge i = torch.randint(0, h - th + 1, size=(1,)).item() j = torch.randint(0, w - tw + 1, size=(1,)).item() return crop(clip, i, j, th, tw) def to_tensor(clip): """ Convert tensor data type from uint8 to float, divide value by 255.0 and permute the dimensions of clip tensor Args: clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W) Return: clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W) """ _is_tensor_video_clip(clip) if not clip.dtype == torch.uint8: raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype)) # return clip.float().permute(3, 0, 1, 2) / 255.0 return clip.float() / 255.0 def normalize(clip, mean, std, inplace=False): """ Args: clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W) mean (tuple): pixel RGB mean. Size is (3) std (tuple): pixel standard deviation. Size is (3) Returns: normalized clip (torch.tensor): Size is (T, C, H, W) """ if not _is_tensor_video_clip(clip): raise ValueError("clip should be a 4D torch.tensor") if not inplace: clip = clip.clone() mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device) # print(mean) std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device) clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None]) return clip def hflip(clip): """ Args: clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W) Returns: flipped clip (torch.tensor): Size is (T, C, H, W) """ if not _is_tensor_video_clip(clip): raise ValueError("clip should be a 4D torch.tensor") return clip.flip(-1) class ResizeCrop: def __init__(self, size): if isinstance(size, numbers.Number): self.size = (int(size), int(size)) else: self.size = size def __call__(self, clip): clip = resize_crop_to_fill(clip, self.size) return clip def __repr__(self) -> str: return f"{self.__class__.__name__}(size={self.size})" class RandomCropVideo: def __init__(self, size): if isinstance(size, numbers.Number): self.size = (int(size), int(size)) else: self.size = size def __call__(self, clip): """ Args: clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W) Returns: torch.tensor: randomly cropped video clip. size is (T, C, OH, OW) """ i, j, h, w = self.get_params(clip) return crop(clip, i, j, h, w) def get_params(self, clip): h, w = clip.shape[-2:] th, tw = self.size if h < th or w < tw: raise ValueError(f"Required crop size {(th, tw)} is larger than input image size {(h, w)}") if w == tw and h == th: return 0, 0, h, w i = torch.randint(0, h - th + 1, size=(1,)).item() j = torch.randint(0, w - tw + 1, size=(1,)).item() return i, j, th, tw def __repr__(self) -> str: return f"{self.__class__.__name__}(size={self.size})" class CenterCropResizeVideo: """ First use the short side for cropping length, center crop video, then resize to the specified size """ def __init__( self, size, interpolation_mode="bilinear", ): if isinstance(size, tuple): if len(size) != 2: raise ValueError(f"size should be tuple (height, width), instead got {size}") self.size = size else: self.size = (size, size) self.interpolation_mode = interpolation_mode def __call__(self, clip): """ Args: clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W) Returns: torch.tensor: scale resized / center cropped video clip. size is (T, C, crop_size, crop_size) """ clip_center_crop = center_crop_using_short_edge(clip) clip_center_crop_resize = resize( clip_center_crop, target_size=self.size, interpolation_mode=self.interpolation_mode ) return clip_center_crop_resize def __repr__(self) -> str: return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}" class UCFCenterCropVideo: """ First scale to the specified size in equal proportion to the short edge, then center cropping """ def __init__( self, size, interpolation_mode="bilinear", ): if isinstance(size, tuple): if len(size) != 2: raise ValueError(f"size should be tuple (height, width), instead got {size}") self.size = size else: self.size = (size, size) self.interpolation_mode = interpolation_mode def __call__(self, clip): """ Args: clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W) Returns: torch.tensor: scale resized / center cropped video clip. size is (T, C, crop_size, crop_size) """ clip_resize = resize_scale(clip=clip, target_size=self.size, interpolation_mode=self.interpolation_mode) clip_center_crop = center_crop(clip_resize, self.size) return clip_center_crop def __repr__(self) -> str: return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}" class KineticsRandomCropResizeVideo: """ Slide along the long edge, with the short edge as crop size. And resie to the desired size. """ def __init__( self, size, interpolation_mode="bilinear", ): if isinstance(size, tuple): if len(size) != 2: raise ValueError(f"size should be tuple (height, width), instead got {size}") self.size = size else: self.size = (size, size) self.interpolation_mode = interpolation_mode def __call__(self, clip): clip_random_crop = random_shift_crop(clip) clip_resize = resize(clip_random_crop, self.size, self.interpolation_mode) return clip_resize class CenterCropVideo: def __init__( self, size, interpolation_mode="bilinear", ): if isinstance(size, tuple): if len(size) != 2: raise ValueError(f"size should be tuple (height, width), instead got {size}") self.size = size else: self.size = (size, size) self.interpolation_mode = interpolation_mode def __call__(self, clip): """ Args: clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W) Returns: torch.tensor: center cropped video clip. size is (T, C, crop_size, crop_size) """ clip_center_crop = center_crop(clip, self.size) return clip_center_crop def __repr__(self) -> str: return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}" class NormalizeVideo: """ Normalize the video clip by mean subtraction and division by standard deviation Args: mean (3-tuple): pixel RGB mean std (3-tuple): pixel RGB standard deviation inplace (boolean): whether do in-place normalization """ def __init__(self, mean, std, inplace=False): self.mean = mean self.std = std self.inplace = inplace def __call__(self, clip): """ Args: clip (torch.tensor): video clip must be normalized. Size is (C, T, H, W) """ return normalize(clip, self.mean, self.std, self.inplace) def __repr__(self) -> str: return f"{self.__class__.__name__}(mean={self.mean}, std={self.std}, inplace={self.inplace})" class ToTensorVideo: """ Convert tensor data type from uint8 to float, divide value by 255.0 and permute the dimensions of clip tensor """ def __init__(self): pass def __call__(self, clip): """ Args: clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W) Return: clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W) """ return to_tensor(clip) def __repr__(self) -> str: return self.__class__.__name__ class RandomHorizontalFlipVideo: """ Flip the video clip along the horizontal direction with a given probability Args: p (float): probability of the clip being flipped. Default value is 0.5 """ def __init__(self, p=0.5): self.p = p def __call__(self, clip): """ Args: clip (torch.tensor): Size is (T, C, H, W) Return: clip (torch.tensor): Size is (T, C, H, W) """ if random.random() < self.p: clip = hflip(clip) return clip def __repr__(self) -> str: return f"{self.__class__.__name__}(p={self.p})" # ------------------------------------------------------------ # --------------------- Sampling --------------------------- # ------------------------------------------------------------ class TemporalRandomCrop(object): """Temporally crop the given frame indices at a random location. Args: size (int): Desired length of frames will be seen in the model. """ def __init__(self, size): self.size = size def __call__(self, total_frames): rand_end = max(0, total_frames - self.size - 1) begin_index = random.randint(0, rand_end) end_index = min(begin_index + self.size, total_frames) return begin_index, end_index if __name__ == "__main__": import os import numpy as np import torchvision.io as io from torchvision import transforms from torchvision.utils import save_image vframes, aframes, info = io.read_video(filename="./v_Archery_g01_c03.avi", pts_unit="sec", output_format="TCHW") trans = transforms.Compose( [ ToTensorVideo(), RandomHorizontalFlipVideo(), UCFCenterCropVideo(512), # NormalizeVideo(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True), ] ) target_video_len = 32 frame_interval = 1 total_frames = len(vframes) print(total_frames) temporal_sample = TemporalRandomCrop(target_video_len * frame_interval) # Sampling video frames start_frame_ind, end_frame_ind = temporal_sample(total_frames) # print(start_frame_ind) # print(end_frame_ind) assert end_frame_ind - start_frame_ind >= target_video_len frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, target_video_len, dtype=int) print(frame_indice) select_vframes = vframes[frame_indice] print(select_vframes.shape) print(select_vframes.dtype) select_vframes_trans = trans(select_vframes) print(select_vframes_trans.shape) print(select_vframes_trans.dtype) select_vframes_trans_int = ((select_vframes_trans * 0.5 + 0.5) * 255).to(dtype=torch.uint8) print(select_vframes_trans_int.dtype) print(select_vframes_trans_int.permute(0, 2, 3, 1).shape) io.write_video("./test.avi", select_vframes_trans_int.permute(0, 2, 3, 1), fps=8) for i in range(target_video_len): save_image( select_vframes_trans[i], os.path.join("./test000", "%04d.png" % i), normalize=True, value_range=(-1, 1) ) ================================================ FILE: Open-Sora/opensora/models/__init__.py ================================================ from .dit import * from .latte import * from .pixart import * from .stdit import * from .text_encoder import * from .vae import * ================================================ FILE: Open-Sora/opensora/models/cache_functions/__init__.py ================================================ from .cache_cutfresh import cache_cutfresh from .fresh_ratio_scheduler import fresh_ratio_scheduler from .score_evaluate import score_evaluate from .global_force_fresh import global_force_fresh from .cache_cutfresh import cache_cutfresh from .update_cache import update_cache from .force_init import force_init from .attention import cached_attention_forward from .cache_init import cache_init ================================================ FILE: Open-Sora/opensora/models/cache_functions/attention.py ================================================ # Besides, re-arrange the attention module from torch.jit import Final import torch import torch.nn as nn import torch.nn.functional as F from typing import Optional, Union from xformers.ops.fmha.attn_bias import BlockDiagonalMask def cached_attention_forward( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attn_bias: Optional[Union[torch.Tensor, BlockDiagonalMask]] = None, p: float = 0.0, scale: Optional[float] = None ) -> torch.Tensor: scale = 1.0 / query.shape[-1] ** 0.5 query = query * scale query = query.transpose(1, 2) key = key.transpose(1, 2) value = value.transpose(1, 2) #attn = query @ key.transpose(-2, -1) attn = torch.matmul(query, key.transpose(-2, -1)) if attn_bias is not None: attn_bias = attn_bias.materialize(shape= attn.shape, dtype= attn.dtype, device= attn.device) attn = attn + attn_bias #out_map = attn attn_map = attn.softmax(-1) attn = F.dropout(attn_map, p) attn = torch.matmul(attn, value) #attn = attn @ value return attn.transpose(1, 2).contiguous(), attn_map.mean(dim=1) ================================================ FILE: Open-Sora/opensora/models/cache_functions/cache_cutfresh.py ================================================ from .fresh_ratio_scheduler import fresh_ratio_scheduler from .score_evaluate import score_evaluate #from .token_merge import token_merge import torch def cache_cutfresh(cache_dic, tokens, current): ''' Cut fresh tokens from the input tokens and update the cache counter. cache_dic: dict, the cache dictionary containing cache(main extra memory cost), indices and some other information. tokens: torch.Tensor, the input tokens to be cut. current: dict, the current step, layer, and module information. Particularly convenient for debugging. ''' step = current['step'] layer = current['layer'] module = current['module'] fresh_ratio = fresh_ratio_scheduler(cache_dic, current) fresh_ratio = torch.clamp(torch.tensor(fresh_ratio, device = tokens.device), min=0, max=1) # Generate the index tensor for fresh tokens score = score_evaluate(cache_dic, tokens, current) # s1, s2, s3 mentioned in the paper #score = local_selection_with_space_time_bonus(cache_dic, score, 0.3, 2, time_mean=False) # s4 mentioned in the paper. indices = score.argsort(dim=-1, descending=True) topk = int(fresh_ratio * score.shape[1]) fresh_indices = indices[:, :topk] stale_indices = indices[:, topk:] # (B, fresh_ratio *N) # Updating the Cache Frequency Score s3 counter mentioned in the paper # stale tokens index + 1 in each ***module***, fresh tokens index = 0 cache_dic['cache_index'][current['flag']][layer][module] += 1 cache_dic['cache_index'][current['flag']][layer][module].scatter_(dim=1, index=fresh_indices, src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device)) cache_dic['cache_index']['layer_index'][module] += 1 cache_dic['cache_index']['layer_index'][module].scatter_(dim=1, index=fresh_indices, src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device)) # select the fresh tokens out fresh_indices_expand = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1]) if module in ['mlp', 'attn', 'cross-attn']: fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices_expand) return fresh_indices, fresh_tokens else: raise ValueError("Unrecognized module?", module) import torch from einops import rearrange def local_selection_with_space_time_bonus(cache_dic, score, bonus_ratio, grid_size=2, time_mean = False): # Get the shape of the tensor from cache_dic B, T, H, W = cache_dic['dynamic_size'] # Reshape the score to [B, T, H, W] score = rearrange(score, "B (T H W) -> B T H W", T=T, H=H, W=W) # Calculate the padding size to make H and W divisible by grid_size pad_h = (grid_size - H % grid_size) % grid_size # Number of zeros to pad in H dimension pad_w = (grid_size - W % grid_size) % grid_size # Number of zeros to pad in W dimension # Pad the H and W dimensions with zeros if pad_h > 0 or pad_w > 0: score = torch.nn.functional.pad(score, (0, pad_w, 0, pad_h)) # (pad width left/right, pad height top/bottom) # Update H and W after padding H_padded, W_padded = score.shape[2], score.shape[3] # Step 1: Normalize along the H*W dimension so that information from different time steps has equal weight score = score.view(B, T, -1) # Merge H and W into one dimension [B, T, H*W] score = torch.nn.functional.softmax(score, dim=-1) # Normalize along H*W dimension score = score.view(B, T, H_padded, W_padded) # Restore to [B, T, H_padded, W_padded] shape # Step 2: Perform block-wise operation on each spatial slice (each T time step) block_size = grid_size * grid_size assert (H_padded * W_padded) % block_size == 0, f"H_padded * W_padded must be divisible by block size, shape: {B},{T},{H_padded},{W_padded}; block:{grid_size}*{grid_size};" # Reshape the score into block-wise grouped shape score_reshaped = score.view(B, T, H_padded // grid_size, grid_size, W_padded // grid_size, grid_size) score_reshaped = score_reshaped.permute(0, 1, 2, 4, 3, 5).contiguous() # [B, T, H//grid_size, W//grid_size, grid_size, grid_size] score_reshaped = score_reshaped.view(B, T, -1, block_size) # [B, T, num_blocks, block_size] # Step 3: Find the maximum score in each block max_scores, max_indices = score_reshaped.max(dim=-1, keepdim=True) # [B, T, num_blocks, 1] # Step 4: Create a mask to identify the token with the maximum score mask = torch.zeros_like(score_reshaped) mask.scatter_(-1, max_indices, 1) # Set the mask to 1 at the index of the maximum score # Step 5: Apply the bonus only to the token with the maximum score score_reshaped = score_reshaped + (mask * max_scores * bonus_ratio) # Apply bonus only to the maximum score # Step 6: Restore the score to its original shape score_modified = score_reshaped.view(B, T, H_padded // grid_size, W_padded // grid_size, grid_size, grid_size) score_modified = score_modified.permute(0, 1, 2, 4, 3, 5).contiguous() score_modified = score_modified.view(B, T, H_padded, W_padded) # Step 7: Remove the padded zeros if pad_h > 0 or pad_w > 0: score_modified = score_modified[:, :, :H, :W] # Remove the padded zeros if time_mean: score_modified = score_modified.mean(dim = 1) score_modified = score_modified.unsqueeze(1).expand(B, T, H, W) # Finally, reshape the score back to the original shape [B, (T H W)] score_modified = rearrange(score_modified, "B T H W -> B (T H W)") return score_modified ================================================ FILE: Open-Sora/opensora/models/cache_functions/cache_init.py ================================================ def cache_init(model_kwargs, num_steps): ''' Initialize for cache. ''' cache_dic = {} cache = {} indices_cache = {} cache_index = {} cache[-1]={} cache[0]={} indices_cache[-1]={} indices_cache[0]={} cache_index[-1]={} cache_index[0]={} cache_index['layer_index']={} cache_dic['attn_map'] = {} cache_dic['attn_map'][-1] = {} cache_dic['attn_map'][0] = {} cache_dic['cross_attn_map'] = {} cache_dic['cross_attn_map'][-1] = {} cache_dic['cross_attn_map'][0] = {} for j in range(28): cache[-1][j] = {} indices_cache[-1] = {} cache_index[-1][j] = {} cache_dic['attn_map'][-1][j] = {} cache_dic['cross_attn_map'][-1][j] = {} cache[0][j] = {} indices_cache[0] = {} cache_index[0][j] = {} cache_dic['attn_map'][0][j] = {} cache_dic['cross_attn_map'][0][j] = {} cache_dic['cache_type'] = model_kwargs['cache_type'] cache_dic['cache_index'] = cache_index cache_dic['cache'] = cache cache_dic['indices_cache'] = indices_cache cache_dic['fresh_ratio_schedule'] = model_kwargs['ratio_scheduler'] cache_dic['fresh_ratio'] = model_kwargs['fresh_ratio'] cache_dic['fresh_threshold'] = model_kwargs['fresh_threshold'] cache_dic['force_fresh'] = model_kwargs['force_fresh'] cache_dic['soft_fresh_weight'] = model_kwargs['soft_fresh_weight'] #cache_dic['extra_flops'] = 0.0 #cache_dic['merge_weight'] = merge_weight current = {} current['num_steps'] = num_steps return cache_dic, current ================================================ FILE: Open-Sora/opensora/models/cache_functions/force_init.py ================================================ import torch from .force_scheduler import force_scheduler def force_init(cache_dic, current, tokens): cache_dic['cache_index'][current['flag']][current['layer']][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device) force_scheduler(cache_dic, current) if current['layer'] == 0: cache_dic['cache_index']['layer_index'][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device) ================================================ FILE: Open-Sora/opensora/models/cache_functions/force_scheduler.py ================================================ import torch def force_scheduler(cache_dic, current): thresholds = {} if cache_dic['fresh_ratio'] == 0: # FORA linear_step_weight = 0.0 else: # TokenCache linear_step_weight = 0.0 #N=6 0.2 #N=4 0.3 step_factor = torch.tensor(1 - linear_step_weight + 2 * linear_step_weight * current['step'] / current['num_steps']) threshold = torch.round(cache_dic['fresh_threshold'] / step_factor) # Here we set force activation cycles for different modules separately. thresholds = { 'spat-attn' : 3, 'temp-attn' : 3, 'cross-attn' : 6, 'mlp' : 3 } #thresholds = { # 'spat-attn' : 2, # 'temp-attn' : 2, # 'cross-attn' : 2, # 'mlp' : 2 } cache_dic['cal_threshold'] = thresholds #return threshold ================================================ FILE: Open-Sora/opensora/models/cache_functions/fresh_ratio_scheduler.py ================================================ import torch def fresh_ratio_scheduler(cache_dic, current): ''' Return the fresh ratio for the current step. ''' fresh_ratio = cache_dic['fresh_ratio'] fresh_ratio_schedule = cache_dic['fresh_ratio_schedule'] step = current['step'] num_steps = current['num_steps'] threshold = cache_dic['fresh_threshold'] weight = 0.9 if fresh_ratio_schedule == 'constant': return fresh_ratio elif fresh_ratio_schedule == 'linear': return fresh_ratio * (1 + weight - 2 * weight * step / num_steps) elif fresh_ratio_schedule == 'exp': #return 0.5 * (0.052 ** (step/num_steps)) return fresh_ratio * (weight ** (step / num_steps)) elif fresh_ratio_schedule == 'linear-mode': mode = (step % threshold)/threshold - 0.5 mode_weight = 0.1 return fresh_ratio * (1 + weight - 2 * weight * step / num_steps + mode_weight * mode) elif fresh_ratio_schedule == 'layerwise': return fresh_ratio * (1 + weight - 2 * weight * current['layer'] / 27) elif fresh_ratio_schedule == 'ToCa': ''' Video cost too much to tune the parameters However, simply set these parameters have good enough performances and fast speed mentioned in our paper. We will search a better parameter setting for better in future. ''' step_weight = 0.0 step_factor = 1 + step_weight - 2 * step_weight * step / num_steps layer_weight = 0.0 layer_factor = 1 + layer_weight - 2 * layer_weight * current['layer'] / 27 module_weight = 1.5 module_time_weight = 0.33 module_factor = (1 - (1-module_time_weight) * module_weight) if current['module']=='cross-attn' else (1 + module_time_weight * module_weight) # set for temporal and spatial branch type_weight = 0.0 type_factor = 1 + type_weight if current['flag'] == -1 else 1 - type_weight return fresh_ratio * layer_factor * step_factor * module_factor * type_factor else: raise ValueError("unrecognized fresh ratio schedule", fresh_ratio_schedule) ================================================ FILE: Open-Sora/opensora/models/cache_functions/global_force_fresh.py ================================================ from .force_scheduler import force_scheduler def global_force_fresh(cache_dic, current): ''' Return whether to force fresh tokens globally. ''' is_force_fresh = {} fresh_thresholds = {} first_step = (current['step'] == 0) first_3steps = (current['step'] <= 2) # Note the fact that for OpenSora series models, the first 3 steps is with great importance!!! last_step = current['step'] == current['num_steps'] - 1 force_fresh = cache_dic['force_fresh'] if not first_step: fresh_thresholds['spat-attn'] = cache_dic['cal_threshold']['spat-attn'] fresh_thresholds['temp-attn'] = cache_dic['cal_threshold']['temp-attn'] fresh_thresholds['cross-attn'] = cache_dic['cal_threshold']['cross-attn'] fresh_thresholds['mlp'] = cache_dic['cal_threshold']['mlp'] else: fresh_thresholds['spat-attn'] = cache_dic['fresh_threshold'] fresh_thresholds['temp-attn'] = cache_dic['fresh_threshold'] fresh_thresholds['cross-attn'] = cache_dic['fresh_threshold'] fresh_thresholds['mlp'] = cache_dic['fresh_threshold'] if force_fresh == 'global': if current['flag'] == -1: is_force_fresh['attn'] = (first_3steps or (current['step']% fresh_thresholds['temp-attn'] == 0)) else: is_force_fresh['attn'] = (first_3steps or (current['step']% fresh_thresholds['spat-attn'] == 0)) is_force_fresh['cross-attn'] = (first_3steps or (current['step']% fresh_thresholds['cross-attn'] == 0)) is_force_fresh['mlp'] = (first_3steps or (current['step']% fresh_thresholds['mlp'] == 0)) return is_force_fresh elif force_fresh == 'local': return first_step elif force_fresh == 'none': return first_step else: raise ValueError("unrecognized force fresh strategy", force_fresh) ================================================ FILE: Open-Sora/opensora/models/cache_functions/score_evaluate.py ================================================ import torch import torch.nn as nn from .scores import attn_score, similarity_score, norm_score def score_evaluate(cache_dic, tokens, current) -> torch.Tensor: ''' Return the score tensor (B, N) for the given tokens. ''' #if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')): ## abandoned branch, if you want to explore the local force fresh strategy, this may help. # force_fresh_mask = torch.as_tensor((cache_dic['cache_index'][-1][current['layer']][current['module']] >= 2 * cache_dic['fresh_threshold']), dtype = int) # 2 because the threshold is for step, not module # force_len = force_fresh_mask.sum(dim=1) # force_indices = force_fresh_mask.argsort(dim = -1, descending = True)[:, :force_len.min()] # # force_indices = force_indices[:, torch.randperm(force_indices.shape[1])] if cache_dic['cache_type'] == 'random': score = torch.rand(int(tokens.shape[0]*0.5), tokens.shape[1], device=tokens.device) score = torch.cat([score, score], dim=0).to(tokens.device) elif cache_dic['cache_type'] == 'straight': score = torch.ones(tokens.shape[0], tokens.shape[1]).to(tokens.device) elif cache_dic['cache_type'] == 'attention': score = attn_score(cache_dic, current) elif cache_dic['cache_type'] == 'similarity': score = similarity_score(cache_dic, current, tokens) elif cache_dic['cache_type'] == 'norm': score = norm_score(cache_dic, current, tokens) elif cache_dic['cache_type'] == 'compress': score1 = torch.rand(int(tokens.shape[0]*0.5), tokens.shape[1]) score1 = torch.cat([score1, score1], dim=0).to(tokens.device) score2 = cache_dic['attn_map'][current['flag']][current['layer']].sum(dim=1)#.mean(dim=0) # (B, N) # normalize score2 = score2 / score2.max(dim=1, keepdim=True)[0] score = 0.5 * score1 + 0.5 * score2 # abandon the branch, if you want to explore the local force fresh strategy, this may help. #if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')): # current['is_force_fresh'] is False, cause when it is True, no cut and fresh are needed # #print(torch.ones_like(force_indices, dtype=float, device=force_indices.device).dtype) # score.scatter_(dim=1, index=force_indices, src=torch.ones_like(force_indices, dtype=torch.float32, # device=force_indices.device)) if (True and (cache_dic['force_fresh'] == 'global')): soft_step_score = cache_dic['cache_index'][current['flag']][current['layer']][current['module']].float() / (cache_dic['fresh_threshold']) #soft_layer_score = cache_dic['cache_index']['layer_index'][current['module']].float() / (27) score = score + cache_dic['soft_fresh_weight'] * soft_step_score #+ 0.1 *soft_layer_score return score.to(tokens.device) ================================================ FILE: Open-Sora/opensora/models/cache_functions/scores.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F def attn_score(cache_dic, current): #self_attn_score = 1- cache_dic['attn_map'][current['flag']][current['layer']].diagonal(dim1=1, dim2=2) #self_attn_score = F.normalize(self_attn_score, dim=1, p=2) #attention_score = F.normalize(cache_dic['attn_map'][current['flag']][current['layer']].sum(dim=1), dim=1, p=2) #cross_attn_map = F.threshold(cache_dic['cross_attn_map'][current['flag']][current['layer']],threshold=0.0, value=0.0) #cross_attention_score = F.normalize(cross_attn_map.sum(dim=-1), dim=-1, p=2) cond_cmap, uncond_cmap = torch.split(cache_dic['cross_attn_map'][current['flag']][current['layer']], len(cache_dic['cross_attn_map'][current['flag']][current['layer']]) // 2, dim=0) cond_weight = 0.5 cmap = cond_weight * cond_cmap + (1 - cond_weight) * uncond_cmap cross_attention_entropy = -torch.sum(cmap * torch.log(cmap + 1e-7), dim=-1) cross_attention_score = F.normalize(1 + cross_attention_entropy, dim=1, p=2) #score = self_attn_score #score = attention_score score = cross_attention_score.repeat(2, 1) #cross_weight = 0.0 #score = (1-cross_weight) * attention_score + cross_weight * cross_attention_score return score def similarity_score(cache_dic, current, tokens): cosine_sim = F.cosine_similarity(tokens, cache_dic['cache'][current['flag']][current['layer']][current['module']], dim=-1) return F.normalize(1- cosine_sim, dim=-1, p=2) def norm_score(cache_dic, current, tokens): norm = tokens.norm(dim=-1, p=2) return F.normalize(norm, dim=-1, p=2) ================================================ FILE: Open-Sora/opensora/models/cache_functions/token_merge.py ================================================ import torch def token_merge(cache_dic, tokens, current, fresh_indices, stale_indices): ''' An abandoned branch in exploring if token merge helps. The answer is no, at least no for training-free strategy. ''' if (current['layer'] % 1 == 0): fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1])) stale_tokens = torch.gather(input = tokens, dim = 1, index = stale_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1])) method = 'similarity' if method == 'distance': descending = False distance = torch.cdist(stale_tokens, fresh_tokens, p=1) stale_fresh_dist, stale_fresh_indices_allstale = torch.min(distance, dim=2) elif method == 'similarity': descending = True fresh_tokens = torch.nn.functional.normalize(fresh_tokens, p=2, dim=-1) stale_tokens = torch.nn.functional.normalize(stale_tokens, p=2, dim=-1) similarity = stale_tokens @ fresh_tokens.transpose(1, 2) stale_fresh_dist, stale_fresh_indices_allstale = torch.max(similarity, dim=2) saved_topk_stale = int((stale_fresh_dist > 0.995).sum(dim=1).min()) merged_stale_sequence = torch.sort(stale_fresh_dist, dim=1, descending=descending)[1][:,:saved_topk_stale] stale_fresh_indices = stale_fresh_indices_allstale.gather(1, merged_stale_sequence) merged_stale_sequence = stale_indices.gather(1, merged_stale_sequence) merged_stale_fresh_indices = fresh_indices.gather(1, stale_fresh_indices) cache_dic['merged_stale_fresh_indices'] = merged_stale_fresh_indices cache_dic['merged_stale_sequence'] = merged_stale_sequence ================================================ FILE: Open-Sora/opensora/models/cache_functions/update_cache.py ================================================ import torch def update_cache(fresh_indices, fresh_tokens, cache_dic, current, fresh_attn_map=None): ''' Update the cache with the fresh tokens. ''' step = current['step'] layer = current['layer'] module = current['module'] # Update the cached tokens at the positions if module == 'attn': indices = fresh_indices#.sort(dim=1, descending=False)[0] cache_dic['attn_map'][current['flag']][layer].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_attn_map.shape[-1]), src=fresh_attn_map) elif module == 'cross-attn': indices = fresh_indices#.sort(dim=1, descending=False)[0] cache_dic['cross_attn_map'][current['flag']][layer].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_attn_map.shape[-1]), src=fresh_attn_map) elif module == 'mlp': indices = fresh_indices cache_dic['cache'][current['flag']][layer][module].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_tokens.shape[-1]), src=fresh_tokens) ================================================ FILE: Open-Sora/opensora/models/dit/__init__.py ================================================ from .dit import DiT, DiT_XL_2, DiT_XL_2x2 ================================================ FILE: Open-Sora/opensora/models/dit/dit.py ================================================ # Modified from Meta DiT # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # DiT: https://github.com/facebookresearch/DiT/tree/main # GLIDE: https://github.com/openai/glide-text2im # MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py # -------------------------------------------------------- import numpy as np import torch import torch.nn as nn import torch.utils.checkpoint from einops import rearrange from timm.models.vision_transformer import Mlp from opensora.acceleration.checkpoint import auto_grad_checkpoint from opensora.models.layers.blocks import ( Attention, CaptionEmbedder, FinalLayer, LabelEmbedder, PatchEmbed3D, TimestepEmbedder, approx_gelu, get_1d_sincos_pos_embed, get_2d_sincos_pos_embed, get_layernorm, modulate, ) from opensora.registry import MODELS from opensora.utils.ckpt_utils import load_checkpoint class DiTBlock(nn.Module): """ A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning. """ def __init__( self, hidden_size, num_heads, mlp_ratio=4.0, enable_flash_attn=False, enable_layernorm_kernel=False, ): super().__init__() self.hidden_size = hidden_size self.num_heads = num_heads self.enable_flash_attn = enable_flash_attn mlp_hidden_dim = int(hidden_size * mlp_ratio) self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.attn = Attention( hidden_size, num_heads=num_heads, qkv_bias=True, enable_flash_attn=enable_flash_attn, ) self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0) self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True)) def forward(self, x, c): shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1) x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1, x, shift_msa, scale_msa)) x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2, x, shift_mlp, scale_mlp)) return x @MODELS.register_module() class DiT(nn.Module): """ Diffusion model with a Transformer backbone. """ def __init__( self, input_size=(16, 32, 32), in_channels=4, patch_size=(1, 2, 2), hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, learn_sigma=True, condition="text", no_temporal_pos_emb=False, caption_channels=512, model_max_length=77, dtype=torch.float32, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, ): super().__init__() self.learn_sigma = learn_sigma self.in_channels = in_channels self.out_channels = in_channels * 2 if learn_sigma else in_channels self.hidden_size = hidden_size self.patch_size = patch_size self.input_size = input_size num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)]) self.num_patches = num_patches self.num_temporal = input_size[0] // patch_size[0] self.num_spatial = num_patches // self.num_temporal self.num_heads = num_heads self.dtype = dtype self.use_text_encoder = not condition.startswith("label") if enable_flash_attn: assert dtype in [ torch.float16, torch.bfloat16, ], f"Flash attention only supports float16 and bfloat16, but got {self.dtype}" self.no_temporal_pos_emb = no_temporal_pos_emb self.mlp_ratio = mlp_ratio self.depth = depth assert enable_sequence_parallelism is False, "Sequence parallelism is not supported in DiT" self.register_buffer("pos_embed_spatial", self.get_spatial_pos_embed()) self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed()) self.x_embedder = PatchEmbed3D(patch_size, in_channels, embed_dim=hidden_size) if not self.use_text_encoder: num_classes = int(condition.split("_")[-1]) self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob) else: self.y_embedder = CaptionEmbedder( in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=1, # pooled token ) self.t_embedder = TimestepEmbedder(hidden_size) self.blocks = nn.ModuleList( [ DiTBlock( hidden_size, num_heads, mlp_ratio=mlp_ratio, enable_flash_attn=enable_flash_attn, enable_layernorm_kernel=enable_layernorm_kernel, ) for _ in range(depth) ] ) self.final_layer = FinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels) self.initialize_weights() self.enable_flash_attn = enable_flash_attn self.enable_layernorm_kernel = enable_layernorm_kernel def get_spatial_pos_embed(self): pos_embed = get_2d_sincos_pos_embed( self.hidden_size, self.input_size[1] // self.patch_size[1], ) pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) return pos_embed def get_temporal_pos_embed(self): pos_embed = get_1d_sincos_pos_embed( self.hidden_size, self.input_size[0] // self.patch_size[0], ) pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) return pos_embed def unpatchify(self, x): c = self.out_channels t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)] pt, ph, pw = self.patch_size x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c)) x = rearrange(x, "n t h w r p q c -> n c t r h p w q") imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw)) return imgs def forward(self, x, t, y): """ Forward pass of DiT. x: (B, C, T, H, W) tensor of inputs t: (B,) tensor of diffusion timesteps y: list of text """ # origin inputs should be float32, cast to specified dtype x = x.to(self.dtype) if self.use_text_encoder: y = y.to(self.dtype) # embedding x = self.x_embedder(x) # (B, N, D) x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial) x = x + self.pos_embed_spatial if not self.no_temporal_pos_emb: x = rearrange(x, "b t s d -> b s t d") x = x + self.pos_embed_temporal x = rearrange(x, "b s t d -> b (t s) d") else: x = rearrange(x, "b t s d -> b (t s) d") t = self.t_embedder(t, dtype=x.dtype) # (N, D) y = self.y_embedder(y, self.training) # (N, D) if self.use_text_encoder: y = y.squeeze(1).squeeze(1) condition = t + y # blocks for _, block in enumerate(self.blocks): c = condition x = auto_grad_checkpoint(block, x, c) # (B, N, D) # final process x = self.final_layer(x, condition) # (B, N, num_patches * out_channels) x = self.unpatchify(x) # (B, out_channels, T, H, W) # cast to float32 for better accuracy x = x.to(torch.float32) return x def initialize_weights(self): # Initialize transformer layers: def _basic_init(module): if isinstance(module, nn.Linear): if module.weight.requires_grad_: torch.nn.init.xavier_uniform_(module.weight) if module.bias is not None: nn.init.constant_(module.bias, 0) self.apply(_basic_init) # Initialize patch_embed like nn.Linear (instead of nn.Conv2d): w = self.x_embedder.proj.weight.data nn.init.xavier_uniform_(w.view([w.shape[0], -1])) nn.init.constant_(self.x_embedder.proj.bias, 0) # Initialize timestep embedding MLP: nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) # Zero-out adaLN modulation layers in DiT blocks: for block in self.blocks: nn.init.constant_(block.adaLN_modulation[-1].weight, 0) nn.init.constant_(block.adaLN_modulation[-1].bias, 0) # Zero-out output layers: nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0) nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0) nn.init.constant_(self.final_layer.linear.weight, 0) nn.init.constant_(self.final_layer.linear.bias, 0) # Zero-out text embedding layers: if self.use_text_encoder: nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02) nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02) @MODELS.register_module("DiT-XL/2") def DiT_XL_2(from_pretrained=None, **kwargs): model = DiT( depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs, ) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model @MODELS.register_module("DiT-XL/2x2") def DiT_XL_2x2(from_pretrained=None, **kwargs): model = DiT( depth=28, hidden_size=1152, patch_size=(2, 2, 2), num_heads=16, **kwargs, ) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model ================================================ FILE: Open-Sora/opensora/models/latte/__init__.py ================================================ from .latte import Latte, Latte_XL_2, Latte_XL_2x2 ================================================ FILE: Open-Sora/opensora/models/latte/latte.py ================================================ # Copyright 2024 Vchitect/Latte # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License.# Modified from Latte # # # This file is mofied from https://github.com/Vchitect/Latte/blob/main/models/latte.py # # With references to: # Latte: https://github.com/Vchitect/Latte # DiT: https://github.com/facebookresearch/DiT/tree/main import torch from einops import rearrange, repeat from opensora.acceleration.checkpoint import auto_grad_checkpoint from opensora.models.dit import DiT from opensora.registry import MODELS from opensora.utils.ckpt_utils import load_checkpoint @MODELS.register_module() class Latte(DiT): def forward(self, x, t, y): """ Forward pass of DiT. x: (B, C, T, H, W) tensor of inputs t: (B,) tensor of diffusion timesteps y: list of text """ # origin inputs should be float32, cast to specified dtype x = x.to(self.dtype) # embedding x = self.x_embedder(x) # (B, N, D) x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial) x = x + self.pos_embed_spatial x = rearrange(x, "b t s d -> b (t s) d") t = self.t_embedder(t, dtype=x.dtype) # (N, D) y = self.y_embedder(y, self.training) # (N, D) if self.use_text_encoder: y = y.squeeze(1).squeeze(1) condition = t + y condition_spatial = repeat(condition, "b d -> (b t) d", t=self.num_temporal) condition_temporal = repeat(condition, "b d -> (b s) d", s=self.num_spatial) # blocks for i, block in enumerate(self.blocks): if i % 2 == 0: # spatial x = rearrange(x, "b (t s) d -> (b t) s d", t=self.num_temporal, s=self.num_spatial) c = condition_spatial else: # temporal x = rearrange(x, "b (t s) d -> (b s) t d", t=self.num_temporal, s=self.num_spatial) c = condition_temporal if i == 1: x = x + self.pos_embed_temporal x = auto_grad_checkpoint(block, x, c) # (B, N, D) if i % 2 == 0: x = rearrange(x, "(b t) s d -> b (t s) d", t=self.num_temporal, s=self.num_spatial) else: x = rearrange(x, "(b s) t d -> b (t s) d", t=self.num_temporal, s=self.num_spatial) # final process x = self.final_layer(x, condition) # (B, N, num_patches * out_channels) x = self.unpatchify(x) # (B, out_channels, T, H, W) # cast to float32 for better accuracy x = x.to(torch.float32) return x @MODELS.register_module("Latte-XL/2") def Latte_XL_2(from_pretrained=None, **kwargs): model = Latte( depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs, ) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model @MODELS.register_module("Latte-XL/2x2") def Latte_XL_2x2(from_pretrained=None, **kwargs): model = Latte( depth=28, hidden_size=1152, patch_size=(2, 2, 2), num_heads=16, **kwargs, ) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model ================================================ FILE: Open-Sora/opensora/models/layers/__init__.py ================================================ ================================================ FILE: Open-Sora/opensora/models/layers/blocks.py ================================================ # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # PixArt: https://github.com/PixArt-alpha/PixArt-alpha # Latte: https://github.com/Vchitect/Latte # DiT: https://github.com/facebookresearch/DiT/tree/main # GLIDE: https://github.com/openai/glide-text2im # MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py # -------------------------------------------------------- import functools import math from typing import Optional import numpy as np import torch import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint import xformers.ops from einops import rearrange from timm.models.vision_transformer import Mlp from opensora.acceleration.communications import all_to_all, split_forward_gather_backward from opensora.acceleration.parallel_states import get_sequence_parallel_group from ..cache_functions.attention import cached_attention_forward approx_gelu = lambda: nn.GELU(approximate="tanh") class LlamaRMSNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): """ LlamaRMSNorm is equivalent to T5LayerNorm """ super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps def forward(self, hidden_states): input_dtype = hidden_states.dtype hidden_states = hidden_states.to(torch.float32) variance = hidden_states.pow(2).mean(-1, keepdim=True) hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) return self.weight * hidden_states.to(input_dtype) def get_layernorm(hidden_size: torch.Tensor, eps: float, affine: bool, use_kernel: bool): if use_kernel: try: from apex.normalization import FusedLayerNorm return FusedLayerNorm(hidden_size, elementwise_affine=affine, eps=eps) except ImportError: raise RuntimeError("FusedLayerNorm not available. Please install apex.") else: return nn.LayerNorm(hidden_size, eps, elementwise_affine=affine) def modulate(norm_func, x, shift, scale): # Suppose x is (B, N, D), shift is (B, D), scale is (B, D) dtype = x.dtype x = norm_func(x.to(torch.float32)).to(dtype) x = x * (scale.unsqueeze(1) + 1) + shift.unsqueeze(1) x = x.to(dtype) return x def t2i_modulate(x, shift, scale): return x * (1 + scale) + shift # =============================================== # General-purpose Layers # =============================================== class PatchEmbed3D(nn.Module): """Video to Patch Embedding. Args: patch_size (int): Patch token size. Default: (2,4,4). in_chans (int): Number of input video channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. norm_layer (nn.Module, optional): Normalization layer. Default: None """ def __init__( self, patch_size=(2, 4, 4), in_chans=3, embed_dim=96, norm_layer=None, flatten=True, ): super().__init__() self.patch_size = patch_size self.flatten = flatten self.in_chans = in_chans self.embed_dim = embed_dim self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: self.norm = None def forward(self, x): """Forward function.""" # padding _, _, D, H, W = x.size() if W % self.patch_size[2] != 0: x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2])) if H % self.patch_size[1] != 0: x = F.pad(x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1])) if D % self.patch_size[0] != 0: x = F.pad(x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0])) x = self.proj(x) # (B C T H W) if self.norm is not None: D, Wh, Ww = x.size(2), x.size(3), x.size(4) x = x.flatten(2).transpose(1, 2) x = self.norm(x) x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww) if self.flatten: x = x.flatten(2).transpose(1, 2) # BCTHW -> BNC return x class Attention(nn.Module): def __init__( self, dim: int, num_heads: int = 8, qkv_bias: bool = False, qk_norm: bool = False, attn_drop: float = 0.0, proj_drop: float = 0.0, norm_layer: nn.Module = LlamaRMSNorm, enable_flash_attn: bool = False, rope=None, qk_norm_legacy: bool = False, ) -> None: super().__init__() assert dim % num_heads == 0, "dim should be divisible by num_heads" self.dim = dim self.num_heads = num_heads self.head_dim = dim // num_heads self.scale = self.head_dim**-0.5 self.enable_flash_attn = enable_flash_attn self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() self.qk_norm_legacy = qk_norm_legacy self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.rope = False if rope is not None: self.rope = True self.rotary_emb = rope self.is_causal = False def forward(self, x: torch.Tensor) -> torch.Tensor: B, N, C = x.shape # flash attn is not memory efficient for small sequences, this is empirical enable_flash_attn = self.enable_flash_attn and (N > B) qkv = self.qkv(x) qkv_shape = (B, N, 3, self.num_heads, self.head_dim) qkv = qkv.view(qkv_shape).permute(2, 0, 3, 1, 4) q, k, v = qkv.unbind(0) if self.qk_norm_legacy: # WARNING: this may be a bug if self.rope: q = self.rotary_emb(q) k = self.rotary_emb(k) q, k = self.q_norm(q), self.k_norm(k) else: q, k = self.q_norm(q), self.k_norm(k) if self.rope: q = self.rotary_emb(q) k = self.rotary_emb(k) if enable_flash_attn: from flash_attn import flash_attn_func # (B, #heads, N, #dim) -> (B, N, #heads, #dim) q = q.permute(0, 2, 1, 3) k = k.permute(0, 2, 1, 3) v = v.permute(0, 2, 1, 3) x = flash_attn_func( q, k, v, dropout_p=self.attn_drop.p if self.training else 0.0, softmax_scale=self.scale, causal=self.is_causal, ) else: dtype = q.dtype q = q * self.scale #attn = q @ k.transpose(-2, -1) # translate attn to float32 attn = torch.matmul(q,k.transpose(-2, -1)) attn = attn.to(torch.float32) if self.is_causal: causal_mask = torch.tril(torch.ones_like(attn), diagonal=0) causal_mask = torch.where(causal_mask.bool(), 0, float('-inf')) attn += causal_mask attn = attn.softmax(dim=-1) attn = attn.to(dtype) # cast back attn to original dtype attn = self.attn_drop(attn) #x = attn @ v x = torch.matmul(attn,v) x_output_shape = (B, N, C) if not enable_flash_attn: x = x.transpose(1, 2) x = x.reshape(x_output_shape) x = self.proj(x) x = self.proj_drop(x) return x class KVCompressAttention(nn.Module): def __init__( self, dim: int, num_heads: int = 8, qkv_bias: bool = False, qk_norm: bool = False, attn_drop: float = 0.0, proj_drop: float = 0.0, norm_layer: nn.Module = LlamaRMSNorm, enable_flash_attn: bool = False, sampling="conv", sr_ratio=1, mem_eff_attention=False, attn_half=False, ) -> None: super().__init__() assert dim % num_heads == 0, "dim should be divisible by num_heads" self.dim = dim self.num_heads = num_heads self.head_dim = dim // num_heads self.scale = self.head_dim**-0.5 self.enable_flash_attn = enable_flash_attn self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.sr_ratio = sr_ratio self.sampling = sampling if sr_ratio > 1 and sampling == "conv": # Avg Conv Init. self.sr = nn.Conv2d(dim, dim, groups=dim, kernel_size=sr_ratio, stride=sr_ratio) self.sr.weight.data.fill_(1 / sr_ratio**2) self.sr.bias.data.zero_() self.norm = nn.LayerNorm(dim) self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.mem_eff_attention = mem_eff_attention self.attn_half = attn_half def downsample_2d(self, tensor, H, W, scale_factor, sampling=None): if sampling is None or scale_factor == 1: return tensor B, N, C = tensor.shape if sampling == "uniform_every": return tensor[:, ::scale_factor], int(N // scale_factor) tensor = tensor.reshape(B, H, W, C).permute(0, 3, 1, 2) new_H, new_W = int(H / scale_factor), int(W / scale_factor) new_N = new_H * new_W if sampling == "ave": tensor = F.interpolate(tensor, scale_factor=1 / scale_factor, mode="nearest").permute(0, 2, 3, 1) elif sampling == "uniform": tensor = tensor[:, :, ::scale_factor, ::scale_factor].permute(0, 2, 3, 1) elif sampling == "conv": tensor = self.sr(tensor).reshape(B, C, -1).permute(0, 2, 1) tensor = self.norm(tensor) else: raise ValueError return tensor.reshape(B, new_N, C).contiguous(), new_N def forward(self, x: torch.Tensor, mask=None, HW=None, block_id=None, **kwargs) -> torch.Tensor: B, N, C = x.shape new_N = N H, W = HW # flash attn is not memory efficient for small sequences, this is empirical enable_flash_attn = self.enable_flash_attn and (N > B) qkv = self.qkv(x).reshape(B, N, 3, C) q, k, v = qkv.unbind(2) dtype = q.dtype # KV compression if self.sr_ratio > 1: k, new_N = self.downsample_2d(k, H, W, self.sr_ratio, sampling=self.sampling) v, new_N = self.downsample_2d(v, H, W, self.sr_ratio, sampling=self.sampling) q = q.reshape(B, N, self.num_heads, C // self.num_heads).to(dtype) k = k.reshape(B, new_N, self.num_heads, C // self.num_heads).to(dtype) v = v.reshape(B, new_N, self.num_heads, C // self.num_heads).to(dtype) q, k = self.q_norm(q), self.k_norm(k) if enable_flash_attn: from flash_attn import flash_attn_func x = flash_attn_func( q, k, v, dropout_p=self.attn_drop.p if self.training else 0.0, softmax_scale=self.scale, ) elif self.mem_eff_attention: attn_bias = None if mask is not None: attn_bias = torch.zeros([B * self.num_heads, q.shape[1], k.shape[1]], dtype=q.dtype, device=q.device) attn_bias.masked_fill_(mask.squeeze(1).repeat(self.num_heads, 1, 1) == 0, float("-inf")) x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias) else: # (B, N, #heads, #dim) -> (B, #heads, N, #dim) q = q.permute(0, 2, 1, 3) k = k.permute(0, 2, 1, 3) v = v.permute(0, 2, 1, 3) dtype = q.dtype q = q * self.scale attn = q @ k.transpose(-2, -1) # translate attn to float32 if not self.attn_half: attn = attn.to(torch.float32) attn = attn.softmax(dim=-1) attn = attn.to(dtype) # cast back attn to original dtype attn = self.attn_drop(attn) x = attn @ v x_output_shape = (B, N, C) if not enable_flash_attn: x = x.transpose(1, 2) x = x.reshape(x_output_shape) x = self.proj(x) x = self.proj_drop(x) return x class SeqParallelAttention(Attention): def __init__( self, dim: int, num_heads: int = 8, qkv_bias: bool = False, qk_norm: bool = False, attn_drop: float = 0.0, proj_drop: float = 0.0, norm_layer: nn.Module = LlamaRMSNorm, enable_flash_attn: bool = False, rope=None, ) -> None: assert rope is None, "Rope is not supported in SeqParallelAttention" super().__init__( dim=dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_norm=qk_norm, attn_drop=attn_drop, proj_drop=proj_drop, norm_layer=norm_layer, enable_flash_attn=enable_flash_attn, ) def forward(self, x: torch.Tensor) -> torch.Tensor: B, N, C = x.shape # for sequence parallel here, the N is a local sequence length qkv = self.qkv(x) qkv_shape = (B, N, 3, self.num_heads, self.head_dim) qkv = qkv.view(qkv_shape) sp_group = get_sequence_parallel_group() # apply all_to_all to gather sequence and split attention heads # [B, SUB_N, 3, NUM_HEAD, HEAD_DIM] -> [B, N, 3, NUM_HEAD_PER_DEVICE, HEAD_DIM] qkv = all_to_all(qkv, sp_group, scatter_dim=3, gather_dim=1) if self.enable_flash_attn: qkv_permute_shape = ( 2, 0, 1, 3, 4, ) # [3, B, N, NUM_HEAD_PER_DEVICE, HEAD_DIM] else: qkv_permute_shape = ( 2, 0, 3, 1, 4, ) # [3, B, NUM_HEAD_PER_DEVICE, N, HEAD_DIM] qkv = qkv.permute(qkv_permute_shape) # ERROR: Should qk_norm first q, k, v = qkv.unbind(0) q, k = self.q_norm(q), self.k_norm(k) if self.enable_flash_attn: from flash_attn import flash_attn_func x = flash_attn_func( q, k, v, dropout_p=self.attn_drop.p if self.training else 0.0, softmax_scale=self.scale, ) else: dtype = q.dtype q = q * self.scale attn = q @ k.transpose(-2, -1) # translate attn to float32 attn = attn.to(torch.float32) attn = attn.softmax(dim=-1) attn = attn.to(dtype) # cast back attn to original dtype attn = self.attn_drop(attn) x = attn @ v if not self.enable_flash_attn: x = x.transpose(1, 2) # apply all to all to gather back attention heads and split sequence # [B, N, NUM_HEAD_PER_DEVICE, HEAD_DIM] -> [B, SUB_N, NUM_HEAD, HEAD_DIM] x = all_to_all(x, sp_group, scatter_dim=1, gather_dim=2) # reshape outputs back to [B, N, C] x_output_shape = (B, N, C) x = x.reshape(x_output_shape) x = self.proj(x) x = self.proj_drop(x) return x class MultiHeadCrossAttention(nn.Module): def __init__(self, d_model, num_heads, attn_drop=0.0, proj_drop=0.0): super(MultiHeadCrossAttention, self).__init__() assert d_model % num_heads == 0, "d_model must be divisible by num_heads" self.d_model = d_model self.num_heads = num_heads self.head_dim = d_model // num_heads self.q_linear = nn.Linear(d_model, d_model) self.kv_linear = nn.Linear(d_model, d_model * 2) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(d_model, d_model) self.proj_drop = nn.Dropout(proj_drop) def forward(self, x, cond, mask=None): #start = torch.cuda.Event(enable_timing=True) #end = torch.cuda.Event(enable_timing=True) # query/value: img tokens; key: condition; mask: if padding tokens B, N, C = x.shape #start.record() q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim) kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim) k, v = kv.unbind(2) attn_bias = None if mask is not None: attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask) #x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias) x, cross_attn_map = cached_attention_forward(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias) x = x.view(B, -1, C) cross_attn_map = cross_attn_map.view(B, -1, cross_attn_map.shape[-1]) x = self.proj(x) x = self.proj_drop(x) #end.record() #torch.cuda.synchronize() #print(start.elapsed_time(end)) return x, cross_attn_map class SeqParallelMultiHeadCrossAttention(MultiHeadCrossAttention): def __init__( self, d_model, num_heads, attn_drop=0.0, proj_drop=0.0, ): super().__init__( d_model=d_model, num_heads=num_heads, attn_drop=attn_drop, proj_drop=proj_drop, ) def forward(self, x, cond, mask=None): # query/value: img tokens; key: condition; mask: if padding tokens sp_group = get_sequence_parallel_group() sp_size = dist.get_world_size(sp_group) B, SUB_N, C = x.shape # [B, TS/p, C] N = SUB_N * sp_size # shape: # q, k, v: [B, SUB_N, NUM_HEADS, HEAD_DIM] q = self.q_linear(x).view(B, -1, self.num_heads, self.head_dim) kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim) kv = split_forward_gather_backward(kv, get_sequence_parallel_group(), dim=3, grad_scale="down") k, v = kv.unbind(2) # apply all_to_all to gather sequence and split attention heads q = all_to_all(q, sp_group, scatter_dim=2, gather_dim=1) q = q.view(1, -1, self.num_heads // sp_size, self.head_dim) k = k.view(1, -1, self.num_heads // sp_size, self.head_dim) v = v.view(1, -1, self.num_heads // sp_size, self.head_dim) # compute attention attn_bias = None if mask is not None: attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask) x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias) # apply all to all to gather back attention heads and scatter sequence x = x.view(B, -1, self.num_heads // sp_size, self.head_dim) x = all_to_all(x, sp_group, scatter_dim=1, gather_dim=2) # apply output projection x = x.view(B, -1, C) x = self.proj(x) x = self.proj_drop(x) return x class FinalLayer(nn.Module): """ The final layer of DiT. """ def __init__(self, hidden_size, num_patch, out_channels): super().__init__() self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) self.linear = nn.Linear(hidden_size, num_patch * out_channels, bias=True) self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)) def forward(self, x, c): shift, scale = self.adaLN_modulation(c).chunk(2, dim=1) x = modulate(self.norm_final, x, shift, scale) x = self.linear(x) return x class T2IFinalLayer(nn.Module): """ The final layer of PixArt. """ def __init__(self, hidden_size, num_patch, out_channels, d_t=None, d_s=None): super().__init__() self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) self.linear = nn.Linear(hidden_size, num_patch * out_channels, bias=True) self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size**0.5) self.out_channels = out_channels self.d_t = d_t self.d_s = d_s def t_mask_select(self, x_mask, x, masked_x, T, S): # x: [B, (T, S), C] # mased_x: [B, (T, S), C] # x_mask: [B, T] x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S) masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=T, S=S) x = torch.where(x_mask[:, :, None, None], x, masked_x) x = rearrange(x, "B T S C -> B (T S) C") return x def forward(self, x, t, x_mask=None, t0=None, T=None, S=None): if T is None: T = self.d_t if S is None: S = self.d_s shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1) x = t2i_modulate(self.norm_final(x), shift, scale) if x_mask is not None: shift_zero, scale_zero = (self.scale_shift_table[None] + t0[:, None]).chunk(2, dim=1) x_zero = t2i_modulate(self.norm_final(x), shift_zero, scale_zero) x = self.t_mask_select(x_mask, x, x_zero, T, S) x = self.linear(x) return x # =============================================== # Embedding Layers for Timesteps and Class Labels # =============================================== class TimestepEmbedder(nn.Module): """ Embeds scalar timesteps into vector representations. """ def __init__(self, hidden_size, frequency_embedding_size=256): super().__init__() self.mlp = nn.Sequential( nn.Linear(frequency_embedding_size, hidden_size, bias=True), nn.SiLU(), nn.Linear(hidden_size, hidden_size, bias=True), ) self.frequency_embedding_size = frequency_embedding_size @staticmethod def timestep_embedding(t, dim, max_period=10000): """ Create sinusoidal timestep embeddings. :param t: a 1-D Tensor of N indices, one per batch element. These may be fractional. :param dim: the dimension of the output. :param max_period: controls the minimum frequency of the embeddings. :return: an (N, D) Tensor of positional embeddings. """ # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py half = dim // 2 freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half) freqs = freqs.to(device=t.device) args = t[:, None].float() * freqs[None] embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) if dim % 2: embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) return embedding def forward(self, t, dtype): t_freq = self.timestep_embedding(t, self.frequency_embedding_size) if t_freq.dtype != dtype: t_freq = t_freq.to(dtype) t_emb = self.mlp(t_freq) return t_emb class LabelEmbedder(nn.Module): """ Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance. """ def __init__(self, num_classes, hidden_size, dropout_prob): super().__init__() use_cfg_embedding = dropout_prob > 0 self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size) self.num_classes = num_classes self.dropout_prob = dropout_prob def token_drop(self, labels, force_drop_ids=None): """ Drops labels to enable classifier-free guidance. """ if force_drop_ids is None: drop_ids = torch.rand(labels.shape[0]).cuda() < self.dropout_prob else: drop_ids = force_drop_ids == 1 labels = torch.where(drop_ids, self.num_classes, labels) return labels def forward(self, labels, train, force_drop_ids=None): use_dropout = self.dropout_prob > 0 if (train and use_dropout) or (force_drop_ids is not None): labels = self.token_drop(labels, force_drop_ids) return self.embedding_table(labels) class SizeEmbedder(TimestepEmbedder): """ Embeds scalar timesteps into vector representations. """ def __init__(self, hidden_size, frequency_embedding_size=256): super().__init__(hidden_size=hidden_size, frequency_embedding_size=frequency_embedding_size) self.mlp = nn.Sequential( nn.Linear(frequency_embedding_size, hidden_size, bias=True), nn.SiLU(), nn.Linear(hidden_size, hidden_size, bias=True), ) self.frequency_embedding_size = frequency_embedding_size self.outdim = hidden_size def forward(self, s, bs): if s.ndim == 1: s = s[:, None] assert s.ndim == 2 if s.shape[0] != bs: s = s.repeat(bs // s.shape[0], 1) assert s.shape[0] == bs b, dims = s.shape[0], s.shape[1] s = rearrange(s, "b d -> (b d)") s_freq = self.timestep_embedding(s, self.frequency_embedding_size).to(self.dtype) s_emb = self.mlp(s_freq) s_emb = rearrange(s_emb, "(b d) d2 -> b (d d2)", b=b, d=dims, d2=self.outdim) return s_emb @property def dtype(self): return next(self.parameters()).dtype class CaptionEmbedder(nn.Module): """ Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance. """ def __init__( self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate="tanh"), token_num=120, ): super().__init__() self.y_proj = Mlp( in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size, act_layer=act_layer, drop=0, ) self.register_buffer( "y_embedding", torch.randn(token_num, in_channels) / in_channels**0.5, ) self.uncond_prob = uncond_prob def token_drop(self, caption, force_drop_ids=None): """ Drops labels to enable classifier-free guidance. """ if force_drop_ids is None: drop_ids = torch.rand(caption.shape[0]).cuda() < self.uncond_prob else: drop_ids = force_drop_ids == 1 caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption) return caption def forward(self, caption, train, force_drop_ids=None): if train: assert caption.shape[2:] == self.y_embedding.shape use_dropout = self.uncond_prob > 0 if (train and use_dropout) or (force_drop_ids is not None): caption = self.token_drop(caption, force_drop_ids) caption = self.y_proj(caption) return caption class PositionEmbedding2D(nn.Module): def __init__(self, dim: int) -> None: super().__init__() self.dim = dim assert dim % 4 == 0, "dim must be divisible by 4" half_dim = dim // 2 inv_freq = 1.0 / (10000 ** (torch.arange(0, half_dim, 2).float() / half_dim)) self.register_buffer("inv_freq", inv_freq, persistent=False) def _get_sin_cos_emb(self, t: torch.Tensor): out = torch.einsum("i,d->id", t, self.inv_freq) emb_cos = torch.cos(out) emb_sin = torch.sin(out) return torch.cat((emb_sin, emb_cos), dim=-1) @functools.lru_cache(maxsize=512) def _get_cached_emb( self, device: torch.device, dtype: torch.dtype, h: int, w: int, scale: float = 1.0, base_size: Optional[int] = None, ): grid_h = torch.arange(h, device=device) / scale grid_w = torch.arange(w, device=device) / scale if base_size is not None: grid_h *= base_size / h grid_w *= base_size / w grid_h, grid_w = torch.meshgrid( grid_w, grid_h, indexing="ij", ) # here w goes first grid_h = grid_h.t().reshape(-1) grid_w = grid_w.t().reshape(-1) emb_h = self._get_sin_cos_emb(grid_h) emb_w = self._get_sin_cos_emb(grid_w) return torch.concat([emb_h, emb_w], dim=-1).unsqueeze(0).to(dtype) def forward( self, x: torch.Tensor, h: int, w: int, scale: Optional[float] = 1.0, base_size: Optional[int] = None, ) -> torch.Tensor: return self._get_cached_emb(x.device, x.dtype, h, w, scale, base_size) # =============================================== # Sine/Cosine Positional Embedding Functions # =============================================== # https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, scale=1.0, base_size=None): """ grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) """ if not isinstance(grid_size, tuple): grid_size = (grid_size, grid_size) grid_h = np.arange(grid_size[0], dtype=np.float32) / scale grid_w = np.arange(grid_size[1], dtype=np.float32) / scale if base_size is not None: grid_h *= base_size / grid_size[0] grid_w *= base_size / grid_size[1] grid = np.meshgrid(grid_w, grid_h) # here w goes first grid = np.stack(grid, axis=0) grid = grid.reshape([2, 1, grid_size[1], grid_size[0]]) pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) if cls_token and extra_tokens > 0: pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0) return pos_embed def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): assert embed_dim % 2 == 0 # use half of dimensions to encode grid_h emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) return emb def get_1d_sincos_pos_embed(embed_dim, length, scale=1.0): pos = np.arange(0, length)[..., None] / scale return get_1d_sincos_pos_embed_from_grid(embed_dim, pos) def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): """ embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D) """ assert embed_dim % 2 == 0 omega = np.arange(embed_dim // 2, dtype=np.float64) omega /= embed_dim / 2.0 omega = 1.0 / 10000**omega # (D/2,) pos = pos.reshape(-1) # (M,) out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product emb_sin = np.sin(out) # (M, D/2) emb_cos = np.cos(out) # (M, D/2) emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) return emb ================================================ FILE: Open-Sora/opensora/models/pixart/pixart.py ================================================ # Adapted from PixArt # # Copyright (C) 2023 PixArt-alpha/PixArt-alpha # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published # by the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # PixArt: https://github.com/PixArt-alpha/PixArt-alpha # DiT: https://github.com/facebookresearch/DiT/tree/main # -------------------------------------------------------- import numpy as np import torch import torch.nn as nn from einops import rearrange from timm.models.layers import DropPath from timm.models.vision_transformer import Mlp # from .builder import MODELS from opensora.acceleration.checkpoint import auto_grad_checkpoint from opensora.models.layers.blocks import ( Attention, CaptionEmbedder, MultiHeadCrossAttention, PatchEmbed3D, SeqParallelAttention, SeqParallelMultiHeadCrossAttention, SizeEmbedder, T2IFinalLayer, TimestepEmbedder, approx_gelu, get_1d_sincos_pos_embed, get_2d_sincos_pos_embed, get_layernorm, t2i_modulate, ) from opensora.registry import MODELS from opensora.utils.ckpt_utils import load_checkpoint class PixArtBlock(nn.Module): """ A PixArt block with adaptive layer norm (adaLN-single) conditioning. """ def __init__( self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0.0, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, ): super().__init__() self.hidden_size = hidden_size self.enable_flash_attn = enable_flash_attn self._enable_sequence_parallelism = enable_sequence_parallelism if enable_sequence_parallelism: self.attn_cls = SeqParallelAttention self.mha_cls = SeqParallelMultiHeadCrossAttention else: self.attn_cls = Attention self.mha_cls = MultiHeadCrossAttention self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.attn = self.attn_cls( hidden_size, num_heads=num_heads, qkv_bias=True, enable_flash_attn=enable_flash_attn, ) self.cross_attn = self.mha_cls(hidden_size, num_heads) self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.mlp = Mlp( in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0 ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5) def forward(self, x, y, t, mask=None): B, N, C = x.shape shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( self.scale_shift_table[None] + t.reshape(B, 6, -1) ).chunk(6, dim=1) x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C)) x = x + self.cross_attn(x, y, mask) x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp))) return x @MODELS.register_module() class PixArt(nn.Module): """ Diffusion model with a Transformer backbone. """ def __init__( self, input_size=(1, 32, 32), in_channels=4, patch_size=(1, 2, 2), hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, pred_sigma=True, drop_path: float = 0.0, no_temporal_pos_emb=False, caption_channels=4096, model_max_length=120, dtype=torch.float32, freeze=None, space_scale=1.0, time_scale=1.0, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, base_size=None, ): super().__init__() assert enable_sequence_parallelism is False, "Sequence parallelism is not supported in this version." self.pred_sigma = pred_sigma self.in_channels = in_channels self.out_channels = in_channels * 2 if pred_sigma else in_channels self.hidden_size = hidden_size self.patch_size = patch_size self.input_size = input_size num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)]) self.num_patches = num_patches self.num_temporal = input_size[0] // patch_size[0] self.num_spatial = num_patches // self.num_temporal if base_size is None: self.base_size = int(np.sqrt(self.num_spatial)) else: self.base_size = base_size // patch_size[1] self.num_heads = num_heads self.dtype = dtype self.no_temporal_pos_emb = no_temporal_pos_emb self.depth = depth self.mlp_ratio = mlp_ratio self.enable_flash_attn = enable_flash_attn self.enable_layernorm_kernel = enable_layernorm_kernel self.space_scale = space_scale self.time_scale = time_scale self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size) self.t_embedder = TimestepEmbedder(hidden_size) self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True)) self.y_embedder = CaptionEmbedder( in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=model_max_length, ) self.register_buffer("pos_embed", self.get_spatial_pos_embed()) self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed()) drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] # stochastic depth decay rule self.blocks = nn.ModuleList( [ PixArtBlock( hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i], enable_flash_attn=enable_flash_attn, enable_layernorm_kernel=enable_layernorm_kernel, ) for i in range(depth) ] ) self.final_layer = T2IFinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels) self.initialize_weights() if freeze is not None: assert freeze in ["text"] if freeze == "text": self.freeze_text() def forward(self, x, timestep, y, mask=None, **kwargs): """ Forward pass of PixArt. x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) t: (N,) tensor of diffusion timesteps y: (N, 1, 120, C) tensor of class labels """ dtype = self.x_embedder.proj.weight.dtype B = x.size(0) x = x.to(dtype) timestep = timestep.to(dtype) y = y.to(dtype) # embedding x = self.x_embedder(x) # (B, N, D) x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial) x = x + self.pos_embed if not self.no_temporal_pos_emb: x = rearrange(x, "b t s d -> b s t d") x = x + self.pos_embed_temporal x = rearrange(x, "b s t d -> b (t s) d") else: x = rearrange(x, "b t s d -> b (t s) d") t = self.t_embedder(timestep, dtype=x.dtype) # (N, D) t0 = self.t_block(t) y = self.y_embedder(y, self.training) # (N, 1, L, D) if mask is not None: if mask.shape[0] != y.shape[0]: mask = mask.repeat(y.shape[0] // mask.shape[0], 1) mask = mask.squeeze(1).squeeze(1) y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) y_lens = mask.sum(dim=1).tolist() else: y_lens = [y.shape[2]] * y.shape[0] y = y.squeeze(1).view(1, -1, x.shape[-1]) # blocks for block in self.blocks: x = auto_grad_checkpoint(block, x, y, t0, y_lens) # final process x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels) x = self.unpatchify(x) # (N, out_channels, H, W) # cast to float32 for better accuracy x = x.to(torch.float32) return x def unpatchify(self, x): c = self.out_channels t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)] pt, ph, pw = self.patch_size x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c)) x = rearrange(x, "n t h w r p q c -> n c t r h p w q") imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw)) return imgs def get_spatial_pos_embed(self, grid_size=None): if grid_size is None: grid_size = self.input_size[1:] pos_embed = get_2d_sincos_pos_embed( self.hidden_size, (grid_size[0] // self.patch_size[1], grid_size[1] // self.patch_size[2]), scale=self.space_scale, base_size=self.base_size, ) pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) return pos_embed def get_temporal_pos_embed(self): pos_embed = get_1d_sincos_pos_embed( self.hidden_size, self.input_size[0] // self.patch_size[0], scale=self.time_scale, ) pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) return pos_embed def freeze_text(self): for n, p in self.named_parameters(): if "cross_attn" in n: p.requires_grad = False def initialize_weights(self): # Initialize transformer layers: def _basic_init(module): if isinstance(module, nn.Linear): torch.nn.init.xavier_uniform_(module.weight) if module.bias is not None: nn.init.constant_(module.bias, 0) self.apply(_basic_init) # Initialize patch_embed like nn.Linear (instead of nn.Conv2d): w = self.x_embedder.proj.weight.data nn.init.xavier_uniform_(w.view([w.shape[0], -1])) # Initialize timestep embedding MLP: nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) nn.init.normal_(self.t_block[1].weight, std=0.02) # Initialize caption embedding MLP: nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02) nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02) # Zero-out adaLN modulation layers in PixArt blocks: for block in self.blocks: nn.init.constant_(block.cross_attn.proj.weight, 0) nn.init.constant_(block.cross_attn.proj.bias, 0) # Zero-out output layers: nn.init.constant_(self.final_layer.linear.weight, 0) nn.init.constant_(self.final_layer.linear.bias, 0) @MODELS.register_module() class PixArtMS(PixArt): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) assert self.hidden_size % 3 == 0, "hidden_size must be divisible by 3" self.csize_embedder = SizeEmbedder(self.hidden_size // 3) self.ar_embedder = SizeEmbedder(self.hidden_size // 3) def forward(self, x, timestep, y, mask=None, data_info=None): """ Forward pass of PixArt. x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) t: (N,) tensor of diffusion timesteps y: (N, 1, 120, C) tensor of class labels """ x = x.to(self.dtype) timestep = timestep.to(self.dtype) y = y.to(self.dtype) c_size = data_info["hw"] ar = data_info["ar"] pos_embed = self.get_spatial_pos_embed((x.shape[-2], x.shape[-1])).to(x.dtype) # embedding x = self.x_embedder(x) # (B, N, D) x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial) x = x + pos_embed.to(x.device) if not self.no_temporal_pos_emb: x = rearrange(x, "b t s d -> b s t d") x = x + self.pos_embed_temporal x = rearrange(x, "b s t d -> b (t s) d") else: x = rearrange(x, "b t s d -> b (t s) d") t = self.t_embedder(timestep, dtype=x.dtype) # (N, D) B = x.shape[0] csize = self.csize_embedder(c_size, B) ar = self.ar_embedder(ar, B) t = t + torch.cat([csize, ar], dim=1) t0 = self.t_block(t) y = self.y_embedder(y, self.training) # (N, 1, L, D) if mask is not None: if mask.shape[0] != y.shape[0]: mask = mask.repeat(y.shape[0] // mask.shape[0], 1) mask = mask.squeeze(1).squeeze(1) y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) y_lens = mask.sum(dim=1).tolist() else: y_lens = [y.shape[2]] * y.shape[0] y = y.squeeze(1).view(1, -1, x.shape[-1]) # blocks for block in self.blocks: x = block(x, y, t0, y_lens) # final process x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels) x = self.unpatchify(x) # (N, out_channels, H, W) # cast to float32 for better accuracy x = x.to(torch.float32) return x @MODELS.register_module("PixArt-XL/2") def PixArt_XL_2(from_pretrained=None, **kwargs): model = PixArt(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model @MODELS.register_module("PixArt-1B/2") def PixArt_1B_2(from_pretrained=None, **kwargs): model = PixArt(depth=28, hidden_size=1872, patch_size=(1, 2, 2), num_heads=26, **kwargs) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model @MODELS.register_module("PixArtMS-XL/2") def PixArtMS_XL_2(from_pretrained=None, **kwargs): model = PixArtMS(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model ================================================ FILE: Open-Sora/opensora/models/stdit/__init__.py ================================================ from .stdit import STDiT from .stdit2 import STDiT2 from .stdit3 import STDiT3 ================================================ FILE: Open-Sora/opensora/models/stdit/stdit.py ================================================ import numpy as np import torch import torch.distributed as dist import torch.nn as nn from einops import rearrange from timm.models.layers import DropPath from timm.models.vision_transformer import Mlp from opensora.acceleration.checkpoint import auto_grad_checkpoint from opensora.acceleration.communications import gather_forward_split_backward, split_forward_gather_backward from opensora.acceleration.parallel_states import get_sequence_parallel_group from opensora.models.layers.blocks import ( Attention, CaptionEmbedder, MultiHeadCrossAttention, PatchEmbed3D, SeqParallelAttention, SeqParallelMultiHeadCrossAttention, T2IFinalLayer, TimestepEmbedder, approx_gelu, get_1d_sincos_pos_embed, get_2d_sincos_pos_embed, get_layernorm, t2i_modulate, ) from opensora.registry import MODELS from opensora.utils.ckpt_utils import load_checkpoint class STDiTBlock(nn.Module): def __init__( self, hidden_size, num_heads, d_s=None, d_t=None, mlp_ratio=4.0, drop_path=0.0, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, ): super().__init__() self.hidden_size = hidden_size self.enable_flash_attn = enable_flash_attn self._enable_sequence_parallelism = enable_sequence_parallelism if enable_sequence_parallelism: self.attn_cls = SeqParallelAttention self.mha_cls = SeqParallelMultiHeadCrossAttention else: self.attn_cls = Attention self.mha_cls = MultiHeadCrossAttention self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.attn = self.attn_cls( hidden_size, num_heads=num_heads, qkv_bias=True, enable_flash_attn=enable_flash_attn, ) self.cross_attn = self.mha_cls(hidden_size, num_heads) self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.mlp = Mlp( in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0 ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5) # temporal attention self.d_s = d_s self.d_t = d_t if self._enable_sequence_parallelism: sp_size = dist.get_world_size(get_sequence_parallel_group()) # make sure d_t is divisible by sp_size assert d_t % sp_size == 0 self.d_t = d_t // sp_size self.attn_temp = self.attn_cls( hidden_size, num_heads=num_heads, qkv_bias=True, enable_flash_attn=self.enable_flash_attn, ) def t_mask_select(self, x, masked_x, x_mask): # x: [B, (T, S), C] # mased_x: [B, (T, S), C] # x_mask: [B, T] x = rearrange(x, "B (T S) C -> B T S C", T=self.d_t, S=self.d_s) masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=self.d_t, S=self.d_s) x = torch.where(x_mask[:, :, None, None], x, masked_x) x = rearrange(x, "B T S C -> B (T S) C") return x def forward(self, x, y, t, mask=None, tpe=None, x_mask=None, t0=None): B, N, C = x.shape shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( self.scale_shift_table[None] + t.reshape(B, 6, -1) ).chunk(6, dim=1) x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa) if x_mask is not None: shift_msa_zero, scale_msa_zero, gate_msa_zero, shift_mlp_zero, scale_mlp_zero, gate_mlp_zero = ( self.scale_shift_table[None] + t0.reshape(B, 6, -1) ).chunk(6, dim=1) x_m_zero = t2i_modulate(self.norm1(x), shift_msa_zero, scale_msa_zero) x_m = self.t_mask_select(x_m, x_m_zero, x_mask) # spatial branch x_s = rearrange(x_m, "B (T S) C -> (B T) S C", T=self.d_t, S=self.d_s) x_s = self.attn(x_s) x_s = rearrange(x_s, "(B T) S C -> B (T S) C", T=self.d_t, S=self.d_s) if x_mask is not None: x_s_zero = gate_msa_zero * x_s x_s = gate_msa * x_s x_s = self.t_mask_select(x_s, x_s_zero, x_mask) else: x_s = gate_msa * x_s x = x + self.drop_path(x_s) # temporal branch x_t = rearrange(x, "B (T S) C -> (B S) T C", T=self.d_t, S=self.d_s) if tpe is not None: x_t = x_t + tpe x_t = self.attn_temp(x_t) x_t = rearrange(x_t, "(B S) T C -> B (T S) C", T=self.d_t, S=self.d_s) x = x + self.drop_path(gate_msa * x_t) # cross attn x = x + self.cross_attn(x, y, mask) # mlp x_m = t2i_modulate(self.norm2(x), shift_mlp, scale_mlp) if x_mask is not None: x_m_zero = t2i_modulate(self.norm2(x), shift_mlp_zero, scale_mlp_zero) x_m = self.t_mask_select(x_m, x_m_zero, x_mask) x_mlp = self.mlp(x_m) if x_mask is not None: x_mlp_zero = gate_mlp_zero * x_mlp x_mlp = gate_mlp * x_mlp x_mlp = self.t_mask_select(x_mlp, x_mlp_zero, x_mask) else: x_mlp = gate_mlp * x_mlp x = x + self.drop_path(x_mlp) return x @MODELS.register_module() class STDiT(nn.Module): def __init__( self, input_size=(1, 32, 32), in_channels=4, patch_size=(1, 2, 2), hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, pred_sigma=True, drop_path=0.0, no_temporal_pos_emb=False, caption_channels=4096, model_max_length=120, dtype=torch.float32, space_scale=1.0, time_scale=1.0, freeze=None, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, ): super().__init__() self.pred_sigma = pred_sigma self.in_channels = in_channels self.out_channels = in_channels * 2 if pred_sigma else in_channels self.hidden_size = hidden_size self.patch_size = patch_size self.input_size = input_size num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)]) self.num_patches = num_patches self.num_temporal = input_size[0] // patch_size[0] self.num_spatial = num_patches // self.num_temporal self.num_heads = num_heads self.dtype = dtype self.no_temporal_pos_emb = no_temporal_pos_emb self.depth = depth self.mlp_ratio = mlp_ratio self.enable_flash_attn = enable_flash_attn self.enable_layernorm_kernel = enable_layernorm_kernel self.space_scale = space_scale self.time_scale = time_scale self.register_buffer("pos_embed", self.get_spatial_pos_embed()) self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed()) self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size) self.t_embedder = TimestepEmbedder(hidden_size) self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True)) self.y_embedder = CaptionEmbedder( in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=model_max_length, ) drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] self.blocks = nn.ModuleList( [ STDiTBlock( self.hidden_size, self.num_heads, mlp_ratio=self.mlp_ratio, drop_path=drop_path[i], enable_flash_attn=self.enable_flash_attn, enable_layernorm_kernel=self.enable_layernorm_kernel, enable_sequence_parallelism=enable_sequence_parallelism, d_t=self.num_temporal, d_s=self.num_spatial, ) for i in range(self.depth) ] ) self.final_layer = T2IFinalLayer( hidden_size, np.prod(self.patch_size), self.out_channels, d_t=self.num_temporal, d_s=self.num_spatial, ) # init model self.initialize_weights() self.initialize_temporal() if freeze is not None: assert freeze in ["not_temporal", "text"] if freeze == "not_temporal": self.freeze_not_temporal() elif freeze == "text": self.freeze_text() # sequence parallel related configs self.enable_sequence_parallelism = enable_sequence_parallelism if enable_sequence_parallelism: self.sp_rank = dist.get_rank(get_sequence_parallel_group()) else: self.sp_rank = None def forward(self, x, timestep, y, mask=None, x_mask=None, **kwargs): """ Forward pass of STDiT. Args: x (torch.Tensor): latent representation of video; of shape [B, C, T, H, W] timestep (torch.Tensor): diffusion time steps; of shape [B] y (torch.Tensor): representation of prompts; of shape [B, 1, N_token, C] mask (torch.Tensor): mask for selecting prompt tokens; of shape [B, N_token] Returns: x (torch.Tensor): output latent representation; of shape [B, C, T, H, W] """ dtype = self.x_embedder.proj.weight.dtype x = x.to(dtype) timestep = timestep.to(dtype) y = y.to(dtype) # embedding x = self.x_embedder(x) # [B, N, C] x = rearrange(x, "B (T S) C -> B T S C", T=self.num_temporal, S=self.num_spatial) x = x + self.pos_embed x = rearrange(x, "B T S C -> B (T S) C") # shard over the sequence dim if sp is enabled if self.enable_sequence_parallelism: x = split_forward_gather_backward(x, get_sequence_parallel_group(), dim=1, grad_scale="down") t = self.t_embedder(timestep, dtype=x.dtype) # [B, C] t_mlp = self.t_block(t) # [B, C] if x_mask is not None: t0_timestep = torch.zeros_like(timestep) t0 = self.t_embedder(t0_timestep, dtype=x.dtype) t0_mlp = self.t_block(t0) else: t0 = None t0_mlp = None y = self.y_embedder(y, self.training) # [B, 1, N_token, C] if mask is not None: if mask.shape[0] != y.shape[0]: mask = mask.repeat(y.shape[0] // mask.shape[0], 1) mask = mask.squeeze(1).squeeze(1) y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) y_lens = mask.sum(dim=1).tolist() else: y_lens = [y.shape[2]] * y.shape[0] y = y.squeeze(1).view(1, -1, x.shape[-1]) # blocks for i, block in enumerate(self.blocks): if i == 0: if self.enable_sequence_parallelism: tpe = torch.chunk( self.pos_embed_temporal, dist.get_world_size(get_sequence_parallel_group()), dim=1 )[self.sp_rank].contiguous() else: tpe = self.pos_embed_temporal else: tpe = None x = auto_grad_checkpoint(block, x, y, t_mlp, y_lens, tpe, x_mask, t0_mlp) if self.enable_sequence_parallelism: x = gather_forward_split_backward(x, get_sequence_parallel_group(), dim=1, grad_scale="up") # x.shape: [B, N, C] # final process x = self.final_layer(x, t, x_mask, t0) # [B, N, C=T_p * H_p * W_p * C_out] x = self.unpatchify(x) # [B, C_out, T, H, W] # cast to float32 for better accuracy x = x.to(torch.float32) return x def unpatchify(self, x): """ Args: x (torch.Tensor): of shape [B, N, C] Return: x (torch.Tensor): of shape [B, C_out, T, H, W] """ N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)] T_p, H_p, W_p = self.patch_size x = rearrange( x, "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)", N_t=N_t, N_h=N_h, N_w=N_w, T_p=T_p, H_p=H_p, W_p=W_p, C_out=self.out_channels, ) return x def unpatchify_old(self, x): c = self.out_channels t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)] pt, ph, pw = self.patch_size x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c)) x = rearrange(x, "n t h w r p q c -> n c t r h p w q") imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw)) return imgs def get_spatial_pos_embed(self, grid_size=None): if grid_size is None: grid_size = self.input_size[1:] pos_embed = get_2d_sincos_pos_embed( self.hidden_size, (grid_size[0] // self.patch_size[1], grid_size[1] // self.patch_size[2]), scale=self.space_scale, ) pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) return pos_embed def get_temporal_pos_embed(self): pos_embed = get_1d_sincos_pos_embed( self.hidden_size, self.input_size[0] // self.patch_size[0], scale=self.time_scale, ) pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) return pos_embed def freeze_not_temporal(self): for n, p in self.named_parameters(): if "attn_temp" not in n: p.requires_grad = False def freeze_text(self): for n, p in self.named_parameters(): if "cross_attn" in n: p.requires_grad = False def initialize_temporal(self): for block in self.blocks: nn.init.constant_(block.attn_temp.proj.weight, 0) nn.init.constant_(block.attn_temp.proj.bias, 0) def initialize_weights(self): # Initialize transformer layers: def _basic_init(module): if isinstance(module, nn.Linear): torch.nn.init.xavier_uniform_(module.weight) if module.bias is not None: nn.init.constant_(module.bias, 0) self.apply(_basic_init) # Initialize patch_embed like nn.Linear (instead of nn.Conv2d): w = self.x_embedder.proj.weight.data nn.init.xavier_uniform_(w.view([w.shape[0], -1])) # Initialize timestep embedding MLP: nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) nn.init.normal_(self.t_block[1].weight, std=0.02) # Initialize caption embedding MLP: nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02) nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02) # Zero-out adaLN modulation layers in PixArt blocks: for block in self.blocks: nn.init.constant_(block.cross_attn.proj.weight, 0) nn.init.constant_(block.cross_attn.proj.bias, 0) # Zero-out output layers: nn.init.constant_(self.final_layer.linear.weight, 0) nn.init.constant_(self.final_layer.linear.bias, 0) @MODELS.register_module("STDiT-XL/2") def STDiT_XL_2(from_pretrained=None, **kwargs): model = STDiT(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model ================================================ FILE: Open-Sora/opensora/models/stdit/stdit2.py ================================================ import os import numpy as np import torch import torch.nn as nn from einops import rearrange from rotary_embedding_torch import RotaryEmbedding from timm.models.layers import DropPath from timm.models.vision_transformer import Mlp from transformers import PretrainedConfig, PreTrainedModel from opensora.acceleration.checkpoint import auto_grad_checkpoint from opensora.models.layers.blocks import ( Attention, CaptionEmbedder, MultiHeadCrossAttention, PatchEmbed3D, PositionEmbedding2D, SizeEmbedder, T2IFinalLayer, TimestepEmbedder, approx_gelu, get_2d_sincos_pos_embed, get_layernorm, t2i_modulate, ) from opensora.registry import MODELS from opensora.utils.ckpt_utils import load_checkpoint class STDiT2Block(nn.Module): def __init__( self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0.0, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, rope=None, qk_norm=False, qk_norm_legacy=False, ): super().__init__() self.hidden_size = hidden_size self.enable_flash_attn = enable_flash_attn self._enable_sequence_parallelism = enable_sequence_parallelism # spatial branch self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.attn = Attention( hidden_size, num_heads=num_heads, qkv_bias=True, enable_flash_attn=enable_flash_attn, qk_norm=qk_norm, qk_norm_legacy=qk_norm_legacy, ) self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5) # cross attn self.cross_attn = MultiHeadCrossAttention(hidden_size, num_heads) # mlp branch self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.mlp = Mlp( in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0 ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() # temporal branch self.norm_temp = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) # new self.attn_temp = Attention( hidden_size, num_heads=num_heads, qkv_bias=True, enable_flash_attn=self.enable_flash_attn, rope=rope, qk_norm=qk_norm, qk_norm_legacy=qk_norm_legacy, ) self.scale_shift_table_temporal = nn.Parameter(torch.randn(3, hidden_size) / hidden_size**0.5) # new def t_mask_select(self, x_mask, x, masked_x, T, S): # x: [B, (T, S), C] # mased_x: [B, (T, S), C] # x_mask: [B, T] x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S) masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=T, S=S) x = torch.where(x_mask[:, :, None, None], x, masked_x) x = rearrange(x, "B T S C -> B (T S) C") return x def forward(self, x, y, t, t_tmp, mask=None, x_mask=None, t0=None, t0_tmp=None, T=None, S=None): B, N, C = x.shape shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( self.scale_shift_table[None] + t.reshape(B, 6, -1) ).chunk(6, dim=1) shift_tmp, scale_tmp, gate_tmp = (self.scale_shift_table_temporal[None] + t_tmp.reshape(B, 3, -1)).chunk( 3, dim=1 ) if x_mask is not None: shift_msa_zero, scale_msa_zero, gate_msa_zero, shift_mlp_zero, scale_mlp_zero, gate_mlp_zero = ( self.scale_shift_table[None] + t0.reshape(B, 6, -1) ).chunk(6, dim=1) shift_tmp_zero, scale_tmp_zero, gate_tmp_zero = ( self.scale_shift_table_temporal[None] + t0_tmp.reshape(B, 3, -1) ).chunk(3, dim=1) # modulate x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa) if x_mask is not None: x_m_zero = t2i_modulate(self.norm1(x), shift_msa_zero, scale_msa_zero) x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S) # spatial branch x_s = rearrange(x_m, "B (T S) C -> (B T) S C", T=T, S=S) x_s = self.attn(x_s) x_s = rearrange(x_s, "(B T) S C -> B (T S) C", T=T, S=S) if x_mask is not None: x_s_zero = gate_msa_zero * x_s x_s = gate_msa * x_s x_s = self.t_mask_select(x_mask, x_s, x_s_zero, T, S) else: x_s = gate_msa * x_s x = x + self.drop_path(x_s) # modulate x_m = t2i_modulate(self.norm_temp(x), shift_tmp, scale_tmp) if x_mask is not None: x_m_zero = t2i_modulate(self.norm_temp(x), shift_tmp_zero, scale_tmp_zero) x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S) # temporal branch x_t = rearrange(x_m, "B (T S) C -> (B S) T C", T=T, S=S) x_t = self.attn_temp(x_t) x_t = rearrange(x_t, "(B S) T C -> B (T S) C", T=T, S=S) if x_mask is not None: x_t_zero = gate_tmp_zero * x_t x_t = gate_tmp * x_t x_t = self.t_mask_select(x_mask, x_t, x_t_zero, T, S) else: x_t = gate_tmp * x_t x = x + self.drop_path(x_t) # cross attn x = x + self.cross_attn(x, y, mask) # modulate x_m = t2i_modulate(self.norm2(x), shift_mlp, scale_mlp) if x_mask is not None: x_m_zero = t2i_modulate(self.norm2(x), shift_mlp_zero, scale_mlp_zero) x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S) # mlp x_mlp = self.mlp(x_m) if x_mask is not None: x_mlp_zero = gate_mlp_zero * x_mlp x_mlp = gate_mlp * x_mlp x_mlp = self.t_mask_select(x_mask, x_mlp, x_mlp_zero, T, S) else: x_mlp = gate_mlp * x_mlp x = x + self.drop_path(x_mlp) return x class STDiT2Config(PretrainedConfig): model_type = "STDiT2" def __init__( self, input_size=(None, None, None), input_sq_size=32, in_channels=4, patch_size=(1, 2, 2), hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, pred_sigma=True, drop_path=0.0, no_temporal_pos_emb=False, caption_channels=4096, model_max_length=120, freeze=None, qk_norm=False, qk_norm_legacy=False, enable_flash_attn=False, enable_layernorm_kernel=False, **kwargs, ): self.input_size = input_size self.input_sq_size = input_sq_size self.in_channels = in_channels self.patch_size = patch_size self.hidden_size = hidden_size self.depth = depth self.num_heads = num_heads self.mlp_ratio = mlp_ratio self.class_dropout_prob = class_dropout_prob self.pred_sigma = pred_sigma self.drop_path = drop_path self.no_temporal_pos_emb = no_temporal_pos_emb self.caption_channels = caption_channels self.model_max_length = model_max_length self.freeze = freeze self.qk_norm = qk_norm self.qk_norm_legacy = qk_norm_legacy self.enable_flash_attn = enable_flash_attn self.enable_layernorm_kernel = enable_layernorm_kernel super().__init__(**kwargs) @MODELS.register_module() class STDiT2(PreTrainedModel): config_class = STDiT2Config def __init__(self, config): super().__init__(config) self.pred_sigma = config.pred_sigma self.in_channels = config.in_channels self.out_channels = config.in_channels * 2 if config.pred_sigma else config.in_channels self.hidden_size = config.hidden_size self.num_heads = config.num_heads self.no_temporal_pos_emb = config.no_temporal_pos_emb self.depth = config.depth self.mlp_ratio = config.mlp_ratio self.enable_flash_attn = config.enable_flash_attn self.enable_layernorm_kernel = config.enable_layernorm_kernel # support dynamic input self.patch_size = config.patch_size self.input_size = config.input_size self.input_sq_size = config.input_sq_size self.pos_embed = PositionEmbedding2D(config.hidden_size) self.x_embedder = PatchEmbed3D(config.patch_size, config.in_channels, config.hidden_size) self.t_embedder = TimestepEmbedder(config.hidden_size) self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(config.hidden_size, 6 * config.hidden_size, bias=True)) self.t_block_temp = nn.Sequential( nn.SiLU(), nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=True) ) # new self.y_embedder = CaptionEmbedder( in_channels=config.caption_channels, hidden_size=config.hidden_size, uncond_prob=config.class_dropout_prob, act_layer=approx_gelu, token_num=config.model_max_length, ) drop_path = [x.item() for x in torch.linspace(0, config.drop_path, config.depth)] self.rope = RotaryEmbedding(dim=self.hidden_size // self.num_heads) # new self.blocks = nn.ModuleList( [ STDiT2Block( self.hidden_size, self.num_heads, mlp_ratio=self.mlp_ratio, drop_path=drop_path[i], enable_flash_attn=self.enable_flash_attn, enable_layernorm_kernel=self.enable_layernorm_kernel, rope=self.rope.rotate_queries_or_keys, qk_norm=config.qk_norm, qk_norm_legacy=config.qk_norm_legacy, ) for i in range(self.depth) ] ) self.final_layer = T2IFinalLayer(config.hidden_size, np.prod(self.patch_size), self.out_channels) # multi_res assert self.hidden_size % 3 == 0, "hidden_size must be divisible by 3" self.csize_embedder = SizeEmbedder(self.hidden_size // 3) self.ar_embedder = SizeEmbedder(self.hidden_size // 3) self.fl_embedder = SizeEmbedder(self.hidden_size) # new self.fps_embedder = SizeEmbedder(self.hidden_size) # new # init model self.initialize_weights() self.initialize_temporal() if config.freeze is not None: assert config.freeze in ["not_temporal", "text"] if config.freeze == "not_temporal": self.freeze_not_temporal() elif config.freeze == "text": self.freeze_text() def get_dynamic_size(self, x): _, _, T, H, W = x.size() if T % self.patch_size[0] != 0: T += self.patch_size[0] - T % self.patch_size[0] if H % self.patch_size[1] != 0: H += self.patch_size[1] - H % self.patch_size[1] if W % self.patch_size[2] != 0: W += self.patch_size[2] - W % self.patch_size[2] T = T // self.patch_size[0] H = H // self.patch_size[1] W = W // self.patch_size[2] return (T, H, W) def forward( self, x, timestep, y, mask=None, x_mask=None, num_frames=None, height=None, width=None, ar=None, fps=None ): """ Forward pass of STDiT. Args: x (torch.Tensor): latent representation of video; of shape [B, C, T, H, W] timestep (torch.Tensor): diffusion time steps; of shape [B] y (torch.Tensor): representation of prompts; of shape [B, 1, N_token, C] mask (torch.Tensor): mask for selecting prompt tokens; of shape [B, N_token] Returns: x (torch.Tensor): output latent representation; of shape [B, C, T, H, W] """ B = x.shape[0] dtype = self.x_embedder.proj.weight.dtype x = x.to(dtype) timestep = timestep.to(dtype) y = y.to(dtype) # === process data info === # 1. get dynamic size hw = torch.cat([height[:, None], width[:, None]], dim=1) rs = (height[0].item() * width[0].item()) ** 0.5 csize = self.csize_embedder(hw, B) # 2. get aspect ratio ar = ar.unsqueeze(1) ar = self.ar_embedder(ar, B) data_info = torch.cat([csize, ar], dim=1) # 3. get number of frames fl = num_frames.unsqueeze(1) fps = fps.unsqueeze(1) fl = self.fl_embedder(fl, B) fl = fl + self.fps_embedder(fps, B) # === get dynamic shape size === _, _, Tx, Hx, Wx = x.size() T, H, W = self.get_dynamic_size(x) S = H * W scale = rs / self.input_sq_size base_size = round(S**0.5) pos_emb = self.pos_embed(x, H, W, scale=scale, base_size=base_size) # embedding x = self.x_embedder(x) # [B, N, C] x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S) x = x + pos_emb x = rearrange(x, "B T S C -> B (T S) C") # prepare adaIN t = self.t_embedder(timestep, dtype=x.dtype) # [B, C] t_spc = t + data_info # [B, C] t_tmp = t + fl # [B, C] t_spc_mlp = self.t_block(t_spc) # [B, 6*C] t_tmp_mlp = self.t_block_temp(t_tmp) # [B, 3*C] if x_mask is not None: t0_timestep = torch.zeros_like(timestep) t0 = self.t_embedder(t0_timestep, dtype=x.dtype) t0_spc = t0 + data_info t0_tmp = t0 + fl t0_spc_mlp = self.t_block(t0_spc) t0_tmp_mlp = self.t_block_temp(t0_tmp) else: t0_spc = None t0_tmp = None t0_spc_mlp = None t0_tmp_mlp = None # prepare y y = self.y_embedder(y, self.training) # [B, 1, N_token, C] if mask is not None: if mask.shape[0] != y.shape[0]: mask = mask.repeat(y.shape[0] // mask.shape[0], 1) mask = mask.squeeze(1).squeeze(1) y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) y_lens = mask.sum(dim=1).tolist() else: y_lens = [y.shape[2]] * y.shape[0] y = y.squeeze(1).view(1, -1, x.shape[-1]) # blocks for _, block in enumerate(self.blocks): x = auto_grad_checkpoint( block, x, y, t_spc_mlp, t_tmp_mlp, y_lens, x_mask, t0_spc_mlp, t0_tmp_mlp, T, S, ) # x.shape: [B, N, C] # final process x = self.final_layer(x, t, x_mask, t0_spc, T, S) # [B, N, C=T_p * H_p * W_p * C_out] x = self.unpatchify(x, T, H, W, Tx, Hx, Wx) # [B, C_out, T, H, W] # cast to float32 for better accuracy x = x.to(torch.float32) return x def unpatchify(self, x, N_t, N_h, N_w, R_t, R_h, R_w): """ Args: x (torch.Tensor): of shape [B, N, C] Return: x (torch.Tensor): of shape [B, C_out, T, H, W] """ # N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)] T_p, H_p, W_p = self.patch_size x = rearrange( x, "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)", N_t=N_t, N_h=N_h, N_w=N_w, T_p=T_p, H_p=H_p, W_p=W_p, C_out=self.out_channels, ) # unpad x = x[:, :, :R_t, :R_h, :R_w] return x def unpatchify_old(self, x): c = self.out_channels t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)] pt, ph, pw = self.patch_size x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c)) x = rearrange(x, "n t h w r p q c -> n c t r h p w q") imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw)) return imgs def get_spatial_pos_embed(self, H, W, scale=1.0, base_size=None): pos_embed = get_2d_sincos_pos_embed( self.hidden_size, (H, W), scale=scale, base_size=base_size, ) pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) return pos_embed def freeze_not_temporal(self): for n, p in self.named_parameters(): if "attn_temp" not in n: p.requires_grad = False def freeze_text(self): for n, p in self.named_parameters(): if "cross_attn" in n: p.requires_grad = False def initialize_temporal(self): for block in self.blocks: nn.init.constant_(block.attn_temp.proj.weight, 0) nn.init.constant_(block.attn_temp.proj.bias, 0) def initialize_weights(self): # Initialize transformer layers: def _basic_init(module): if isinstance(module, nn.Linear): torch.nn.init.xavier_uniform_(module.weight) if module.bias is not None: nn.init.constant_(module.bias, 0) self.apply(_basic_init) # Initialize patch_embed like nn.Linear (instead of nn.Conv2d): w = self.x_embedder.proj.weight.data nn.init.xavier_uniform_(w.view([w.shape[0], -1])) # Initialize timestep embedding MLP: nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) nn.init.normal_(self.t_block[1].weight, std=0.02) nn.init.normal_(self.t_block_temp[1].weight, std=0.02) # Initialize caption embedding MLP: nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02) nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02) # Zero-out adaLN modulation layers in PixArt blocks: for block in self.blocks: nn.init.constant_(block.cross_attn.proj.weight, 0) nn.init.constant_(block.cross_attn.proj.bias, 0) # Zero-out output layers: nn.init.constant_(self.final_layer.linear.weight, 0) nn.init.constant_(self.final_layer.linear.bias, 0) @MODELS.register_module("STDiT2-XL/2") def STDiT2_XL_2(from_pretrained=None, **kwargs): if from_pretrained is not None: if os.path.isdir(from_pretrained) or os.path.isfile(from_pretrained): # if it is a directory or a file, we load the checkpoint manually config = STDiT2Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) model = STDiT2(config) load_checkpoint(model, from_pretrained) return model else: # otherwise, we load the model from hugging face hub return STDiT2.from_pretrained(from_pretrained) else: # create a new model config = STDiT2Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) model = STDiT2(config) return model ================================================ FILE: Open-Sora/opensora/models/stdit/stdit3.py ================================================ import os import numpy as np import torch import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F from einops import rearrange from rotary_embedding_torch import RotaryEmbedding from timm.models.layers import DropPath from timm.models.vision_transformer import Mlp from transformers import PretrainedConfig, PreTrainedModel from opensora.acceleration.checkpoint import auto_grad_checkpoint from opensora.acceleration.communications import gather_forward_split_backward, split_forward_gather_backward from opensora.acceleration.parallel_states import get_sequence_parallel_group from opensora.models.layers.blocks import ( Attention, CaptionEmbedder, MultiHeadCrossAttention, PatchEmbed3D, PositionEmbedding2D, SeqParallelAttention, SeqParallelMultiHeadCrossAttention, SizeEmbedder, T2IFinalLayer, TimestepEmbedder, approx_gelu, get_layernorm, t2i_modulate, ) from opensora.registry import MODELS from opensora.utils.ckpt_utils import load_checkpoint from ...models.cache_functions import global_force_fresh, cache_cutfresh, update_cache, force_init, score_evaluate class STDiT3Block(nn.Module): def __init__( self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0.0, rope=None, qk_norm=False, temporal=False, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, ): super().__init__() self.temporal = temporal self.hidden_size = hidden_size self.enable_flash_attn = enable_flash_attn self.enable_sequence_parallelism = enable_sequence_parallelism if self.enable_sequence_parallelism and not temporal: attn_cls = SeqParallelAttention mha_cls = SeqParallelMultiHeadCrossAttention else: attn_cls = Attention mha_cls = MultiHeadCrossAttention self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.attn = attn_cls( hidden_size, num_heads=num_heads, qkv_bias=True, qk_norm=qk_norm, rope=rope, enable_flash_attn=enable_flash_attn, ) self.cross_attn = mha_cls(hidden_size, num_heads) self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) self.mlp = Mlp( in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0 ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5) def t_mask_select(self, x_mask, x, masked_x, T, S): # x: [B, (T, S), C] # mased_x: [B, (T, S), C] # x_mask: [B, T] x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S) masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=T, S=S) x = torch.where(x_mask[:, :, None, None], x, masked_x) x = rearrange(x, "B T S C -> B (T S) C") return x def forward( self, x, y, t, current, cache_dic, mask=None, # text mask x_mask=None, # temporal mask t0=None, # t with timestamp=0 T=None, # number of frames S=None, # number of pixel patches ): ''' Forward for video models. Note that the Force Activation Cycle is slightly different from DiT-ToCa and PixArt-alpha-ToCa. This is because of a discovery: The Force Activation Cycle of different modules can be different for OpenSora model. (This cause decrease in performance in DiT and PixArt). ''' # prepare modulate parameters B, N, C = x.shape shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( self.scale_shift_table[None] + t.reshape(B, 6, -1) ).chunk(6, dim=1) if x_mask is not None: shift_msa_zero, scale_msa_zero, gate_msa_zero, shift_mlp_zero, scale_mlp_zero, gate_mlp_zero = ( self.scale_shift_table[None] + t0.reshape(B, 6, -1) ).chunk(6, dim=1) if self.temporal: current['flag'] = -1 else: current['flag'] = 0 is_force_fresh = global_force_fresh(cache_dic, current) current['is_force_fresh'] = is_force_fresh # modulate (attention) current['module'] = 'attn' if is_force_fresh[current['module']]: x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa) if x_mask is not None: x_m_zero = t2i_modulate(self.norm1(x), shift_msa_zero, scale_msa_zero) x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S) # attention if self.temporal: x_m = rearrange(x_m, "B (T S) C -> (B S) T C", T=T, S=S) x_m = self.attn(x_m) x_m = rearrange(x_m, "(B S) T C -> B (T S) C", T=T, S=S) else: x_m = rearrange(x_m, "B (T S) C -> (B T) S C", T=T, S=S) x_m = self.attn(x_m) x_m = rearrange(x_m, "(B T) S C -> B (T S) C", T=T, S=S) cache_dic['cache'][current['flag']][current['layer']][current['module']] = x_m force_init(cache_dic, current, x) else: x_m = cache_dic['cache'][current['flag']][current['layer']][current['module']] # modulate (attention) x_m_s = gate_msa * x_m if x_mask is not None: x_m_s_zero = gate_msa_zero * x_m x_m_s = self.t_mask_select(x_mask, x_m_s, x_m_s_zero, T, S) # residual x = x + self.drop_path(x_m_s) # cross attention current['module'] = 'cross-attn' if is_force_fresh[current['module']]: cache_dic['cache'][current['flag']][current['layer']][current['module']], cache_dic['cross_attn_map'][current['flag']][current['layer']] = self.cross_attn(x, y, mask) force_init(cache_dic, current, x) else: fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x, current) fresh_tokens, fresh_cross_attn_map = self.cross_attn(fresh_tokens, y, mask) update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current, fresh_attn_map=fresh_cross_attn_map) x = x + cache_dic['cache'][current['flag']][current['layer']][current['module']] # modulate (MLP) current['module'] = 'mlp' #mlp_tick.record() x_m = t2i_modulate(self.norm2(x), shift_mlp, scale_mlp) if x_mask is not None: x_m_zero = t2i_modulate(self.norm2(x), shift_mlp_zero, scale_mlp_zero) x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S) # MLP if is_force_fresh[current['module']]: x_m = self.mlp(x_m) cache_dic['cache'][current['flag']][current['layer']][current['module']] = x_m force_init(cache_dic, current, x) else: fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x_m, current) fresh_tokens = self.mlp(fresh_tokens) update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current) # modulate (MLP) x_m_s = gate_mlp * cache_dic['cache'][current['flag']][current['layer']][current['module']] if x_mask is not None: x_m_s_zero = gate_mlp_zero * x_m x_m_s = self.t_mask_select(x_mask, x_m_s, x_m_s_zero, T, S) # residual x = x + self.drop_path(x_m_s) return x class STDiT3Config(PretrainedConfig): model_type = "STDiT3" def __init__( self, input_size=(None, None, None), input_sq_size=512, in_channels=4, patch_size=(1, 2, 2), hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, pred_sigma=True, drop_path=0.0, caption_channels=4096, model_max_length=300, qk_norm=True, enable_flash_attn=False, enable_layernorm_kernel=False, enable_sequence_parallelism=False, only_train_temporal=False, freeze_y_embedder=False, skip_y_embedder=False, **kwargs, ): self.input_size = input_size self.input_sq_size = input_sq_size self.in_channels = in_channels self.patch_size = patch_size self.hidden_size = hidden_size self.depth = depth self.num_heads = num_heads self.mlp_ratio = mlp_ratio self.class_dropout_prob = class_dropout_prob self.pred_sigma = pred_sigma self.drop_path = drop_path self.caption_channels = caption_channels self.model_max_length = model_max_length self.qk_norm = qk_norm self.enable_flash_attn = enable_flash_attn self.enable_layernorm_kernel = enable_layernorm_kernel self.enable_sequence_parallelism = enable_sequence_parallelism self.only_train_temporal = only_train_temporal self.freeze_y_embedder = freeze_y_embedder self.skip_y_embedder = skip_y_embedder super().__init__(**kwargs) class STDiT3(PreTrainedModel): config_class = STDiT3Config def __init__(self, config): super().__init__(config) self.pred_sigma = config.pred_sigma self.in_channels = config.in_channels self.out_channels = config.in_channels * 2 if config.pred_sigma else config.in_channels # model size related self.depth = config.depth self.mlp_ratio = config.mlp_ratio self.hidden_size = config.hidden_size self.num_heads = config.num_heads # computation related self.drop_path = config.drop_path self.enable_flash_attn = config.enable_flash_attn self.enable_layernorm_kernel = config.enable_layernorm_kernel self.enable_sequence_parallelism = config.enable_sequence_parallelism # input size related self.patch_size = config.patch_size self.input_sq_size = config.input_sq_size self.pos_embed = PositionEmbedding2D(config.hidden_size) self.rope = RotaryEmbedding(dim=self.hidden_size // self.num_heads) # embedding self.x_embedder = PatchEmbed3D(config.patch_size, config.in_channels, config.hidden_size) self.t_embedder = TimestepEmbedder(config.hidden_size) self.fps_embedder = SizeEmbedder(self.hidden_size) self.t_block = nn.Sequential( nn.SiLU(), nn.Linear(config.hidden_size, 6 * config.hidden_size, bias=True), ) self.y_embedder = CaptionEmbedder( in_channels=config.caption_channels, hidden_size=config.hidden_size, uncond_prob=config.class_dropout_prob, act_layer=approx_gelu, token_num=config.model_max_length, ) # spatial blocks drop_path = [x.item() for x in torch.linspace(0, self.drop_path, config.depth)] self.spatial_blocks = nn.ModuleList( [ STDiT3Block( hidden_size=config.hidden_size, num_heads=config.num_heads, mlp_ratio=config.mlp_ratio, drop_path=drop_path[i], qk_norm=config.qk_norm, enable_flash_attn=config.enable_flash_attn, enable_layernorm_kernel=config.enable_layernorm_kernel, enable_sequence_parallelism=config.enable_sequence_parallelism, ) for i in range(config.depth) ] ) # temporal blocks drop_path = [x.item() for x in torch.linspace(0, self.drop_path, config.depth)] self.temporal_blocks = nn.ModuleList( [ STDiT3Block( hidden_size=config.hidden_size, num_heads=config.num_heads, mlp_ratio=config.mlp_ratio, drop_path=drop_path[i], qk_norm=config.qk_norm, enable_flash_attn=config.enable_flash_attn, enable_layernorm_kernel=config.enable_layernorm_kernel, enable_sequence_parallelism=config.enable_sequence_parallelism, # temporal temporal=True, rope=self.rope.rotate_queries_or_keys, ) for i in range(config.depth) ] ) # final layer self.final_layer = T2IFinalLayer(config.hidden_size, np.prod(self.patch_size), self.out_channels) self.initialize_weights() if config.only_train_temporal: for param in self.parameters(): param.requires_grad = False for block in self.temporal_blocks: for param in block.parameters(): param.requires_grad = True if config.freeze_y_embedder: for param in self.y_embedder.parameters(): param.requires_grad = False def initialize_weights(self): # Initialize transformer layers: def _basic_init(module): if isinstance(module, nn.Linear): torch.nn.init.xavier_uniform_(module.weight) if module.bias is not None: nn.init.constant_(module.bias, 0) self.apply(_basic_init) # Initialize fps_embedder nn.init.normal_(self.fps_embedder.mlp[0].weight, std=0.02) nn.init.constant_(self.fps_embedder.mlp[0].bias, 0) nn.init.constant_(self.fps_embedder.mlp[2].weight, 0) nn.init.constant_(self.fps_embedder.mlp[2].bias, 0) # Initialize timporal blocks for block in self.temporal_blocks: nn.init.constant_(block.attn.proj.weight, 0) nn.init.constant_(block.cross_attn.proj.weight, 0) nn.init.constant_(block.mlp.fc2.weight, 0) def get_dynamic_size(self, x): _, _, T, H, W = x.size() if T % self.patch_size[0] != 0: T += self.patch_size[0] - T % self.patch_size[0] if H % self.patch_size[1] != 0: H += self.patch_size[1] - H % self.patch_size[1] if W % self.patch_size[2] != 0: W += self.patch_size[2] - W % self.patch_size[2] T = T // self.patch_size[0] H = H // self.patch_size[1] W = W // self.patch_size[2] return (T, H, W) def encode_text(self, y, mask=None): y = self.y_embedder(y, self.training) # [B, 1, N_token, C] if mask is not None: if mask.shape[0] != y.shape[0]: mask = mask.repeat(y.shape[0] // mask.shape[0], 1) mask = mask.squeeze(1).squeeze(1) y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, self.hidden_size) y_lens = mask.sum(dim=1).tolist() else: y_lens = [y.shape[2]] * y.shape[0] y = y.squeeze(1).view(1, -1, self.hidden_size) return y, y_lens def forward(self, x, timestep, y, mask=None, x_mask=None, fps=None, height=None, width=None, cache_dic=None, current=None, **kwargs): dtype = self.x_embedder.proj.weight.dtype B = x.size(0) x = x.to(dtype) timestep = timestep.to(dtype) y = y.to(dtype) # === get pos embed === _, _, Tx, Hx, Wx = x.size() T, H, W = self.get_dynamic_size(x) cache_dic['dynamic_size'] = (B,T,H,W) # adjust for sequence parallelism # we need to ensure H * W is divisible by sequence parallel size # for simplicity, we can adjust the height to make it divisible if self.enable_sequence_parallelism: sp_size = dist.get_world_size(get_sequence_parallel_group()) if H % sp_size != 0: h_pad_size = sp_size - H % sp_size else: h_pad_size = 0 if h_pad_size > 0: hx_pad_size = h_pad_size * self.patch_size[1] # pad x along the H dimension H += h_pad_size x = F.pad(x, (0, 0, 0, hx_pad_size)) S = H * W base_size = round(S**0.5) resolution_sq = (height[0].item() * width[0].item()) ** 0.5 scale = resolution_sq / self.input_sq_size pos_emb = self.pos_embed(x, H, W, scale=scale, base_size=base_size) # === get timestep embed === t = self.t_embedder(timestep, dtype=x.dtype) # [B, C] fps = self.fps_embedder(fps.unsqueeze(1), B) t = t + fps t_mlp = self.t_block(t) t0 = t0_mlp = None if x_mask is not None: t0_timestep = torch.zeros_like(timestep) t0 = self.t_embedder(t0_timestep, dtype=x.dtype) t0 = t0 + fps t0_mlp = self.t_block(t0) # === get y embed === if self.config.skip_y_embedder: y_lens = mask if isinstance(y_lens, torch.Tensor): y_lens = y_lens.long().tolist() else: y, y_lens = self.encode_text(y, mask) # === get x embed === x = self.x_embedder(x) # [B, N, C] x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S) x = x + pos_emb # shard over the sequence dim if sp is enabled if self.enable_sequence_parallelism: x = split_forward_gather_backward(x, get_sequence_parallel_group(), dim=2, grad_scale="down") S = S // dist.get_world_size(get_sequence_parallel_group()) x = rearrange(x, "B T S C -> B (T S) C", T=T, S=S) # === blocks === for i, (spatial_block, temporal_block) in enumerate(zip(self.spatial_blocks, self.temporal_blocks)): current['layer'] = i #x = auto_grad_checkpoint(spatial_block, x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S) #x = auto_grad_checkpoint(temporal_block, x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S) x = spatial_block(x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S) x = temporal_block(x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S) if self.enable_sequence_parallelism: x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S) x = gather_forward_split_backward(x, get_sequence_parallel_group(), dim=2, grad_scale="up") S = S * dist.get_world_size(get_sequence_parallel_group()) x = rearrange(x, "B T S C -> B (T S) C", T=T, S=S) # === final layer === x = self.final_layer(x, t, x_mask, t0, T, S) x = self.unpatchify(x, T, H, W, Tx, Hx, Wx) # cast to float32 for better accuracy x = x.to(torch.float32) return x def unpatchify(self, x, N_t, N_h, N_w, R_t, R_h, R_w): """ Args: x (torch.Tensor): of shape [B, N, C] Return: x (torch.Tensor): of shape [B, C_out, T, H, W] """ # N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)] T_p, H_p, W_p = self.patch_size x = rearrange( x, "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)", N_t=N_t, N_h=N_h, N_w=N_w, T_p=T_p, H_p=H_p, W_p=W_p, C_out=self.out_channels, ) # unpad x = x[:, :, :R_t, :R_h, :R_w] return x @MODELS.register_module("STDiT3-XL/2") def STDiT3_XL_2(from_pretrained=None, **kwargs): force_huggingface = kwargs.pop("force_huggingface", False) if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained): model = STDiT3.from_pretrained(from_pretrained, **kwargs) else: config = STDiT3Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) model = STDiT3(config) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model @MODELS.register_module("STDiT3-3B/2") def STDiT3_3B_2(from_pretrained=None, **kwargs): force_huggingface = kwargs.pop("force_huggingface", False) if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained): model = STDiT3.from_pretrained(from_pretrained, **kwargs) else: config = STDiT3Config(depth=28, hidden_size=1872, patch_size=(1, 2, 2), num_heads=26, **kwargs) model = STDiT3(config) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model ================================================ FILE: Open-Sora/opensora/models/text_encoder/__init__.py ================================================ from .classes import ClassEncoder from .clip import ClipEncoder from .t5 import T5Encoder ================================================ FILE: Open-Sora/opensora/models/text_encoder/classes.py ================================================ import torch from opensora.registry import MODELS @MODELS.register_module("classes") class ClassEncoder: def __init__(self, num_classes, model_max_length=None, device="cuda", dtype=torch.float): self.num_classes = num_classes self.y_embedder = None self.model_max_length = model_max_length self.output_dim = None self.device = device def encode(self, text): return dict(y=torch.tensor([int(t) for t in text]).to(self.device)) def null(self, n): return torch.tensor([self.num_classes] * n).to(self.device) ================================================ FILE: Open-Sora/opensora/models/text_encoder/clip.py ================================================ # Copyright 2024 Vchitect/Latte # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License.# Modified from Latte # # This file is adapted from the Latte project. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # Latte: https://github.com/Vchitect/Latte # DiT: https://github.com/facebookresearch/DiT/tree/main # -------------------------------------------------------- import torch import torch.nn as nn import transformers from transformers import CLIPTextModel, CLIPTokenizer from opensora.registry import MODELS transformers.logging.set_verbosity_error() class AbstractEncoder(nn.Module): def __init__(self): super().__init__() def encode(self, *args, **kwargs): raise NotImplementedError class FrozenCLIPEmbedder(AbstractEncoder): """Uses the CLIP transformer encoder for text (from Hugging Face)""" def __init__(self, path="openai/clip-vit-huge-patch14", device="cuda", max_length=77): super().__init__() self.tokenizer = CLIPTokenizer.from_pretrained(path) self.transformer = CLIPTextModel.from_pretrained(path) self.device = device self.max_length = max_length self._freeze() def _freeze(self): self.transformer = self.transformer.eval() for param in self.parameters(): param.requires_grad = False def forward(self, text): batch_encoding = self.tokenizer( text, truncation=True, max_length=self.max_length, return_length=True, return_overflowing_tokens=False, padding="max_length", return_tensors="pt", ) tokens = batch_encoding["input_ids"].to(self.device) outputs = self.transformer(input_ids=tokens) z = outputs.last_hidden_state pooled_z = outputs.pooler_output return z, pooled_z def encode(self, text): return self(text) @MODELS.register_module("clip") class ClipEncoder: """ Embeds text prompt into vector representations. Also handles text dropout for classifier-free guidance. """ def __init__( self, from_pretrained, model_max_length=77, device="cuda", dtype=torch.float, ): super().__init__() assert from_pretrained is not None, "Please specify the path to the T5 model" self.text_encoder = FrozenCLIPEmbedder(path=from_pretrained, max_length=model_max_length).to(device, dtype) self.y_embedder = None self.model_max_length = model_max_length self.output_dim = self.text_encoder.transformer.config.hidden_size def encode(self, text): _, pooled_embeddings = self.text_encoder.encode(text) y = pooled_embeddings.unsqueeze(1).unsqueeze(1) return dict(y=y) def null(self, n): null_y = self.y_embedder.y_embedding[None].repeat(n, 1, 1)[:, None] return null_y def to(self, dtype): self.text_encoder = self.text_encoder.to(dtype) return self ================================================ FILE: Open-Sora/opensora/models/text_encoder/t5.py ================================================ # Adapted from PixArt # # Copyright (C) 2023 PixArt-alpha/PixArt-alpha # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published # by the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # PixArt: https://github.com/PixArt-alpha/PixArt-alpha # T5: https://github.com/google-research/text-to-text-transfer-transformer # -------------------------------------------------------- import html import re import ftfy import torch from transformers import AutoTokenizer, T5EncoderModel from opensora.registry import MODELS class T5Embedder: def __init__( self, device, from_pretrained=None, *, cache_dir=None, hf_token=None, use_text_preprocessing=True, t5_model_kwargs=None, torch_dtype=None, use_offload_folder=None, model_max_length=120, local_files_only=False, ): self.device = torch.device(device) self.torch_dtype = torch_dtype or torch.bfloat16 self.cache_dir = cache_dir if t5_model_kwargs is None: t5_model_kwargs = { "low_cpu_mem_usage": True, "torch_dtype": self.torch_dtype, } if use_offload_folder is not None: t5_model_kwargs["offload_folder"] = use_offload_folder t5_model_kwargs["device_map"] = { "shared": self.device, "encoder.embed_tokens": self.device, "encoder.block.0": self.device, "encoder.block.1": self.device, "encoder.block.2": self.device, "encoder.block.3": self.device, "encoder.block.4": self.device, "encoder.block.5": self.device, "encoder.block.6": self.device, "encoder.block.7": self.device, "encoder.block.8": self.device, "encoder.block.9": self.device, "encoder.block.10": self.device, "encoder.block.11": self.device, "encoder.block.12": "disk", "encoder.block.13": "disk", "encoder.block.14": "disk", "encoder.block.15": "disk", "encoder.block.16": "disk", "encoder.block.17": "disk", "encoder.block.18": "disk", "encoder.block.19": "disk", "encoder.block.20": "disk", "encoder.block.21": "disk", "encoder.block.22": "disk", "encoder.block.23": "disk", "encoder.final_layer_norm": "disk", "encoder.dropout": "disk", } else: t5_model_kwargs["device_map"] = { "shared": self.device, "encoder": self.device, } self.use_text_preprocessing = use_text_preprocessing self.hf_token = hf_token self.tokenizer = AutoTokenizer.from_pretrained( from_pretrained, cache_dir=cache_dir, local_files_only=local_files_only, ) self.model = T5EncoderModel.from_pretrained( from_pretrained, cache_dir=cache_dir, local_files_only=local_files_only, **t5_model_kwargs, ).eval() self.model_max_length = model_max_length def get_text_embeddings(self, texts): text_tokens_and_mask = self.tokenizer( texts, max_length=self.model_max_length, padding="max_length", truncation=True, return_attention_mask=True, add_special_tokens=True, return_tensors="pt", ) input_ids = text_tokens_and_mask["input_ids"].to(self.device) attention_mask = text_tokens_and_mask["attention_mask"].to(self.device) with torch.no_grad(): text_encoder_embs = self.model( input_ids=input_ids, attention_mask=attention_mask, )["last_hidden_state"].detach() return text_encoder_embs, attention_mask @MODELS.register_module("t5") class T5Encoder: def __init__( self, from_pretrained=None, model_max_length=120, device="cuda", dtype=torch.float, cache_dir=None, shardformer=False, local_files_only=False, ): assert from_pretrained is not None, "Please specify the path to the T5 model" self.t5 = T5Embedder( device=device, torch_dtype=dtype, from_pretrained=from_pretrained, cache_dir=cache_dir, model_max_length=model_max_length, local_files_only=local_files_only, ) self.t5.model.to(dtype=dtype) self.y_embedder = None self.model_max_length = model_max_length self.output_dim = self.t5.model.config.d_model self.dtype = dtype if shardformer: self.shardformer_t5() def shardformer_t5(self): from colossalai.shardformer import ShardConfig, ShardFormer from opensora.acceleration.shardformer.policy.t5_encoder import T5EncoderPolicy from opensora.utils.misc import requires_grad shard_config = ShardConfig( tensor_parallel_process_group=None, pipeline_stage_manager=None, enable_tensor_parallelism=False, enable_fused_normalization=False, enable_flash_attention=False, enable_jit_fused=True, enable_sequence_parallelism=False, enable_sequence_overlap=False, ) shard_former = ShardFormer(shard_config=shard_config) optim_model, _ = shard_former.optimize(self.t5.model, policy=T5EncoderPolicy()) self.t5.model = optim_model.to(self.dtype) # ensure the weights are frozen requires_grad(self.t5.model, False) def encode(self, text): caption_embs, emb_masks = self.t5.get_text_embeddings(text) caption_embs = caption_embs[:, None] return dict(y=caption_embs, mask=emb_masks) def null(self, n): null_y = self.y_embedder.y_embedding[None].repeat(n, 1, 1)[:, None] return null_y def basic_clean(text): text = ftfy.fix_text(text) text = html.unescape(html.unescape(text)) return text.strip() BAD_PUNCT_REGEX = re.compile( r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" ) # noqa def clean_caption(caption): import urllib.parse as ul from bs4 import BeautifulSoup caption = str(caption) caption = ul.unquote_plus(caption) caption = caption.strip().lower() caption = re.sub("", "person", caption) # urls: caption = re.sub( r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", caption, ) # regex for urls caption = re.sub( r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", caption, ) # regex for urls # html: caption = BeautifulSoup(caption, features="html.parser").text # @ caption = re.sub(r"@[\w\d]+\b", "", caption) # 31C0—31EF CJK Strokes # 31F0—31FF Katakana Phonetic Extensions # 3200—32FF Enclosed CJK Letters and Months # 3300—33FF CJK Compatibility # 3400—4DBF CJK Unified Ideographs Extension A # 4DC0—4DFF Yijing Hexagram Symbols # 4E00—9FFF CJK Unified Ideographs caption = re.sub(r"[\u31c0-\u31ef]+", "", caption) caption = re.sub(r"[\u31f0-\u31ff]+", "", caption) caption = re.sub(r"[\u3200-\u32ff]+", "", caption) caption = re.sub(r"[\u3300-\u33ff]+", "", caption) caption = re.sub(r"[\u3400-\u4dbf]+", "", caption) caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption) caption = re.sub(r"[\u4e00-\u9fff]+", "", caption) ####################################################### # все виды тире / all types of dash --> "-" caption = re.sub( r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa "-", caption, ) # кавычки к одному стандарту caption = re.sub(r"[`´«»“”¨]", '"', caption) caption = re.sub(r"[‘’]", "'", caption) # " caption = re.sub(r""?", "", caption) # & caption = re.sub(r"&", "", caption) # ip adresses: caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption) # article ids: caption = re.sub(r"\d:\d\d\s+$", "", caption) # \n caption = re.sub(r"\\n", " ", caption) # "#123" caption = re.sub(r"#\d{1,3}\b", "", caption) # "#12345.." caption = re.sub(r"#\d{5,}\b", "", caption) # "123456.." caption = re.sub(r"\b\d{6,}\b", "", caption) # filenames: caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) # caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" caption = re.sub(BAD_PUNCT_REGEX, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " # this-is-my-cute-cat / this_is_my_cute_cat regex2 = re.compile(r"(?:\-|\_)") if len(re.findall(regex2, caption)) > 3: caption = re.sub(regex2, " ", caption) caption = basic_clean(caption) caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640 caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231 caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) caption = re.sub(r"\bpage\s+\d+\b", "", caption) caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) caption = re.sub(r"\b\s+\:\s+", r": ", caption) caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption) caption = re.sub(r"\s+", " ", caption) caption.strip() caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption) caption = re.sub(r"^[\'\_,\-\:;]", r"", caption) caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption) caption = re.sub(r"^\.\S+$", "", caption) return caption.strip() def text_preprocessing(text, use_text_preprocessing: bool = True): if use_text_preprocessing: # The exact text cleaning as was in the training stage: text = clean_caption(text) text = clean_caption(text) return text else: return text.lower().strip() ================================================ FILE: Open-Sora/opensora/models/vae/__init__.py ================================================ from .discriminator import DISCRIMINATOR_3D from .vae import VideoAutoencoderKL, VideoAutoencoderKLTemporalDecoder from .vae_temporal import VAE_Temporal ================================================ FILE: Open-Sora/opensora/models/vae/discriminator.py ================================================ import functools import math import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from opensora.registry import MODELS from opensora.utils.ckpt_utils import find_model, load_checkpoint def cast_tuple(t, length=1): return t if isinstance(t, tuple) else ((t,) * length) def xavier_uniform_weight_init(m): if isinstance(m, nn.Conv3d) or isinstance(m, nn.Linear): nn.init.xavier_uniform_(m.weight, gain=nn.init.calculate_gain("relu")) if m.bias is not None: nn.init.zeros_(m.bias) # print("initialized module to xavier_uniform:", m) # SCH: taken from Open Sora Plan def n_layer_disc_weights_init(m): classname = m.__class__.__name__ if classname.find("Conv") != -1: nn.init.normal_(m.weight.data, 0.0, 0.02) elif classname.find("BatchNorm") != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0) # SCH: own implementation modified on top of: discriminator with anti-aliased downsampling (blurpool Zhang et al.) class BlurPool3D(nn.Module): def __init__( self, channels, pad_type="reflect", filt_size=3, stride=2, pad_off=0, device="cpu", dtype=torch.bfloat16, ): super(BlurPool3D, self).__init__() self.filt_size = filt_size self.pad_off = pad_off self.pad_sizes = [ int(1.0 * (filt_size - 1) / 2), int(np.ceil(1.0 * (filt_size - 1) / 2)), int(1.0 * (filt_size - 1) / 2), int(np.ceil(1.0 * (filt_size - 1) / 2)), int(1.0 * (filt_size - 1) / 2), int(np.ceil(1.0 * (filt_size - 1) / 2)), ] self.pad_sizes = [pad_size + pad_off for pad_size in self.pad_sizes] self.stride = stride self.off = int((self.stride - 1) / 2.0) self.channels = channels if self.filt_size == 1: a = np.array( [ 1.0, ] ) elif self.filt_size == 2: a = np.array([1.0, 1.0]) elif self.filt_size == 3: a = np.array([1.0, 2.0, 1.0]) elif self.filt_size == 4: a = np.array([1.0, 3.0, 3.0, 1.0]) elif self.filt_size == 5: a = np.array([1.0, 4.0, 6.0, 4.0, 1.0]) elif self.filt_size == 6: a = np.array([1.0, 5.0, 10.0, 10.0, 5.0, 1.0]) elif self.filt_size == 7: a = np.array([1.0, 6.0, 15.0, 20.0, 15.0, 6.0, 1.0]) filt_2d = a[:, None] * a[None, :] filt_3d = torch.Tensor(a[:, None, None] * filt_2d[None, :, :]).to(device, dtype) filt = filt_3d / torch.sum(filt_3d) # SCH: modified to it 3D self.register_buffer("filt", filt[None, None, :, :, :].repeat((self.channels, 1, 1, 1, 1))) self.pad = get_pad_layer(pad_type)(self.pad_sizes) def forward(self, inp): if self.filt_size == 1: if self.pad_off == 0: return inp[:, :, :: self.stride, :: self.stride] else: return self.pad(inp)[:, :, :: self.stride, :: self.stride] else: return F.conv3d(self.pad(inp), self.filt, stride=self.stride, groups=inp.shape[1]) class ResBlockDown(nn.Module): """3D StyleGAN ResBlock for D.""" def __init__( self, in_channels, filters, activation_fn, num_groups=32, device="cpu", dtype=torch.bfloat16, ): super().__init__() self.filters = filters self.activation_fn = activation_fn # SCH: NOTE: although paper says conv (X->Y, Y->Y), original code implementation is (X->X, X->Y), we follow code self.conv1 = nn.Conv3d( in_channels, in_channels, (3, 3, 3), padding=1, device=device, dtype=dtype ) # NOTE: init to xavier_uniform self.norm1 = nn.GroupNorm(num_groups, in_channels, device=device, dtype=dtype) self.blur = BlurPool3D(in_channels, device=device, dtype=dtype) self.conv2 = nn.Conv3d( in_channels, self.filters, (1, 1, 1), bias=False, device=device, dtype=dtype ) # NOTE: init to xavier_uniform self.conv3 = nn.Conv3d( in_channels, self.filters, (3, 3, 3), padding=1, device=device, dtype=dtype ) # NOTE: init to xavier_uniform self.norm2 = nn.GroupNorm(num_groups, self.filters, device=device, dtype=dtype) # self.apply(xavier_uniform_weight_init) def forward(self, x): residual = x x = self.conv1(x) x = self.norm1(x) x = self.activation_fn(x) residual = self.blur(residual) residual = self.conv2(residual) x = self.blur(x) x = self.conv3(x) x = self.norm2(x) x = self.activation_fn(x) out = (residual + x) / math.sqrt(2) return out @MODELS.register_module() class NLayerDiscriminator(nn.Module): """Defines a PatchGAN discriminator as in Pix2Pix --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py """ def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False, from_pretrained=None): """Construct a PatchGAN discriminator Parameters: input_nc (int) -- the number of channels in input images ndf (int) -- the number of filters in the last conv layer n_layers (int) -- the number of conv layers in the discriminator norm_layer -- normalization layer """ super(NLayerDiscriminator, self).__init__() norm_layer = nn.BatchNorm2d if type(norm_layer) == functools.partial: # no need to use bias as BatchNorm2d has affine parameters use_bias = norm_layer.func != nn.BatchNorm2d else: use_bias = norm_layer != nn.BatchNorm2d kw = 4 padw = 1 sequence = [nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)] nf_mult = 1 nf_mult_prev = 1 for n in range(1, n_layers): # gradually increase the number of filters nf_mult_prev = nf_mult nf_mult = min(2**n, 8) sequence += [ nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias), norm_layer(ndf * nf_mult), nn.LeakyReLU(0.2, True), ] nf_mult_prev = nf_mult nf_mult = min(2**n_layers, 8) sequence += [ nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias), norm_layer(ndf * nf_mult), nn.LeakyReLU(0.2, True), ] sequence += [ nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw) ] # output 1 channel prediction map self.main = nn.Sequential(*sequence) if from_pretrained is not None: load_checkpoint(self, from_pretrained) def forward(self, input): """Standard forward.""" return self.main(input) class NLayerDiscriminator3D(nn.Module): """Defines a 3D PatchGAN discriminator as in Pix2Pix but for 3D inputs.""" def __init__(self, input_nc=1, ndf=64, n_layers=3, use_actnorm=False): """ Construct a 3D PatchGAN discriminator Parameters: input_nc (int) -- the number of channels in input volumes ndf (int) -- the number of filters in the last conv layer n_layers (int) -- the number of conv layers in the discriminator use_actnorm (bool) -- flag to use actnorm instead of batchnorm """ super(NLayerDiscriminator3D, self).__init__() if not use_actnorm: norm_layer = nn.BatchNorm3d else: raise NotImplementedError("Not implemented.") if type(norm_layer) == functools.partial: use_bias = norm_layer.func != nn.BatchNorm3d else: use_bias = norm_layer != nn.BatchNorm3d kw = 4 padw = 1 sequence = [nn.Conv3d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)] nf_mult = 1 nf_mult_prev = 1 for n in range(1, n_layers): # gradually increase the number of filters nf_mult_prev = nf_mult nf_mult = min(2**n, 8) sequence += [ nn.Conv3d( ndf * nf_mult_prev, ndf * nf_mult, kernel_size=(kw, kw, kw), stride=(1, 2, 2), padding=padw, bias=use_bias, ), norm_layer(ndf * nf_mult), nn.LeakyReLU(0.2, True), ] nf_mult_prev = nf_mult nf_mult = min(2**n_layers, 8) sequence += [ nn.Conv3d( ndf * nf_mult_prev, ndf * nf_mult, kernel_size=(kw, kw, kw), stride=1, padding=padw, bias=use_bias ), norm_layer(ndf * nf_mult), nn.LeakyReLU(0.2, True), ] sequence += [ nn.Conv3d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw) ] # output 1 channel prediction map self.main = nn.Sequential(*sequence) def forward(self, input): """Standard forward.""" return self.main(input) class StyleGANDiscriminatorBlur(nn.Module): """StyleGAN Discriminator. SCH: NOTE: this discriminator requries the num_frames to be fixed during training; in case we pre-train with image then train on video, this disciminator's Linear layer would have to be re-trained! """ def __init__( self, image_size=(128, 128), num_frames=17, in_channels=3, filters=128, channel_multipliers=(2, 4, 4, 4, 4), num_groups=32, dtype=torch.bfloat16, device="cpu", ): super().__init__() self.dtype = dtype self.input_size = cast_tuple(image_size, 2) self.filters = filters self.activation_fn = nn.LeakyReLU(negative_slope=0.2) self.channel_multipliers = channel_multipliers self.conv1 = nn.Conv3d( in_channels, self.filters, (3, 3, 3), padding=1, device=device, dtype=dtype ) # NOTE: init to xavier_uniform prev_filters = self.filters # record in_channels self.num_blocks = len(self.channel_multipliers) self.res_block_list = nn.ModuleList([]) for i in range(self.num_blocks): filters = self.filters * self.channel_multipliers[i] self.res_block_list.append( ResBlockDown(prev_filters, filters, self.activation_fn, device=device, dtype=dtype).apply( xavier_uniform_weight_init ) ) prev_filters = filters # update in_channels self.conv2 = nn.Conv3d( prev_filters, prev_filters, (3, 3, 3), padding=1, device=device, dtype=dtype ) # NOTE: init to xavier_uniform # torch.nn.init.xavier_uniform_(self.conv2.weight) self.norm1 = nn.GroupNorm(num_groups, prev_filters, dtype=dtype, device=device) scale_factor = 2**self.num_blocks if num_frames % scale_factor != 0: # SCH: NOTE: has first frame which would be padded before usage time_scaled = num_frames // scale_factor + 1 else: time_scaled = num_frames / scale_factor assert ( self.input_size[0] % scale_factor == 0 ), f"image width {self.input_size[0]} is not divisible by scale factor {scale_factor}" assert ( self.input_size[1] % scale_factor == 0 ), f"image height {self.input_size[1]} is not divisible by scale factor {scale_factor}" w_scaled, h_scaled = self.input_size[0] / scale_factor, self.input_size[1] / scale_factor in_features = int(prev_filters * time_scaled * w_scaled * h_scaled) # (C*T*W*H) self.linear1 = nn.Linear(in_features, prev_filters, device=device, dtype=dtype) # NOTE: init to xavier_uniform self.linear2 = nn.Linear(prev_filters, 1, device=device, dtype=dtype) # NOTE: init to xavier_uniform # self.apply(xavier_uniform_weight_init) def forward(self, x): x = self.conv1(x) # print("discriminator aft conv:", x.size()) x = self.activation_fn(x) for i in range(self.num_blocks): x = self.res_block_list[i](x) # print("discriminator resblock down:", x.size()) x = self.conv2(x) # print("discriminator aft conv2:", x.size()) x = self.norm1(x) x = self.activation_fn(x) x = x.reshape((x.shape[0], -1)) # SCH: [B, (C * T * W * H)] ? # print("discriminator reshape:", x.size()) x = self.linear1(x) # print("discriminator aft linear1:", x.size()) x = self.activation_fn(x) x = self.linear2(x) # print("discriminator aft linear2:", x.size()) return x def load_checkpoint_with_inflation(model, ckpt_path): """ pre-train using image, then inflate to 3D videos """ if ckpt_path.endswith(".pt") or ckpt_path.endswith(".pth"): state_dict = find_model(ckpt_path) with torch.no_grad(): for key in state_dict: if key in model: # central inflation if state_dict[key].size() == model[key][:, :, 0, :, :].size(): # temporal dimension val = torch.zeros_like(model[key]) centre = int(model[key].size(2) // 2) val[:, :, centre, :, :] = state_dict[key] missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) print(f"Missing keys: {missing_keys}") print(f"Unexpected keys: {unexpected_keys}") else: load_checkpoint(model, ckpt_path) # use the default function @MODELS.register_module("DISCRIMINATOR_3D") def DISCRIMINATOR_3D(from_pretrained=None, inflate_from_2d=False, use_pretrained=True, **kwargs): model = StyleGANDiscriminatorBlur(**kwargs).apply(xavier_uniform_weight_init) if from_pretrained is not None: if use_pretrained: if inflate_from_2d: load_checkpoint_with_inflation(model, from_pretrained) else: load_checkpoint(model, from_pretrained, model_name="discriminator") print("loaded discriminator") else: print(f"discriminator use_pretrained={use_pretrained}, initializing new discriminator") return model @MODELS.register_module("N_Layer_DISCRIMINATOR_3D") def DISCRIMINATOR_3D_N_Layer(from_pretrained=None, inflate_from_2d=False, use_pretrained=True, **kwargs): model = NLayerDiscriminator3D( input_nc=3, n_layers=3, ).apply(n_layer_disc_weights_init) if from_pretrained is not None: if use_pretrained: if inflate_from_2d: load_checkpoint_with_inflation(model, from_pretrained) else: load_checkpoint(model, from_pretrained, model_name="discriminator") print("loaded discriminator") else: print(f"discriminator use_pretrained={use_pretrained}, initializing new discriminator") return model ================================================ FILE: Open-Sora/opensora/models/vae/losses.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange, repeat from .lpips import LPIPS def hinge_d_loss(logits_real, logits_fake): loss_real = torch.mean(F.relu(1.0 - logits_real)) loss_fake = torch.mean(F.relu(1.0 + logits_fake)) d_loss = 0.5 * (loss_real + loss_fake) return d_loss def vanilla_d_loss(logits_real, logits_fake): d_loss = 0.5 * ( torch.mean(torch.nn.functional.softplus(-logits_real)) + torch.mean(torch.nn.functional.softplus(logits_fake)) ) return d_loss # from MAGVIT, used in place hof hinge_d_loss def sigmoid_cross_entropy_with_logits(labels, logits): # The final formulation is: max(x, 0) - x * z + log(1 + exp(-abs(x))) zeros = torch.zeros_like(logits, dtype=logits.dtype) condition = logits >= zeros relu_logits = torch.where(condition, logits, zeros) neg_abs_logits = torch.where(condition, -logits, logits) return relu_logits - logits * labels + torch.log1p(torch.exp(neg_abs_logits)) def lecam_reg(real_pred, fake_pred, ema_real_pred, ema_fake_pred): assert real_pred.ndim == 0 and ema_fake_pred.ndim == 0 lecam_loss = torch.mean(torch.pow(nn.ReLU()(real_pred - ema_fake_pred), 2)) lecam_loss += torch.mean(torch.pow(nn.ReLU()(ema_real_pred - fake_pred), 2)) return lecam_loss def gradient_penalty_fn(images, output): gradients = torch.autograd.grad( outputs=output, inputs=images, grad_outputs=torch.ones(output.size(), device=images.device), create_graph=True, retain_graph=True, only_inputs=True, )[0] gradients = rearrange(gradients, "b ... -> b (...)") return ((gradients.norm(2, dim=1) - 1) ** 2).mean() class VAELoss(nn.Module): def __init__( self, logvar_init=0.0, perceptual_loss_weight=0.1, kl_loss_weight=0.000001, device="cpu", dtype="bf16", ): super().__init__() if type(dtype) == str: if dtype == "bf16": dtype = torch.bfloat16 elif dtype == "fp16": dtype = torch.float16 else: raise NotImplementedError(f"dtype: {dtype}") # KL Loss self.kl_loss_weight = kl_loss_weight # Perceptual Loss self.perceptual_loss_fn = LPIPS().eval().to(device, dtype) self.perceptual_loss_weight = perceptual_loss_weight self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init) def forward( self, video, recon_video, posterior, nll_weights=None, no_perceptual=False, ): video = rearrange(video, "b c t h w -> (b t) c h w").contiguous() recon_video = rearrange(recon_video, "b c t h w -> (b t) c h w").contiguous() # reconstruction loss recon_loss = torch.abs(video - recon_video) # perceptual loss if self.perceptual_loss_weight is not None and self.perceptual_loss_weight > 0.0 and not no_perceptual: # handle channels channels = video.shape[1] assert channels in {1, 3} if channels == 1: input_vgg_input = repeat(video, "b 1 h w -> b c h w", c=3) recon_vgg_input = repeat(recon_video, "b 1 h w -> b c h w", c=3) else: input_vgg_input = video recon_vgg_input = recon_video perceptual_loss = self.perceptual_loss_fn(input_vgg_input, recon_vgg_input) recon_loss = recon_loss + self.perceptual_loss_weight * perceptual_loss nll_loss = recon_loss / torch.exp(self.logvar) + self.logvar weighted_nll_loss = nll_loss if nll_weights is not None: weighted_nll_loss = nll_weights * nll_loss weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0] nll_loss = torch.sum(nll_loss) / nll_loss.shape[0] # KL Loss weighted_kl_loss = 0 if self.kl_loss_weight is not None and self.kl_loss_weight > 0.0: kl_loss = posterior.kl() kl_loss = torch.sum(kl_loss) / kl_loss.shape[0] weighted_kl_loss = kl_loss * self.kl_loss_weight return nll_loss, weighted_nll_loss, weighted_kl_loss def adopt_weight(weight, global_step, threshold=0, value=0.0): if global_step < threshold: weight = value return weight class AdversarialLoss(nn.Module): def __init__( self, discriminator_factor=1.0, discriminator_start=50001, generator_factor=0.5, generator_loss_type="non-saturating", ): super().__init__() self.discriminator_factor = discriminator_factor self.discriminator_start = discriminator_start self.generator_factor = generator_factor self.generator_loss_type = generator_loss_type def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer): nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0] g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0] d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4) d_weight = torch.clamp(d_weight, 0.0, 1e4).detach() d_weight = d_weight * self.generator_factor return d_weight def forward( self, fake_logits, nll_loss, last_layer, global_step, is_training=True, ): # NOTE: following MAGVIT to allow non_saturating assert self.generator_loss_type in ["hinge", "vanilla", "non-saturating"] if self.generator_loss_type == "hinge": gen_loss = -torch.mean(fake_logits) elif self.generator_loss_type == "non-saturating": gen_loss = torch.mean( sigmoid_cross_entropy_with_logits(labels=torch.ones_like(fake_logits), logits=fake_logits) ) else: raise ValueError("Generator loss {} not supported".format(self.generator_loss_type)) if self.discriminator_factor is not None and self.discriminator_factor > 0.0: try: d_weight = self.calculate_adaptive_weight(nll_loss, gen_loss, last_layer) except RuntimeError: assert not is_training d_weight = torch.tensor(0.0) else: d_weight = torch.tensor(0.0) disc_factor = adopt_weight(self.discriminator_factor, global_step, threshold=self.discriminator_start) weighted_gen_loss = d_weight * disc_factor * gen_loss return weighted_gen_loss class LeCamEMA: def __init__(self, ema_real=0.0, ema_fake=0.0, decay=0.999, dtype=torch.bfloat16, device="cpu"): self.decay = decay self.ema_real = torch.tensor(ema_real).to(device, dtype) self.ema_fake = torch.tensor(ema_fake).to(device, dtype) def update(self, ema_real, ema_fake): self.ema_real = self.ema_real * self.decay + ema_real * (1 - self.decay) self.ema_fake = self.ema_fake * self.decay + ema_fake * (1 - self.decay) def get(self): return self.ema_real, self.ema_fake class DiscriminatorLoss(nn.Module): def __init__( self, discriminator_factor=1.0, discriminator_start=50001, discriminator_loss_type="non-saturating", lecam_loss_weight=None, gradient_penalty_loss_weight=None, # SCH: following MAGVIT config.vqgan.grad_penalty_cost ): super().__init__() assert discriminator_loss_type in ["hinge", "vanilla", "non-saturating"] self.discriminator_factor = discriminator_factor self.discriminator_start = discriminator_start self.lecam_loss_weight = lecam_loss_weight self.gradient_penalty_loss_weight = gradient_penalty_loss_weight self.discriminator_loss_type = discriminator_loss_type def forward( self, real_logits, fake_logits, global_step, lecam_ema_real=None, lecam_ema_fake=None, real_video=None, split="train", ): if self.discriminator_factor is not None and self.discriminator_factor > 0.0: disc_factor = adopt_weight(self.discriminator_factor, global_step, threshold=self.discriminator_start) if self.discriminator_loss_type == "hinge": disc_loss = hinge_d_loss(real_logits, fake_logits) elif self.discriminator_loss_type == "non-saturating": if real_logits is not None: real_loss = sigmoid_cross_entropy_with_logits( labels=torch.ones_like(real_logits), logits=real_logits ) else: real_loss = 0.0 if fake_logits is not None: fake_loss = sigmoid_cross_entropy_with_logits( labels=torch.zeros_like(fake_logits), logits=fake_logits ) else: fake_loss = 0.0 disc_loss = 0.5 * (torch.mean(real_loss) + torch.mean(fake_loss)) elif self.discriminator_loss_type == "vanilla": disc_loss = vanilla_d_loss(real_logits, fake_logits) else: raise ValueError(f"Unknown GAN loss '{self.discriminator_loss_type}'.") weighted_d_adversarial_loss = disc_factor * disc_loss else: weighted_d_adversarial_loss = 0 lecam_loss = torch.tensor(0.0) if self.lecam_loss_weight is not None and self.lecam_loss_weight > 0.0: real_pred = torch.mean(real_logits) fake_pred = torch.mean(fake_logits) lecam_loss = lecam_reg(real_pred, fake_pred, lecam_ema_real, lecam_ema_fake) lecam_loss = lecam_loss * self.lecam_loss_weight gradient_penalty = torch.tensor(0.0) if self.gradient_penalty_loss_weight is not None and self.gradient_penalty_loss_weight > 0.0: assert real_video is not None gradient_penalty = gradient_penalty_fn(real_video, real_logits) gradient_penalty *= self.gradient_penalty_loss_weight return (weighted_d_adversarial_loss, lecam_loss, gradient_penalty) ================================================ FILE: Open-Sora/opensora/models/vae/lpips.py ================================================ import hashlib import os from collections import namedtuple import requests import torch import torch.nn as nn from torchvision import models from tqdm import tqdm URL_MAP = {"vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1"} CKPT_MAP = {"vgg_lpips": "vgg.pth"} MD5_MAP = {"vgg_lpips": "d507d7349b931f0638a25a48a722f98a"} def md5_hash(path): with open(path, "rb") as f: content = f.read() return hashlib.md5(content).hexdigest() def download(url, local_path, chunk_size=1024): os.makedirs(os.path.split(local_path)[0], exist_ok=True) with requests.get(url, stream=True) as r: total_size = int(r.headers.get("content-length", 0)) with tqdm(total=total_size, unit="B", unit_scale=True) as pbar: with open(local_path, "wb") as f: for data in r.iter_content(chunk_size=chunk_size): if data: f.write(data) pbar.update(chunk_size) def get_ckpt_path(name, root, check=False): assert name in URL_MAP path = os.path.join(root, CKPT_MAP[name]) if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]): print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path)) download(URL_MAP[name], path) md5 = md5_hash(path) assert md5 == MD5_MAP[name], md5 return path class LPIPS(nn.Module): # Learned perceptual metric def __init__(self, use_dropout=True): super().__init__() self.scaling_layer = ScalingLayer() self.chns = [64, 128, 256, 512, 512] # vg16 features self.net = vgg16(pretrained=True, requires_grad=False) self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout) self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout) self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout) self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout) self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout) self.load_from_pretrained() for param in self.parameters(): param.requires_grad = False def load_from_pretrained(self, name="vgg_lpips"): ckpt = get_ckpt_path(name, "pretrained_models/taming/modules/autoencoder/lpips") self.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False) # print("loaded pretrained LPIPS loss from {}".format(ckpt)) @classmethod def from_pretrained(cls, name="vgg_lpips"): if name != "vgg_lpips": raise NotImplementedError model = cls() ckpt = get_ckpt_path(name) model.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False) return model def forward(self, input, target): in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target)) outs0, outs1 = self.net(in0_input), self.net(in1_input) feats0, feats1, diffs = {}, {}, {} lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4] for kk in range(len(self.chns)): feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk]) diffs[kk] = (feats0[kk] - feats1[kk]) ** 2 res = [spatial_average(lins[kk].model(diffs[kk]), keepdim=True) for kk in range(len(self.chns))] val = res[0] for l in range(1, len(self.chns)): val += res[l] return val class ScalingLayer(nn.Module): def __init__(self): super(ScalingLayer, self).__init__() self.register_buffer("shift", torch.Tensor([-0.030, -0.088, -0.188])[None, :, None, None]) self.register_buffer("scale", torch.Tensor([0.458, 0.448, 0.450])[None, :, None, None]) def forward(self, inp): return (inp - self.shift) / self.scale class NetLinLayer(nn.Module): """A single linear layer which does a 1x1 conv""" def __init__(self, chn_in, chn_out=1, use_dropout=False): super(NetLinLayer, self).__init__() layers = ( [ nn.Dropout(), ] if (use_dropout) else [] ) layers += [ nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False), ] self.model = nn.Sequential(*layers) class vgg16(torch.nn.Module): def __init__(self, requires_grad=False, pretrained=True): super(vgg16, self).__init__() vgg_pretrained_features = models.vgg16(pretrained=pretrained).features self.slice1 = torch.nn.Sequential() self.slice2 = torch.nn.Sequential() self.slice3 = torch.nn.Sequential() self.slice4 = torch.nn.Sequential() self.slice5 = torch.nn.Sequential() self.N_slices = 5 for x in range(4): self.slice1.add_module(str(x), vgg_pretrained_features[x]) for x in range(4, 9): self.slice2.add_module(str(x), vgg_pretrained_features[x]) for x in range(9, 16): self.slice3.add_module(str(x), vgg_pretrained_features[x]) for x in range(16, 23): self.slice4.add_module(str(x), vgg_pretrained_features[x]) for x in range(23, 30): self.slice5.add_module(str(x), vgg_pretrained_features[x]) if not requires_grad: for param in self.parameters(): param.requires_grad = False def forward(self, X): h = self.slice1(X) h_relu1_2 = h h = self.slice2(h) h_relu2_2 = h h = self.slice3(h) h_relu3_3 = h h = self.slice4(h) h_relu4_3 = h h = self.slice5(h) h_relu5_3 = h vgg_outputs = namedtuple("VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"]) out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3) return out def normalize_tensor(x, eps=1e-10): norm_factor = torch.sqrt(torch.sum(x**2, dim=1, keepdim=True)) return x / (norm_factor + eps) def spatial_average(x, keepdim=True): return x.mean([2, 3], keepdim=keepdim) ================================================ FILE: Open-Sora/opensora/models/vae/utils.py ================================================ import numpy as np import torch """Stripped version of https://github.com/richzhang/PerceptualSimilarity/tree/master/models""" class DiagonalGaussianDistribution(object): def __init__( self, parameters, deterministic=False, ): self.parameters = parameters self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) self.logvar = torch.clamp(self.logvar, -30.0, 20.0) self.deterministic = deterministic self.std = torch.exp(0.5 * self.logvar) self.var = torch.exp(self.logvar) if self.deterministic: self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device, dtype=self.mean.dtype) def sample(self): # torch.randn: standard normal distribution x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device, dtype=self.mean.dtype) return x def kl(self, other=None): if self.deterministic: return torch.Tensor([0.0]) else: if other is None: # SCH: assumes other is a standard normal distribution return 0.5 * torch.sum(torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, dim=[1, 2, 3, 4]) else: return 0.5 * torch.sum( torch.pow(self.mean - other.mean, 2) / other.var + self.var / other.var - 1.0 - self.logvar + other.logvar, dim=[1, 2, 3, 4], ) def nll(self, sample, dims=[1, 2, 3, 4]): if self.deterministic: return torch.Tensor([0.0]) logtwopi = np.log(2.0 * np.pi) return 0.5 * torch.sum(logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, dim=dims) def mode(self): return self.mean ================================================ FILE: Open-Sora/opensora/models/vae/vae.py ================================================ import os import torch import torch.nn as nn from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder from einops import rearrange from transformers import PretrainedConfig, PreTrainedModel from opensora.registry import MODELS, build_module from opensora.utils.ckpt_utils import load_checkpoint @MODELS.register_module() class VideoAutoencoderKL(nn.Module): def __init__( self, from_pretrained=None, micro_batch_size=None, cache_dir=None, local_files_only=False, subfolder=None, scaling_factor=0.18215, ): super().__init__() self.module = AutoencoderKL.from_pretrained( from_pretrained, cache_dir=cache_dir, local_files_only=local_files_only, subfolder=subfolder, ) self.out_channels = self.module.config.latent_channels self.patch_size = (1, 8, 8) self.micro_batch_size = micro_batch_size self.scaling_factor = scaling_factor def encode(self, x): # x: (B, C, T, H, W) B = x.shape[0] x = rearrange(x, "B C T H W -> (B T) C H W") if self.micro_batch_size is None: x = self.module.encode(x).latent_dist.sample().mul_(self.scaling_factor) else: # NOTE: cannot be used for training bs = self.micro_batch_size x_out = [] for i in range(0, x.shape[0], bs): x_bs = x[i : i + bs] x_bs = self.module.encode(x_bs).latent_dist.sample().mul_(self.scaling_factor) x_out.append(x_bs) x = torch.cat(x_out, dim=0) x = rearrange(x, "(B T) C H W -> B C T H W", B=B) return x def decode(self, x, **kwargs): # x: (B, C, T, H, W) B = x.shape[0] x = rearrange(x, "B C T H W -> (B T) C H W") if self.micro_batch_size is None: x = self.module.decode(x / self.scaling_factor).sample else: # NOTE: cannot be used for training bs = self.micro_batch_size x_out = [] for i in range(0, x.shape[0], bs): x_bs = x[i : i + bs] x_bs = self.module.decode(x_bs / self.scaling_factor).sample x_out.append(x_bs) x = torch.cat(x_out, dim=0) x = rearrange(x, "(B T) C H W -> B C T H W", B=B) return x def get_latent_size(self, input_size): latent_size = [] for i in range(3): # assert ( # input_size[i] is None or input_size[i] % self.patch_size[i] == 0 # ), "Input size must be divisible by patch size" latent_size.append(input_size[i] // self.patch_size[i] if input_size[i] is not None else None) return latent_size @property def device(self): return next(self.parameters()).device @property def dtype(self): return next(self.parameters()).dtype @MODELS.register_module() class VideoAutoencoderKLTemporalDecoder(nn.Module): def __init__(self, from_pretrained=None, cache_dir=None, local_files_only=False): super().__init__() self.module = AutoencoderKLTemporalDecoder.from_pretrained( from_pretrained, cache_dir=cache_dir, local_files_only=local_files_only ) self.out_channels = self.module.config.latent_channels self.patch_size = (1, 8, 8) def encode(self, x): raise NotImplementedError def decode(self, x, **kwargs): B, _, T = x.shape[:3] x = rearrange(x, "B C T H W -> (B T) C H W") x = self.module.decode(x / 0.18215, num_frames=T).sample x = rearrange(x, "(B T) C H W -> B C T H W", B=B) return x def get_latent_size(self, input_size): latent_size = [] for i in range(3): # assert ( # input_size[i] is None or input_size[i] % self.patch_size[i] == 0 # ), "Input size must be divisible by patch size" latent_size.append(input_size[i] // self.patch_size[i] if input_size[i] is not None else None) return latent_size @property def device(self): return next(self.parameters()).device @property def dtype(self): return next(self.parameters()).dtype class VideoAutoencoderPipelineConfig(PretrainedConfig): model_type = "VideoAutoencoderPipeline" def __init__( self, vae_2d=None, vae_temporal=None, from_pretrained=None, freeze_vae_2d=False, cal_loss=False, micro_frame_size=None, shift=0.0, scale=1.0, **kwargs, ): self.vae_2d = vae_2d self.vae_temporal = vae_temporal self.from_pretrained = from_pretrained self.freeze_vae_2d = freeze_vae_2d self.cal_loss = cal_loss self.micro_frame_size = micro_frame_size self.shift = shift self.scale = scale super().__init__(**kwargs) class VideoAutoencoderPipeline(PreTrainedModel): config_class = VideoAutoencoderPipelineConfig def __init__(self, config: VideoAutoencoderPipelineConfig): super().__init__(config=config) self.spatial_vae = build_module(config.vae_2d, MODELS) self.temporal_vae = build_module(config.vae_temporal, MODELS) self.cal_loss = config.cal_loss self.micro_frame_size = config.micro_frame_size self.micro_z_frame_size = self.temporal_vae.get_latent_size([config.micro_frame_size, None, None])[0] if config.freeze_vae_2d: for param in self.spatial_vae.parameters(): param.requires_grad = False self.out_channels = self.temporal_vae.out_channels # normalization parameters scale = torch.tensor(config.scale) shift = torch.tensor(config.shift) if len(scale.shape) > 0: scale = scale[None, :, None, None, None] if len(shift.shape) > 0: shift = shift[None, :, None, None, None] self.register_buffer("scale", scale) self.register_buffer("shift", shift) def encode(self, x): x_z = self.spatial_vae.encode(x) if self.micro_frame_size is None: posterior = self.temporal_vae.encode(x_z) z = posterior.sample() else: z_list = [] for i in range(0, x_z.shape[2], self.micro_frame_size): x_z_bs = x_z[:, :, i : i + self.micro_frame_size] posterior = self.temporal_vae.encode(x_z_bs) z_list.append(posterior.sample()) z = torch.cat(z_list, dim=2) if self.cal_loss: return z, posterior, x_z else: return (z - self.shift) / self.scale def decode(self, z, num_frames=None): if not self.cal_loss: z = z * self.scale.to(z.dtype) + self.shift.to(z.dtype) if self.micro_frame_size is None: x_z = self.temporal_vae.decode(z, num_frames=num_frames) x = self.spatial_vae.decode(x_z) else: x_z_list = [] for i in range(0, z.size(2), self.micro_z_frame_size): z_bs = z[:, :, i : i + self.micro_z_frame_size] x_z_bs = self.temporal_vae.decode(z_bs, num_frames=min(self.micro_frame_size, num_frames)) x_z_list.append(x_z_bs) num_frames -= self.micro_frame_size x_z = torch.cat(x_z_list, dim=2) x = self.spatial_vae.decode(x_z) if self.cal_loss: return x, x_z else: return x def forward(self, x): assert self.cal_loss, "This method is only available when cal_loss is True" z, posterior, x_z = self.encode(x) x_rec, x_z_rec = self.decode(z, num_frames=x_z.shape[2]) return x_rec, x_z_rec, z, posterior, x_z def get_latent_size(self, input_size): if self.micro_frame_size is None or input_size[0] is None: return self.temporal_vae.get_latent_size(self.spatial_vae.get_latent_size(input_size)) else: sub_input_size = [self.micro_frame_size, input_size[1], input_size[2]] sub_latent_size = self.temporal_vae.get_latent_size(self.spatial_vae.get_latent_size(sub_input_size)) sub_latent_size[0] = sub_latent_size[0] * (input_size[0] // self.micro_frame_size) remain_temporal_size = [input_size[0] % self.micro_frame_size, None, None] if remain_temporal_size[0] > 0: remain_size = self.temporal_vae.get_latent_size(remain_temporal_size) sub_latent_size[0] += remain_size[0] return sub_latent_size def get_temporal_last_layer(self): return self.temporal_vae.decoder.conv_out.conv.weight @property def device(self): return next(self.parameters()).device @property def dtype(self): return next(self.parameters()).dtype @MODELS.register_module() def OpenSoraVAE_V1_2( micro_batch_size=4, micro_frame_size=17, from_pretrained=None, local_files_only=False, freeze_vae_2d=False, cal_loss=False, force_huggingface=False, ): vae_2d = dict( type="VideoAutoencoderKL", from_pretrained="/root/autodl-tmp/pretrained_models/PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers", subfolder="vae", micro_batch_size=micro_batch_size, local_files_only=local_files_only, ) vae_temporal = dict( type="VAE_Temporal_SD", from_pretrained=None, ) shift = (-0.10, 0.34, 0.27, 0.98) scale = (3.85, 2.32, 2.33, 3.06) kwargs = dict( vae_2d=vae_2d, vae_temporal=vae_temporal, freeze_vae_2d=freeze_vae_2d, cal_loss=cal_loss, micro_frame_size=micro_frame_size, shift=shift, scale=scale, ) if force_huggingface or (from_pretrained is not None and not os.path.exists(from_pretrained)): model = VideoAutoencoderPipeline.from_pretrained(from_pretrained, **kwargs) else: config = VideoAutoencoderPipelineConfig(**kwargs) model = VideoAutoencoderPipeline(config) if from_pretrained: load_checkpoint(model, from_pretrained) return model ================================================ FILE: Open-Sora/opensora/models/vae/vae_temporal.py ================================================ from typing import Tuple, Union import torch.nn as nn import torch.nn.functional as F from einops import rearrange from opensora.registry import MODELS from opensora.utils.ckpt_utils import load_checkpoint from .utils import DiagonalGaussianDistribution def cast_tuple(t, length=1): return t if isinstance(t, tuple) else ((t,) * length) def divisible_by(num, den): return (num % den) == 0 def is_odd(n): return not divisible_by(n, 2) def pad_at_dim(t, pad, dim=-1): dims_from_right = (-dim - 1) if dim < 0 else (t.ndim - dim - 1) zeros = (0, 0) * dims_from_right return F.pad(t, (*zeros, *pad), mode="constant") def exists(v): return v is not None class CausalConv3d(nn.Module): def __init__( self, chan_in, chan_out, kernel_size: Union[int, Tuple[int, int, int]], pad_mode="constant", strides=None, # allow custom stride **kwargs, ): super().__init__() kernel_size = cast_tuple(kernel_size, 3) time_kernel_size, height_kernel_size, width_kernel_size = kernel_size assert is_odd(height_kernel_size) and is_odd(width_kernel_size) dilation = kwargs.pop("dilation", 1) stride = strides[0] if strides is not None else kwargs.pop("stride", 1) self.pad_mode = pad_mode time_pad = dilation * (time_kernel_size - 1) + (1 - stride) height_pad = height_kernel_size // 2 width_pad = width_kernel_size // 2 self.time_pad = time_pad self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_pad, 0) stride = strides if strides is not None else (stride, 1, 1) dilation = (dilation, 1, 1) self.conv = nn.Conv3d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs) def forward(self, x): x = F.pad(x, self.time_causal_padding, mode=self.pad_mode) x = self.conv(x) return x class ResBlock(nn.Module): def __init__( self, in_channels, # SCH: added filters, conv_fn, activation_fn=nn.SiLU, use_conv_shortcut=False, num_groups=32, ): super().__init__() self.in_channels = in_channels self.filters = filters self.activate = activation_fn() self.use_conv_shortcut = use_conv_shortcut # SCH: MAGVIT uses GroupNorm by default self.norm1 = nn.GroupNorm(num_groups, in_channels) self.conv1 = conv_fn(in_channels, self.filters, kernel_size=(3, 3, 3), bias=False) self.norm2 = nn.GroupNorm(num_groups, self.filters) self.conv2 = conv_fn(self.filters, self.filters, kernel_size=(3, 3, 3), bias=False) if in_channels != filters: if self.use_conv_shortcut: self.conv3 = conv_fn(in_channels, self.filters, kernel_size=(3, 3, 3), bias=False) else: self.conv3 = conv_fn(in_channels, self.filters, kernel_size=(1, 1, 1), bias=False) def forward(self, x): residual = x x = self.norm1(x) x = self.activate(x) x = self.conv1(x) x = self.norm2(x) x = self.activate(x) x = self.conv2(x) if self.in_channels != self.filters: # SCH: ResBlock X->Y residual = self.conv3(residual) return x + residual def get_activation_fn(activation): if activation == "relu": activation_fn = nn.ReLU elif activation == "swish": activation_fn = nn.SiLU else: raise NotImplementedError return activation_fn class Encoder(nn.Module): """Encoder Blocks.""" def __init__( self, in_out_channels=4, latent_embed_dim=512, # num channels for latent vector filters=128, num_res_blocks=4, channel_multipliers=(1, 2, 2, 4), temporal_downsample=(False, True, True), num_groups=32, # for nn.GroupNorm activation_fn="swish", ): super().__init__() self.filters = filters self.num_res_blocks = num_res_blocks self.num_blocks = len(channel_multipliers) self.channel_multipliers = channel_multipliers self.temporal_downsample = temporal_downsample self.num_groups = num_groups self.embedding_dim = latent_embed_dim self.activation_fn = get_activation_fn(activation_fn) self.activate = self.activation_fn() self.conv_fn = CausalConv3d self.block_args = dict( conv_fn=self.conv_fn, activation_fn=self.activation_fn, use_conv_shortcut=False, num_groups=self.num_groups, ) # first layer conv self.conv_in = self.conv_fn( in_out_channels, filters, kernel_size=(3, 3, 3), bias=False, ) # ResBlocks and conv downsample self.block_res_blocks = nn.ModuleList([]) self.conv_blocks = nn.ModuleList([]) filters = self.filters prev_filters = filters # record for in_channels for i in range(self.num_blocks): filters = self.filters * self.channel_multipliers[i] block_items = nn.ModuleList([]) for _ in range(self.num_res_blocks): block_items.append(ResBlock(prev_filters, filters, **self.block_args)) prev_filters = filters # update in_channels self.block_res_blocks.append(block_items) if i < self.num_blocks - 1: if self.temporal_downsample[i]: t_stride = 2 if self.temporal_downsample[i] else 1 s_stride = 1 self.conv_blocks.append( self.conv_fn( prev_filters, filters, kernel_size=(3, 3, 3), strides=(t_stride, s_stride, s_stride) ) ) prev_filters = filters # update in_channels else: # if no t downsample, don't add since this does nothing for pipeline models self.conv_blocks.append(nn.Identity(prev_filters)) # Identity prev_filters = filters # update in_channels # last layer res block self.res_blocks = nn.ModuleList([]) for _ in range(self.num_res_blocks): self.res_blocks.append(ResBlock(prev_filters, filters, **self.block_args)) prev_filters = filters # update in_channels # MAGVIT uses Group Normalization self.norm1 = nn.GroupNorm(self.num_groups, prev_filters) self.conv2 = self.conv_fn(prev_filters, self.embedding_dim, kernel_size=(1, 1, 1), padding="same") def forward(self, x): x = self.conv_in(x) for i in range(self.num_blocks): for j in range(self.num_res_blocks): x = self.block_res_blocks[i][j](x) if i < self.num_blocks - 1: x = self.conv_blocks[i](x) for i in range(self.num_res_blocks): x = self.res_blocks[i](x) x = self.norm1(x) x = self.activate(x) x = self.conv2(x) return x class Decoder(nn.Module): """Decoder Blocks.""" def __init__( self, in_out_channels=4, latent_embed_dim=512, filters=128, num_res_blocks=4, channel_multipliers=(1, 2, 2, 4), temporal_downsample=(False, True, True), num_groups=32, # for nn.GroupNorm activation_fn="swish", ): super().__init__() self.filters = filters self.num_res_blocks = num_res_blocks self.num_blocks = len(channel_multipliers) self.channel_multipliers = channel_multipliers self.temporal_downsample = temporal_downsample self.num_groups = num_groups self.embedding_dim = latent_embed_dim self.s_stride = 1 self.activation_fn = get_activation_fn(activation_fn) self.activate = self.activation_fn() self.conv_fn = CausalConv3d self.block_args = dict( conv_fn=self.conv_fn, activation_fn=self.activation_fn, use_conv_shortcut=False, num_groups=self.num_groups, ) filters = self.filters * self.channel_multipliers[-1] prev_filters = filters # last conv self.conv1 = self.conv_fn(self.embedding_dim, filters, kernel_size=(3, 3, 3), bias=True) # last layer res block self.res_blocks = nn.ModuleList([]) for _ in range(self.num_res_blocks): self.res_blocks.append(ResBlock(filters, filters, **self.block_args)) # ResBlocks and conv upsample self.block_res_blocks = nn.ModuleList([]) self.num_blocks = len(self.channel_multipliers) self.conv_blocks = nn.ModuleList([]) # reverse to keep track of the in_channels, but append also in a reverse direction for i in reversed(range(self.num_blocks)): filters = self.filters * self.channel_multipliers[i] # resblock handling block_items = nn.ModuleList([]) for _ in range(self.num_res_blocks): block_items.append(ResBlock(prev_filters, filters, **self.block_args)) prev_filters = filters # SCH: update in_channels self.block_res_blocks.insert(0, block_items) # SCH: append in front # conv blocks with upsampling if i > 0: if self.temporal_downsample[i - 1]: t_stride = 2 if self.temporal_downsample[i - 1] else 1 # SCH: T-Causal Conv 3x3x3, f -> (t_stride * 2 * 2) * f, depth to space t_stride x 2 x 2 self.conv_blocks.insert( 0, self.conv_fn( prev_filters, prev_filters * t_stride * self.s_stride * self.s_stride, kernel_size=(3, 3, 3) ), ) else: self.conv_blocks.insert( 0, nn.Identity(prev_filters), ) self.norm1 = nn.GroupNorm(self.num_groups, prev_filters) self.conv_out = self.conv_fn(filters, in_out_channels, 3) def forward(self, x): x = self.conv1(x) for i in range(self.num_res_blocks): x = self.res_blocks[i](x) for i in reversed(range(self.num_blocks)): for j in range(self.num_res_blocks): x = self.block_res_blocks[i][j](x) if i > 0: t_stride = 2 if self.temporal_downsample[i - 1] else 1 x = self.conv_blocks[i - 1](x) x = rearrange( x, "B (C ts hs ws) T H W -> B C (T ts) (H hs) (W ws)", ts=t_stride, hs=self.s_stride, ws=self.s_stride, ) x = self.norm1(x) x = self.activate(x) x = self.conv_out(x) return x @MODELS.register_module() class VAE_Temporal(nn.Module): def __init__( self, in_out_channels=4, latent_embed_dim=4, embed_dim=4, filters=128, num_res_blocks=4, channel_multipliers=(1, 2, 2, 4), temporal_downsample=(True, True, False), num_groups=32, # for nn.GroupNorm activation_fn="swish", ): super().__init__() self.time_downsample_factor = 2 ** sum(temporal_downsample) # self.time_padding = self.time_downsample_factor - 1 self.patch_size = (self.time_downsample_factor, 1, 1) self.out_channels = in_out_channels # NOTE: following MAGVIT, conv in bias=False in encoder first conv self.encoder = Encoder( in_out_channels=in_out_channels, latent_embed_dim=latent_embed_dim * 2, filters=filters, num_res_blocks=num_res_blocks, channel_multipliers=channel_multipliers, temporal_downsample=temporal_downsample, num_groups=num_groups, # for nn.GroupNorm activation_fn=activation_fn, ) self.quant_conv = CausalConv3d(2 * latent_embed_dim, 2 * embed_dim, 1) self.post_quant_conv = CausalConv3d(embed_dim, latent_embed_dim, 1) self.decoder = Decoder( in_out_channels=in_out_channels, latent_embed_dim=latent_embed_dim, filters=filters, num_res_blocks=num_res_blocks, channel_multipliers=channel_multipliers, temporal_downsample=temporal_downsample, num_groups=num_groups, # for nn.GroupNorm activation_fn=activation_fn, ) def get_latent_size(self, input_size): latent_size = [] for i in range(3): if input_size[i] is None: lsize = None elif i == 0: time_padding = ( 0 if (input_size[i] % self.time_downsample_factor == 0) else self.time_downsample_factor - input_size[i] % self.time_downsample_factor ) lsize = (input_size[i] + time_padding) // self.patch_size[i] else: lsize = input_size[i] // self.patch_size[i] latent_size.append(lsize) return latent_size def encode(self, x): time_padding = ( 0 if (x.shape[2] % self.time_downsample_factor == 0) else self.time_downsample_factor - x.shape[2] % self.time_downsample_factor ) x = pad_at_dim(x, (time_padding, 0), dim=2) encoded_feature = self.encoder(x) moments = self.quant_conv(encoded_feature).to(x.dtype) posterior = DiagonalGaussianDistribution(moments) return posterior def decode(self, z, num_frames=None): time_padding = ( 0 if (num_frames % self.time_downsample_factor == 0) else self.time_downsample_factor - num_frames % self.time_downsample_factor ) z = self.post_quant_conv(z) x = self.decoder(z) x = x[:, :, time_padding:] return x def forward(self, x, sample_posterior=True): posterior = self.encode(x) if sample_posterior: z = posterior.sample() else: z = posterior.mode() recon_video = self.decode(z, num_frames=x.shape[2]) return recon_video, posterior, z @MODELS.register_module("VAE_Temporal_SD") def VAE_Temporal_SD(from_pretrained=None, **kwargs): model = VAE_Temporal( in_out_channels=4, latent_embed_dim=4, embed_dim=4, filters=128, num_res_blocks=4, channel_multipliers=(1, 2, 2, 4), temporal_downsample=(False, True, True), **kwargs, ) if from_pretrained is not None: load_checkpoint(model, from_pretrained) return model ================================================ FILE: Open-Sora/opensora/models/vae/video_sdxl/blocks.py ================================================ """ Adapted from SDXL VAE (https://huggingface.co/stabilityai/sdxl-vae/blob/main/config.json) All default values of kwargs are the same as SDXL """ from typing import Optional, Tuple, Union import torch import torch.nn as nn import torch.nn.functional as F from diffusers.models.attention_processor import Attention from einops import rearrange def video_to_image(func): def wrapper(self, x, *args, **kwargs): if x.ndim == 5: B = x.shape[0] x = rearrange(x, 'B C T H W -> (B T) C H W') if hasattr(self, 'micro_batch_size') and self.micro_batch_size is None: x = func(self, x, *args, **kwargs) else: bs = self.micro_batch_size x_out = [] for i in range(0, x.shape[0], bs): x_i = func(self, x[i:i + bs], *args, **kwargs) x_out.append(x_i) x = torch.cat(x_out, dim=0) x = rearrange(x, '(B T) C H W -> B C T H W', B=B) return x return wrapper class VideoConv2d(nn.Conv2d): def __init__(self, *args, micro_batch_size=None, **kwargs): super().__init__(*args, **kwargs) self.micro_batch_size = micro_batch_size @video_to_image def forward(self, x): return super().forward(x) class ResnetBlock2D(nn.Module): """ Use nn.Conv2d Default activation is nn.SiLU() Make sure input tensor is of shape [B, C, T, H, W] or [B, C, H, W] Support micro_batch_size """ def __init__( self, in_channels: int, out_channels: Optional[int] = None, norm_groups: int = 32, norm_eps: float = 1e-6, micro_batch_size=None, ): super().__init__() self.in_channels = in_channels out_channels = in_channels if out_channels is None else out_channels self.out_channels = out_channels self.micro_batch_size = micro_batch_size conv_cls = nn.Conv2d self.norm1 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=in_channels, eps=norm_eps, affine=True) self.conv1 = conv_cls(in_channels, out_channels, kernel_size=3, stride=1, padding=1) self.norm2 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=out_channels, eps=norm_eps, affine=True) self.conv2 = conv_cls(out_channels, out_channels, kernel_size=3, stride=1, padding=1) self.act = nn.SiLU() self.use_in_shortcut = self.in_channels != out_channels self.conv_shortcut = None if self.use_in_shortcut: self.conv_shortcut = conv_cls( in_channels, out_channels, kernel_size=1, stride=1, padding=0, ) @video_to_image def forward(self, x): res = self.norm1(x) res = self.act(res) res = self.conv1(res) res = self.norm2(res) res = self.act(res) res = self.conv2(res) if self.conv_shortcut is not None: x = self.conv_shortcut(x) out = x + res return out class ResnetBlock3D(nn.Module): """ Use nn.Conv3d Default activation is nn.SiLU() Make sure input tensor is of shape [B, C, T, H, W] """ def __init__( self, in_channels: int, out_channels: Optional[int] = None, norm_groups: int = 32, norm_eps: float = 1e-6, ): super().__init__() self.in_channels = in_channels out_channels = in_channels if out_channels is None else out_channels self.out_channels = out_channels conv_cls = nn.Conv3d self.norm1 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=in_channels, eps=norm_eps, affine=True) self.conv1 = conv_cls(in_channels, out_channels, kernel_size=3, stride=1, padding=1) self.norm2 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=out_channels, eps=norm_eps, affine=True) self.conv2 = conv_cls(out_channels, out_channels, kernel_size=3, stride=1, padding=1) self.act = nn.SiLU() self.use_in_shortcut = self.in_channels != out_channels self.conv_shortcut = None if self.use_in_shortcut: self.conv_shortcut = conv_cls( in_channels, out_channels, kernel_size=1, stride=1, padding=0, ) def forward(self, x): res = self.norm1(x) res = self.act(res) res = self.conv1(res) res = self.norm2(res) res = self.act(res) res = self.conv2(res) if self.conv_shortcut is not None: x = self.conv_shortcut(x) out = x + res return out class SpatialDownsample2x(nn.Module): """ Default downsample is Conv2d(stride=2) Make sure input tensor is of shape [B, C, T, H, W] Support micro_batch_size """ def __init__( self, channels: int, use_conv: bool = True, micro_batch_size=None, ): super().__init__() self.channels = channels self.use_conv = use_conv self.micro_batch_size = micro_batch_size if use_conv: self.downsample = nn.Conv2d( self.channels, self.channels, kernel_size=3, stride=2, padding=0, ) else: self.downsample = nn.AvgPool2d(kernel_size=2, stride=2) @video_to_image def forward(self, x): # implementation from SDXL pad = (0, 1, 0, 1) x = F.pad(x, pad, mode="constant", value=0) x = self.downsample(x) return x class SpatialUpsample2x(nn.Module): """ Default upsample is F.interpolate(scale_factor=2) + Conv2d(stride=1) Make sure input tensor is of shape [B, C, T, H, W] Support micro_batch_size """ def __init__( self, channels: int, use_interpolate=True, micro_batch_size=None, ): super().__init__() self.channels = channels self.use_interpolate = use_interpolate self.micro_batch_size = micro_batch_size if use_interpolate: self.conv = nn.Conv2d(self.channels, self.channels, kernel_size=3, padding=1) else: raise NotImplementedError self.upsample = nn.ConvTranspose2d(channels, self.channels, kernel_size=4, stride=2, padding=1) def forward(self, x): B = x.shape[0] x = rearrange(x, 'B C T H W -> (B T) C H W') if self.micro_batch_size is None: x = self.forward_BCHW(x) else: bs = self.micro_batch_size x_out = [] for i in range(0, x.shape[0], bs): x_i = self.forward_BCHW(x[i:i + bs]) x_out.append(x_i) x = torch.cat(x_out, dim=0) x = rearrange(x, '(B T) C H W -> B C T H W', B=B) return x def forward_BCHW(self, x): if self.use_interpolate: # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984 if x.shape[0] >= 64: x = x.contiguous() # interpolate tensor of bfloat16 is fixed in pytorch 2.1. see https://github.com/pytorch/pytorch/issues/86679 x = F.interpolate(x, scale_factor=2.0, mode="nearest") x = self.conv(x) else: x = self.upsample(x) return x class TemporalDownsample2x(nn.Module): """ Default downsample is Conv3d(stride=(2, 1, 1)) Make sure input tensor is of shape [B, C, T, H, W] """ def __init__( self, channels: int, use_conv: bool = True, ): super().__init__() self.channels = channels self.use_conv = use_conv if use_conv: self.downsample = nn.Conv3d( self.channels, self.channels, kernel_size=(3, 3, 3), stride=(2, 1, 1), padding=(1, 1, 1), ) else: self.downsample = nn.AvgPool3d(kernel_size=(3, 1, 1), stride=(2, 1, 1)) def forward(self, x): x = self.downsample(x) return x class TemporalUpsample2x(nn.Module): """ Default upsample is F.interpolate(scale_factor=(2, 1, 1)) + Conv3d(stride=1) Make sure input tensor is of shape [B, C, T, H, W] Support micro_batch_size """ def __init__( self, channels, ): super().__init__() self.channels = channels self.conv = nn.Conv3d(channels, channels, kernel_size=3, padding=1) def forward(self, x): if x.shape[0] >= 64: x = x.contiguous() x = F.interpolate(x, scale_factor=(2, 1, 1), mode="trilinear") x = self.conv(x) return x class UNetMidBlock2D(nn.Module): """ default is ResnetBlock2D + Spatial Attention + ResnetBlock2D Make sure input tensor is of shape [B, C, T, H, W] or [B, C, H, W] """ def __init__( self, in_channels: int, num_layers: int = 1, norm_groups: int = 32, norm_eps: float = 1e-6, attn_groups: Optional[int] = None, add_attention: bool = True, attention_head_dim: int = 512, ): super().__init__() self.add_attention = add_attention if attn_groups is None: attn_groups = norm_groups if attention_head_dim is None: attention_head_dim = in_channels res_blocks = [ ResnetBlock2D( in_channels=in_channels, out_channels=in_channels, norm_eps=norm_eps, norm_groups=norm_groups, ) ] attn_blocks = [] for _ in range(num_layers): if self.add_attention: attn_blocks.append( Attention( in_channels, heads=in_channels // attention_head_dim, dim_head=attention_head_dim, # rescale_output_factor=output_scale_factor, rescale_output_factor=1.0, eps=norm_eps, norm_num_groups=attn_groups, # spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None, spatial_norm_dim=None, residual_connection=True, bias=True, upcast_softmax=True, _from_deprecated_attn_block=True, ) ) res_blocks.append( ResnetBlock2D( in_channels=in_channels, out_channels=in_channels, norm_eps=norm_eps, norm_groups=norm_groups, ) ) self.attn_blocks = nn.ModuleList(attn_blocks) self.res_blocks = nn.ModuleList(res_blocks) def forward(self, x): has_T = x.ndim == 5 if has_T: B = x.shape[0] x = rearrange(x, 'B C T H W -> (B T) C H W') x = self.res_blocks[0](x) for attn, res_block in zip(self.attn_blocks, self.res_blocks[1:]): if attn is not None: x = attn(x) x = res_block(x) if has_T: x = rearrange(x, '(B T) C H W -> B C T H W', B=B) return x class Encoder(nn.Module): """ default arch is conv_in + blocks + mid_block + out_block Make sure input tensor is of shape [B, C, T, H, W] """ def __init__( self, in_channels=3, out_channels=4, norm_groups=32, norm_eps=1e-6, double_z=True, micro_batch_size=None, ): super().__init__() in_channels_encoder = in_channels out_channels_encoder = out_channels block_out_channels = [128, 256, 512, 512] # conv_in self.conv_in = VideoConv2d( in_channels_encoder, block_out_channels[0], kernel_size=3, stride=1, padding=1, micro_batch_size=micro_batch_size, ) # blocks blocks = [] # the first block: ResnetBlock2D in_channels = block_out_channels[0] out_channels = block_out_channels[0] blocks.append( nn.Sequential( ResnetBlock2D( in_channels=in_channels, out_channels=out_channels, norm_groups=norm_groups, norm_eps=norm_eps, micro_batch_size=micro_batch_size, ), ResnetBlock2D( in_channels=out_channels, out_channels=out_channels, norm_groups=norm_groups, norm_eps=norm_eps, micro_batch_size=micro_batch_size, ), SpatialDownsample2x( channels=out_channels, use_conv=True, micro_batch_size=micro_batch_size, ), ) ) # the second block: ResnetBlock2D in_channels = block_out_channels[0] out_channels = block_out_channels[1] blocks.append( nn.Sequential( ResnetBlock2D( in_channels=in_channels, out_channels=out_channels, norm_groups=norm_groups, norm_eps=norm_eps, micro_batch_size=micro_batch_size, ), ResnetBlock2D( in_channels=out_channels, out_channels=out_channels, norm_groups=norm_groups, norm_eps=norm_eps, micro_batch_size=micro_batch_size, ), SpatialDownsample2x( channels=out_channels, use_conv=True, micro_batch_size=micro_batch_size, ), TemporalDownsample2x( channels=out_channels, use_conv=True, ) ) ) # the third block: ResnetBlock3D in_channels = block_out_channels[1] out_channels = block_out_channels[2] blocks.append( nn.Sequential( ResnetBlock3D( in_channels=in_channels, out_channels=out_channels, norm_groups=norm_groups, norm_eps=norm_eps, ), ResnetBlock3D( in_channels=out_channels, out_channels=out_channels, norm_groups=norm_groups, norm_eps=norm_eps, ), SpatialDownsample2x( channels=out_channels, use_conv=True, ), TemporalDownsample2x( channels=out_channels, use_conv=True, ) ) ) # the fourth block: ResnetBlock3D in_channels = block_out_channels[2] out_channels = block_out_channels[3] blocks.append( nn.Sequential( ResnetBlock3D( in_channels=in_channels, out_channels=out_channels, norm_groups=norm_groups, norm_eps=norm_eps, ), ResnetBlock3D( in_channels=out_channels, out_channels=out_channels, norm_groups=norm_groups, norm_eps=norm_eps, ), ) ) self.blocks = nn.ModuleList(blocks) # mid_block in_channels = block_out_channels[-1] self.mid_block = UNetMidBlock2D( in_channels=in_channels, num_layers=1, norm_groups=norm_groups, norm_eps=norm_eps, add_attention=True, attention_head_dim=in_channels, ) # out_block in_channels = block_out_channels[-1] out_channels = 2 * out_channels_encoder if double_z else out_channels_encoder self.out_block = nn.Sequential( nn.GroupNorm(num_channels=in_channels, num_groups=norm_groups, eps=norm_eps), nn.SiLU(), nn.Conv3d(in_channels, out_channels, kernel_size=3, padding=1), ) def forward(self, x): x = self.conv_in(x) for block in self.blocks: x = block(x) x = self.mid_block(x) x = self.out_block(x) return x class Decoder(nn.Module): """ default arch is conv_in + mid_block + blocks + out_block Make sure input tensor is of shape [B, C, T, H, W] """ def __init__( self, in_channels=4, out_channels=3, norm_groups=32, norm_eps=1e-6, ): super().__init__() in_channels_decoder = in_channels out_channels_decoder = out_channels block_out_channels = [512, 512, 256, 128] # conv_in self.conv_in = nn.Conv3d( in_channels_decoder, block_out_channels[0], kernel_size=3, stride=1, padding=1, ) # mid_block in_channels = block_out_channels[0] self.mid_block = UNetMidBlock2D( in_channels=in_channels, num_layers=1, norm_groups=norm_groups, norm_eps=norm_eps, add_attention=True, attention_head_dim=in_channels, ) # blocks blocks = [] layer_per_block = 3 # the first up block: ResnetBlock3D in_channels = block_out_channels[0] out_channels = block_out_channels[0] seq = [ ResnetBlock3D( in_channels=in_channels if idx ==0 else out_channels, out_channels=out_channels, norm_groups=norm_groups, norm_eps=norm_eps, ) for idx in range(layer_per_block) ] + [ SpatialUpsample2x( channels=out_channels, use_interpolate=True, ), TemporalUpsample2x( channels=out_channels, ), ] blocks.append(nn.Sequential(*seq)) # the second up block: ResnetBlock3D in_channels = block_out_channels[0] out_channels = block_out_channels[1] seq = [ ResnetBlock3D( in_channels=in_channels if idx ==0 else out_channels, out_channels=out_channels, norm_groups=norm_groups, norm_eps=norm_eps, ) for idx in range(layer_per_block) ] + [ SpatialUpsample2x( channels=out_channels, use_interpolate=True, ), TemporalUpsample2x( channels=out_channels, ), ] blocks.append(nn.Sequential(*seq)) # the third up block: ResnetBlock3D in_channels = block_out_channels[1] out_channels = block_out_channels[2] seq = [ ResnetBlock3D( in_channels=in_channels if idx ==0 else out_channels, out_channels=out_channels, norm_groups=norm_groups, norm_eps=norm_eps, ) for idx in range(layer_per_block) ] + [ SpatialUpsample2x( channels=out_channels, use_interpolate=True, ), ] blocks.append(nn.Sequential(*seq)) # the fourth up block: ResnetBlock2D in_channels = block_out_channels[2] out_channels = block_out_channels[3] seq = [ ResnetBlock2D( in_channels=in_channels if idx ==0 else out_channels, out_channels=out_channels, norm_groups=norm_groups, norm_eps=norm_eps, ) for idx in range(layer_per_block) ] blocks.append(nn.Sequential(*seq)) self.blocks = nn.ModuleList(blocks) # out_block in_channels = block_out_channels[-1] out_channels = out_channels_decoder self.out_block = nn.Sequential( nn.GroupNorm(num_channels=in_channels, num_groups=norm_groups, eps=norm_eps), nn.SiLU(), nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1), ) def forward(self, x): x = self.conv_in(x) print(torch.cuda.memory_allocated() / 1024 ** 3) x = self.mid_block(x) print(torch.cuda.memory_allocated() / 1024 ** 3) for block in self.blocks: x = block(x) print(torch.cuda.memory_allocated() / 1024 ** 3) x = self.out_block(x) print(torch.cuda.memory_allocated() / 1024 ** 3) return x if __name__ == '__main__': from opensora.utils.misc import count_params device = 'cuda' dtype = torch.bfloat16 encoder = Encoder( in_channels=3, out_channels=4, double_z=False, micro_batch_size=4, ).to(torch.bfloat16).to(device, dtype).eval() decoder = Decoder( in_channels=4, out_channels=3, ).to(torch.bfloat16).to(device, dtype).eval() num_params_enc = count_params(encoder) num_params_dec = count_params(decoder) print(f'Encoder #params: {num_params_enc}') print(f'Decoder #params: {num_params_dec}') # inference x = torch.rand(1, 3, 51, 720, 1080).to(device, dtype) with torch.inference_mode(): x_enc = encoder(x) x_dec = decoder(x_enc) print(torch.cuda.memory_allocated() / 1024 ** 3) breakpoint() ================================================ FILE: Open-Sora/opensora/registry.py ================================================ from copy import deepcopy import torch.nn as nn from mmengine.registry import Registry def build_module(module, builder, **kwargs): """Build module from config or return the module itself. Args: module (Union[dict, nn.Module]): The module to build. builder (Registry): The registry to build module. *args, **kwargs: Arguments passed to build function. Returns: Any: The built module. """ if module is None: return None if isinstance(module, dict): cfg = deepcopy(module) for k, v in kwargs.items(): cfg[k] = v return builder.build(cfg) elif isinstance(module, nn.Module): return module elif module is None: return None else: raise TypeError(f"Only support dict and nn.Module, but got {type(module)}.") MODELS = Registry( "model", locations=["opensora.models"], ) SCHEDULERS = Registry( "scheduler", locations=["opensora.schedulers"], ) DATASETS = Registry( "dataset", locations=["opensora.datasets"], ) ================================================ FILE: Open-Sora/opensora/schedulers/__init__.py ================================================ from .dpms import DPMS from .iddpm import IDDPM from .rf import RFLOW ================================================ FILE: Open-Sora/opensora/schedulers/dpms/__init__.py ================================================ from functools import partial import torch from opensora.registry import SCHEDULERS from .dpm_solver import DPMS @SCHEDULERS.register_module("dpm-solver") class DPM_SOLVER: def __init__(self, num_sampling_steps=None, cfg_scale=4.0): self.num_sampling_steps = num_sampling_steps self.cfg_scale = cfg_scale def sample( self, model, text_encoder, z, prompts, device, additional_args=None, mask=None, progress=True, ): if mask is not None: print("[WARNING] mask is not supported in dpm-solver, it will be ignored") n = len(prompts) model_args = text_encoder.encode(prompts) y = model_args.pop("y") null_y = text_encoder.null(n) if additional_args is not None: model_args.update(additional_args) dpms = DPMS( partial(forward_with_dpmsolver, model), condition=y, uncondition=null_y, cfg_scale=self.cfg_scale, model_kwargs=model_args, ) samples = dpms.sample( z, steps=self.num_sampling_steps, order=2, skip_type="time_uniform", method="multistep", progress=progress, ) return samples def forward_with_dpmsolver(self, x, timestep, y, **kwargs): """ dpm solver donnot need variance prediction """ # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb model_out = self.forward(x, timestep, y, **kwargs) return model_out.chunk(2, dim=1)[0] ================================================ FILE: Open-Sora/opensora/schedulers/dpms/dpm_solver.py ================================================ # MIT License # # Copyright (c) 2022 Cheng Lu # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # # This file is adapted from the dpm-solver project # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # PixArt: https://github.com/PixArt-alpha/PixArt-alpha # dpm-solver: https://github.com/LuChengTHU/dpm-solver # -------------------------------------------------------- import math import numpy as np import torch from tqdm import tqdm def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac): betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64) warmup_time = int(num_diffusion_timesteps * warmup_frac) betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64) return betas def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps): """ This is the deprecated API for creating beta schedules. See get_named_beta_schedule() for the new library of schedules. """ if beta_schedule == "quad": betas = ( np.linspace( beta_start**0.5, beta_end**0.5, num_diffusion_timesteps, dtype=np.float64, ) ** 2 ) elif beta_schedule == "linear": betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64) elif beta_schedule == "warmup10": betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1) elif beta_schedule == "warmup50": betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5) elif beta_schedule == "const": betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64) elif beta_schedule == "jsd": # 1/T, 1/(T-1), 1/(T-2), ..., 1 betas = 1.0 / np.linspace(num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64) else: raise NotImplementedError(beta_schedule) assert betas.shape == (num_diffusion_timesteps,) return betas def get_named_beta_schedule(schedule_name, num_diffusion_timesteps): """ Get a pre-defined beta schedule for the given name. The beta schedule library consists of beta schedules which remain similar in the limit of num_diffusion_timesteps. Beta schedules may be added, but should not be removed or changed once they are committed to maintain backwards compatibility. """ if schedule_name == "linear": # Linear schedule from Ho et al, extended to work for any number of # diffusion steps. scale = 1000 / num_diffusion_timesteps return get_beta_schedule( "linear", beta_start=scale * 0.0001, beta_end=scale * 0.02, num_diffusion_timesteps=num_diffusion_timesteps, ) elif schedule_name == "squaredcos_cap_v2": return betas_for_alpha_bar( num_diffusion_timesteps, lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2, ) else: raise NotImplementedError(f"unknown beta schedule: {schedule_name}") def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. :param num_diffusion_timesteps: the number of betas to produce. :param alpha_bar: a lambda that takes an argument t from 0 to 1 and produces the cumulative product of (1-beta) up to that part of the diffusion process. :param max_beta: the maximum beta to use; use values lower than 1 to prevent singularities. """ betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) return np.array(betas) class NoiseScheduleVP: def __init__( self, schedule="discrete", betas=None, alphas_cumprod=None, continuous_beta_0=0.1, continuous_beta_1=20.0, dtype=torch.float32, ): """Create a wrapper class for the forward SDE (VP type). *** Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t. We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images. *** The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ). We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper). Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have: log_alpha_t = self.marginal_log_mean_coeff(t) sigma_t = self.marginal_std(t) lambda_t = self.marginal_lambda(t) Moreover, as lambda(t) is an invertible function, we also support its inverse function: t = self.inverse_lambda(lambda_t) =============================================================== We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]). 1. For discrete-time DPMs: For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by: t_i = (i + 1) / N e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1. We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3. Args: betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details) alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details) Note that we always have alphas_cumprod = cumprod(1 - betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`. **Important**: Please pay special attention for the args for `alphas_cumprod`: The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ). Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have alpha_{t_n} = \sqrt{\hat{alpha_n}}, and log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}). 2. For continuous-time DPMs: We support the linear VPSDE for the continuous time setting. The hyperparameters for the noise schedule are the default settings in Yang Song's ScoreSDE: Args: beta_min: A `float` number. The smallest beta for the linear schedule. beta_max: A `float` number. The largest beta for the linear schedule. T: A `float` number. The ending time of the forward process. =============================================================== Args: schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs, 'linear' for continuous-time DPMs. Returns: A wrapper object of the forward SDE (VP type). =============================================================== Example: # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1): >>> ns = NoiseScheduleVP('discrete', betas=betas) # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1): >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod) # For continuous-time DPMs (VPSDE), linear schedule: >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.) """ if schedule not in ["discrete", "linear"]: raise ValueError(f"Unsupported noise schedule {schedule}. The schedule needs to be 'discrete' or 'linear'") self.schedule = schedule if schedule == "discrete": if betas is not None: log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0) else: assert alphas_cumprod is not None log_alphas = 0.5 * torch.log(alphas_cumprod) self.T = 1.0 self.log_alpha_array = ( self.numerical_clip_alpha(log_alphas) .reshape( ( 1, -1, ) ) .to(dtype=dtype) ) self.total_N = self.log_alpha_array.shape[1] self.t_array = torch.linspace(0.0, 1.0, self.total_N + 1)[1:].reshape((1, -1)).to(dtype=dtype) else: self.T = 1.0 self.total_N = 1000 self.beta_0 = continuous_beta_0 self.beta_1 = continuous_beta_1 def numerical_clip_alpha(self, log_alphas, clipped_lambda=-5.1): """ For some beta schedules such as cosine schedule, the log-SNR has numerical isssues. We clip the log-SNR near t=T within -5.1 to ensure the stability. Such a trick is very useful for diffusion models with the cosine schedule, such as i-DDPM, guided-diffusion and GLIDE. """ log_sigmas = 0.5 * torch.log(1.0 - torch.exp(2.0 * log_alphas)) lambs = log_alphas - log_sigmas idx = torch.searchsorted(torch.flip(lambs, [0]), clipped_lambda) if idx > 0: log_alphas = log_alphas[:-idx] return log_alphas def marginal_log_mean_coeff(self, t): """ Compute log(alpha_t) of a given continuous-time label t in [0, T]. """ if self.schedule == "discrete": return interpolate_fn( t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device) ).reshape((-1)) elif self.schedule == "linear": return -0.25 * t**2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0 def marginal_alpha(self, t): """ Compute alpha_t of a given continuous-time label t in [0, T]. """ return torch.exp(self.marginal_log_mean_coeff(t)) def marginal_std(self, t): """ Compute sigma_t of a given continuous-time label t in [0, T]. """ return torch.sqrt(1.0 - torch.exp(2.0 * self.marginal_log_mean_coeff(t))) def marginal_lambda(self, t): """ Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T]. """ log_mean_coeff = self.marginal_log_mean_coeff(t) log_std = 0.5 * torch.log(1.0 - torch.exp(2.0 * log_mean_coeff)) return log_mean_coeff - log_std def inverse_lambda(self, lamb): """ Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t. """ if self.schedule == "linear": tmp = 2.0 * (self.beta_1 - self.beta_0) * torch.logaddexp(-2.0 * lamb, torch.zeros((1,)).to(lamb)) Delta = self.beta_0**2 + tmp return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0) elif self.schedule == "discrete": log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2.0 * lamb) t = interpolate_fn( log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]), torch.flip(self.t_array.to(lamb.device), [1]), ) return t.reshape((-1,)) def model_wrapper( model, noise_schedule, model_type="noise", model_kwargs={}, guidance_type="uncond", condition=None, unconditional_condition=None, guidance_scale=1.0, classifier_fn=None, classifier_kwargs={}, ): """Create a wrapper function for the noise prediction model. DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to firstly wrap the model function to a noise prediction model that accepts the continuous time as the input. We support four types of the diffusion model by setting `model_type`: 1. "noise": noise prediction model. (Trained by predicting noise). 2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0). 3. "v": velocity prediction model. (Trained by predicting the velocity). The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2]. [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models." arXiv preprint arXiv:2202.00512 (2022). [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models." arXiv preprint arXiv:2210.02303 (2022). 4. "score": marginal score function. (Trained by denoising score matching). Note that the score function and the noise prediction model follows a simple relationship: ``` noise(x_t, t) = -sigma_t * score(x_t, t) ``` We support three types of guided sampling by DPMs by setting `guidance_type`: 1. "uncond": unconditional sampling by DPMs. The input `model` has the following format: `` model(x, t_input, **model_kwargs) -> noise | x_start | v | score `` 2. "classifier": classifier guidance sampling [3] by DPMs and another classifier. The input `model` has the following format: `` model(x, t_input, **model_kwargs) -> noise | x_start | v | score `` The input `classifier_fn` has the following format: `` classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond) `` [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis," in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794. 3. "classifier-free": classifier-free guidance sampling by conditional DPMs. The input `model` has the following format: `` model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score `` And if cond == `unconditional_condition`, the model output is the unconditional DPM output. [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance." arXiv preprint arXiv:2207.12598 (2022). The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999) or continuous-time labels (i.e. epsilon to T). We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise: `` def model_fn(x, t_continuous) -> noise: t_input = get_model_input_time(t_continuous) return noise_pred(model, x, t_input, **model_kwargs) `` where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver. =============================================================== Args: model: A diffusion model with the corresponding format described above. noise_schedule: A noise schedule object, such as NoiseScheduleVP. model_type: A `str`. The parameterization type of the diffusion model. "noise" or "x_start" or "v" or "score". model_kwargs: A `dict`. A dict for the other inputs of the model function. guidance_type: A `str`. The type of the guidance for sampling. "uncond" or "classifier" or "classifier-free". condition: A pytorch tensor. The condition for the guided sampling. Only used for "classifier" or "classifier-free" guidance type. unconditional_condition: A pytorch tensor. The condition for the unconditional sampling. Only used for "classifier-free" guidance type. guidance_scale: A `float`. The scale for the guided sampling. classifier_fn: A classifier function. Only used for the classifier guidance. classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function. Returns: A noise prediction model that accepts the noised data and the continuous time as the inputs. """ def get_model_input_time(t_continuous): """ Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time. For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N]. For continuous-time DPMs, we just use `t_continuous`. """ if noise_schedule.schedule == "discrete": return (t_continuous - 1.0 / noise_schedule.total_N) * 1000.0 else: return t_continuous def noise_pred_fn(x, t_continuous, cond=None): t_input = get_model_input_time(t_continuous) if cond is None: output = model(x, t_input, **model_kwargs) else: output = model(x, t_input, cond, **model_kwargs) if model_type == "noise": return output elif model_type == "x_start": alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous) return (x - expand_dims(alpha_t, x.dim()) * output) / expand_dims(sigma_t, x.dim()) elif model_type == "v": alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous) return expand_dims(alpha_t, x.dim()) * output + expand_dims(sigma_t, x.dim()) * x elif model_type == "score": sigma_t = noise_schedule.marginal_std(t_continuous) return -expand_dims(sigma_t, x.dim()) * output def cond_grad_fn(x, t_input): """ Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t). """ with torch.enable_grad(): x_in = x.detach().requires_grad_(True) log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs) return torch.autograd.grad(log_prob.sum(), x_in)[0] def model_fn(x, t_continuous): """ The noise predicition model function that is used for DPM-Solver. """ if guidance_type == "uncond": return noise_pred_fn(x, t_continuous) elif guidance_type == "classifier": assert classifier_fn is not None t_input = get_model_input_time(t_continuous) cond_grad = cond_grad_fn(x, t_input) sigma_t = noise_schedule.marginal_std(t_continuous) noise = noise_pred_fn(x, t_continuous) return noise - guidance_scale * expand_dims(sigma_t, x.dim()) * cond_grad elif guidance_type == "classifier-free": if guidance_scale == 1.0 or unconditional_condition is None: return noise_pred_fn(x, t_continuous, cond=condition) x_in = torch.cat([x] * 2) t_in = torch.cat([t_continuous] * 2) c_in = torch.cat([unconditional_condition, condition]) noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2) return noise_uncond + guidance_scale * (noise - noise_uncond) assert model_type in ["noise", "x_start", "v", "score"] assert guidance_type in ["uncond", "classifier", "classifier-free"] return model_fn class DPM_Solver: def __init__( self, model_fn, noise_schedule, algorithm_type="dpmsolver++", correcting_x0_fn=None, correcting_xt_fn=None, thresholding_max_val=1.0, dynamic_thresholding_ratio=0.995, ): """Construct a DPM-Solver. We support both DPM-Solver (`algorithm_type="dpmsolver"`) and DPM-Solver++ (`algorithm_type="dpmsolver++"`). We also support the "dynamic thresholding" method in Imagen[1]. For pixel-space diffusion models, you can set both `algorithm_type="dpmsolver++"` and `correcting_x0_fn="dynamic_thresholding"` to use the dynamic thresholding. The "dynamic thresholding" can greatly improve the sample quality for pixel-space DPMs with large guidance scales. Note that the thresholding method is **unsuitable** for latent-space DPMs (such as stable-diffusion). To support advanced algorithms in image-to-image applications, we also support corrector functions for both x0 and xt. Args: model_fn: A noise prediction model function which accepts the continuous-time input (t in [epsilon, T]): `` def model_fn(x, t_continuous): return noise `` The shape of `x` is `(batch_size, **shape)`, and the shape of `t_continuous` is `(batch_size,)`. noise_schedule: A noise schedule object, such as NoiseScheduleVP. algorithm_type: A `str`. Either "dpmsolver" or "dpmsolver++". correcting_x0_fn: A `str` or a function with the following format: ``` def correcting_x0_fn(x0, t): x0_new = ... return x0_new ``` This function is to correct the outputs of the data prediction model at each sampling step. e.g., ``` x0_pred = data_pred_model(xt, t) if correcting_x0_fn is not None: x0_pred = correcting_x0_fn(x0_pred, t) xt_1 = update(x0_pred, xt, t) ``` If `correcting_x0_fn="dynamic_thresholding"`, we use the dynamic thresholding proposed in Imagen[1]. correcting_xt_fn: A function with the following format: ``` def correcting_xt_fn(xt, t, step): x_new = ... return x_new ``` This function is to correct the intermediate samples xt at each sampling step. e.g., ``` xt = ... xt = correcting_xt_fn(xt, t, step) ``` thresholding_max_val: A `float`. The max value for thresholding. Valid only when use `dpmsolver++` and `correcting_x0_fn="dynamic_thresholding"`. dynamic_thresholding_ratio: A `float`. The ratio for dynamic thresholding (see Imagen[1] for details). Valid only when use `dpmsolver++` and `correcting_x0_fn="dynamic_thresholding"`. [1] Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour, Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b. """ self.model = lambda x, t: model_fn(x, t.expand((x.shape[0]))) self.noise_schedule = noise_schedule assert algorithm_type in ["dpmsolver", "dpmsolver++"] self.algorithm_type = algorithm_type if correcting_x0_fn == "dynamic_thresholding": self.correcting_x0_fn = self.dynamic_thresholding_fn else: self.correcting_x0_fn = correcting_x0_fn self.correcting_xt_fn = correcting_xt_fn self.dynamic_thresholding_ratio = dynamic_thresholding_ratio self.thresholding_max_val = thresholding_max_val def dynamic_thresholding_fn(self, x0, t): """ The dynamic thresholding method. """ dims = x0.dim() p = self.dynamic_thresholding_ratio s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1) s = expand_dims(torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)), dims) x0 = torch.clamp(x0, -s, s) / s return x0 def noise_prediction_fn(self, x, t): """ Return the noise prediction model. """ return self.model(x, t) def data_prediction_fn(self, x, t): """ Return the data prediction model (with corrector). """ noise = self.noise_prediction_fn(x, t) alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t) x0 = (x - sigma_t * noise) / alpha_t if self.correcting_x0_fn is not None: x0 = self.correcting_x0_fn(x0, t) return x0 def model_fn(self, x, t): """ Convert the model to the noise prediction model or the data prediction model. """ if self.algorithm_type == "dpmsolver++": return self.data_prediction_fn(x, t) else: return self.noise_prediction_fn(x, t) def get_time_steps(self, skip_type, t_T, t_0, N, device): """Compute the intermediate time steps for sampling. Args: skip_type: A `str`. The type for the spacing of the time steps. We support three types: - 'logSNR': uniform logSNR for the time steps. - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.) - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.) t_T: A `float`. The starting time of the sampling (default is T). t_0: A `float`. The ending time of the sampling (default is epsilon). N: A `int`. The total number of the spacing of the time steps. device: A torch device. Returns: A pytorch tensor of the time steps, with the shape (N + 1,). """ if skip_type == "logSNR": lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device)) lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device)) logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device) return self.noise_schedule.inverse_lambda(logSNR_steps) elif skip_type == "time_uniform": return torch.linspace(t_T, t_0, N + 1).to(device) elif skip_type == "time_quadratic": t_order = 2 return torch.linspace(t_T ** (1.0 / t_order), t_0 ** (1.0 / t_order), N + 1).pow(t_order).to(device) else: raise ValueError( f"Unsupported skip_type {skip_type}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'" ) def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device): """ Get the order of each step for sampling by the singlestep DPM-Solver. We combine both DPM-Solver-1,2,3 to use all the function evaluations, which is named as "DPM-Solver-fast". Given a fixed number of function evaluations by `steps`, the sampling procedure by DPM-Solver-fast is: - If order == 1: We take `steps` of DPM-Solver-1 (i.e. DDIM). - If order == 2: - Denote K = (steps // 2). We take K or (K + 1) intermediate time steps for sampling. - If steps % 2 == 0, we use K steps of DPM-Solver-2. - If steps % 2 == 1, we use K steps of DPM-Solver-2 and 1 step of DPM-Solver-1. - If order == 3: - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling. - If steps % 3 == 0, we use (K - 2) steps of DPM-Solver-3, and 1 step of DPM-Solver-2 and 1 step of DPM-Solver-1. - If steps % 3 == 1, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-1. - If steps % 3 == 2, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-2. ============================================ Args: order: A `int`. The max order for the solver (2 or 3). steps: A `int`. The total number of function evaluations (NFE). skip_type: A `str`. The type for the spacing of the time steps. We support three types: - 'logSNR': uniform logSNR for the time steps. - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.) - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.) t_T: A `float`. The starting time of the sampling (default is T). t_0: A `float`. The ending time of the sampling (default is epsilon). device: A torch device. Returns: orders: A list of the solver order of each step. """ if order == 3: K = steps // 3 + 1 if steps % 3 == 0: orders = [ 3, ] * ( K - 2 ) + [2, 1] elif steps % 3 == 1: orders = [ 3, ] * ( K - 1 ) + [1] else: orders = [ 3, ] * ( K - 1 ) + [2] elif order == 2: if steps % 2 == 0: K = steps // 2 orders = [ 2, ] * K else: K = steps // 2 + 1 orders = [ 2, ] * ( K - 1 ) + [1] elif order == 1: K = 1 orders = [ 1, ] * steps else: raise ValueError("'order' must be '1' or '2' or '3'.") if skip_type == "logSNR": # To reproduce the results in DPM-Solver paper timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device) else: timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[ torch.cumsum( torch.tensor( [ 0, ] + orders ), 0, ).to(device) ] return timesteps_outer, orders def denoise_to_zero_fn(self, x, s): """ Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization. """ return self.data_prediction_fn(x, s) def dpm_solver_first_update(self, x, s, t, model_s=None, return_intermediate=False): """ DPM-Solver-1 (equivalent to DDIM) from time `s` to time `t`. Args: x: A pytorch tensor. The initial value at time `s`. s: A pytorch tensor. The starting time, with the shape (1,). t: A pytorch tensor. The ending time, with the shape (1,). model_s: A pytorch tensor. The model function evaluated at time `s`. If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it. return_intermediate: A `bool`. If true, also return the model value at time `s`. Returns: x_t: A pytorch tensor. The approximated solution at time `t`. """ ns = self.noise_schedule x.dim() lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t) h = lambda_t - lambda_s log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t) sigma_s, sigma_t = ns.marginal_std(s), ns.marginal_std(t) alpha_t = torch.exp(log_alpha_t) if self.algorithm_type == "dpmsolver++": phi_1 = torch.expm1(-h) if model_s is None: model_s = self.model_fn(x, s) x_t = sigma_t / sigma_s * x - alpha_t * phi_1 * model_s else: phi_1 = torch.expm1(h) if model_s is None: model_s = self.model_fn(x, s) x_t = torch.exp(log_alpha_t - log_alpha_s) * x - (sigma_t * phi_1) * model_s return (x_t, {"model_s": model_s}) if return_intermediate else x_t def singlestep_dpm_solver_second_update( self, x, s, t, r1=0.5, model_s=None, return_intermediate=False, solver_type="dpmsolver" ): """ Singlestep solver DPM-Solver-2 from time `s` to time `t`. Args: x: A pytorch tensor. The initial value at time `s`. s: A pytorch tensor. The starting time, with the shape (1,). t: A pytorch tensor. The ending time, with the shape (1,). r1: A `float`. The hyperparameter of the second-order solver. model_s: A pytorch tensor. The model function evaluated at time `s`. If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it. return_intermediate: A `bool`. If true, also return the model value at time `s` and `s1` (the intermediate time). solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers. The type slightly impacts the performance. We recommend to use 'dpmsolver' type. Returns: x_t: A pytorch tensor. The approximated solution at time `t`. """ if solver_type not in ["dpmsolver", "taylor"]: raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}") if r1 is None: r1 = 0.5 ns = self.noise_schedule lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t) h = lambda_t - lambda_s lambda_s1 = lambda_s + r1 * h s1 = ns.inverse_lambda(lambda_s1) log_alpha_s, log_alpha_s1, log_alpha_t = ( ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(t), ) sigma_s, sigma_s1, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(t) alpha_s1, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_t) if self.algorithm_type == "dpmsolver++": phi_11 = torch.expm1(-r1 * h) phi_1 = torch.expm1(-h) if model_s is None: model_s = self.model_fn(x, s) x_s1 = (sigma_s1 / sigma_s) * x - (alpha_s1 * phi_11) * model_s model_s1 = self.model_fn(x_s1, s1) if solver_type == "dpmsolver": x_t = ( (sigma_t / sigma_s) * x - (alpha_t * phi_1) * model_s - (0.5 / r1) * (alpha_t * phi_1) * (model_s1 - model_s) ) elif solver_type == "taylor": x_t = ( (sigma_t / sigma_s) * x - (alpha_t * phi_1) * model_s + (1.0 / r1) * (alpha_t * (phi_1 / h + 1.0)) * (model_s1 - model_s) ) else: phi_11 = torch.expm1(r1 * h) phi_1 = torch.expm1(h) if model_s is None: model_s = self.model_fn(x, s) x_s1 = torch.exp(log_alpha_s1 - log_alpha_s) * x - (sigma_s1 * phi_11) * model_s model_s1 = self.model_fn(x_s1, s1) if solver_type == "dpmsolver": x_t = ( torch.exp(log_alpha_t - log_alpha_s) * x - (sigma_t * phi_1) * model_s - (0.5 / r1) * (sigma_t * phi_1) * (model_s1 - model_s) ) elif solver_type == "taylor": x_t = ( torch.exp(log_alpha_t - log_alpha_s) * x - (sigma_t * phi_1) * model_s - (1.0 / r1) * (sigma_t * (phi_1 / h - 1.0)) * (model_s1 - model_s) ) if return_intermediate: return x_t, {"model_s": model_s, "model_s1": model_s1} else: return x_t def singlestep_dpm_solver_third_update( self, x, s, t, r1=1.0 / 3.0, r2=2.0 / 3.0, model_s=None, model_s1=None, return_intermediate=False, solver_type="dpmsolver", ): """ Singlestep solver DPM-Solver-3 from time `s` to time `t`. Args: x: A pytorch tensor. The initial value at time `s`. s: A pytorch tensor. The starting time, with the shape (1,). t: A pytorch tensor. The ending time, with the shape (1,). r1: A `float`. The hyperparameter of the third-order solver. r2: A `float`. The hyperparameter of the third-order solver. model_s: A pytorch tensor. The model function evaluated at time `s`. If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it. model_s1: A pytorch tensor. The model function evaluated at time `s1` (the intermediate time given by `r1`). If `model_s1` is None, we evaluate the model at `s1`; otherwise we directly use it. return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times). solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers. The type slightly impacts the performance. We recommend to use 'dpmsolver' type. Returns: x_t: A pytorch tensor. The approximated solution at time `t`. """ if solver_type not in ["dpmsolver", "taylor"]: raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}") if r1 is None: r1 = 1.0 / 3.0 if r2 is None: r2 = 2.0 / 3.0 ns = self.noise_schedule lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t) h = lambda_t - lambda_s lambda_s1 = lambda_s + r1 * h lambda_s2 = lambda_s + r2 * h s1 = ns.inverse_lambda(lambda_s1) s2 = ns.inverse_lambda(lambda_s2) log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = ( ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(s2), ns.marginal_log_mean_coeff(t), ) sigma_s, sigma_s1, sigma_s2, sigma_t = ( ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(s2), ns.marginal_std(t), ) alpha_s1, alpha_s2, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_s2), torch.exp(log_alpha_t) if self.algorithm_type == "dpmsolver++": phi_11 = torch.expm1(-r1 * h) phi_12 = torch.expm1(-r2 * h) phi_1 = torch.expm1(-h) phi_22 = torch.expm1(-r2 * h) / (r2 * h) + 1.0 phi_2 = phi_1 / h + 1.0 phi_3 = phi_2 / h - 0.5 if model_s is None: model_s = self.model_fn(x, s) if model_s1 is None: x_s1 = (sigma_s1 / sigma_s) * x - (alpha_s1 * phi_11) * model_s model_s1 = self.model_fn(x_s1, s1) x_s2 = ( (sigma_s2 / sigma_s) * x - (alpha_s2 * phi_12) * model_s + r2 / r1 * (alpha_s2 * phi_22) * (model_s1 - model_s) ) model_s2 = self.model_fn(x_s2, s2) if solver_type == "dpmsolver": x_t = ( (sigma_t / sigma_s) * x - (alpha_t * phi_1) * model_s + (1.0 / r2) * (alpha_t * phi_2) * (model_s2 - model_s) ) elif solver_type == "taylor": D1_0 = (1.0 / r1) * (model_s1 - model_s) D1_1 = (1.0 / r2) * (model_s2 - model_s) D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1) D2 = 2.0 * (D1_1 - D1_0) / (r2 - r1) x_t = ( (sigma_t / sigma_s) * x - (alpha_t * phi_1) * model_s + (alpha_t * phi_2) * D1 - (alpha_t * phi_3) * D2 ) else: phi_11 = torch.expm1(r1 * h) phi_12 = torch.expm1(r2 * h) phi_1 = torch.expm1(h) phi_22 = torch.expm1(r2 * h) / (r2 * h) - 1.0 phi_2 = phi_1 / h - 1.0 phi_3 = phi_2 / h - 0.5 if model_s is None: model_s = self.model_fn(x, s) if model_s1 is None: x_s1 = (torch.exp(log_alpha_s1 - log_alpha_s)) * x - (sigma_s1 * phi_11) * model_s model_s1 = self.model_fn(x_s1, s1) x_s2 = ( (torch.exp(log_alpha_s2 - log_alpha_s)) * x - (sigma_s2 * phi_12) * model_s - r2 / r1 * (sigma_s2 * phi_22) * (model_s1 - model_s) ) model_s2 = self.model_fn(x_s2, s2) if solver_type == "dpmsolver": x_t = ( (torch.exp(log_alpha_t - log_alpha_s)) * x - (sigma_t * phi_1) * model_s - (1.0 / r2) * (sigma_t * phi_2) * (model_s2 - model_s) ) elif solver_type == "taylor": D1_0 = (1.0 / r1) * (model_s1 - model_s) D1_1 = (1.0 / r2) * (model_s2 - model_s) D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1) D2 = 2.0 * (D1_1 - D1_0) / (r2 - r1) x_t = ( (torch.exp(log_alpha_t - log_alpha_s)) * x - (sigma_t * phi_1) * model_s - (sigma_t * phi_2) * D1 - (sigma_t * phi_3) * D2 ) if return_intermediate: return x_t, {"model_s": model_s, "model_s1": model_s1, "model_s2": model_s2} else: return x_t def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t, solver_type="dpmsolver"): """ Multistep solver DPM-Solver-2 from time `t_prev_list[-1]` to time `t`. Args: x: A pytorch tensor. The initial value at time `s`. model_prev_list: A list of pytorch tensor. The previous computed model values. t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,) t: A pytorch tensor. The ending time, with the shape (1,). solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers. The type slightly impacts the performance. We recommend to use 'dpmsolver' type. Returns: x_t: A pytorch tensor. The approximated solution at time `t`. """ if solver_type not in ["dpmsolver", "taylor"]: raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}") ns = self.noise_schedule model_prev_1, model_prev_0 = model_prev_list[-2], model_prev_list[-1] t_prev_1, t_prev_0 = t_prev_list[-2], t_prev_list[-1] lambda_prev_1, lambda_prev_0, lambda_t = ( ns.marginal_lambda(t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t), ) log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t) sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t) alpha_t = torch.exp(log_alpha_t) h_0 = lambda_prev_0 - lambda_prev_1 h = lambda_t - lambda_prev_0 r0 = h_0 / h D1_0 = (1.0 / r0) * (model_prev_0 - model_prev_1) if self.algorithm_type == "dpmsolver++": phi_1 = torch.expm1(-h) if solver_type == "dpmsolver": x_t = (sigma_t / sigma_prev_0) * x - (alpha_t * phi_1) * model_prev_0 - 0.5 * (alpha_t * phi_1) * D1_0 elif solver_type == "taylor": x_t = ( (sigma_t / sigma_prev_0) * x - (alpha_t * phi_1) * model_prev_0 + (alpha_t * (phi_1 / h + 1.0)) * D1_0 ) else: phi_1 = torch.expm1(h) if solver_type == "dpmsolver": x_t = ( (torch.exp(log_alpha_t - log_alpha_prev_0)) * x - (sigma_t * phi_1) * model_prev_0 - 0.5 * (sigma_t * phi_1) * D1_0 ) elif solver_type == "taylor": x_t = ( (torch.exp(log_alpha_t - log_alpha_prev_0)) * x - (sigma_t * phi_1) * model_prev_0 - (sigma_t * (phi_1 / h - 1.0)) * D1_0 ) return x_t def multistep_dpm_solver_third_update(self, x, model_prev_list, t_prev_list, t, solver_type="dpmsolver"): """ Multistep solver DPM-Solver-3 from time `t_prev_list[-1]` to time `t`. Args: x: A pytorch tensor. The initial value at time `s`. model_prev_list: A list of pytorch tensor. The previous computed model values. t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,) t: A pytorch tensor. The ending time, with the shape (1,). solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers. The type slightly impacts the performance. We recommend to use 'dpmsolver' type. Returns: x_t: A pytorch tensor. The approximated solution at time `t`. """ ns = self.noise_schedule model_prev_2, model_prev_1, model_prev_0 = model_prev_list t_prev_2, t_prev_1, t_prev_0 = t_prev_list lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = ( ns.marginal_lambda(t_prev_2), ns.marginal_lambda(t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t), ) log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t) sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t) alpha_t = torch.exp(log_alpha_t) h_1 = lambda_prev_1 - lambda_prev_2 h_0 = lambda_prev_0 - lambda_prev_1 h = lambda_t - lambda_prev_0 r0, r1 = h_0 / h, h_1 / h D1_0 = (1.0 / r0) * (model_prev_0 - model_prev_1) D1_1 = (1.0 / r1) * (model_prev_1 - model_prev_2) D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1) D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1) if self.algorithm_type == "dpmsolver++": phi_1 = torch.expm1(-h) phi_2 = phi_1 / h + 1.0 phi_3 = phi_2 / h - 0.5 return ( (sigma_t / sigma_prev_0) * x - (alpha_t * phi_1) * model_prev_0 + (alpha_t * phi_2) * D1 - (alpha_t * phi_3) * D2 ) else: phi_1 = torch.expm1(h) phi_2 = phi_1 / h - 1.0 phi_3 = phi_2 / h - 0.5 return ( (torch.exp(log_alpha_t - log_alpha_prev_0)) * x - (sigma_t * phi_1) * model_prev_0 - (sigma_t * phi_2) * D1 - (sigma_t * phi_3) * D2 ) def singlestep_dpm_solver_update( self, x, s, t, order, return_intermediate=False, solver_type="dpmsolver", r1=None, r2=None ): """ Singlestep DPM-Solver with the order `order` from time `s` to time `t`. Args: x: A pytorch tensor. The initial value at time `s`. s: A pytorch tensor. The starting time, with the shape (1,). t: A pytorch tensor. The ending time, with the shape (1,). order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3. return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times). solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers. The type slightly impacts the performance. We recommend to use 'dpmsolver' type. r1: A `float`. The hyperparameter of the second-order or third-order solver. r2: A `float`. The hyperparameter of the third-order solver. Returns: x_t: A pytorch tensor. The approximated solution at time `t`. """ if order == 1: return self.dpm_solver_first_update(x, s, t, return_intermediate=return_intermediate) elif order == 2: return self.singlestep_dpm_solver_second_update( x, s, t, return_intermediate=return_intermediate, solver_type=solver_type, r1=r1 ) elif order == 3: return self.singlestep_dpm_solver_third_update( x, s, t, return_intermediate=return_intermediate, solver_type=solver_type, r1=r1, r2=r2 ) else: raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}") def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, solver_type="dpmsolver"): """ Multistep DPM-Solver with the order `order` from time `t_prev_list[-1]` to time `t`. Args: x: A pytorch tensor. The initial value at time `s`. model_prev_list: A list of pytorch tensor. The previous computed model values. t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,) t: A pytorch tensor. The ending time, with the shape (1,). order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3. solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers. The type slightly impacts the performance. We recommend to use 'dpmsolver' type. Returns: x_t: A pytorch tensor. The approximated solution at time `t`. """ if order == 1: return self.dpm_solver_first_update(x, t_prev_list[-1], t, model_s=model_prev_list[-1]) elif order == 2: return self.multistep_dpm_solver_second_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type) elif order == 3: return self.multistep_dpm_solver_third_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type) else: raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}") def dpm_solver_adaptive( self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol=0.05, theta=0.9, t_err=1e-5, solver_type="dpmsolver" ): """ The adaptive step size solver based on singlestep DPM-Solver. Args: x: A pytorch tensor. The initial value at time `t_T`. order: A `int`. The (higher) order of the solver. We only support order == 2 or 3. t_T: A `float`. The starting time of the sampling (default is T). t_0: A `float`. The ending time of the sampling (default is epsilon). h_init: A `float`. The initial step size (for logSNR). atol: A `float`. The absolute tolerance of the solver. For image data, the default setting is 0.0078, followed [1]. rtol: A `float`. The relative tolerance of the solver. The default setting is 0.05. theta: A `float`. The safety hyperparameter for adapting the step size. The default setting is 0.9, followed [1]. t_err: A `float`. The tolerance for the time. We solve the diffusion ODE until the absolute error between the current time and `t_0` is less than `t_err`. The default setting is 1e-5. solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers. The type slightly impacts the performance. We recommend to use 'dpmsolver' type. Returns: x_0: A pytorch tensor. The approximated solution at time `t_0`. [1] A. Jolicoeur-Martineau, K. Li, R. Piché-Taillefer, T. Kachman, and I. Mitliagkas, "Gotta go fast when generating data with score-based models," arXiv preprint arXiv:2105.14080, 2021. """ ns = self.noise_schedule s = t_T * torch.ones((1,)).to(x) lambda_s = ns.marginal_lambda(s) lambda_0 = ns.marginal_lambda(t_0 * torch.ones_like(s).to(x)) h = h_init * torch.ones_like(s).to(x) x_prev = x nfe = 0 if order == 2: r1 = 0.5 lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True) higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update( x, s, t, r1=r1, solver_type=solver_type, **kwargs ) elif order == 3: r1, r2 = 1.0 / 3.0, 2.0 / 3.0 lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update( x, s, t, r1=r1, return_intermediate=True, solver_type=solver_type ) higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update( x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs ) else: raise ValueError(f"For adaptive step size solver, order must be 2 or 3, got {order}") while torch.abs((s - t_0)).mean() > t_err: t = ns.inverse_lambda(lambda_s + h) x_lower, lower_noise_kwargs = lower_update(x, s, t) x_higher = higher_update(x, s, t, **lower_noise_kwargs) delta = torch.max(torch.ones_like(x).to(x) * atol, rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev))) norm_fn = lambda v: torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True)) E = norm_fn((x_higher - x_lower) / delta).max() if torch.all(E <= 1.0): x = x_higher s = t x_prev = x_lower lambda_s = ns.marginal_lambda(s) h = torch.min(theta * h * torch.float_power(E, -1.0 / order).float(), lambda_0 - lambda_s) nfe += order print("adaptive solver nfe", nfe) return x def add_noise(self, x, t, noise=None): """ Compute the noised input xt = alpha_t * x + sigma_t * noise. Args: x: A `torch.Tensor` with shape `(batch_size, *shape)`. t: A `torch.Tensor` with shape `(t_size,)`. Returns: xt with shape `(t_size, batch_size, *shape)`. """ alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t) if noise is None: noise = torch.randn((t.shape[0], *x.shape), device=x.device) x = x.reshape((-1, *x.shape)) xt = expand_dims(alpha_t, x.dim()) * x + expand_dims(sigma_t, x.dim()) * noise return xt.squeeze(0) if t.shape[0] == 1 else xt def inverse( self, x, steps=20, t_start=None, t_end=None, order=2, skip_type="time_uniform", method="multistep", lower_order_final=True, denoise_to_zero=False, solver_type="dpmsolver", atol=0.0078, rtol=0.05, return_intermediate=False, ): """ Inverse the sample `x` from time `t_start` to `t_end` by DPM-Solver. For discrete-time DPMs, we use `t_start=1/N`, where `N` is the total time steps during training. """ t_0 = 1.0 / self.noise_schedule.total_N if t_start is None else t_start t_T = self.noise_schedule.T if t_end is None else t_end assert ( t_0 > 0 and t_T > 0 ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" return self.sample( x, steps=steps, t_start=t_0, t_end=t_T, order=order, skip_type=skip_type, method=method, lower_order_final=lower_order_final, denoise_to_zero=denoise_to_zero, solver_type=solver_type, atol=atol, rtol=rtol, return_intermediate=return_intermediate, ) def sample( self, x, steps=20, t_start=None, t_end=None, order=2, skip_type="time_uniform", method="multistep", lower_order_final=True, denoise_to_zero=False, solver_type="dpmsolver", atol=0.0078, rtol=0.05, return_intermediate=False, progress=True, ): """ Compute the sample at time `t_end` by DPM-Solver, given the initial `x` at time `t_start`. ===================================================== We support the following algorithms for both noise prediction model and data prediction model: - 'singlestep': Singlestep DPM-Solver (i.e. "DPM-Solver-fast" in the paper), which combines different orders of singlestep DPM-Solver. We combine all the singlestep solvers with order <= `order` to use up all the function evaluations (steps). The total number of function evaluations (NFE) == `steps`. Given a fixed NFE == `steps`, the sampling procedure is: - If `order` == 1: - Denote K = steps. We use K steps of DPM-Solver-1 (i.e. DDIM). - If `order` == 2: - Denote K = (steps // 2) + (steps % 2). We take K intermediate time steps for sampling. - If steps % 2 == 0, we use K steps of singlestep DPM-Solver-2. - If steps % 2 == 1, we use (K - 1) steps of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1. - If `order` == 3: - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling. - If steps % 3 == 0, we use (K - 2) steps of singlestep DPM-Solver-3, and 1 step of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1. - If steps % 3 == 1, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of DPM-Solver-1. - If steps % 3 == 2, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of singlestep DPM-Solver-2. - 'multistep': Multistep DPM-Solver with the order of `order`. The total number of function evaluations (NFE) == `steps`. We initialize the first `order` values by lower order multistep solvers. Given a fixed NFE == `steps`, the sampling procedure is: Denote K = steps. - If `order` == 1: - We use K steps of DPM-Solver-1 (i.e. DDIM). - If `order` == 2: - We firstly use 1 step of DPM-Solver-1, then use (K - 1) step of multistep DPM-Solver-2. - If `order` == 3: - We firstly use 1 step of DPM-Solver-1, then 1 step of multistep DPM-Solver-2, then (K - 2) step of multistep DPM-Solver-3. - 'singlestep_fixed': Fixed order singlestep DPM-Solver (i.e. DPM-Solver-1 or singlestep DPM-Solver-2 or singlestep DPM-Solver-3). We use singlestep DPM-Solver-`order` for `order`=1 or 2 or 3, with total [`steps` // `order`] * `order` NFE. - 'adaptive': Adaptive step size DPM-Solver (i.e. "DPM-Solver-12" and "DPM-Solver-23" in the paper). We ignore `steps` and use adaptive step size DPM-Solver with a higher order of `order`. You can adjust the absolute tolerance `atol` and the relative tolerance `rtol` to balance the computatation costs (NFE) and the sample quality. - If `order` == 2, we use DPM-Solver-12 which combines DPM-Solver-1 and singlestep DPM-Solver-2. - If `order` == 3, we use DPM-Solver-23 which combines singlestep DPM-Solver-2 and singlestep DPM-Solver-3. ===================================================== Some advices for choosing the algorithm: - For **unconditional sampling** or **guided sampling with small guidance scale** by DPMs: Use singlestep DPM-Solver or DPM-Solver++ ("DPM-Solver-fast" in the paper) with `order = 3`. e.g., DPM-Solver: >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver") >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3, skip_type='time_uniform', method='singlestep') e.g., DPM-Solver++: >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++") >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3, skip_type='time_uniform', method='singlestep') - For **guided sampling with large guidance scale** by DPMs: Use multistep DPM-Solver with `algorithm_type="dpmsolver++"` and `order = 2`. e.g. >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++") >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=2, skip_type='time_uniform', method='multistep') We support three types of `skip_type`: - 'logSNR': uniform logSNR for the time steps. **Recommended for low-resolutional images** - 'time_uniform': uniform time for the time steps. **Recommended for high-resolutional images**. - 'time_quadratic': quadratic time for the time steps. ===================================================== Args: x: A pytorch tensor. The initial value at time `t_start` e.g. if `t_start` == T, then `x` is a sample from the standard normal distribution. steps: A `int`. The total number of function evaluations (NFE). t_start: A `float`. The starting time of the sampling. If `T` is None, we use self.noise_schedule.T (default is 1.0). t_end: A `float`. The ending time of the sampling. If `t_end` is None, we use 1. / self.noise_schedule.total_N. e.g. if total_N == 1000, we have `t_end` == 1e-3. For discrete-time DPMs: - We recommend `t_end` == 1. / self.noise_schedule.total_N. For continuous-time DPMs: - We recommend `t_end` == 1e-3 when `steps` <= 15; and `t_end` == 1e-4 when `steps` > 15. order: A `int`. The order of DPM-Solver. skip_type: A `str`. The type for the spacing of the time steps. 'time_uniform' or 'logSNR' or 'time_quadratic'. method: A `str`. The method for sampling. 'singlestep' or 'multistep' or 'singlestep_fixed' or 'adaptive'. denoise_to_zero: A `bool`. Whether to denoise to time 0 at the final step. Default is `False`. If `denoise_to_zero` is `True`, the total NFE is (`steps` + 1). This trick is firstly proposed by DDPM (https://arxiv.org/abs/2006.11239) and score_sde (https://arxiv.org/abs/2011.13456). Such trick can improve the FID for diffusion models sampling by diffusion SDEs for low-resolutional images (such as CIFAR-10). However, we observed that such trick does not matter for high-resolutional images. As it needs an additional NFE, we do not recommend it for high-resolutional images. lower_order_final: A `bool`. Whether to use lower order solvers at the final steps. Only valid for `method=multistep` and `steps < 15`. We empirically find that this trick is a key to stabilizing the sampling by DPM-Solver with very few steps (especially for steps <= 10). So we recommend to set it to be `True`. solver_type: A `str`. The taylor expansion type for the solver. `dpmsolver` or `taylor`. We recommend `dpmsolver`. atol: A `float`. The absolute tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'. rtol: A `float`. The relative tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'. return_intermediate: A `bool`. Whether to save the xt at each step. When set to `True`, method returns a tuple (x0, intermediates); when set to False, method returns only x0. Returns: x_end: A pytorch tensor. The approximated solution at time `t_end`. """ t_0 = 1.0 / self.noise_schedule.total_N if t_end is None else t_end t_T = self.noise_schedule.T if t_start is None else t_start assert ( t_0 > 0 and t_T > 0 ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" if return_intermediate: assert method in [ "multistep", "singlestep", "singlestep_fixed", ], "Cannot use adaptive solver when saving intermediate values" if self.correcting_xt_fn is not None: assert method in [ "multistep", "singlestep", "singlestep_fixed", ], "Cannot use adaptive solver when correcting_xt_fn is not None" device = x.device intermediates = [] with torch.no_grad(): if method == "adaptive": x = self.dpm_solver_adaptive( x, order=order, t_T=t_T, t_0=t_0, atol=atol, rtol=rtol, solver_type=solver_type ) elif method == "multistep": assert steps >= order timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device) assert timesteps.shape[0] - 1 == steps # Init the initial values. step = 0 t = timesteps[step] t_prev_list = [t] model_prev_list = [self.model_fn(x, t)] if self.correcting_xt_fn is not None: x = self.correcting_xt_fn(x, t, step) if return_intermediate: intermediates.append(x) # Init the first `order` values by lower order multistep DPM-Solver. for step in range(1, order): t = timesteps[step] x = self.multistep_dpm_solver_update( x, model_prev_list, t_prev_list, t, step, solver_type=solver_type ) if self.correcting_xt_fn is not None: x = self.correcting_xt_fn(x, t, step) if return_intermediate: intermediates.append(x) t_prev_list.append(t) model_prev_list.append(self.model_fn(x, t)) # Compute the remaining values by `order`-th order multistep DPM-Solver. progress_fn = tqdm if progress else lambda x: x for step in progress_fn(range(order, steps + 1)): t = timesteps[step] # We only use lower order for steps < 10 if lower_order_final: # recommended by Shuchen Xue step_order = min(order, steps + 1 - step) else: step_order = order x = self.multistep_dpm_solver_update( x, model_prev_list, t_prev_list, t, step_order, solver_type=solver_type ) if self.correcting_xt_fn is not None: x = self.correcting_xt_fn(x, t, step) if return_intermediate: intermediates.append(x) for i in range(order - 1): t_prev_list[i] = t_prev_list[i + 1] model_prev_list[i] = model_prev_list[i + 1] t_prev_list[-1] = t # We do not need to evaluate the final model value. if step < steps: model_prev_list[-1] = self.model_fn(x, t) elif method in ["singlestep", "singlestep_fixed"]: if method == "singlestep": timesteps_outer, orders = self.get_orders_and_timesteps_for_singlestep_solver( steps=steps, order=order, skip_type=skip_type, t_T=t_T, t_0=t_0, device=device ) elif method == "singlestep_fixed": K = steps // order orders = [ order, ] * K timesteps_outer = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=K, device=device) for step, order in enumerate(orders): s, t = timesteps_outer[step], timesteps_outer[step + 1] timesteps_inner = self.get_time_steps( skip_type=skip_type, t_T=s.item(), t_0=t.item(), N=order, device=device ) lambda_inner = self.noise_schedule.marginal_lambda(timesteps_inner) h = lambda_inner[-1] - lambda_inner[0] r1 = None if order <= 1 else (lambda_inner[1] - lambda_inner[0]) / h r2 = None if order <= 2 else (lambda_inner[2] - lambda_inner[0]) / h x = self.singlestep_dpm_solver_update(x, s, t, order, solver_type=solver_type, r1=r1, r2=r2) if self.correcting_xt_fn is not None: x = self.correcting_xt_fn(x, t, step) if return_intermediate: intermediates.append(x) else: raise ValueError(f"Got wrong method {method}") if denoise_to_zero: t = torch.ones((1,)).to(device) * t_0 x = self.denoise_to_zero_fn(x, t) if self.correcting_xt_fn is not None: x = self.correcting_xt_fn(x, t, step + 1) if return_intermediate: intermediates.append(x) return (x, intermediates) if return_intermediate else x ############################################################# # other utility functions ############################################################# def interpolate_fn(x, xp, yp): """ A piecewise linear function y = f(x), using xp and yp as keypoints. We implement f(x) in a differentiable way (i.e. applicable for autograd). The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.) Args: x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver). xp: PyTorch tensor with shape [C, K], where K is the number of keypoints. yp: PyTorch tensor with shape [C, K]. Returns: The function values f(x), with shape [N, C]. """ N, K = x.shape[0], xp.shape[1] all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2) sorted_all_x, x_indices = torch.sort(all_x, dim=2) x_idx = torch.argmin(x_indices, dim=2) cand_start_idx = x_idx - 1 start_idx = torch.where( torch.eq(x_idx, 0), torch.tensor(1, device=x.device), torch.where( torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx, ), ) end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1) start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2) end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2) start_idx2 = torch.where( torch.eq(x_idx, 0), torch.tensor(0, device=x.device), torch.where( torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx, ), ) y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1) start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2) end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2) return start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x) def expand_dims(v, dims): """ Expand the tensor `v` to the dim `dims`. Args: `v`: a PyTorch tensor with shape [N]. `dim`: a `int`. Returns: a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`. """ return v[(...,) + (None,) * (dims - 1)] def DPMS( model, condition, uncondition, cfg_scale, model_type="noise", noise_schedule="linear", guidance_type="classifier-free", model_kwargs=None, diffusion_steps=1000, ): if model_kwargs is None: model_kwargs = {} betas = torch.tensor(get_named_beta_schedule(noise_schedule, diffusion_steps)) ## 1. Define the noise schedule. noise_schedule = NoiseScheduleVP(schedule="discrete", betas=betas) ## 2. Convert your discrete-time `model` to the continuous-time ## noise prediction model. Here is an example for a diffusion model ## `model` with the noise prediction type ("noise") . model_fn = model_wrapper( model, noise_schedule, model_type=model_type, model_kwargs=model_kwargs, guidance_type=guidance_type, condition=condition, unconditional_condition=uncondition, guidance_scale=cfg_scale, ) ## 3. Define dpm-solver and sample by multistep DPM-Solver. return DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++") ================================================ FILE: Open-Sora/opensora/schedulers/iddpm/__init__.py ================================================ from functools import partial import torch from opensora.registry import SCHEDULERS from . import gaussian_diffusion as gd from .respace import SpacedDiffusion, space_timesteps from .speed import SpeeDiffusion @SCHEDULERS.register_module("iddpm") class IDDPM(SpacedDiffusion): def __init__( self, num_sampling_steps=None, timestep_respacing=None, noise_schedule="linear", use_kl=False, sigma_small=False, predict_xstart=False, learn_sigma=True, rescale_learned_sigmas=False, diffusion_steps=1000, cfg_scale=4.0, cfg_channel=None, ): betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps) if use_kl: loss_type = gd.LossType.RESCALED_KL elif rescale_learned_sigmas: loss_type = gd.LossType.RESCALED_MSE else: loss_type = gd.LossType.MSE if num_sampling_steps is not None: assert timestep_respacing is None timestep_respacing = str(num_sampling_steps) if timestep_respacing is None or timestep_respacing == "": timestep_respacing = [diffusion_steps] super().__init__( use_timesteps=space_timesteps(diffusion_steps, timestep_respacing), betas=betas, model_mean_type=(gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X), model_var_type=( (gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL) if not learn_sigma else gd.ModelVarType.LEARNED_RANGE ), loss_type=loss_type, ) self.cfg_scale = cfg_scale self.cfg_channel = cfg_channel def sample( self, model, text_encoder, z, prompts, device, additional_args=None, mask=None, progress=True, ): n = len(prompts) z = torch.cat([z, z], 0) model_args = text_encoder.encode(prompts) y_null = text_encoder.null(n) model_args["y"] = torch.cat([model_args["y"], y_null], 0) if additional_args is not None: model_args.update(additional_args) forward = partial(forward_with_cfg, model, cfg_scale=self.cfg_scale, cfg_channel=self.cfg_channel) samples = self.p_sample_loop( forward, z.shape, z, clip_denoised=False, model_kwargs=model_args, progress=progress, device=device, mask=mask, ) samples, _ = samples.chunk(2, dim=0) return samples def forward_with_cfg(model, x, timestep, y, cfg_scale, cfg_channel=None, **kwargs): # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb half = x[: len(x) // 2] combined = torch.cat([half, half], dim=0) if "x_mask" in kwargs and kwargs["x_mask"] is not None: if len(kwargs["x_mask"]) != len(x): kwargs["x_mask"] = torch.cat([kwargs["x_mask"], kwargs["x_mask"]], dim=0) model_out = model.forward(combined, timestep, y, **kwargs) model_out = model_out["x"] if isinstance(model_out, dict) else model_out if cfg_channel is None: cfg_channel = model_out.shape[1] // 2 eps, rest = model_out[:, :cfg_channel], model_out[:, cfg_channel:] cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0) half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps) eps = torch.cat([half_eps, half_eps], dim=0) return torch.cat([eps, rest], dim=1) ================================================ FILE: Open-Sora/opensora/schedulers/iddpm/diffusion_utils.py ================================================ # Adapted from DiT # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # DiT: https://github.com/facebookresearch/DiT/tree/main # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py # -------------------------------------------------------- import numpy as np import torch def normal_kl(mean1, logvar1, mean2, logvar2): """ Compute the KL divergence between two gaussians. Shapes are automatically broadcasted, so batches can be compared to scalars, among other use cases. """ tensor = None for obj in (mean1, logvar1, mean2, logvar2): if isinstance(obj, torch.Tensor): tensor = obj break assert tensor is not None, "at least one argument must be a Tensor" # Force variances to be Tensors. Broadcasting helps convert scalars to # Tensors, but it does not work for torch.exp(). logvar1, logvar2 = [x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor) for x in (logvar1, logvar2)] return 0.5 * ( -1.0 + logvar2 - logvar1 + torch.exp(logvar1 - logvar2) + ((mean1 - mean2) ** 2) * torch.exp(-logvar2) ) def approx_standard_normal_cdf(x): """ A fast approximation of the cumulative distribution function of the standard normal. """ return 0.5 * (1.0 + torch.tanh(np.sqrt(2.0 / torch.pi) * (x + 0.044715 * torch.pow(x, 3)))) def continuous_gaussian_log_likelihood(x, *, means, log_scales): """ Compute the log-likelihood of a continuous Gaussian distribution. :param x: the targets :param means: the Gaussian mean Tensor. :param log_scales: the Gaussian log stddev Tensor. :return: a tensor like x of log probabilities (in nats). """ centered_x = x - means inv_stdv = torch.exp(-log_scales) normalized_x = centered_x * inv_stdv log_probs = torch.distributions.Normal(torch.zeros_like(x), torch.ones_like(x)).log_prob(normalized_x) return log_probs def discretized_gaussian_log_likelihood(x, *, means, log_scales): """ Compute the log-likelihood of a Gaussian distribution discretizing to a given image. :param x: the target images. It is assumed that this was uint8 values, rescaled to the range [-1, 1]. :param means: the Gaussian mean Tensor. :param log_scales: the Gaussian log stddev Tensor. :return: a tensor like x of log probabilities (in nats). """ assert x.shape == means.shape == log_scales.shape centered_x = x - means inv_stdv = torch.exp(-log_scales) plus_in = inv_stdv * (centered_x + 1.0 / 255.0) cdf_plus = approx_standard_normal_cdf(plus_in) min_in = inv_stdv * (centered_x - 1.0 / 255.0) cdf_min = approx_standard_normal_cdf(min_in) log_cdf_plus = torch.log(cdf_plus.clamp(min=1e-12)) log_one_minus_cdf_min = torch.log((1.0 - cdf_min).clamp(min=1e-12)) cdf_delta = cdf_plus - cdf_min log_probs = torch.where( x < -0.999, log_cdf_plus, torch.where(x > 0.999, log_one_minus_cdf_min, torch.log(cdf_delta.clamp(min=1e-12))), ) assert log_probs.shape == x.shape return log_probs ================================================ FILE: Open-Sora/opensora/schedulers/iddpm/gaussian_diffusion.py ================================================ # Adapted from DiT # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # DiT: https://github.com/facebookresearch/DiT/tree/main # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py # -------------------------------------------------------- import enum from typing import Callable, List import numpy as np import torch from einops import rearrange from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl def mean_flat(tensor: torch.Tensor, mask=None): """ Take the mean over all non-batch dimensions. """ if mask is None: return tensor.mean(dim=list(range(1, len(tensor.shape)))) else: assert tensor.dim() == 5 assert tensor.shape[2] == mask.shape[1] tensor = rearrange(tensor, "b c t h w -> b t (c h w)") denom = mask.sum(dim=1) * tensor.shape[-1] loss = (tensor * mask.unsqueeze(2)).sum(dim=1).sum(dim=1) / denom return loss class ModelMeanType(enum.Enum): """ Which type of output the model predicts. """ PREVIOUS_X = enum.auto() # the model predicts x_{t-1} START_X = enum.auto() # the model predicts x_0 EPSILON = enum.auto() # the model predicts epsilon class ModelVarType(enum.Enum): """ What is used as the model's output variance. The LEARNED_RANGE option has been added to allow the model to predict values between FIXED_SMALL and FIXED_LARGE, making its job easier. """ LEARNED = enum.auto() FIXED_SMALL = enum.auto() FIXED_LARGE = enum.auto() LEARNED_RANGE = enum.auto() class LossType(enum.Enum): MSE = enum.auto() # use raw MSE loss (and KL when learning variances) RESCALED_MSE = enum.auto() # use raw MSE loss (with RESCALED_KL when learning variances) KL = enum.auto() # use the variational lower-bound RESCALED_KL = enum.auto() # like KL, but rescale to estimate the full VLB def is_vb(self): return self == LossType.KL or self == LossType.RESCALED_KL def _warmup_beta(beta_start: float, beta_end: float, num_diffusion_timesteps: int, warmup_frac: float) -> torch.Tensor: betas = beta_end * torch.ones(num_diffusion_timesteps, dtype=torch.float64) warmup_time = int(num_diffusion_timesteps * warmup_frac) betas[:warmup_time] = torch.linspace(beta_start, beta_end, warmup_time, dtype=torch.float64) return betas def get_beta_schedule( beta_schedule: str, *, beta_start: float, beta_end: float, num_diffusion_timesteps: int ) -> torch.Tensor: """ This is the deprecated API for creating beta schedules. See get_named_beta_schedule() for the new library of schedules. """ if beta_schedule == "quad": betas = ( torch.linspace( beta_start**0.5, beta_end**0.5, num_diffusion_timesteps, dtype=torch.float64, ) ** 2 ) elif beta_schedule == "linear": betas = torch.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=torch.float64) elif beta_schedule == "warmup10": betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1) elif beta_schedule == "warmup50": betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5) elif beta_schedule == "const": betas = beta_end * torch.ones(num_diffusion_timesteps, dtype=torch.float64) elif beta_schedule == "jsd": # 1/T, 1/(T-1), 1/(T-2), ..., 1 betas = 1.0 / torch.linspace(num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=torch.float64) else: raise NotImplementedError(beta_schedule) assert betas.shape == (num_diffusion_timesteps,) return betas def betas_for_alpha_bar(num_diffusion_timesteps: int, alpha_bar: Callable, max_beta: float = 0.999): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. :param num_diffusion_timesteps: the number of betas to produce. :param alpha_bar: a lambda that takes an argument t from 0 to 1 and produces the cumulative product of (1-beta) up to that part of the diffusion process. :param max_beta: the maximum beta to use; use values lower than 1 to prevent singularities. """ betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) return torch.DoubleTensor(betas) def get_named_beta_schedule(schedule_name, num_diffusion_timesteps): """ Get a pre-defined beta schedule for the given name. The beta schedule library consists of beta schedules which remain similar in the limit of num_diffusion_timesteps. Beta schedules may be added, but should not be removed or changed once they are committed to maintain backwards compatibility. """ if schedule_name == "linear": # Linear schedule from Ho et al, extended to work for any number of # diffusion steps. scale = 1000 / num_diffusion_timesteps return get_beta_schedule( "linear", beta_start=scale * 0.0001, beta_end=scale * 0.02, num_diffusion_timesteps=num_diffusion_timesteps, ) elif schedule_name == "squaredcos_cap_v2": return betas_for_alpha_bar( num_diffusion_timesteps, lambda t: matorch.cos((t + 0.008) / 1.008 * matorch.pi / 2) ** 2, ) else: raise NotImplementedError(f"unknown beta schedule: {schedule_name}") class GaussianDiffusion: """ Utilities for training and sampling diffusion models. Original ported from this codebase: https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42 :param betas: a 1-D numpy array of betas for each diffusion timestep, starting at T and going to 1. """ def __init__( self, *, betas: torch.Tensor, model_mean_type: str, model_var_type: str, loss_type: str, device: str = "cuda", ): if device == "cuda": device = torch.device(f"cuda:{torch.cuda.current_device()}") elif device == "cpu": device = torch.device("cpu") else: raise ValueError(f"Unknown device: {device}") self.device = device self.model_mean_type = model_mean_type self.model_var_type = model_var_type self.loss_type = loss_type # Use float64 for accuracy. self.betas = betas.to(self.device) assert len(self.betas.shape) == 1, "betas must be 1-D" assert (self.betas > 0).all() and (self.betas <= 1).all() self.num_timesteps = int(betas.shape[0]) alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(alphas, axis=0) self.alphas_cumprod_prev = torch.cat([torch.tensor([1.0], device=self.device), self.alphas_cumprod[:-1]]) self.alphas_cumprod_next = torch.cat([self.alphas_cumprod[1:], torch.tensor([0.0], device=self.device)]) assert self.alphas_cumprod_prev.shape == (self.num_timesteps,) # calculations for diffusion q(x_t | x_{t-1}) and others self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod) self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - self.alphas_cumprod) self.log_one_minus_alphas_cumprod = torch.log(1.0 - self.alphas_cumprod) self.sqrt_recip_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod) self.sqrt_recipm1_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod - 1) # calculations for posterior q(x_{t-1} | x_t, x_0) self.posterior_variance = self.betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod) # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain self.posterior_log_variance_clipped = ( torch.log(torch.cat([self.posterior_variance[1].unsqueeze(0), self.posterior_variance[1:]])) if len(self.posterior_variance) > 1 else torch.DoubleTensor([]) ) self.posterior_mean_coef1 = self.betas * torch.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod) self.posterior_mean_coef2 = (1.0 - self.alphas_cumprod_prev) * torch.sqrt(alphas) / (1.0 - self.alphas_cumprod) def q_mean_variance(self, x_start, t): """ Get the distribution q(x_t | x_0). :param x_start: the [N x C x ...] tensor of noiseless inputs. :param t: the number of diffusion steps (minus 1). Here, 0 means one step. :return: A tuple (mean, variance, log_variance), all of x_start's shape. """ mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape) log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape) return mean, variance, log_variance def q_sample(self, x_start, t, noise=None): """ Diffuse the data for a given number of diffusion steps. In other words, sample from q(x_t | x_0). :param x_start: the initial data batch. :param t: the number of diffusion steps (minus 1). Here, 0 means one step. :param noise: if specified, the split-out normal noise. :return: A noisy version of x_start. """ if noise is None: noise = torch.randn_like(x_start) assert noise.shape == x_start.shape return ( _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise ) def q_posterior_mean_variance(self, x_start, x_t, t): """ Compute the mean and variance of the diffusion posterior: q(x_{t-1} | x_t, x_0) """ assert x_start.shape == x_t.shape posterior_mean = ( _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t ) posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape) posterior_log_variance_clipped = _extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape) assert ( posterior_mean.shape[0] == posterior_variance.shape[0] == posterior_log_variance_clipped.shape[0] == x_start.shape[0] ) return posterior_mean, posterior_variance, posterior_log_variance_clipped def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None): """ Apply the model to get p(x_{t-1} | x_t), as well as a prediction of the initial x, x_0. :param model: the model, which takes a signal and a batch of timesteps as input. :param x: the [N x C x ...] tensor at time t. :param t: a 1-D Tensor of timesteps. :param clip_denoised: if True, clip the denoised signal into [-1, 1]. :param denoised_fn: if not None, a function which applies to the x_start prediction before it is used to sample. Applies before clip_denoised. :param model_kwargs: if not None, a dict of extra keyword arguments to pass to the model. This can be used for conditioning. :return: a dict with the following keys: - 'mean': the model mean output. - 'variance': the model variance output. - 'log_variance': the log of 'variance'. - 'pred_xstart': the prediction for x_0. """ if model_kwargs is None: model_kwargs = {} B, C = x.shape[:2] assert t.shape == (B,) model_output = model(x, t, **model_kwargs) if isinstance(model_output, tuple): model_output, extra = model_output else: extra = None if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]: assert model_output.shape == (B, C * 2, *x.shape[2:]) model_output, model_var_values = torch.split(model_output, C, dim=1) min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape) max_log = _extract_into_tensor(torch.log(self.betas), t, x.shape) # The model_var_values is [-1, 1] for [min_var, max_var]. frac = (model_var_values + 1) / 2 model_log_variance = frac * max_log + (1 - frac) * min_log model_variance = torch.exp(model_log_variance) else: model_variance, model_log_variance = { # for fixedlarge, we set the initial (log-)variance like so # to get a better decoder log likelihood. ModelVarType.FIXED_LARGE: ( torch.cat(self.posterior_variance[1].unsqueeze(0), self.betas[1:]), torch.log(torch.cat(self.posterior_variance[1].unsqueeze(0), self.betas[1:])), ), ModelVarType.FIXED_SMALL: ( self.posterior_variance, self.posterior_log_variance_clipped, ), }[self.model_var_type] model_variance = _extract_into_tensor(model_variance, t, x.shape) model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape) def process_xstart(x): if denoised_fn is not None: x = denoised_fn(x) if clip_denoised: return x.clamp(-1, 1) return x if self.model_mean_type == ModelMeanType.START_X: pred_xstart = process_xstart(model_output) else: pred_xstart = process_xstart(self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)) model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t) assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape return { "mean": model_mean, "variance": model_variance, "log_variance": model_log_variance, "pred_xstart": pred_xstart, "extra": extra, } def _predict_xstart_from_eps(self, x_t, t, eps): assert x_t.shape == eps.shape return ( _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps ) def _predict_eps_from_xstart(self, x_t, t, pred_xstart): return ( _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None): """ Compute the mean for the previous step, given a function cond_fn that computes the gradient of a conditional log probability with respect to x. In particular, cond_fn computes grad(log(p(y|x))), and we want to condition on y. This uses the conditioning strategy from Sohl-Dickstein et al. (2015). """ gradient = cond_fn(x, t, **model_kwargs) new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float() return new_mean def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None): """ Compute what the p_mean_variance output would have been, should the model's score function be conditioned by cond_fn. See condition_mean() for details on cond_fn. Unlike condition_mean(), this instead uses the conditioning strategy from Song et al (2020). """ alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"]) eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs) out = p_mean_var.copy() out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps) out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t) return out def p_sample( self, model, x, t, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, mask=None, ): """ Sample x_{t-1} from the model at the given timestep. :param model: the model to sample from. :param x: the current tensor at x_{t-1}. :param t: the value of t, starting at 0 for the first diffusion step. :param clip_denoised: if True, clip the x_start prediction to [-1, 1]. :param denoised_fn: if not None, a function which applies to the x_start prediction before it is used to sample. :param cond_fn: if not None, this is a gradient function that acts similarly to the model. :param model_kwargs: if not None, a dict of extra keyword arguments to pass to the model. This can be used for conditioning. :return: a dict containing the following keys: - 'sample': a random sample from the model. - 'pred_xstart': a prediction of x_0. """ if mask is not None: if mask.shape[0] != x.shape[0]: mask = mask.repeat(2, 1) # HACK mask_t = (mask * len(self.betas)).to(torch.int) # x0: copy unchanged x values # x_noise: add noise to x values x0 = x.clone() x_noise = x0 * _extract_into_tensor(self.sqrt_alphas_cumprod, t, x.shape) + torch.randn_like( x ) * _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x.shape) # active noise addition # WARNING: this is a hacky implementation mask_t_equall = (mask_t == t.unsqueeze(1))[:, None, :, None, None] x = torch.where(mask_t_equall, x_noise, x0) # create x_mask mask_t_upper = (mask_t > t.unsqueeze(1))[:, None, :, None, None] batch_size = x.shape[0] model_kwargs["x_mask"] = mask_t_upper.reshape(batch_size, -1).to(torch.bool) out = self.p_mean_variance( model, x, t, clip_denoised=clip_denoised, denoised_fn=denoised_fn, model_kwargs=model_kwargs, ) noise = torch.randn_like(x) nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) # no noise when t == 0 if cond_fn is not None: out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs) sample = out["mean"] + nonzero_mask * torch.exp(0.5 * out["log_variance"]) * noise if mask is not None: mask_t_lower = (mask_t < t.unsqueeze(1))[:, None, :, None, None] sample = torch.where(mask_t_lower, x0, sample) return {"sample": sample, "pred_xstart": out["pred_xstart"]} def p_sample_loop( self, model, shape, noise=None, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, device=None, progress=False, mask=None, ): """ Generate samples from the model. :param model: the model module. :param shape: the shape of the samples, (N, C, H, W). :param noise: if specified, the noise from the encoder to sample. Should be of the same shape as `shape`. :param clip_denoised: if True, clip x_start predictions to [-1, 1]. :param denoised_fn: if not None, a function which applies to the x_start prediction before it is used to sample. :param cond_fn: if not None, this is a gradient function that acts similarly to the model. :param model_kwargs: if not None, a dict of extra keyword arguments to pass to the model. This can be used for conditioning. :param device: if specified, the device to create the samples on. If not specified, use a model parameter's device. :param progress: if True, show a tqdm progress bar. :return: a non-differentiable batch of samples. """ final = None for sample in self.p_sample_loop_progressive( model, shape, noise=noise, clip_denoised=clip_denoised, denoised_fn=denoised_fn, cond_fn=cond_fn, model_kwargs=model_kwargs, device=device, progress=progress, mask=mask, ): final = sample return final["sample"] def p_sample_loop_progressive( self, model, shape, noise=None, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, device=None, progress=False, mask=None, ): """ Generate samples from the model and yield intermediate samples from each timestep of diffusion. Arguments are the same as p_sample_loop(). Returns a generator over dicts, where each dict is the return value of p_sample(). """ if device is None: device = next(model.parameters()).device assert isinstance(shape, (tuple, list)) if noise is not None: img = noise else: img = torch.randn(*shape, device=device) indices = list(range(self.num_timesteps))[::-1] if progress: # Lazy import so that we don't depend on tqdm. from tqdm.auto import tqdm indices = tqdm(indices) for i in indices: t = torch.tensor([i] * shape[0], device=device) with torch.no_grad(): out = self.p_sample( model, img, t, clip_denoised=clip_denoised, denoised_fn=denoised_fn, cond_fn=cond_fn, model_kwargs=model_kwargs, mask=mask, ) yield out img = out["sample"] def ddim_sample( self, model, x, t, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, eta=0.0, ): """ Sample x_{t-1} from the model using DDIM. Same usage as p_sample(). """ out = self.p_mean_variance( model, x, t, clip_denoised=clip_denoised, denoised_fn=denoised_fn, model_kwargs=model_kwargs, ) if cond_fn is not None: out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs) # Usually our model outputs epsilon, but we re-derive it # in case we used x_start or x_prev prediction. eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"]) alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape) sigma = eta * torch.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * torch.sqrt(1 - alpha_bar / alpha_bar_prev) # Equation 12. noise = torch.randn_like(x) mean_pred = out["pred_xstart"] * torch.sqrt(alpha_bar_prev) + torch.sqrt(1 - alpha_bar_prev - sigma**2) * eps nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) # no noise when t == 0 sample = mean_pred + nonzero_mask * sigma * noise return {"sample": sample, "pred_xstart": out["pred_xstart"]} def ddim_reverse_sample( self, model, x, t, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, eta=0.0, ): """ Sample x_{t+1} from the model using DDIM reverse ODE. """ assert eta == 0.0, "Reverse ODE only for deterministic path" out = self.p_mean_variance( model, x, t, clip_denoised=clip_denoised, denoised_fn=denoised_fn, model_kwargs=model_kwargs, ) if cond_fn is not None: out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs) # Usually our model outputs epsilon, but we re-derive it # in case we used x_start or x_prev prediction. eps = ( _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x - out["pred_xstart"] ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape) alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape) # Equation 12. reversed mean_pred = out["pred_xstart"] * torch.sqrt(alpha_bar_next) + torch.sqrt(1 - alpha_bar_next) * eps return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]} def ddim_sample_loop( self, model, shape, noise=None, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, device=None, progress=False, eta=0.0, ): """ Generate samples from the model using DDIM. Same usage as p_sample_loop(). """ final = None for sample in self.ddim_sample_loop_progressive( model, shape, noise=noise, clip_denoised=clip_denoised, denoised_fn=denoised_fn, cond_fn=cond_fn, model_kwargs=model_kwargs, device=device, progress=progress, eta=eta, ): final = sample return final["sample"] def ddim_sample_loop_progressive( self, model, shape, noise=None, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, device=None, progress=False, eta=0.0, ): """ Use DDIM to sample from the model and yield intermediate samples from each timestep of DDIM. Same usage as p_sample_loop_progressive(). """ if device is None: device = next(model.parameters()).device assert isinstance(shape, (tuple, list)) if noise is not None: img = noise else: img = torch.randn(*shape, device=device) indices = list(range(self.num_timesteps))[::-1] if progress: # Lazy import so that we don't depend on tqdm. from tqdm.auto import tqdm indices = tqdm(indices) for i in indices: t = torch.tensor([i] * shape[0], device=device) with torch.no_grad(): out = self.ddim_sample( model, img, t, clip_denoised=clip_denoised, denoised_fn=denoised_fn, cond_fn=cond_fn, model_kwargs=model_kwargs, eta=eta, ) yield out img = out["sample"] def _vb_terms_bpd(self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None, mask=None): """ Get a term for the variational lower-bound. The resulting units are bits (rather than nats, as one might expect). This allows for comparison to other papers. :return: a dict with the following keys: - 'output': a shape [N] tensor of NLLs or KLs. - 'pred_xstart': the x_0 predictions. """ true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t) out = self.p_mean_variance(model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs) kl = normal_kl(true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]) kl = mean_flat(kl, mask=mask) / np.log(2.0) decoder_nll = -discretized_gaussian_log_likelihood( x_start, means=out["mean"], log_scales=0.5 * out["log_variance"] ) assert decoder_nll.shape == x_start.shape decoder_nll = mean_flat(decoder_nll, mask=mask) / np.log(2.0) # At the first timestep return the decoder NLL, # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t)) output = torch.where((t == 0), decoder_nll, kl) return {"output": output, "pred_xstart": out["pred_xstart"]} def training_losses(self, model, x_start, model_kwargs=None, noise=None, mask=None, weights=None, t=None): """ Compute training losses for a single timestep. :param model: the model to evaluate loss on. :param x_start: the [N x C x ...] tensor of inputs. :param model_kwargs: if not None, a dict of extra keyword arguments to pass to the model. This can be used for conditioning. :param noise: if specified, the specific Gaussian noise to try to remove. :return: a dict with the key "loss" containing a tensor of shape [N]. Some mean or variance settings may also have other keys. """ # sample timestep t = torch.randint(0, self.num_timesteps, (x_start.shape[0],), device=x_start.device) if model_kwargs is None: model_kwargs = {} if noise is None: noise = torch.randn_like(x_start) x_t = self.q_sample(x_start, t, noise=noise) if mask is not None: t0 = torch.zeros_like(t) x_t0 = self.q_sample(x_start, t0, noise=noise) x_t = torch.where(mask[:, None, :, None, None], x_t, x_t0) terms = {} if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL: assert mask is None, "mask not supported for KL loss" terms["loss"] = self._vb_terms_bpd( model=model, x_start=x_start, x_t=x_t, t=t, clip_denoised=False, model_kwargs=model_kwargs, )["output"] if self.loss_type == LossType.RESCALED_KL: terms["loss"] *= self.num_timesteps elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE: model_output = model(x_t, t, **model_kwargs) if self.model_var_type in [ ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE, ]: B, C = x_t.shape[:2] assert model_output.shape == (B, C * 2, *x_t.shape[2:]) model_output, model_var_values = torch.split(model_output, C, dim=1) # Learn the variance using the variational bound, but don't let # it affect our mean prediction. frozen_out = torch.cat([model_output.detach(), model_var_values], dim=1) terms["vb"] = self._vb_terms_bpd( model=lambda *args, r=frozen_out: r, x_start=x_start, x_t=x_t, t=t, clip_denoised=False, mask=mask, )["output"] if self.loss_type == LossType.RESCALED_MSE: # Divide by 1000 for equivalence with initial implementation. # Without a factor of 1/1000, the VB term hurts the MSE term. terms["vb"] *= self.num_timesteps / 1000.0 target = { ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)[0], ModelMeanType.START_X: x_start, ModelMeanType.EPSILON: noise, }[self.model_mean_type] assert model_output.shape == target.shape == x_start.shape if weights is None: terms["mse"] = mean_flat((target - model_output) ** 2, mask=mask) else: weight = _extract_into_tensor(weights, t, target.shape) terms["mse"] = mean_flat(weight * (target - model_output) ** 2, mask=mask) if "vb" in terms: terms["loss"] = terms["mse"] + terms["vb"] else: terms["loss"] = terms["mse"] else: raise NotImplementedError(self.loss_type) return terms def _prior_bpd(self, x_start): """ Get the prior KL term for the variational lower-bound, measured in bits-per-dim. This term can't be optimized, as it only depends on the encoder. :param x_start: the [N x C x ...] tensor of inputs. :return: a batch of [N] KL values (in bits), one per batch element. """ batch_size = x_start.shape[0] t = torch.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device) qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t) kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0) return mean_flat(kl_prior) / np.log(2.0) def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None): """ Compute the entire variational lower-bound, measured in bits-per-dim, as well as other related quantities. :param model: the model to evaluate loss on. :param x_start: the [N x C x ...] tensor of inputs. :param clip_denoised: if True, clip denoised samples. :param model_kwargs: if not None, a dict of extra keyword arguments to pass to the model. This can be used for conditioning. :return: a dict containing the following keys: - total_bpd: the total variational lower-bound, per batch element. - prior_bpd: the prior term in the lower-bound. - vb: an [N x T] tensor of terms in the lower-bound. - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep. - mse: an [N x T] tensor of epsilon MSEs for each timestep. """ device = x_start.device batch_size = x_start.shape[0] vb = [] xstart_mse = [] mse = [] for t in list(range(self.num_timesteps))[::-1]: t_batch = torch.tensor([t] * batch_size, device=device) noise = torch.randn_like(x_start) x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise) # Calculate VLB term at the current timestep with torch.no_grad(): out = self._vb_terms_bpd( model, x_start=x_start, x_t=x_t, t=t_batch, clip_denoised=clip_denoised, model_kwargs=model_kwargs, ) vb.append(out["output"]) xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2)) eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"]) mse.append(mean_flat((eps - noise) ** 2)) vb = torch.stack(vb, dim=1) xstart_mse = torch.stack(xstart_mse, dim=1) mse = torch.stack(mse, dim=1) prior_bpd = self._prior_bpd(x_start) total_bpd = vb.sum(dim=1) + prior_bpd return { "total_bpd": total_bpd, "prior_bpd": prior_bpd, "vb": vb, "xstart_mse": xstart_mse, "mse": mse, } def _extract_into_tensor(arr: torch.Tensor, timesteps: torch.Tensor, broadcast_shape: List[int]): """ Extract values from a 1-D numpy array for a batch of indices. :param arr: the 1-D numpy array. :param timesteps: a tensor of indices into the array to extract. :param broadcast_shape: a larger shape of K dimensions with the batch dimension equal to the length of timesteps. :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims. """ res = arr.to(timesteps.device)[timesteps].float() while len(res.shape) < len(broadcast_shape): res = res[..., None] return res + torch.zeros(broadcast_shape, device=timesteps.device) ================================================ FILE: Open-Sora/opensora/schedulers/iddpm/respace.py ================================================ # Adapted from DiT # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # DiT: https://github.com/facebookresearch/DiT/tree/main # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py # -------------------------------------------------------- import torch from colossalai.utils import get_current_device from .gaussian_diffusion import GaussianDiffusion def space_timesteps(num_timesteps, section_counts): """ Create a list of timesteps to use from an original diffusion process, given the number of timesteps we want to take from equally-sized portions of the original process. For example, if there's 300 timesteps and the section counts are [10,15,20] then the first 100 timesteps are strided to be 10 timesteps, the second 100 are strided to be 15 timesteps, and the final 100 are strided to be 20. If the stride is a string starting with "ddim", then the fixed striding from the DDIM paper is used, and only one section is allowed. :param num_timesteps: the number of diffusion steps in the original process to divide up. :param section_counts: either a list of numbers, or a string containing comma-separated numbers, indicating the step count per section. As a special case, use "ddimN" where N is a number of steps to use the striding from the DDIM paper. :return: a set of diffusion steps from the original process to use. """ if isinstance(section_counts, str): if section_counts.startswith("ddim"): desired_count = int(section_counts[len("ddim") :]) for i in range(1, num_timesteps): if len(range(0, num_timesteps, i)) == desired_count: return set(range(0, num_timesteps, i)) raise ValueError(f"cannot create exactly {num_timesteps} steps with an integer stride") section_counts = [int(x) for x in section_counts.split(",")] size_per = num_timesteps // len(section_counts) extra = num_timesteps % len(section_counts) start_idx = 0 all_steps = [] for i, section_count in enumerate(section_counts): size = size_per + (1 if i < extra else 0) if size < section_count: raise ValueError(f"cannot divide section of {size} steps into {section_count}") if section_count <= 1: frac_stride = 1 else: frac_stride = (size - 1) / (section_count - 1) cur_idx = 0.0 taken_steps = [] for _ in range(section_count): taken_steps.append(start_idx + round(cur_idx)) cur_idx += frac_stride all_steps += taken_steps start_idx += size return set(all_steps) class SpacedDiffusion(GaussianDiffusion): """ A diffusion process which can skip steps in a base diffusion process. :param use_timesteps: a collection (sequence or set) of timesteps from the original diffusion process to retain. :param kwargs: the kwargs to create the base diffusion process. """ def __init__(self, use_timesteps, **kwargs): self.use_timesteps = set(use_timesteps) self.timestep_map = [] self.original_num_steps = len(kwargs["betas"]) base_diffusion = GaussianDiffusion(**kwargs) # pylint: disable=missing-kwoa last_alpha_cumprod = 1.0 new_betas = [] for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod): if i in self.use_timesteps: new_betas.append(1 - alpha_cumprod / last_alpha_cumprod) last_alpha_cumprod = alpha_cumprod self.timestep_map.append(i) kwargs["betas"] = torch.FloatTensor(new_betas) super().__init__(**kwargs) self.map_tensor = torch.tensor(self.timestep_map, device=get_current_device()) def p_mean_variance(self, model, *args, **kwargs): # pylint: disable=signature-differs return super().p_mean_variance(self._wrap_model(model), *args, **kwargs) def training_losses(self, model, *args, **kwargs): # pylint: disable=signature-differs return super().training_losses(self._wrap_model(model), *args, **kwargs) def condition_mean(self, cond_fn, *args, **kwargs): return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs) def condition_score(self, cond_fn, *args, **kwargs): return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs) def _wrap_model(self, model): if isinstance(model, _WrappedModel): return model return _WrappedModel(model, self.map_tensor, self.original_num_steps) def _scale_timesteps(self, t): # Scaling is done by the wrapped model. return t class _WrappedModel: def __init__(self, model, map_tensor, original_num_steps): self.model = model self.map_tensor = map_tensor # self.rescale_timesteps = rescale_timesteps self.original_num_steps = original_num_steps def __call__(self, x, ts, **kwargs): new_ts = self.map_tensor[ts].to(device=ts.device, dtype=ts.dtype) # if self.rescale_timesteps: # new_ts = new_ts.float() * (1000.0 / self.original_num_steps) return self.model(x, new_ts, **kwargs) ================================================ FILE: Open-Sora/opensora/schedulers/iddpm/speed.py ================================================ import numpy as np import torch import torch.nn.functional as F from opensora.registry import SCHEDULERS from . import gaussian_diffusion as gd from .respace import SpacedDiffusion, space_timesteps @SCHEDULERS.register_module("iddpm-speed") class SpeeDiffusion(SpacedDiffusion): def __init__( self, num_sampling_steps=None, timestep_respacing=None, noise_schedule="linear", use_kl=False, sigma_small=False, predict_xstart=False, learn_sigma=True, rescale_learned_sigmas=False, diffusion_steps=1000, cfg_scale=4.0, ): betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps) if use_kl: loss_type = gd.LossType.RESCALED_KL elif rescale_learned_sigmas: loss_type = gd.LossType.RESCALED_MSE else: loss_type = gd.LossType.MSE if num_sampling_steps is not None: assert timestep_respacing is None timestep_respacing = str(num_sampling_steps) if timestep_respacing is None or timestep_respacing == "": timestep_respacing = [diffusion_steps] super().__init__( use_timesteps=space_timesteps(diffusion_steps, timestep_respacing), betas=betas, model_mean_type=(gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X), model_var_type=( (gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL) if not learn_sigma else gd.ModelVarType.LEARNED_RANGE ), loss_type=loss_type, ) self.cfg_scale = cfg_scale # we fallback to numpy here as argmax_cuda is not implemented for Bool grad = np.gradient(self.sqrt_one_minus_alphas_cumprod.cpu()) self.meaningful_steps = np.argmax(grad < 5e-5) + 1 # p2 weighting from: Perception Prioritized Training of Diffusion Models self.p2_gamma = 1 self.p2_k = 1 self.snr = 1.0 / (1 - self.alphas_cumprod) - 1 sqrt_one_minus_alphas_bar = self.sqrt_one_minus_alphas_cumprod p = torch.tanh(1e6 * (torch.gradient(sqrt_one_minus_alphas_bar)[0] - 1e-4)) + 1.5 self.p = F.normalize(p, p=1, dim=0) self.weights = 1 / (self.p2_k + self.snr) ** self.p2_gamma def t_sample(self, n, device): t = torch.multinomial(self.p, n // 2 + 1, replacement=True).to(device) dual_t = torch.where(t < self.meaningful_steps, self.meaningful_steps - t, t - self.meaningful_steps) t = torch.cat([t, dual_t], dim=0)[:n] return t def training_losses(self, model, x, *args, **kwargs): # pylint: disable=signature-differs t = self.t_sample(x.shape[0], x.device) return super().training_losses(model, x, t, weights=self.weights, *args, **kwargs) def sample(self, *args, **kwargs): raise NotImplementedError("SpeeDiffusion is only for training") ================================================ FILE: Open-Sora/opensora/schedulers/iddpm/timestep_sampler.py ================================================ # Adapted from DiT # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # DiT: https://github.com/facebookresearch/DiT/tree/main # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py # -------------------------------------------------------- from abc import ABC, abstractmethod import numpy as np import torch as th import torch.distributed as dist def create_named_schedule_sampler(name, diffusion): """ Create a ScheduleSampler from a library of pre-defined samplers. :param name: the name of the sampler. :param diffusion: the diffusion object to sample for. """ if name == "uniform": return UniformSampler(diffusion) elif name == "loss-second-moment": return LossSecondMomentResampler(diffusion) else: raise NotImplementedError(f"unknown schedule sampler: {name}") class ScheduleSampler(ABC): """ A distribution over timesteps in the diffusion process, intended to reduce variance of the objective. By default, samplers perform unbiased importance sampling, in which the objective's mean is unchanged. However, subclasses may override sample() to change how the resampled terms are reweighted, allowing for actual changes in the objective. """ @abstractmethod def weights(self): """ Get a numpy array of weights, one per diffusion step. The weights needn't be normalized, but must be positive. """ def sample(self, batch_size, device): """ Importance-sample timesteps for a batch. :param batch_size: the number of timesteps. :param device: the torch device to save to. :return: a tuple (timesteps, weights): - timesteps: a tensor of timestep indices. - weights: a tensor of weights to scale the resulting losses. """ w = self.weights() p = w / np.sum(w) indices_np = np.random.choice(len(p), size=(batch_size,), p=p) indices = th.from_numpy(indices_np).long().to(device) weights_np = 1 / (len(p) * p[indices_np]) weights = th.from_numpy(weights_np).float().to(device) return indices, weights class UniformSampler(ScheduleSampler): def __init__(self, diffusion): self.diffusion = diffusion self._weights = np.ones([diffusion.num_timesteps]) def weights(self): return self._weights class LossAwareSampler(ScheduleSampler): def update_with_local_losses(self, local_ts, local_losses): """ Update the reweighting using losses from a model. Call this method from each rank with a batch of timesteps and the corresponding losses for each of those timesteps. This method will perform synchronization to make sure all of the ranks maintain the exact same reweighting. :param local_ts: an integer Tensor of timesteps. :param local_losses: a 1D Tensor of losses. """ batch_sizes = [th.tensor([0], dtype=th.int32, device=local_ts.device) for _ in range(dist.get_world_size())] dist.all_gather( batch_sizes, th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device), ) # Pad all_gather batches to be the maximum batch size. batch_sizes = [x.item() for x in batch_sizes] max_bs = max(batch_sizes) timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes] loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes] dist.all_gather(timestep_batches, local_ts) dist.all_gather(loss_batches, local_losses) timesteps = [x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]] losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]] self.update_with_all_losses(timesteps, losses) @abstractmethod def update_with_all_losses(self, ts, losses): """ Update the reweighting using losses from a model. Sub-classes should override this method to update the reweighting using losses from the model. This method directly updates the reweighting without synchronizing between workers. It is called by update_with_local_losses from all ranks with identical arguments. Thus, it should have deterministic behavior to maintain state across workers. :param ts: a list of int timesteps. :param losses: a list of float losses, one per timestep. """ class LossSecondMomentResampler(LossAwareSampler): def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001): self.diffusion = diffusion self.history_per_term = history_per_term self.uniform_prob = uniform_prob self._loss_history = np.zeros([diffusion.num_timesteps, history_per_term], dtype=np.float64) self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int) def weights(self): if not self._warmed_up(): return np.ones([self.diffusion.num_timesteps], dtype=np.float64) weights = np.sqrt(np.mean(self._loss_history**2, axis=-1)) weights /= np.sum(weights) weights *= 1 - self.uniform_prob weights += self.uniform_prob / len(weights) return weights def update_with_all_losses(self, ts, losses): for t, loss in zip(ts, losses): if self._loss_counts[t] == self.history_per_term: # Shift out the oldest loss term. self._loss_history[t, :-1] = self._loss_history[t, 1:] self._loss_history[t, -1] = loss else: self._loss_history[t, self._loss_counts[t]] = loss self._loss_counts[t] += 1 def _warmed_up(self): return (self._loss_counts == self.history_per_term).all() ================================================ FILE: Open-Sora/opensora/schedulers/rf/__init__.py ================================================ import torch from tqdm import tqdm from opensora.registry import SCHEDULERS from .rectified_flow import RFlowScheduler, timestep_transform from ...models.cache_functions import cache_init import re @SCHEDULERS.register_module("rflow") class RFLOW: def __init__( self, num_sampling_steps=10, num_timesteps=1000, cfg_scale=4.0, use_discrete_timesteps=False, use_timestep_transform=False, **kwargs, ): self.num_sampling_steps = num_sampling_steps self.num_timesteps = num_timesteps self.cfg_scale = cfg_scale self.use_discrete_timesteps = use_discrete_timesteps self.use_timestep_transform = use_timestep_transform self.scheduler = RFlowScheduler( num_timesteps=num_timesteps, num_sampling_steps=num_sampling_steps, use_discrete_timesteps=use_discrete_timesteps, use_timestep_transform=use_timestep_transform, **kwargs, ) def sample( self, model, text_encoder, z, prompts, device, additional_args=None, mask=None, guidance_scale=None, progress=True, #flops_cal=True, ): # if no specific guidance scale is provided, use the default scale when initializing the scheduler if guidance_scale is None: guidance_scale = self.cfg_scale n = len(prompts) # text encoding model_args = text_encoder.encode(prompts) y_null = text_encoder.null(n) model_args["y"] = torch.cat([model_args["y"], y_null], 0) if additional_args is not None: model_args.update(additional_args) # prepare timesteps timesteps = [(1.0 - i / self.num_sampling_steps) * self.num_timesteps for i in range(self.num_sampling_steps)] if self.use_discrete_timesteps: timesteps = [int(round(t)) for t in timesteps] timesteps = [torch.tensor([t] * z.shape[0], device=device) for t in timesteps] if self.use_timestep_transform: timesteps = [timestep_transform(t, additional_args, num_timesteps=self.num_timesteps) for t in timesteps] if mask is not None: noise_added = torch.zeros_like(mask, dtype=torch.bool) noise_added = noise_added | (mask == 1) cache_dic_cal_flops, current_cal_flops = cache_init(model_kwargs=model_args, num_steps=self.num_sampling_steps) cache_dic, current = cache_init(model_kwargs=model_args, num_steps=self.num_sampling_steps) flops_sum = 0 cal_flops = False if cal_flops: from calflops import calculate_flops progress_wrap = tqdm if progress else (lambda x: x) for i, t in progress_wrap(enumerate(timesteps)): current['step'] = i current_cal_flops['step'] = i # mask for adding noise if mask is not None: mask_t = mask * self.num_timesteps x0 = z.clone() x_noise = self.scheduler.add_noise(x0, torch.randn_like(x0), t) mask_t_upper = mask_t >= t.unsqueeze(1) model_args["x_mask"] = mask_t_upper.repeat(2, 1) mask_add_noise = mask_t_upper & ~noise_added z = torch.where(mask_add_noise[:, None, :, None, None], x_noise, x0) noise_added = mask_t_upper # classifier-free guidance z_in = torch.cat([z, z], 0) t = torch.cat([t, t], 0) if cal_flops: flop_kwargs = model_args.copy() flop_kwargs['x'] = z_in.clone() flop_kwargs['timestep'] = t.clone() flop_kwargs['cache_dic'] = cache_dic_cal_flops flop_kwargs['current'] = current_cal_flops flops, macs, params = calculate_flops(model=model, kwargs = flop_kwargs, print_results=False) # 将字符串转换为浮点数 #flops = float(re.findall(r"[-+]?\d*\.\d+|\d+", flops)[0]) match = re.findall(r"([-+]?\d*\.\d+|\d+)\s*([GMTP]?)FLOPS", flops) flops_value = float(match[0][0]) # 提取数值部分 unit = match[0][1] # 提取量级部分,如 G 或 T if unit == 'G': flops = flops_value * 0.001 else: flops = flops_value flops_sum += flops else: pred = model(z_in, t, cache_dic=cache_dic, current=current, **model_args).chunk(2, dim=1)[0] pred_cond, pred_uncond = pred.chunk(2, dim=0) v_pred = pred_uncond + guidance_scale * (pred_cond - pred_uncond) # update z dt = timesteps[i] - timesteps[i + 1] if i < len(timesteps) - 1 else timesteps[i] dt = dt / self.num_timesteps z = z + v_pred * dt[:, None, None, None, None] if mask is not None: z = torch.where(mask_t_upper[:, None, :, None, None], z, x0) if cal_flops: print("FLOPs:", flops_sum, "TFLOPs") return z def training_losses(self, model, x_start, model_kwargs=None, noise=None, mask=None, weights=None, t=None): return self.scheduler.training_losses(model, x_start, model_kwargs, noise, mask, weights, t) ================================================ FILE: Open-Sora/opensora/schedulers/rf/rectified_flow.py ================================================ import torch from torch.distributions import LogisticNormal from ..iddpm.gaussian_diffusion import _extract_into_tensor, mean_flat # some code are inspired by https://github.com/magic-research/piecewise-rectified-flow/blob/main/scripts/train_perflow.py # and https://github.com/magic-research/piecewise-rectified-flow/blob/main/src/scheduler_perflow.py def timestep_transform( t, model_kwargs, base_resolution=512 * 512, base_num_frames=1, scale=1.0, num_timesteps=1, ): # Force fp16 input to fp32 to avoid nan output for key in ["height", "width", "num_frames"]: if model_kwargs[key].dtype == torch.float16: model_kwargs[key] = model_kwargs[key].float() t = t / num_timesteps resolution = model_kwargs["height"] * model_kwargs["width"] ratio_space = (resolution / base_resolution).sqrt() # NOTE: currently, we do not take fps into account # NOTE: temporal_reduction is hardcoded, this should be equal to the temporal reduction factor of the vae if model_kwargs["num_frames"][0] == 1: num_frames = torch.ones_like(model_kwargs["num_frames"]) else: num_frames = model_kwargs["num_frames"] // 17 * 5 ratio_time = (num_frames / base_num_frames).sqrt() ratio = ratio_space * ratio_time * scale new_t = ratio * t / (1 + (ratio - 1) * t) new_t = new_t * num_timesteps return new_t class RFlowScheduler: def __init__( self, num_timesteps=1000, num_sampling_steps=10, use_discrete_timesteps=False, sample_method="uniform", loc=0.0, scale=1.0, use_timestep_transform=False, transform_scale=1.0, ): self.num_timesteps = num_timesteps self.num_sampling_steps = num_sampling_steps self.use_discrete_timesteps = use_discrete_timesteps # sample method assert sample_method in ["uniform", "logit-normal"] assert ( sample_method == "uniform" or not use_discrete_timesteps ), "Only uniform sampling is supported for discrete timesteps" self.sample_method = sample_method if sample_method == "logit-normal": self.distribution = LogisticNormal(torch.tensor([loc]), torch.tensor([scale])) self.sample_t = lambda x: self.distribution.sample((x.shape[0],))[:, 0].to(x.device) # timestep transform self.use_timestep_transform = use_timestep_transform self.transform_scale = transform_scale def training_losses(self, model, x_start, model_kwargs=None, noise=None, mask=None, weights=None, t=None): """ Compute training losses for a single timestep. Arguments format copied from opensora/schedulers/iddpm/gaussian_diffusion.py/training_losses Note: t is int tensor and should be rescaled from [0, num_timesteps-1] to [1,0] """ if t is None: if self.use_discrete_timesteps: t = torch.randint(0, self.num_timesteps, (x_start.shape[0],), device=x_start.device) elif self.sample_method == "uniform": t = torch.rand((x_start.shape[0],), device=x_start.device) * self.num_timesteps elif self.sample_method == "logit-normal": t = self.sample_t(x_start) * self.num_timesteps if self.use_timestep_transform: t = timestep_transform(t, model_kwargs, scale=self.transform_scale, num_timesteps=self.num_timesteps) if model_kwargs is None: model_kwargs = {} if noise is None: noise = torch.randn_like(x_start) assert noise.shape == x_start.shape x_t = self.add_noise(x_start, noise, t) if mask is not None: t0 = torch.zeros_like(t) x_t0 = self.add_noise(x_start, noise, t0) x_t = torch.where(mask[:, None, :, None, None], x_t, x_t0) terms = {} model_output = model(x_t, t, **model_kwargs) velocity_pred = model_output.chunk(2, dim=1)[0] if weights is None: loss = mean_flat((velocity_pred - (x_start - noise)).pow(2), mask=mask) else: weight = _extract_into_tensor(weights, t, x_start.shape) loss = mean_flat(weight * (velocity_pred - (x_start - noise)).pow(2), mask=mask) terms["loss"] = loss return terms def add_noise( self, original_samples: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor, ) -> torch.FloatTensor: """ compatible with diffusers add_noise() """ timepoints = timesteps.float() / self.num_timesteps timepoints = 1 - timepoints # [1,1/1000] # timepoint (bsz) noise: (bsz, 4, frame, w ,h) # expand timepoint to noise shape timepoints = timepoints.unsqueeze(1).unsqueeze(1).unsqueeze(1).unsqueeze(1) timepoints = timepoints.repeat(1, noise.shape[1], noise.shape[2], noise.shape[3], noise.shape[4]) return timepoints * original_samples + (1 - timepoints) * noise ================================================ FILE: Open-Sora/opensora/utils/__init__.py ================================================ ================================================ FILE: Open-Sora/opensora/utils/ckpt_utils.py ================================================ import functools import json import operator import os from typing import Tuple import torch import torch.distributed as dist import torch.nn as nn from colossalai.booster import Booster from colossalai.checkpoint_io import GeneralCheckpointIO from torch.optim import Optimizer from torch.optim.lr_scheduler import _LRScheduler from torchvision.datasets.utils import download_url from .misc import get_logger hf_endpoint = os.environ.get("HF_ENDPOINT") if hf_endpoint is None: hf_endpoint = "https://huggingface.co" pretrained_models = { "DiT-XL-2-512x512.pt": "https://dl.fbaipublicfiles.com/DiT/models/DiT-XL-2-512x512.pt", "DiT-XL-2-256x256.pt": "https://dl.fbaipublicfiles.com/DiT/models/DiT-XL-2-256x256.pt", "Latte-XL-2-256x256-ucf101.pt": hf_endpoint + "/maxin-cn/Latte/resolve/main/ucf101.pt", "PixArt-XL-2-256x256.pth": hf_endpoint + "/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-256x256.pth", "PixArt-XL-2-SAM-256x256.pth": hf_endpoint + "/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-SAM-256x256.pth", "PixArt-XL-2-512x512.pth": hf_endpoint + "/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-512x512.pth", "PixArt-XL-2-1024-MS.pth": hf_endpoint + "/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-1024-MS.pth", "OpenSora-v1-16x256x256.pth": hf_endpoint + "/hpcai-tech/Open-Sora/resolve/main/OpenSora-v1-16x256x256.pth", "OpenSora-v1-HQ-16x256x256.pth": hf_endpoint + "/hpcai-tech/Open-Sora/resolve/main/OpenSora-v1-HQ-16x256x256.pth", "OpenSora-v1-HQ-16x512x512.pth": hf_endpoint + "/hpcai-tech/Open-Sora/resolve/main/OpenSora-v1-HQ-16x512x512.pth", "PixArt-Sigma-XL-2-256x256.pth": hf_endpoint + "/PixArt-alpha/PixArt-Sigma/resolve/main/PixArt-Sigma-XL-2-256x256.pth", "PixArt-Sigma-XL-2-512-MS.pth": hf_endpoint + "/PixArt-alpha/PixArt-Sigma/resolve/main/PixArt-Sigma-XL-2-512-MS.pth", "PixArt-Sigma-XL-2-1024-MS.pth": hf_endpoint + "/PixArt-alpha/PixArt-Sigma/resolve/main/PixArt-Sigma-XL-2-1024-MS.pth", "PixArt-Sigma-XL-2-2K-MS.pth": hf_endpoint + "/PixArt-alpha/PixArt-Sigma/resolve/main/PixArt-Sigma-XL-2-2K-MS.pth", } def reparameter(ckpt, name=None, model=None): model_name = name name = os.path.basename(name) if not dist.is_initialized() or dist.get_rank() == 0: get_logger().info("loading pretrained model: %s", model_name) if name in ["DiT-XL-2-512x512.pt", "DiT-XL-2-256x256.pt"]: ckpt["x_embedder.proj.weight"] = ckpt["x_embedder.proj.weight"].unsqueeze(2) del ckpt["pos_embed"] if name in ["Latte-XL-2-256x256-ucf101.pt"]: ckpt = ckpt["ema"] ckpt["x_embedder.proj.weight"] = ckpt["x_embedder.proj.weight"].unsqueeze(2) del ckpt["pos_embed"] del ckpt["temp_embed"] if name in [ "PixArt-XL-2-256x256.pth", "PixArt-XL-2-SAM-256x256.pth", "PixArt-XL-2-512x512.pth", "PixArt-XL-2-1024-MS.pth", "PixArt-Sigma-XL-2-256x256.pth", "PixArt-Sigma-XL-2-512-MS.pth", "PixArt-Sigma-XL-2-1024-MS.pth", "PixArt-Sigma-XL-2-2K-MS.pth", ]: ckpt = ckpt["state_dict"] ckpt["x_embedder.proj.weight"] = ckpt["x_embedder.proj.weight"].unsqueeze(2) if "pos_embed" in ckpt: del ckpt["pos_embed"] if name in [ "PixArt-1B-2.pth", ]: ckpt = ckpt["state_dict"] if "pos_embed" in ckpt: del ckpt["pos_embed"] # no need pos_embed if "pos_embed_temporal" in ckpt: del ckpt["pos_embed_temporal"] if "pos_embed" in ckpt: del ckpt["pos_embed"] # different text length if "y_embedder.y_embedding" in ckpt: if ckpt["y_embedder.y_embedding"].shape[0] < model.y_embedder.y_embedding.shape[0]: get_logger().info( "Extend y_embedding from %s to %s", ckpt["y_embedder.y_embedding"].shape[0], model.y_embedder.y_embedding.shape[0], ) additional_length = model.y_embedder.y_embedding.shape[0] - ckpt["y_embedder.y_embedding"].shape[0] new_y_embedding = torch.zeros(additional_length, model.y_embedder.y_embedding.shape[1]) new_y_embedding[:] = ckpt["y_embedder.y_embedding"][-1] ckpt["y_embedder.y_embedding"] = torch.cat([ckpt["y_embedder.y_embedding"], new_y_embedding], dim=0) elif ckpt["y_embedder.y_embedding"].shape[0] > model.y_embedder.y_embedding.shape[0]: get_logger().info( "Shrink y_embedding from %s to %s", ckpt["y_embedder.y_embedding"].shape[0], model.y_embedder.y_embedding.shape[0], ) ckpt["y_embedder.y_embedding"] = ckpt["y_embedder.y_embedding"][: model.y_embedder.y_embedding.shape[0]] # stdit3 special case if type(model).__name__ == "STDiT3" and "PixArt-Sigma" in name: ckpt_keys = list(ckpt.keys()) for key in ckpt_keys: if "blocks." in key: ckpt[key.replace("blocks.", "spatial_blocks.")] = ckpt[key] del ckpt[key] return ckpt def find_model(model_name, model=None): """ Finds a pre-trained DiT model, downloading it if necessary. Alternatively, loads a model from a local path. """ if model_name in pretrained_models: # Find/download our pre-trained DiT checkpoints model_ckpt = download_model(model_name) model_ckpt = reparameter(model_ckpt, model_name, model=model) else: # Load a custom DiT checkpoint: assert os.path.isfile(model_name), f"Could not find DiT checkpoint at {model_name}" model_ckpt = torch.load(model_name, map_location=lambda storage, loc: storage) model_ckpt = reparameter(model_ckpt, model_name, model=model) return model_ckpt def download_model(model_name=None, local_path=None, url=None): """ Downloads a pre-trained DiT model from the web. """ if model_name is not None: assert model_name in pretrained_models local_path = f"pretrained_models/{model_name}" web_path = pretrained_models[model_name] else: assert local_path is not None assert url is not None web_path = url if not os.path.isfile(local_path): os.makedirs("pretrained_models", exist_ok=True) dir_name = os.path.dirname(local_path) file_name = os.path.basename(local_path) download_url(web_path, dir_name, file_name) model = torch.load(local_path, map_location=lambda storage, loc: storage) return model def load_from_sharded_state_dict(model, ckpt_path, model_name="model.safetensors", strict=False): ckpt_io = GeneralCheckpointIO() ckpt_io.load_model(model, os.path.join(ckpt_path, model_name), strict=strict) def model_sharding(model: torch.nn.Module): global_rank = dist.get_rank() world_size = dist.get_world_size() for _, param in model.named_parameters(): padding_size = (world_size - param.numel() % world_size) % world_size if padding_size > 0: padding_param = torch.nn.functional.pad(param.data.view(-1), [0, padding_size]) else: padding_param = param.data.view(-1) splited_params = padding_param.split(padding_param.numel() // world_size) splited_params = splited_params[global_rank] param.data = splited_params def model_gathering(model: torch.nn.Module, model_shape_dict: dict): global_rank = dist.get_rank() global_size = dist.get_world_size() for name, param in model.named_parameters(): all_params = [torch.empty_like(param.data) for _ in range(global_size)] dist.all_gather(all_params, param.data, group=dist.group.WORLD) if int(global_rank) == 0: all_params = torch.cat(all_params) param.data = remove_padding(all_params, model_shape_dict[name]).view(model_shape_dict[name]) dist.barrier() def remove_padding(tensor: torch.Tensor, original_shape: Tuple) -> torch.Tensor: return tensor[: functools.reduce(operator.mul, original_shape)] def record_model_param_shape(model: torch.nn.Module) -> dict: param_shape = {} for name, param in model.named_parameters(): param_shape[name] = param.shape return param_shape def load_checkpoint(model, ckpt_path, save_as_pt=False, model_name="model.safetensors", strict=False): if ckpt_path.endswith(".pt") or ckpt_path.endswith(".pth"): state_dict = find_model(ckpt_path, model=model) missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=strict) get_logger().info("Missing keys: %s", missing_keys) get_logger().info("Unexpected keys: %s", unexpected_keys) elif ckpt_path.endswith(".safetensors"): from safetensors.torch import load_file state_dict = load_file(ckpt_path) missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) print(f"Missing keys: {missing_keys}") print(f"Unexpected keys: {unexpected_keys}") elif os.path.isdir(ckpt_path): load_from_sharded_state_dict(model, ckpt_path, model_name, strict=strict) get_logger().info("Model checkpoint loaded from %s", ckpt_path) if save_as_pt: save_path = os.path.join(ckpt_path, model_name + "_ckpt.pt") torch.save(model.state_dict(), save_path) get_logger().info("Model checkpoint saved to %s", save_path) else: raise ValueError(f"Invalid checkpoint path: {ckpt_path}") def load_json(file_path: str): with open(file_path, "r") as f: return json.load(f) def save_json(data, file_path: str): with open(file_path, "w") as f: json.dump(data, f, indent=4) # save and load for training def save( booster: Booster, save_dir: str, model: nn.Module = None, ema: nn.Module = None, optimizer: Optimizer = None, lr_scheduler: _LRScheduler = None, sampler=None, epoch: int = None, step: int = None, global_step: int = None, batch_size: int = None, ): save_dir = os.path.join(save_dir, f"epoch{epoch}-global_step{global_step}") os.makedirs(os.path.join(save_dir, "model"), exist_ok=True) if model is not None: booster.save_model(model, os.path.join(save_dir, "model"), shard=True) if optimizer is not None: booster.save_optimizer(optimizer, os.path.join(save_dir, "optimizer"), shard=True, size_per_shard=4096) if lr_scheduler is not None: booster.save_lr_scheduler(lr_scheduler, os.path.join(save_dir, "lr_scheduler")) if dist.get_rank() == 0: running_states = { "epoch": epoch, "step": step, "global_step": global_step, "batch_size": batch_size, } save_json(running_states, os.path.join(save_dir, "running_states.json")) if ema is not None: torch.save(ema.state_dict(), os.path.join(save_dir, "ema.pt")) if sampler is not None: # only for VariableVideoBatchSampler torch.save(sampler.state_dict(step), os.path.join(save_dir, "sampler")) dist.barrier() return save_dir def load( booster: Booster, load_dir: str, model: nn.Module = None, ema: nn.Module = None, optimizer: Optimizer = None, lr_scheduler: _LRScheduler = None, sampler=None, ) -> Tuple[int, int, int]: assert os.path.exists(load_dir), f"Checkpoint directory {load_dir} does not exist" assert os.path.exists(os.path.join(load_dir, "running_states.json")), "running_states.json does not exist" running_states = load_json(os.path.join(load_dir, "running_states.json")) if model is not None: booster.load_model(model, os.path.join(load_dir, "model")) if ema is not None: # ema is not boosted, so we don't use booster.load_model ema.load_state_dict( torch.load(os.path.join(load_dir, "ema.pt"), map_location=torch.device("cpu")), strict=False, ) if optimizer is not None: booster.load_optimizer(optimizer, os.path.join(load_dir, "optimizer")) if lr_scheduler is not None: booster.load_lr_scheduler(lr_scheduler, os.path.join(load_dir, "lr_scheduler")) if sampler is not None: sampler.load_state_dict(torch.load(os.path.join(load_dir, "sampler"))) dist.barrier() return ( running_states["epoch"], running_states["step"], ) ================================================ FILE: Open-Sora/opensora/utils/config_utils.py ================================================ import argparse import json import os from glob import glob from mmengine.config import Config def parse_args(training=False): parser = argparse.ArgumentParser() # model config parser.add_argument("config", help="model config file path") # ====================================================== # General # ====================================================== parser.add_argument("--seed", default=None, type=int, help="seed for reproducibility") parser.add_argument( "--ckpt-path", default=None, type=str, help="path to model ckpt; will overwrite cfg.model.from_pretrained if specified", ) parser.add_argument("--batch-size", default=None, type=int, help="batch size") parser.add_argument("--outputs", default=None, type=str, help="the dir to save model weights") parser.add_argument("--flash-attn", default=None, type=str2bool, help="enable flash attention") parser.add_argument("--layernorm-kernel", default=None, type=str2bool, help="enable layernorm kernel") parser.add_argument("--resolution", default=None, type=str, help="multi resolution") parser.add_argument("--data-path", default=None, type=str, help="path to data csv") parser.add_argument("--dtype", default=None, type=str, help="data type") # ====================================================== # Inference # ====================================================== if not training: # output parser.add_argument("--save-dir", default=None, type=str, help="path to save generated samples") parser.add_argument("--sample-name", default=None, type=str, help="sample name, default is sample_idx") parser.add_argument("--start-index", default=None, type=int, help="start index for sample name") parser.add_argument("--end-index", default=None, type=int, help="end index for sample name") parser.add_argument("--num-sample", default=None, type=int, help="number of samples to generate for one prompt") parser.add_argument("--prompt-as-path", action="store_true", help="use prompt as path to save samples") parser.add_argument("--verbose", default=None, type=int, help="verbose level") # prompt parser.add_argument("--prompt-path", default=None, type=str, help="path to prompt txt file") parser.add_argument("--prompt", default=None, type=str, nargs="+", help="prompt list") parser.add_argument("--llm-refine", default=None, type=str2bool, help="enable LLM refine") parser.add_argument("--prompt-generator", default=None, type=str, help="prompt generator") # image/video parser.add_argument("--num-frames", default=None, type=str, help="number of frames") parser.add_argument("--fps", default=None, type=int, help="fps") parser.add_argument("--save-fps", default=None, type=int, help="save fps") parser.add_argument("--image-size", default=None, type=int, nargs=2, help="image size") parser.add_argument("--frame-interval", default=None, type=int, help="frame interval") parser.add_argument("--aspect-ratio", default=None, type=str, help="aspect ratio (h:w)") parser.add_argument("--watermark", default=None, type=str2bool, help="watermark video") # hyperparameters parser.add_argument("--num-sampling-steps", default=None, type=int, help="sampling steps") parser.add_argument("--cfg-scale", default=None, type=float, help="balance between cond & uncond") # reference parser.add_argument("--loop", default=None, type=int, help="loop") parser.add_argument("--condition-frame-length", default=None, type=int, help="condition frame length") parser.add_argument("--reference-path", default=None, type=str, nargs="+", help="reference path") parser.add_argument("--mask-strategy", default=None, type=str, nargs="+", help="mask strategy") parser.add_argument("--aes", default=None, type=float, help="aesthetic score") parser.add_argument("--flow", default=None, type=float, help="flow score") parser.add_argument("--camera-motion", default=None, type=str, help="camera motion") # ====================================================== # Training # ====================================================== else: parser.add_argument("--lr", default=None, type=float, help="learning rate") parser.add_argument("--wandb", default=None, type=bool, help="enable wandb") parser.add_argument("--load", default=None, type=str, help="path to continue training") parser.add_argument("--start-from-scratch", action="store_true", help="start training from scratch") parser.add_argument("--warmup-steps", default=None, type=int, help="warmup steps") parser.add_argument("--record-time", default=False, action="store_true", help="record time of each part") return parser.parse_args() def merge_args(cfg, args, training=False): if args.ckpt_path is not None: cfg.model["from_pretrained"] = args.ckpt_path if cfg.get("discriminator") is not None: cfg.discriminator["from_pretrained"] = args.ckpt_path args.ckpt_path = None if args.flash_attn is not None: cfg.model["enable_flash_attn"] = args.flash_attn args.enable_flash_attn = None if args.layernorm_kernel is not None: cfg.model["enable_layernorm_kernel"] = args.layernorm_kernel args.enable_layernorm_kernel = None if args.data_path is not None: cfg.dataset["data_path"] = args.data_path args.data_path = None # NOTE: for vae inference (reconstruction) if not training and "dataset" in cfg: if args.image_size is not None: cfg.dataset["image_size"] = args.image_size if args.num_frames is not None: cfg.dataset["num_frames"] = args.num_frames if not training: if args.cfg_scale is not None: cfg.scheduler["cfg_scale"] = args.cfg_scale args.cfg_scale = None if args.num_sampling_steps is not None: cfg.scheduler["num_sampling_steps"] = args.num_sampling_steps args.num_sampling_steps = None for k, v in vars(args).items(): if v is not None: cfg[k] = v return cfg def read_config(config_path): cfg = Config.fromfile(config_path) return cfg def parse_configs(training=False): args = parse_args(training) cfg = read_config(args.config) cfg = merge_args(cfg, args, training) return cfg def define_experiment_workspace(cfg, get_last_workspace=False): """ This function creates a folder for experiment tracking. Args: args: The parsed arguments. Returns: exp_dir: The path to the experiment folder. """ # Make outputs folder (holds all experiment subfolders) os.makedirs(cfg.outputs, exist_ok=True) experiment_index = len(glob(f"{cfg.outputs}/*")) if get_last_workspace: experiment_index -= 1 # Create an experiment folder model_name = cfg.model["type"].replace("/", "-") exp_name = f"{experiment_index:03d}-{model_name}" exp_dir = f"{cfg.outputs}/{exp_name}" return exp_name, exp_dir def save_training_config(cfg, experiment_dir): with open(f"{experiment_dir}/config.txt", "w") as f: json.dump(cfg, f, indent=4) def str2bool(v): if isinstance(v, bool): return v if v.lower() in ("yes", "true", "t", "y", "1"): return True elif v.lower() in ("no", "false", "f", "n", "0"): return False else: raise argparse.ArgumentTypeError("Boolean value expected.") ================================================ FILE: Open-Sora/opensora/utils/inference_utils.py ================================================ import json import os import re import torch from opensora.datasets import IMG_FPS from opensora.datasets.utils import read_from_path def prepare_multi_resolution_info(info_type, batch_size, image_size, num_frames, fps, device, dtype): if info_type is None: return dict() elif info_type == "PixArtMS": hw = torch.tensor([image_size], device=device, dtype=dtype).repeat(batch_size, 1) ar = torch.tensor([[image_size[0] / image_size[1]]], device=device, dtype=dtype).repeat(batch_size, 1) return dict(ar=ar, hw=hw) elif info_type in ["STDiT2", "OpenSora"]: fps = fps if num_frames > 1 else IMG_FPS fps = torch.tensor([fps], device=device, dtype=dtype).repeat(batch_size) height = torch.tensor([image_size[0]], device=device, dtype=dtype).repeat(batch_size) width = torch.tensor([image_size[1]], device=device, dtype=dtype).repeat(batch_size) num_frames = torch.tensor([num_frames], device=device, dtype=dtype).repeat(batch_size) ar = torch.tensor([image_size[0] / image_size[1]], device=device, dtype=dtype).repeat(batch_size) return dict(height=height, width=width, num_frames=num_frames, ar=ar, fps=fps) else: raise NotImplementedError def load_prompts(prompt_path, start_idx=None, end_idx=None): with open(prompt_path, "r") as f: prompts = [line.strip() for line in f.readlines()] prompts = prompts[start_idx:end_idx] return prompts def get_save_path_name( save_dir, sample_name=None, # prefix sample_idx=None, # sample index prompt=None, # used prompt prompt_as_path=False, # use prompt as path num_sample=1, # number of samples to generate for one prompt k=None, # kth sample ): if sample_name is None: sample_name = "" if prompt_as_path else "sample" sample_name_suffix = prompt if prompt_as_path else f"_{sample_idx:04d}" save_path = os.path.join(save_dir, f"{sample_name}{sample_name_suffix}") if num_sample != 1: save_path = f"{save_path}-{k}" return save_path def append_score_to_prompts(prompts, aes=None, flow=None, camera_motion=None): new_prompts = [] for prompt in prompts: new_prompt = prompt if aes is not None and "aesthetic score:" not in prompt: new_prompt = f"{new_prompt} aesthetic score: {aes:.1f}." if flow is not None and "motion score:" not in prompt: new_prompt = f"{new_prompt} motion score: {flow:.1f}." if camera_motion is not None and "camera motion:" not in prompt: new_prompt = f"{new_prompt} camera motion: {camera_motion}." new_prompts.append(new_prompt) return new_prompts def extract_json_from_prompts(prompts, reference, mask_strategy): ret_prompts = [] for i, prompt in enumerate(prompts): parts = re.split(r"(?=[{])", prompt) assert len(parts) <= 2, f"Invalid prompt: {prompt}" ret_prompts.append(parts[0]) if len(parts) > 1: additional_info = json.loads(parts[1]) for key in additional_info: assert key in ["reference_path", "mask_strategy"], f"Invalid key: {key}" if key == "reference_path": reference[i] = additional_info[key] elif key == "mask_strategy": mask_strategy[i] = additional_info[key] return ret_prompts, reference, mask_strategy def collect_references_batch(reference_paths, vae, image_size): refs_x = [] # refs_x: [batch, ref_num, C, T, H, W] for reference_path in reference_paths: if reference_path == "": refs_x.append([]) continue ref_path = reference_path.split(";") ref = [] for r_path in ref_path: r = read_from_path(r_path, image_size, transform_name="resize_crop") r_x = vae.encode(r.unsqueeze(0).to(vae.device, vae.dtype)) r_x = r_x.squeeze(0) ref.append(r_x) refs_x.append(ref) return refs_x def extract_prompts_loop(prompts, num_loop): ret_prompts = [] for prompt in prompts: if prompt.startswith("|0|"): prompt_list = prompt.split("|")[1:] text_list = [] for i in range(0, len(prompt_list), 2): start_loop = int(prompt_list[i]) text = prompt_list[i + 1] end_loop = int(prompt_list[i + 2]) if i + 2 < len(prompt_list) else num_loop + 1 text_list.extend([text] * (end_loop - start_loop)) prompt = text_list[num_loop] ret_prompts.append(prompt) return ret_prompts def split_prompt(prompt_text): if prompt_text.startswith("|0|"): # this is for prompts which look like # |0| a beautiful day |1| a sunny day |2| a rainy day # we want to parse it into a list of prompts with the loop index prompt_list = prompt_text.split("|")[1:] text_list = [] loop_idx = [] for i in range(0, len(prompt_list), 2): start_loop = int(prompt_list[i]) text = prompt_list[i + 1].strip() text_list.append(text) loop_idx.append(start_loop) return text_list, loop_idx else: return [prompt_text], None def merge_prompt(text_list, loop_idx_list=None): if loop_idx_list is None: return text_list[0] else: prompt = "" for i, text in enumerate(text_list): prompt += f"|{loop_idx_list[i]}|{text}" return prompt MASK_DEFAULT = ["0", "0", "0", "0", "1", "0"] def parse_mask_strategy(mask_strategy): mask_batch = [] if mask_strategy == "" or mask_strategy is None: return mask_batch mask_strategy = mask_strategy.split(";") for mask in mask_strategy: mask_group = mask.split(",") num_group = len(mask_group) assert num_group >= 1 and num_group <= 6, f"Invalid mask strategy: {mask}" mask_group.extend(MASK_DEFAULT[num_group:]) for i in range(5): mask_group[i] = int(mask_group[i]) mask_group[5] = float(mask_group[5]) mask_batch.append(mask_group) return mask_batch def find_nearest_point(value, point, max_value): t = value // point if value % point > point / 2 and t < max_value // point - 1: t += 1 return t * point def apply_mask_strategy(z, refs_x, mask_strategys, loop_i, align=None): masks = [] no_mask = True for i, mask_strategy in enumerate(mask_strategys): no_mask = False mask = torch.ones(z.shape[2], dtype=torch.float, device=z.device) mask_strategy = parse_mask_strategy(mask_strategy) for mst in mask_strategy: loop_id, m_id, m_ref_start, m_target_start, m_length, edit_ratio = mst if loop_id != loop_i: continue ref = refs_x[i][m_id] if m_ref_start < 0: # ref: [C, T, H, W] m_ref_start = ref.shape[1] + m_ref_start if m_target_start < 0: # z: [B, C, T, H, W] m_target_start = z.shape[2] + m_target_start if align is not None: m_ref_start = find_nearest_point(m_ref_start, align, ref.shape[1]) m_target_start = find_nearest_point(m_target_start, align, z.shape[2]) m_length = min(m_length, z.shape[2] - m_target_start, ref.shape[1] - m_ref_start) z[i, :, m_target_start : m_target_start + m_length] = ref[:, m_ref_start : m_ref_start + m_length] mask[m_target_start : m_target_start + m_length] = edit_ratio masks.append(mask) if no_mask: return None masks = torch.stack(masks) return masks def append_generated(vae, generated_video, refs_x, mask_strategy, loop_i, condition_frame_length, condition_frame_edit): ref_x = vae.encode(generated_video) for j, refs in enumerate(refs_x): if refs is None: refs_x[j] = [ref_x[j]] else: refs.append(ref_x[j]) if mask_strategy[j] is None or mask_strategy[j] == "": mask_strategy[j] = "" else: mask_strategy[j] += ";" mask_strategy[ j ] += f"{loop_i},{len(refs)-1},-{condition_frame_length},0,{condition_frame_length},{condition_frame_edit}" return refs_x, mask_strategy def dframe_to_frame(num): assert num % 5 == 0, f"Invalid num: {num}" return num // 5 * 17 OPENAI_CLIENT = None REFINE_PROMPTS = None REFINE_PROMPTS_PATH = "assets/texts/t2v_pllava.txt" REFINE_PROMPTS_TEMPLATE = """ You need to refine user's input prompt. The user's input prompt is used for video generation task. You need to refine the user's prompt to make it more suitable for the task. Here are some examples of refined prompts: {} The refined prompt should pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. The refined prompt should be in English. """ RANDOM_PROMPTS = None RANDOM_PROMPTS_TEMPLATE = """ You need to generate one input prompt for video generation task. The prompt should be suitable for the task. Here are some examples of refined prompts: {} The prompt should pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. The prompt should be in English. """ def get_openai_response(sys_prompt, usr_prompt, model="gpt-4o"): global OPENAI_CLIENT if OPENAI_CLIENT is None: from openai import OpenAI OPENAI_CLIENT = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) completion = OPENAI_CLIENT.chat.completions.create( model=model, messages=[ { "role": "system", "content": sys_prompt, }, # <-- This is the system message that provides context to the model { "role": "user", "content": usr_prompt, }, # <-- This is the user message for which the model will generate a response ], ) return completion.choices[0].message.content def get_random_prompt_by_openai(): global RANDOM_PROMPTS if RANDOM_PROMPTS is None: examples = load_prompts(REFINE_PROMPTS_PATH) RANDOM_PROMPTS = RANDOM_PROMPTS_TEMPLATE.format("\n".join(examples)) response = get_openai_response(RANDOM_PROMPTS, "Generate one example.") return response def refine_prompt_by_openai(prompt): global REFINE_PROMPTS if REFINE_PROMPTS is None: examples = load_prompts(REFINE_PROMPTS_PATH) REFINE_PROMPTS = REFINE_PROMPTS_TEMPLATE.format("\n".join(examples)) response = get_openai_response(REFINE_PROMPTS, prompt) return response def has_openai_key(): return "OPENAI_API_KEY" in os.environ def refine_prompts_by_openai(prompts): new_prompts = [] for prompt in prompts: try: if prompt.strip() == "": new_prompt = get_random_prompt_by_openai() print(f"[Info] Empty prompt detected, generate random prompt: {new_prompt}") else: new_prompt = refine_prompt_by_openai(prompt) print(f"[Info] Refine prompt: {prompt} -> {new_prompt}") new_prompts.append(new_prompt) except Exception as e: print(f"[Warning] Failed to refine prompt: {prompt} due to {e}") new_prompts.append(prompt) return new_prompts def add_watermark( input_video_path, watermark_image_path="./assets/images/watermark/watermark.png", output_video_path=None ): # execute this command in terminal with subprocess # return if the process is successful if output_video_path is None: output_video_path = input_video_path.replace(".mp4", "_watermark.mp4") cmd = f'ffmpeg -y -i {input_video_path} -i {watermark_image_path} -filter_complex "[1][0]scale2ref=oh*mdar:ih*0.1[logo][video];[video][logo]overlay" {output_video_path}' exit_code = os.system(cmd) is_success = exit_code == 0 return is_success ================================================ FILE: Open-Sora/opensora/utils/lr_scheduler.py ================================================ from torch.optim.lr_scheduler import _LRScheduler class LinearWarmupLR(_LRScheduler): """Linearly warmup learning rate and then linearly decay. Args: optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer. warmup_steps (int, optional): Number of warmup steps, defaults to 0 last_step (int, optional): The index of last step, defaults to -1. When last_step=-1, the schedule is started from the beginning or When last_step=-1, sets initial lr as lr. """ def __init__(self, optimizer, warmup_steps: int = 0, last_epoch: int = -1): self.warmup_steps = warmup_steps super().__init__(optimizer, last_epoch=last_epoch) def get_lr(self): if self.last_epoch < self.warmup_steps: return [(self.last_epoch + 1) / (self.warmup_steps + 1) * lr for lr in self.base_lrs] else: return self.base_lrs ================================================ FILE: Open-Sora/opensora/utils/misc.py ================================================ import collections import importlib import logging import os import time from collections import OrderedDict from collections.abc import Sequence from itertools import repeat from typing import Optional, Tuple import numpy as np import torch import torch.distributed as dist from colossalai.cluster.dist_coordinator import DistCoordinator # ====================================================== # Logging # ====================================================== def is_distributed(): return os.environ.get("WORLD_SIZE", None) is not None def is_main_process(): return not is_distributed() or dist.get_rank() == 0 def get_world_size(): if is_distributed(): return dist.get_world_size() else: return 1 def create_logger(logging_dir=None): """ Create a logger that writes to a log file and stdout. """ if is_main_process(): # real logger additional_args = dict() if logging_dir is not None: additional_args["handlers"] = [ logging.StreamHandler(), logging.FileHandler(f"{logging_dir}/log.txt"), ] logging.basicConfig( level=logging.INFO, format="[\033[34m%(asctime)s\033[0m] %(message)s", datefmt="%Y-%m-%d %H:%M:%S", **additional_args, ) logger = logging.getLogger(__name__) else: # dummy logger (does nothing) logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) return logger def get_logger(): return logging.getLogger(__name__) def print_rank(var_name, var_value, rank=0): if dist.get_rank() == rank: print(f"[Rank {rank}] {var_name}: {var_value}") def print_0(*args, **kwargs): if dist.get_rank() == 0: print(*args, **kwargs) def create_tensorboard_writer(exp_dir): from torch.utils.tensorboard import SummaryWriter tensorboard_dir = f"{exp_dir}/tensorboard" os.makedirs(tensorboard_dir, exist_ok=True) writer = SummaryWriter(tensorboard_dir) return writer # ====================================================== # String # ====================================================== def format_numel_str(numel: int) -> str: B = 1024**3 M = 1024**2 K = 1024 if numel >= B: return f"{numel / B:.2f} B" elif numel >= M: return f"{numel / M:.2f} M" elif numel >= K: return f"{numel / K:.2f} K" else: return f"{numel}" def get_timestamp(): timestamp = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time())) return timestamp def format_time(seconds): days = int(seconds / 3600 / 24) seconds = seconds - days * 3600 * 24 hours = int(seconds / 3600) seconds = seconds - hours * 3600 minutes = int(seconds / 60) seconds = seconds - minutes * 60 secondsf = int(seconds) seconds = seconds - secondsf millis = int(seconds * 1000) f = "" i = 1 if days > 0: f += str(days) + "D" i += 1 if hours > 0 and i <= 2: f += str(hours) + "h" i += 1 if minutes > 0 and i <= 2: f += str(minutes) + "m" i += 1 if secondsf > 0 and i <= 2: f += str(secondsf) + "s" i += 1 if millis > 0 and i <= 2: f += str(millis) + "ms" i += 1 if f == "": f = "0ms" return f class BColors: HEADER = "\033[95m" OKBLUE = "\033[94m" OKCYAN = "\033[96m" OKGREEN = "\033[92m" WARNING = "\033[93m" FAIL = "\033[91m" ENDC = "\033[0m" BOLD = "\033[1m" UNDERLINE = "\033[4m" # ====================================================== # PyTorch # ====================================================== def requires_grad(model: torch.nn.Module, flag: bool = True) -> None: """ Set requires_grad flag for all parameters in a model. """ for p in model.parameters(): p.requires_grad = flag def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor: dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM) tensor.div_(dist.get_world_size()) return tensor def get_model_numel(model: torch.nn.Module) -> Tuple[int, int]: num_params = 0 num_params_trainable = 0 for p in model.parameters(): num_params += p.numel() if p.requires_grad: num_params_trainable += p.numel() return num_params, num_params_trainable def count_params(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) def to_tensor(data): """Convert objects of various python types to :obj:`torch.Tensor`. Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, :class:`Sequence`, :class:`int` and :class:`float`. Args: data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to be converted. """ if isinstance(data, torch.Tensor): return data elif isinstance(data, np.ndarray): return torch.from_numpy(data) elif isinstance(data, Sequence) and not isinstance(data, str): return torch.tensor(data) elif isinstance(data, int): return torch.LongTensor([data]) elif isinstance(data, float): return torch.FloatTensor([data]) else: raise TypeError(f"type {type(data)} cannot be converted to tensor.") def to_ndarray(data): if isinstance(data, torch.Tensor): return data.numpy() elif isinstance(data, np.ndarray): return data elif isinstance(data, Sequence): return np.array(data) elif isinstance(data, int): return np.ndarray([data], dtype=int) elif isinstance(data, float): return np.array([data], dtype=float) else: raise TypeError(f"type {type(data)} cannot be converted to ndarray.") def to_torch_dtype(dtype): if isinstance(dtype, torch.dtype): return dtype elif isinstance(dtype, str): dtype_mapping = { "float64": torch.float64, "float32": torch.float32, "float16": torch.float16, "fp32": torch.float32, "fp16": torch.float16, "half": torch.float16, "bf16": torch.bfloat16, } if dtype not in dtype_mapping: raise ValueError dtype = dtype_mapping[dtype] return dtype else: raise ValueError def _ntuple(n): def parse(x): if isinstance(x, collections.abc.Iterable) and not isinstance(x, str): return x return tuple(repeat(x, n)) return parse to_1tuple = _ntuple(1) to_2tuple = _ntuple(2) to_3tuple = _ntuple(3) to_4tuple = _ntuple(4) to_ntuple = _ntuple def convert_SyncBN_to_BN2d(model_cfg): for k in model_cfg: v = model_cfg[k] if k == "norm_cfg" and v["type"] == "SyncBN": v["type"] = "BN2d" elif isinstance(v, dict): convert_SyncBN_to_BN2d(v) def get_topk(x, dim=4, k=5): x = to_tensor(x) inds = x[..., dim].topk(k)[1] return x[inds] def param_sigmoid(x, alpha): ret = 1 / (1 + (-alpha * x).exp()) return ret def inverse_param_sigmoid(x, alpha, eps=1e-5): x = x.clamp(min=0, max=1) x1 = x.clamp(min=eps) x2 = (1 - x).clamp(min=eps) return torch.log(x1 / x2) / alpha def inverse_sigmoid(x, eps=1e-5): """Inverse function of sigmoid. Args: x (Tensor): The tensor to do the inverse. eps (float): EPS avoid numerical overflow. Defaults 1e-5. Returns: Tensor: The x has passed the inverse function of sigmoid, has same shape with input. """ x = x.clamp(min=0, max=1) x1 = x.clamp(min=eps) x2 = (1 - x).clamp(min=eps) return torch.log(x1 / x2) # ====================================================== # Python # ====================================================== def count_columns(df, columns): cnt_dict = OrderedDict() num_samples = len(df) for col in columns: d_i = df[col].value_counts().to_dict() for k in d_i: d_i[k] = (d_i[k], d_i[k] / num_samples) cnt_dict[col] = d_i return cnt_dict def try_import(name): """Try to import a module. Args: name (str): Specifies what module to import in absolute or relative terms (e.g. either pkg.mod or ..mod). Returns: ModuleType or None: If importing successfully, returns the imported module, otherwise returns None. """ try: return importlib.import_module(name) except ImportError: return None def transpose(x): """ transpose a list of list Args: x (list[list]): """ ret = list(map(list, zip(*x))) return ret def all_exists(paths): return all(os.path.exists(path) for path in paths) # ====================================================== # Profile # ====================================================== class Timer: def __init__(self, name, log=False, coordinator: Optional[DistCoordinator] = None): self.name = name self.start_time = None self.end_time = None self.log = log self.coordinator = coordinator @property def elapsed_time(self): return self.end_time - self.start_time def __enter__(self): torch.cuda.synchronize() self.start_time = time.time() return self def __exit__(self, exc_type, exc_val, exc_tb): if self.coordinator is not None: self.coordinator.block_all() torch.cuda.synchronize() self.end_time = time.time() if self.log: print(f"Elapsed time for {self.name}: {self.elapsed_time:.2f} s") def get_tensor_memory(tensor, human_readable=True): size = tensor.element_size() * tensor.nelement() if human_readable: size = format_numel_str(size) return size class FeatureSaver: def __init__(self, save_dir, bin_size=10, start_bin=0): self.save_dir = save_dir self.bin_size = bin_size self.bin_cnt = start_bin self.data_list = [] self.cnt = 0 def update(self, data): self.data_list.append(data) self.cnt += 1 if self.cnt % self.bin_size == 0: self.save() def save(self): save_path = os.path.join(self.save_dir, f"{self.bin_cnt:08}.bin") torch.save(self.data_list, save_path) get_logger().info("Saved to %s", save_path) self.data_list = [] self.bin_cnt += 1 ================================================ FILE: Open-Sora/opensora/utils/train_utils.py ================================================ import math import random from collections import OrderedDict import torch import torch.distributed as dist from colossalai.booster.plugin import LowLevelZeroPlugin from opensora.acceleration.parallel_states import set_data_parallel_group, set_sequence_parallel_group from opensora.acceleration.plugin import ZeroSeqParallelPlugin from .misc import get_logger def create_colossalai_plugin(plugin, dtype, grad_clip, sp_size, reduce_bucket_size_in_m: int = 20): if plugin == "zero2": assert sp_size == 1, "Zero2 plugin does not support sequence parallelism" plugin = LowLevelZeroPlugin( stage=2, precision=dtype, initial_scale=2**16, max_norm=grad_clip, reduce_bucket_size_in_m=reduce_bucket_size_in_m, ) set_data_parallel_group(dist.group.WORLD) elif plugin == "zero2-seq": assert sp_size > 1, "Zero2-seq plugin requires sequence parallelism" plugin = ZeroSeqParallelPlugin( sp_size=sp_size, stage=2, precision=dtype, initial_scale=2**16, max_norm=grad_clip, reduce_bucket_size_in_m=reduce_bucket_size_in_m, ) set_sequence_parallel_group(plugin.sp_group) set_data_parallel_group(plugin.dp_group) else: raise ValueError(f"Unknown plugin {plugin}") return plugin @torch.no_grad() def update_ema( ema_model: torch.nn.Module, model: torch.nn.Module, optimizer=None, decay: float = 0.9999, sharded: bool = True ) -> None: """ Step the EMA model towards the current model. """ ema_params = OrderedDict(ema_model.named_parameters()) model_params = OrderedDict(model.named_parameters()) for name, param in model_params.items(): if name == "pos_embed": continue if not param.requires_grad: continue if not sharded: param_data = param.data ema_params[name].mul_(decay).add_(param_data, alpha=1 - decay) else: if param.data.dtype != torch.float32: param_id = id(param) master_param = optimizer._param_store.working_to_master_param[param_id] param_data = master_param.data else: param_data = param.data ema_params[name].mul_(decay).add_(param_data, alpha=1 - decay) class MaskGenerator: def __init__(self, mask_ratios): valid_mask_names = [ "identity", "quarter_random", "quarter_head", "quarter_tail", "quarter_head_tail", "image_random", "image_head", "image_tail", "image_head_tail", "random", "intepolate", ] assert all( mask_name in valid_mask_names for mask_name in mask_ratios.keys() ), f"mask_name should be one of {valid_mask_names}, got {mask_ratios.keys()}" assert all( mask_ratio >= 0 for mask_ratio in mask_ratios.values() ), f"mask_ratio should be greater than or equal to 0, got {mask_ratios.values()}" assert all( mask_ratio <= 1 for mask_ratio in mask_ratios.values() ), f"mask_ratio should be less than or equal to 1, got {mask_ratios.values()}" # sum of mask_ratios should be 1 if "identity" not in mask_ratios: mask_ratios["identity"] = 1.0 - sum(mask_ratios.values()) assert math.isclose( sum(mask_ratios.values()), 1.0, abs_tol=1e-6 ), f"sum of mask_ratios should be 1, got {sum(mask_ratios.values())}" get_logger().info("mask ratios: %s", mask_ratios) self.mask_ratios = mask_ratios def get_mask(self, x): mask_type = random.random() mask_name = None prob_acc = 0.0 for mask, mask_ratio in self.mask_ratios.items(): prob_acc += mask_ratio if mask_type < prob_acc: mask_name = mask break num_frames = x.shape[2] # Hardcoded condition_frames condition_frames_max = num_frames // 4 mask = torch.ones(num_frames, dtype=torch.bool, device=x.device) if num_frames <= 1: return mask if mask_name == "quarter_random": random_size = random.randint(1, condition_frames_max) random_pos = random.randint(0, x.shape[2] - random_size) mask[random_pos : random_pos + random_size] = 0 elif mask_name == "image_random": random_size = 1 random_pos = random.randint(0, x.shape[2] - random_size) mask[random_pos : random_pos + random_size] = 0 elif mask_name == "quarter_head": random_size = random.randint(1, condition_frames_max) mask[:random_size] = 0 elif mask_name == "image_head": random_size = 1 mask[:random_size] = 0 elif mask_name == "quarter_tail": random_size = random.randint(1, condition_frames_max) mask[-random_size:] = 0 elif mask_name == "image_tail": random_size = 1 mask[-random_size:] = 0 elif mask_name == "quarter_head_tail": random_size = random.randint(1, condition_frames_max) mask[:random_size] = 0 mask[-random_size:] = 0 elif mask_name == "image_head_tail": random_size = 1 mask[:random_size] = 0 mask[-random_size:] = 0 elif mask_name == "intepolate": random_start = random.randint(0, 1) mask[random_start::2] = 0 elif mask_name == "random": mask_ratio = random.uniform(0.1, 0.9) mask = torch.rand(num_frames, device=x.device) > mask_ratio # if mask is all False, set the last frame to True if not mask.any(): mask[-1] = 1 return mask def get_masks(self, x): masks = [] for _ in range(len(x)): mask = self.get_mask(x) masks.append(mask) masks = torch.stack(masks, dim=0) return masks ================================================ FILE: Open-Sora/opensora.egg-info/PKG-INFO ================================================ Metadata-Version: 2.1 Name: opensora Version: 1.2.0 Summary: Democratizing Efficient Video Production for All Home-page: https://github.com/hpcaitech/Open-Sora License: Apache Software License 2.0 Project-URL: Bug Tracker, https://github.com/hpcaitech/Open-Sora/issues Project-URL: Examples, https://hpcaitech.github.io/Open-Sora/ Project-URL: Documentation, https://github.com/hpcaitech/Open-Sora?tab=readme-ov-file Project-URL: Github, https://github.com/hpcaitech/Open-Sora Classifier: Programming Language :: Python :: 3 Classifier: License :: OSI Approved :: Apache Software License Classifier: Environment :: GPU :: NVIDIA CUDA Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence Classifier: Topic :: System :: Distributed Computing Requires-Python: >=3.6 Description-Content-Type: text/markdown License-File: LICENSE Requires-Dist: colossalai>=0.4.0 Requires-Dist: mmengine>=0.10.3 Requires-Dist: pandas>=2.0.3 Requires-Dist: timm==0.9.16 Requires-Dist: rotary_embedding_torch==0.5.3 Requires-Dist: ftfy>=6.2.0 Requires-Dist: diffusers==0.27.2 Requires-Dist: accelerate==0.29.2 Requires-Dist: av>=12.0.0 Requires-Dist: numpy<2.0.0 Requires-Dist: gradio>=4.26.0 Requires-Dist: spaces>=0.28.3 Requires-Dist: ipykernel>=6.29.4 Requires-Dist: ipywidgets>=8.1.2 Requires-Dist: wandb>=0.17.0 Requires-Dist: tensorboard>=2.14.0 Requires-Dist: pandarallel>=1.6.5 Requires-Dist: pyarrow>=16.1.0 Requires-Dist: pre-commit>=3.5.0 Requires-Dist: openai Provides-Extra: data Requires-Dist: gdown>=5.2.0; extra == "data" Requires-Dist: ninja>=1.11.1.1; extra == "data" Requires-Dist: shortuuid>=1.0.13; extra == "data" Requires-Dist: markdown2[all]; extra == "data" Requires-Dist: scikit-learn>=1.4.2; extra == "data" Requires-Dist: einops-exts>=0.0.4; extra == "data" Requires-Dist: decord==0.6.0; extra == "data" Requires-Dist: ptvsd==4.3.2; extra == "data" Requires-Dist: imageio-ffmpeg>=0.4.9; extra == "data" Requires-Dist: ffmpeg-python==0.2.0; extra == "data" Requires-Dist: lingua-language-detector==2.0.2; extra == "data" Requires-Dist: imageio>=2.34.1; extra == "data" Requires-Dist: setuptools==68.2.2; extra == "data" Requires-Dist: clip@ git+https://github.com/openai/CLIP.git ; extra == "data" Requires-Dist: mmcv==2.1.0; extra == "data" Requires-Dist: mmdet==3.1.0; extra == "data" Requires-Dist: mmocr==1.0.1; extra == "data" Requires-Dist: detectron2@ git+https://github.com/facebookresearch/detectron2.git@ff53992 ; extra == "data" Provides-Extra: eval Requires-Dist: detectron2@ git+https://github.com/facebookresearch/detectron2.git@ff53992 ; extra == "eval" Requires-Dist: imageio>=2.34.1; extra == "eval" Requires-Dist: pyiqa==0.1.10; extra == "eval" Requires-Dist: scikit-learn>=1.4.2; extra == "eval" Requires-Dist: scikit-image>=0.20.0; extra == "eval" Requires-Dist: lvis==0.5.3; extra == "eval" Requires-Dist: boto3>=1.34.113; extra == "eval" Requires-Dist: easydict>=1.9; extra == "eval" Requires-Dist: fairscale>=0.4.13; extra == "eval" Requires-Dist: decord==0.6.0; extra == "eval" Requires-Dist: pytorchvideo==0.1.5; extra == "eval" Requires-Dist: lpips==0.1.4; extra == "eval" Provides-Extra: vae Requires-Dist: beartype==0.18.5; extra == "vae" Requires-Dist: einops==0.8.0; extra == "vae" Requires-Dist: einops-exts==0.0.4; extra == "vae" Requires-Dist: opencv-python==4.9.0.80; extra == "vae" Requires-Dist: pillow==10.3.0; extra == "vae" Provides-Extra: full Requires-Dist: gdown>=5.2.0; extra == "full" Requires-Dist: ninja>=1.11.1.1; extra == "full" Requires-Dist: shortuuid>=1.0.13; extra == "full" Requires-Dist: markdown2[all]; extra == "full" Requires-Dist: scikit-learn>=1.4.2; extra == "full" Requires-Dist: einops-exts>=0.0.4; extra == "full" Requires-Dist: decord==0.6.0; extra == "full" Requires-Dist: ptvsd==4.3.2; extra == "full" Requires-Dist: imageio-ffmpeg>=0.4.9; extra == "full" Requires-Dist: ffmpeg-python==0.2.0; extra == "full" Requires-Dist: lingua-language-detector==2.0.2; extra == "full" Requires-Dist: imageio>=2.34.1; extra == "full" Requires-Dist: setuptools==68.2.2; extra == "full" Requires-Dist: clip@ git+https://github.com/openai/CLIP.git ; extra == "full" Requires-Dist: mmcv==2.1.0; extra == "full" Requires-Dist: mmdet==3.1.0; extra == "full" Requires-Dist: mmocr==1.0.1; extra == "full" Requires-Dist: detectron2@ git+https://github.com/facebookresearch/detectron2.git@ff53992 ; extra == "full" Requires-Dist: detectron2@ git+https://github.com/facebookresearch/detectron2.git@ff53992 ; extra == "full" Requires-Dist: imageio>=2.34.1; extra == "full" Requires-Dist: pyiqa==0.1.10; extra == "full" Requires-Dist: scikit-learn>=1.4.2; extra == "full" Requires-Dist: scikit-image>=0.20.0; extra == "full" Requires-Dist: lvis==0.5.3; extra == "full" Requires-Dist: boto3>=1.34.113; extra == "full" Requires-Dist: easydict>=1.9; extra == "full" Requires-Dist: fairscale>=0.4.13; extra == "full" Requires-Dist: decord==0.6.0; extra == "full" Requires-Dist: pytorchvideo==0.1.5; extra == "full" Requires-Dist: lpips==0.1.4; extra == "full"

## Open-Sora: Democratizing Efficient Video Production for All We design and implement **Open-Sora**, an initiative dedicated to **efficiently** producing high-quality video. We hope to make the model, tools and all details accessible to all. By embracing **open-source** principles, Open-Sora not only democratizes access to advanced video generation techniques, but also offers a streamlined and user-friendly platform that simplifies the complexities of video generation. With Open-Sora, our goal is to foster innovation, creativity, and inclusivity within the field of content creation. [[中文文档](/docs/zh_CN/README.md)] [[潞晨云](https://cloud.luchentech.com/)|[OpenSora镜像](https://cloud.luchentech.com/doc/docs/image/open-sora/)|[视频教程](https://www.bilibili.com/video/BV1ow4m1e7PX/?vd_source=c6b752764cd36ff0e535a768e35d98d2)] ## 📰 News - **[2024.06.17]** 🔥 We released **Open-Sora 1.2**, which includes **3D-VAE**, **rectified flow**, and **score condition**. The video quality is greatly improved. [[checkpoints]](#open-sora-10-model-weights) [[report]](/docs/report_03.md) [[blog]](https://hpc-ai.com/blog/open-sora-from-hpc-ai-tech-team-continues-open-source-generate-any-16-second-720p-hd-video-with-one-click-model-weights-ready-to-use) - **[2024.04.25]** 🤗 We released the [Gradio demo for Open-Sora](https://huggingface.co/spaces/hpcai-tech/open-sora) on Hugging Face Spaces. - **[2024.04.25]** We released **Open-Sora 1.1**, which supports **2s~15s, 144p to 720p, any aspect ratio** text-to-image, **text-to-video, image-to-video, video-to-video, infinite time** generation. In addition, a full video processing pipeline is released. [[checkpoints]]() [[report]](/docs/report_02.md) - **[2024.03.18]** We released **Open-Sora 1.0**, a fully open-source project for video generation. Open-Sora 1.0 supports a full pipeline of video data preprocessing, training with acceleration, inference, and more. Our model can produce 2s 512x512 videos with only 3 days training. [[checkpoints]](#open-sora-10-model-weights) [[blog]](https://hpc-ai.com/blog/open-sora-v1.0) [[report]](/docs/report_01.md) - **[2024.03.04]** Open-Sora provides training with 46% cost reduction. [[blog]](https://hpc-ai.com/blog/open-sora) ## 🎥 Latest Demo 🔥 You can experience Open-Sora on our [🤗 Gradio application on Hugging Face](https://huggingface.co/spaces/hpcai-tech/open-sora). More samples and corresponding prompts are available in our [Gallery](https://hpcaitech.github.io/Open-Sora/). | **4s 720×1280** | **4s 720×1280** | **4s 720×1280** | | ---------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/7895aab6-ed23-488c-8486-091480c26327) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/20f07c7b-182b-4562-bbee-f1df74c86c9a) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/3d897e0d-dc21-453a-b911-b3bda838acc2) | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/644bf938-96ce-44aa-b797-b3c0b513d64c) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/272d88ac-4b4a-484d-a665-8d07431671d0) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/ebbac621-c34e-4bb4-9543-1c34f8989764) | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/a1e3a1a3-4abd-45f5-8df2-6cced69da4ca) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/d6ce9c13-28e1-4dff-9644-cc01f5f11926) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/561978f8-f1b0-4f4d-ae7b-45bec9001b4a) |
OpenSora 1.1 Demo | **2s 240×426** | **2s 240×426** | | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) | | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/f7ce4aaa-528f-40a8-be7a-72e61eaacbbd) | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/5d58d71e-1fda-4d90-9ad3-5f2f7b75c6a9) | | **2s 426×240** | **4s 480×854** | | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/34ecb4a0-4eef-4286-ad4c-8e3a87e5a9fd) | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c1619333-25d7-42ba-a91c-18dbc1870b18) | | **16s 320×320** | **16s 224×448** | **2s 426×240** | | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------- | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/3cab536e-9b43-4b33-8da8-a0f9cf842ff2) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/9fb0b9e0-c6f4-4935-b29e-4cac10b373c4) | [](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/3e892ad2-9543-4049-b005-643a4c1bf3bf) |
OpenSora 1.0 Demo | **2s 512×512** | **2s 512×512** | **2s 512×512** | | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/de1963d3-b43b-4e68-a670-bb821ebb6f80) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/13f8338f-3d42-4b71-8142-d234fbd746cc) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/fa6a65a6-e32a-4d64-9a9e-eabb0ebb8c16) | | A serene night scene in a forested area. [...] The video is a time-lapse, capturing the transition from day to night, with the lake and forest serving as a constant backdrop. | A soaring drone footage captures the majestic beauty of a coastal cliff, [...] The water gently laps at the rock base and the greenery that clings to the top of the cliff. | The majestic beauty of a waterfall cascading down a cliff into a serene lake. [...] The camera angle provides a bird's eye view of the waterfall. | | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/64232f84-1b36-4750-a6c0-3e610fa9aa94) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/983a1965-a374-41a7-a76b-c07941a6c1e9) | [](https://github.com/hpcaitech/Open-Sora/assets/99191637/ec10c879-9767-4c31-865f-2e8d6cf11e65) | | A bustling city street at night, filled with the glow of car headlights and the ambient light of streetlights. [...] | The vibrant beauty of a sunflower field. The sunflowers are arranged in neat rows, creating a sense of order and symmetry. [...] | A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell [...] | Videos are downsampled to `.gif` for display. Click for original videos. Prompts are trimmed for display, see [here](/assets/texts/t2v_samples.txt) for full prompts.
## 🔆 New Features/Updates - 📍 **Open-Sora 1.2** released. Model weights are available [here](#model-weights). See our **[report 1.2](/docs/report_03.md)** for more details. - ✅ Support rectified flow scheduling. - ✅ Support more conditioning including fps, aesthetic score, motion strength and camera motion. - ✅ Trained our 3D-VAE for temporal dimension compression. - 📍 **Open-Sora 1.1** released. Model weights are available [here](#model-weights). It is trained on **0s~15s, 144p to 720p, various aspect ratios** videos. See our **[report 1.1](/docs/report_02.md)** for more discussions. - 🔧 **Data processing pipeline v1.1** is released. An automatic [processing pipeline](#data-processing) from raw videos to (text, video clip) pairs is provided, including scene cutting $\rightarrow$ filtering(aesthetic, optical flow, OCR, etc.) $\rightarrow$ captioning $\rightarrow$ managing. With this tool, you can easily build your video dataset.
View more - ✅ Improved ST-DiT architecture includes rope positional encoding, qk norm, longer text length, etc. - ✅ Support training with any resolution, aspect ratio, and duration (including images). - ✅ Support image and video conditioning and video editing, and thus support animating images, connecting videos, etc. - 📍 **Open-Sora 1.0** released. Model weights are available [here](#model-weights). With only 400K video clips and 200 H800 days (compared with 152M samples in Stable Video Diffusion), we are able to generate 2s 512×512 videos. See our **[report 1.0](docs/report_01.md)** for more discussions. - ✅ Three-stage training from an image diffusion model to a video diffusion model. We provide the weights for each stage. - ✅ Support training acceleration including accelerated transformer, faster T5 and VAE, and sequence parallelism. Open-Sora improves **55%** training speed when training on 64x512x512 videos. Details locates at [acceleration.md](docs/acceleration.md). - 🔧 **Data preprocessing pipeline v1.0**, including [downloading](tools/datasets/README.md), [video cutting](tools/scene_cut/README.md), and [captioning](tools/caption/README.md) tools. Our data collection plan can be found at [datasets.md](docs/datasets.md). - ✅ We find VQ-VAE from [VideoGPT](https://wilson1yan.github.io/videogpt/index.html) has a low quality and thus adopt a better VAE from [Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original). We also find patching in the time dimension deteriorates the quality. See our **[report](docs/report_01.md)** for more discussions. - ✅ We investigate different architectures including DiT, Latte, and our proposed STDiT. Our **STDiT** achieves a better trade-off between quality and speed. See our **[report](docs/report_01.md)** for more discussions. - ✅ Support clip and T5 text conditioning. - ✅ By viewing images as one-frame videos, our project supports training DiT on both images and videos (e.g., ImageNet & UCF101). See [commands.md](docs/commands.md) for more instructions. - ✅ Support inference with official weights from [DiT](https://github.com/facebookresearch/DiT), [Latte](https://github.com/Vchitect/Latte), and [PixArt](https://pixart-alpha.github.io/). - ✅ Refactor the codebase. See [structure.md](docs/structure.md) to learn the project structure and how to use the config files.
### TODO list sorted by priority
View more - [x] Training Video-VAE and adapt our model to new VAE. - [x] Scaling model parameters and dataset size. - [x] Incoporate a better scheduler (rectified flow). - [x] Evaluation pipeline. - [x] Complete the data processing pipeline (including dense optical flow, aesthetics scores, text-image similarity, etc.). See [the dataset](/docs/datasets.md) for more information - [x] Support image and video conditioning. - [x] Support variable aspect ratios, resolutions, durations.
## Contents - [Installation](#installation) - [Model Weights](#model-weights) - [Gradio Demo](#gradio-demo) - [Inference](#inference) - [Data Processing](#data-processing) - [Training](#training) - [Evaluation](#evaluation) - [VAE Training & Evaluation](#vae-training--evaluation) - [Contribution](#contribution) - [Citation](#citation) - [Acknowledgement](#acknowledgement) Other useful documents and links are listed below. - Report: each version is trained from a image base seperately (not continuously trained), while a newer version will incorporate the techniques from the previous version. - [report 1.2](docs/report_03.md): rectified flow, 3d-VAE, score condition, evaluation, etc. - [report 1.1](docs/report_02.md): multi-resolution/length/aspect-ratio, image/video conditioning/editing, data preprocessing, etc. - [report 1.0](docs/report_01.md): architecture, captioning, etc. - [acceleration.md](docs/acceleration.md) - Repo structure: [structure.md](docs/structure.md) - Config file explanation: [config.md](docs/config.md) - Useful commands: [commands.md](docs/commands.md) - Data processing pipeline and dataset: [datasets.md](docs/datasets.md) - Each data processing tool's README: [dataset conventions and management](/tools/datasets/README.md), [scene cutting](/tools/scene_cut/README.md), [scoring](/tools/scoring/README.md), [caption](/tools/caption/README.md) - Evaluation: [eval/README.md](/eval/README.md) - Gallery: [gallery](https://hpcaitech.github.io/Open-Sora/) ## Installation ### Install from Source For CUDA 12.1, you can install the dependencies with the following commands. Otherwise, please refer to [Installation Documentation](docs/installation.md) for more instructions on different cuda version, and additional dependency for data preprocessing, VAE, and model evaluation. ```bash # create a virtual env and activate (conda as an example) conda create -n opensora python=3.9 conda activate opensora # download the repo git clone https://github.com/hpcaitech/Open-Sora cd Open-Sora # install torch, torchvision and xformers pip install -r requirements/requirements-cu121.txt # the default installation is for inference only pip install -v . # for development mode, `pip install -v -e .` ``` (Optional, recommended for fast speed, especially for training) To enable `layernorm_kernel` and `flash_attn`, you need to install `apex` and `flash-attn` with the following commands. ```bash # install flash attention # set enable_flash_attn=False in config to disable flash attention pip install packaging ninja pip install flash-attn --no-build-isolation # install apex # set enable_layernorm_kernel=False in config to disable apex pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git ``` ### Use Docker Run the following command to build a docker image from Dockerfile provided. ```bash docker build -t opensora . ``` Run the following command to start the docker container in interactive mode. ```bash docker run -ti --gpus all -v .:/workspace/Open-Sora opensora ``` ## Model Weights ### Open-Sora 1.2 Model Weights | Model | Model Size | Data | #iterations | Batch Size | URL | | --------- | ---------- | ---- | ----------- | ---------- | ------------------------------------------------------------- | | Diffusion | 1.1B | 30M | 70k | Dynamic | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v3) | | VAE | 384M | 3M | 1M | 8 | [:link:](https://huggingface.co/hpcai-tech/OpenSora-VAE-v1.2) | See our **[report 1.2](docs/report_03.md)** for more infomation. Weight will be automatically downloaded when you run the inference script. > For users from mainland China, try `export HF_ENDPOINT=https://hf-mirror.com` to successfully download the weights. ### Open-Sora 1.1 Model Weights
View more | Resolution | Model Size | Data | #iterations | Batch Size | URL | | ------------------ | ---------- | -------------------------- | ----------- | ------------------------------------------------- | -------------------------------------------------------------------- | | mainly 144p & 240p | 700M | 10M videos + 2M images | 100k | [dynamic](/configs/opensora-v1-1/train/stage2.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage2) | | 144p to 720p | 700M | 500K HQ videos + 1M images | 4k | [dynamic](/configs/opensora-v1-1/train/stage3.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage3) | See our **[report 1.1](docs/report_02.md)** for more infomation. :warning: **LIMITATION**: This version contains known issues which we are going to fix in the next version (as we save computation resource for the next release). In addition, the video generation may fail for long duration, and high resolution will have noisy results due to this problem.
### Open-Sora 1.0 Model Weights
View more | Resolution | Model Size | Data | #iterations | Batch Size | GPU days (H800) | URL | | ---------- | ---------- | ------ | ----------- | ---------- | --------------- | --------------------------------------------------------------------------------------------- | | 16×512×512 | 700M | 20K HQ | 20k | 2×64 | 35 | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x512x512.pth) | | 16×256×256 | 700M | 20K HQ | 24k | 8×64 | 45 | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x256x256.pth) | | 16×256×256 | 700M | 366K | 80k | 8×64 | 117 | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-16x256x256.pth) | Training orders: 16x256x256 $\rightarrow$ 16x256x256 HQ $\rightarrow$ 16x512x512 HQ. Our model's weight is partially initialized from [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha). The number of parameters is 724M. More information about training can be found in our **[report](/docs/report_01.md)**. More about the dataset can be found in [datasets.md](/docs/datasets.md). HQ means high quality. :warning: **LIMITATION**: Our model is trained on a limited budget. The quality and text alignment is relatively poor. The model performs badly, especially on generating human beings and cannot follow detailed instructions. We are working on improving the quality and text alignment.
## Gradio Demo 🔥 You can experience Open-Sora on our [🤗 Gradio application](https://huggingface.co/spaces/hpcai-tech/open-sora) on Hugging Face online. ### Local Deployment If you want to deploy gradio locally, we have also provided a [Gradio application](./gradio) in this repository, you can use the following the command to start an interactive web application to experience video generation with Open-Sora. ```bash pip install gradio spaces python gradio/app.py ``` This will launch a Gradio application on your localhost. If you want to know more about the Gradio applicaiton, you can refer to the [Gradio README](./gradio/README.md). To enable prompt enhancement and other language input (e.g., 中文输入), you need to set the `OPENAI_API_KEY` in the environment. Check [OpenAI's documentation](https://platform.openai.com/docs/quickstart) to get your API key. ```bash export OPENAI_API_KEY=YOUR_API_KEY ``` ### Getting Started In the Gradio application, the basic options are as follows: ![Gradio Demo](assets/readme/gradio_basic.png) The easiest way to generate a video is to input a text prompt and click the "**Generate video**" button (scroll down if you cannot find). The generated video will be displayed in the right panel. Checking the "**Enhance prompt with GPT4o**" will use GPT-4o to refine the prompt, while "**Random Prompt**" button will generate a random prompt by GPT-4o for you. Due to the OpenAI's API limit, the prompt refinement result has some randomness. Then, you can choose the **resolution**, **duration**, and **aspect ratio** of the generated video. Different resolution and video length will affect the video generation speed. On a 80G H100 GPU, the generation speed (with `num_sampling_step=30`) and peak memory usage is: | | Image | 2s | 4s | 8s | 16s | | ---- | ------- | -------- | --------- | --------- | --------- | | 360p | 3s, 24G | 18s, 27G | 31s, 27G | 62s, 28G | 121s, 33G | | 480p | 2s, 24G | 29s, 31G | 55s, 30G | 108s, 32G | 219s, 36G | | 720p | 6s, 27G | 68s, 41G | 130s, 39G | 260s, 45G | 547s, 67G | Note that besides text to video, you can also use **image to video generation**. You can upload an image and then click the "**Generate video**" button to generate a video with the image as the first frame. Or you can fill in the text prompt and click the "**Generate image**" button to generate an image with the text prompt, and then click the "**Generate video**" button to generate a video with the image generated with the same model. ![Gradio Demo](assets/readme/gradio_option.png) Then you can specify more options, including "**Motion Strength**", "**Aesthetic**" and "**Camera Motion**". If "Enable" not checked or the choice is "none", the information is not passed to the model. Otherwise, the model will generate videos with the specified motion strength, aesthetic score, and camera motion. For the **aesthetic score**, we recommend using values higher than 6. For **motion strength**, a smaller value will lead to a smoother but less dynamic video, while a larger value will lead to a more dynamic but likely more blurry video. Thus, you can try without it and then adjust it according to the generated video. For the **camera motion**, sometimes the model cannot follow the instruction well, and we are working on improving it. You can also adjust the "**Sampling steps**", this is directly related to the generation speed as it is the number of denoising. A number smaller than 30 usually leads to a poor generation results, while a number larger than 100 usually has no significant improvement. The "**Seed**" is used for reproducibility, you can set it to a fixed number to generate the same video. The "**CFG Scale**" controls how much the model follows the text prompt, a smaller value will lead to a more random video, while a larger value will lead to a more text-following video (7 is recommended). For more advanced usage, you can refer to [Gradio README](./gradio/README.md#advanced-usage). ## Inference ### Open-Sora 1.2 Command Line Inference The basic command line inference is as follows: ```bash # text to video python scripts/inference.py configs/opensora-v1-2/inference/sample.py \ --num-frames 4s --resolution 720p --aspect-ratio 9:16 \ --prompt "a beautiful waterfall" ``` You can add more options to the command line to customize the generation. ```bash python scripts/inference.py configs/opensora-v1-2/inference/sample.py \ --num-frames 4s --resolution 720p --aspect-ratio 9:16 \ --num-sampling-steps 30 --flow 5 --aes 6.5 \ --prompt "a beautiful waterfall" ``` For image to video generation and other functionalities, the API is compatible with Open-Sora 1.1. See [here](docs/commands.md) for more instructions. If your installation do not contain `apex` and `flash-attn`, you need to disable them in the config file, or via the folowing command. ```bash python scripts/inference.py configs/opensora-v1-2/inference/sample.py \ --num-frames 4s --resolution 720p \ --layernorm-kernel False --flash-attn False \ --prompt "a beautiful waterfall" ``` ### Sequence Parallelism Inference To enable sequence parallelism, you need to use `torchrun` to run the inference script. The following command will run the inference with 2 GPUs. ```bash # text to video CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node 2 scripts/inference.py configs/opensora-v1-2/inference/sample.py \ --num-frames 4s --resolution 720p --aspect-ratio 9:16 \ --prompt "a beautiful waterfall" ``` :warning: **LIMITATION**: The sequence parallelism is not supported for gradio deployment. For now, the sequence parallelism is only supported when the dimension can be divided by the number of GPUs. Thus, it may fail for some cases. We tested 4 GPUs for 720p and 2 GPUs for 480p. ### GPT-4o Prompt Refinement We find that GPT-4o can refine the prompt and improve the quality of the generated video. With this feature, you can also use other language (e.g., Chinese) as the prompt. To enable this feature, you need prepare your openai api key in the environment: ```bash export OPENAI_API_KEY=YOUR_API_KEY ``` Then you can inference with `--llm-refine True` to enable the GPT-4o prompt refinement, or leave prompt empty to get a random prompt generated by GPT-4o. ```bash python scripts/inference.py configs/opensora-v1-2/inference/sample.py \ --num-frames 4s --resolution 720p --llm-refine True ``` ### Open-Sora 1.1 Command Line Inference
View more Since Open-Sora 1.1 supports inference with dynamic input size, you can pass the input size as an argument. ```bash # text to video python scripts/inference.py configs/opensora-v1-1/inference/sample.py --prompt "A beautiful sunset over the city" --num-frames 32 --image-size 480 854 ``` If your installation do not contain `apex` and `flash-attn`, you need to disable them in the config file, or via the folowing command. ```bash python scripts/inference.py configs/opensora-v1-1/inference/sample.py --prompt "A beautiful sunset over the city" --num-frames 32 --image-size 480 854 --layernorm-kernel False --flash-attn False ``` See [here](docs/commands.md#inference-with-open-sora-11) for more instructions including text-to-image, image-to-video, video-to-video, and infinite time generation.
### Open-Sora 1.0 Command Line Inference
View more We have also provided an offline inference script. Run the following commands to generate samples, the required model weights will be automatically downloaded. To change sampling prompts, modify the txt file passed to `--prompt-path`. See [here](docs/structure.md#inference-config-demos) to customize the configuration. ```bash # Sample 16x512x512 (20s/sample, 100 time steps, 24 GB memory) torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x512x512.py --ckpt-path OpenSora-v1-HQ-16x512x512.pth --prompt-path ./assets/texts/t2v_samples.txt # Sample 16x256x256 (5s/sample, 100 time steps, 22 GB memory) torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path OpenSora-v1-HQ-16x256x256.pth --prompt-path ./assets/texts/t2v_samples.txt # Sample 64x512x512 (40s/sample, 100 time steps) torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth --prompt-path ./assets/texts/t2v_samples.txt # Sample 64x512x512 with sequence parallelism (30s/sample, 100 time steps) # sequence parallelism is enabled automatically when nproc_per_node is larger than 1 torchrun --standalone --nproc_per_node 2 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth --prompt-path ./assets/texts/t2v_samples.txt ``` The speed is tested on H800 GPUs. For inference with other models, see [here](docs/commands.md) for more instructions. To lower the memory usage, set a smaller `vae.micro_batch_size` in the config (slightly lower sampling speed).
## Data Processing High-quality data is crucial for training good generation models. To this end, we establish a complete pipeline for data processing, which could seamlessly convert raw videos to high-quality video-text pairs. The pipeline is shown below. For detailed information, please refer to [data processing](docs/data_processing.md). Also check out the [datasets](docs/datasets.md) we use. ![Data Processing Pipeline](assets/readme/report_data_pipeline.png) ## Training ### Open-Sora 1.2 Training The training process is same as Open-Sora 1.1. ```bash # one node torchrun --standalone --nproc_per_node 8 scripts/train.py \ configs/opensora-v1-2/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT # multiple nodes colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py \ configs/opensora-v1-2/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT ``` ### Open-Sora 1.1 Training
View more Once you prepare the data in a `csv` file, run the following commands to launch training on a single node. ```bash # one node torchrun --standalone --nproc_per_node 8 scripts/train.py \ configs/opensora-v1-1/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT # multiple nodes colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py \ configs/opensora-v1-1/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT ```
### Open-Sora 1.0 Training
View more Once you prepare the data in a `csv` file, run the following commands to launch training on a single node. ```bash # 1 GPU, 16x256x256 torchrun --nnodes=1 --nproc_per_node=1 scripts/train.py configs/opensora/train/16x256x256.py --data-path YOUR_CSV_PATH # 8 GPUs, 64x512x512 torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT ``` To launch training on multiple nodes, prepare a hostfile according to [ColossalAI](https://colossalai.org/docs/basics/launch_colossalai/#launch-with-colossal-ai-cli), and run the following commands. ```bash colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT ``` For training other models and advanced usage, see [here](docs/commands.md) for more instructions.
## Evaluation We support evaluation based on: - Validation loss - [VBench](https://github.com/Vchitect/VBench/tree/master) score - VBench-i2v score - Batch generation for human evaluation All the evaluation code is released in `eval` folder. Check the [README](/eval/README.md) for more details. Our [report](/docs/report_03.md#evaluation) also provides more information about the evaluation during training. The following table shows Open-Sora 1.2 greatly improves Open-Sora 1.0. | Model | Total Score | Quality Score | Semantic Score | | -------------- | ----------- | ------------- | -------------- | | Open-Sora V1.0 | 75.91% | 78.81% | 64.28% | | Open-Sora V1.2 | 79.23% | 80.71% | 73.30% | ## VAE Training & Evaluation We train a VAE pipeline that consists of a spatial VAE followed by a temporal VAE. For more details, refer to [VAE Documentation](docs/vae.md). Before you run the following commands, follow our [Installation Documentation](docs/installation.md) to install the required dependencies for VAE and Evaluation. If you want to train your own VAE, we need to prepare data in the csv following the [data processing](#data-processing) pipeline, then run the following commands. Note that you need to adjust the number of trained epochs (`epochs`) in the config file accordingly with respect to your own csv data size. ```bash # stage 1 training, 380k steps, 8 GPUs torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage1.py --data-path YOUR_CSV_PATH # stage 2 training, 260k steps, 8 GPUs torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage2.py --data-path YOUR_CSV_PATH # stage 3 training, 540k steps, 24 GPUs torchrun --nnodes=3 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage3.py --data-path YOUR_CSV_PATH ``` To evaluate the VAE performance, you need to run VAE inference first to generate the videos, then calculate scores on the generated videos: ```bash # video generation torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference_vae.py configs/vae/inference/video.py --ckpt-path YOUR_VAE_CKPT_PATH --data-path YOUR_CSV_PATH --save-dir YOUR_VIDEO_DIR # the original videos will be saved to `YOUR_VIDEO_DIR_ori` # the reconstructed videos through the pipeline will be saved to `YOUR_VIDEO_DIR_rec` # the reconstructed videos through the spatial VAE only will be saved to `YOUR_VIDEO_DIR_spatial` # score calculation python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir YOUR_VIDEO_DIR_ori --generated_video_dir YOUR_VIDEO_DIR_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips ``` ## Contribution Thanks goes to these wonderful contributors: If you wish to contribute to this project, please refer to the [Contribution Guideline](./CONTRIBUTING.md). ## Acknowledgement Here we only list a few of the projects. For other works and datasets, please refer to our report. - [ColossalAI](https://github.com/hpcaitech/ColossalAI): A powerful large model parallel acceleration and optimization system. - [DiT](https://github.com/facebookresearch/DiT): Scalable Diffusion Models with Transformers. - [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT): An acceleration for DiT training. We adopt valuable acceleration strategies for training progress from OpenDiT. - [PixArt](https://github.com/PixArt-alpha/PixArt-alpha): An open-source DiT-based text-to-image model. - [Latte](https://github.com/Vchitect/Latte): An attempt to efficiently train DiT for video. - [StabilityAI VAE](https://huggingface.co/stabilityai/sd-vae-ft-mse-original): A powerful image VAE model. - [CLIP](https://github.com/openai/CLIP): A powerful text-image embedding model. - [T5](https://github.com/google-research/text-to-text-transfer-transformer): A powerful text encoder. - [LLaVA](https://github.com/haotian-liu/LLaVA): A powerful image captioning model based on [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) and [Yi-34B](https://huggingface.co/01-ai/Yi-34B). - [PLLaVA](https://github.com/magic-research/PLLaVA): A powerful video captioning model. - [MiraData](https://github.com/mira-space/MiraData): A large-scale video dataset with long durations and structured caption. We are grateful for their exceptional work and generous contribution to open source. Special thanks go to the authors of [MiraData](https://github.com/mira-space/MiraData) and [Rectified Flow](https://github.com/gnobitab/RectifiedFlow) for their valuable advice and help. We wish to express gratitude towards AK for sharing this project on social media and Hugging Face for providing free GPU resources for our online Gradio demo. ## Citation ```bibtex @software{opensora, author = {Zangwei Zheng and Xiangyu Peng and Tianji Yang and Chenhui Shen and Shenggui Li and Hongxin Liu and Yukun Zhou and Tianyi Li and Yang You}, title = {Open-Sora: Democratizing Efficient Video Production for All}, month = {March}, year = {2024}, url = {https://github.com/hpcaitech/Open-Sora} } ``` ## Star History [![Star History Chart](https://api.star-history.com/svg?repos=hpcaitech/Open-Sora&type=Date)](https://star-history.com/#hpcaitech/Open-Sora&Date) ================================================ FILE: Open-Sora/opensora.egg-info/SOURCES.txt ================================================ LICENSE README.md pyproject.toml setup.py opensora/__init__.py opensora/registry.py opensora.egg-info/PKG-INFO opensora.egg-info/SOURCES.txt opensora.egg-info/dependency_links.txt opensora.egg-info/requires.txt opensora.egg-info/top_level.txt opensora/acceleration/__init__.py opensora/acceleration/checkpoint.py opensora/acceleration/communications.py opensora/acceleration/parallel_states.py opensora/acceleration/plugin.py opensora/acceleration/shardformer/__init__.py opensora/acceleration/shardformer/modeling/__init__.py opensora/acceleration/shardformer/modeling/t5.py opensora/acceleration/shardformer/policy/__init__.py opensora/acceleration/shardformer/policy/t5_encoder.py opensora/datasets/__init__.py opensora/datasets/aspect.py opensora/datasets/bucket.py opensora/datasets/dataloader.py opensora/datasets/datasets.py opensora/datasets/read_video.py opensora/datasets/sampler.py opensora/datasets/utils.py opensora/datasets/video_transforms.py opensora/models/__init__.py opensora/models/cache_functions/__init__.py opensora/models/cache_functions/attention.py opensora/models/cache_functions/cache_cutfresh.py opensora/models/cache_functions/cache_init.py opensora/models/cache_functions/force_init.py opensora/models/cache_functions/force_scheduler.py opensora/models/cache_functions/fresh_ratio_scheduler.py opensora/models/cache_functions/global_force_fresh.py opensora/models/cache_functions/score_evaluate.py opensora/models/cache_functions/scores.py opensora/models/cache_functions/token_merge.py opensora/models/cache_functions/update_cache.py opensora/models/dit/__init__.py opensora/models/dit/dit.py opensora/models/latte/__init__.py opensora/models/latte/latte.py opensora/models/layers/__init__.py opensora/models/layers/blocks.py opensora/models/pixart/__init__.py opensora/models/pixart/pixart.py opensora/models/pixart/pixart_sigma.py opensora/models/stdit/__init__.py opensora/models/stdit/stdit.py opensora/models/stdit/stdit2.py opensora/models/stdit/stdit3 copy.py opensora/models/stdit/stdit3.py opensora/models/text_encoder/__init__.py opensora/models/text_encoder/classes.py opensora/models/text_encoder/clip.py opensora/models/text_encoder/t5.py opensora/models/vae/__init__.py opensora/models/vae/discriminator.py opensora/models/vae/losses.py opensora/models/vae/lpips.py opensora/models/vae/utils.py opensora/models/vae/vae.py opensora/models/vae/vae_temporal.py opensora/schedulers/__init__.py opensora/schedulers/dpms/__init__.py opensora/schedulers/dpms/dpm_solver.py opensora/schedulers/iddpm/__init__.py opensora/schedulers/iddpm/diffusion_utils.py opensora/schedulers/iddpm/gaussian_diffusion.py opensora/schedulers/iddpm/respace.py opensora/schedulers/iddpm/speed.py opensora/schedulers/iddpm/timestep_sampler.py opensora/schedulers/rf/__init__.py opensora/schedulers/rf/rectified_flow.py opensora/utils/__init__.py opensora/utils/ckpt_utils.py opensora/utils/config_utils.py opensora/utils/inference_utils.py opensora/utils/lr_scheduler.py opensora/utils/misc.py opensora/utils/train_utils.py tests/test_attn.py tests/test_lr_scheduler.py tests/test_np_torch.py tests/test_pos_emb.py tests/test_seq_parallel_attention.py tests/test_stdit3_sequence_parallelism.py tests/test_t5_shardformer.py tools/caption/__init__.py tools/caption/camera_motion_detect.py tools/caption/caption_gpt4.py tools/caption/caption_llama3.py tools/caption/caption_llava.py tools/caption/utils.py tools/caption/acceleration/__init__.py tools/caption/acceleration/llava/__init__.py tools/caption/acceleration/llava/policies/__init__.py tools/caption/acceleration/llava/policies/llama.py tools/caption/acceleration/llava/policies/mistral.py tools/caption/camera_motion/__init__.py tools/caption/camera_motion/camera_motion.py tools/caption/camera_motion/detect.py tools/caption/camera_motion/utils.py tools/caption/camera_motion/visualizer.py tools/datasets/__init__.py tools/datasets/analyze.py tools/datasets/convert.py tools/datasets/datautil.py tools/datasets/filter_panda10m.py tools/datasets/split.py tools/datasets/transform.py tools/datasets/utils.py tools/frame_interpolation/__init__.py tools/frame_interpolation/interpolation.py tools/frame_interpolation/networks/__init__.py tools/frame_interpolation/networks/amt_g.py tools/frame_interpolation/networks/blocks/__init__.py tools/frame_interpolation/networks/blocks/feat_enc.py tools/frame_interpolation/networks/blocks/ifrnet.py tools/frame_interpolation/networks/blocks/multi_flow.py tools/frame_interpolation/networks/blocks/raft.py tools/frame_interpolation/utils/__init__.py tools/frame_interpolation/utils/dist_utils.py tools/frame_interpolation/utils/flow_utils.py tools/frame_interpolation/utils/utils.py tools/scene_cut/__init__.py tools/scene_cut/convert_id_to_path.py tools/scene_cut/cut.py tools/scene_cut/scene_detect.py tools/scoring/__init__.py tools/scoring/aesthetic/__init__.py tools/scoring/aesthetic/inference.py tools/scoring/matching/__init__.py tools/scoring/matching/inference.py tools/scoring/ocr/__init__.py tools/scoring/ocr/dbnetpp.py tools/scoring/ocr/inference.py tools/scoring/optical_flow/__init__.py tools/scoring/optical_flow/inference.py tools/scoring/optical_flow/unimatch/__init__.py tools/scoring/optical_flow/unimatch/attention.py tools/scoring/optical_flow/unimatch/backbone.py tools/scoring/optical_flow/unimatch/geometry.py tools/scoring/optical_flow/unimatch/matching.py tools/scoring/optical_flow/unimatch/position.py tools/scoring/optical_flow/unimatch/reg_refine.py tools/scoring/optical_flow/unimatch/transformer.py tools/scoring/optical_flow/unimatch/trident_conv.py tools/scoring/optical_flow/unimatch/unimatch.py tools/scoring/optical_flow/unimatch/utils.py ================================================ FILE: Open-Sora/opensora.egg-info/dependency_links.txt ================================================ ================================================ FILE: Open-Sora/opensora.egg-info/requires.txt ================================================ colossalai>=0.4.0 mmengine>=0.10.3 pandas>=2.0.3 timm==0.9.16 rotary_embedding_torch==0.5.3 ftfy>=6.2.0 diffusers==0.27.2 accelerate==0.29.2 av>=12.0.0 numpy<2.0.0 gradio>=4.26.0 spaces>=0.28.3 ipykernel>=6.29.4 ipywidgets>=8.1.2 wandb>=0.17.0 tensorboard>=2.14.0 pandarallel>=1.6.5 pyarrow>=16.1.0 pre-commit>=3.5.0 openai [data] gdown>=5.2.0 ninja>=1.11.1.1 shortuuid>=1.0.13 markdown2[all] scikit-learn>=1.4.2 einops-exts>=0.0.4 decord==0.6.0 ptvsd==4.3.2 imageio-ffmpeg>=0.4.9 ffmpeg-python==0.2.0 lingua-language-detector==2.0.2 imageio>=2.34.1 setuptools==68.2.2 clip@ git+https://github.com/openai/CLIP.git mmcv==2.1.0 mmdet==3.1.0 mmocr==1.0.1 detectron2@ git+https://github.com/facebookresearch/detectron2.git@ff53992 [eval] detectron2@ git+https://github.com/facebookresearch/detectron2.git@ff53992 imageio>=2.34.1 pyiqa==0.1.10 scikit-learn>=1.4.2 scikit-image>=0.20.0 lvis==0.5.3 boto3>=1.34.113 easydict>=1.9 fairscale>=0.4.13 decord==0.6.0 pytorchvideo==0.1.5 lpips==0.1.4 [full] gdown>=5.2.0 ninja>=1.11.1.1 shortuuid>=1.0.13 markdown2[all] scikit-learn>=1.4.2 einops-exts>=0.0.4 decord==0.6.0 ptvsd==4.3.2 imageio-ffmpeg>=0.4.9 ffmpeg-python==0.2.0 lingua-language-detector==2.0.2 imageio>=2.34.1 setuptools==68.2.2 clip@ git+https://github.com/openai/CLIP.git mmcv==2.1.0 mmdet==3.1.0 mmocr==1.0.1 detectron2@ git+https://github.com/facebookresearch/detectron2.git@ff53992 pyiqa==0.1.10 scikit-image>=0.20.0 lvis==0.5.3 boto3>=1.34.113 easydict>=1.9 fairscale>=0.4.13 pytorchvideo==0.1.5 lpips==0.1.4 [vae] beartype==0.18.5 einops==0.8.0 einops-exts==0.0.4 opencv-python==4.9.0.80 pillow==10.3.0 ================================================ FILE: Open-Sora/opensora.egg-info/top_level.txt ================================================ opensora tools ================================================ FILE: Open-Sora/pyproject.toml ================================================ [tool.autoflake] remove-unused-variables = true remove-all-unused-imports = true ignore-init-module-imports = true [tool.isort] line_length = 120 multi_line_output = 3 include_trailing_comma = true ignore_comments = true profile = "black" honor_noqa = true [tool.black] line-length = 120 target-version = ["py37", "py38", "py39", "py310"] ================================================ FILE: Open-Sora/requirements/requirements-cu121.txt ================================================ torch==2.2.2 --index-url https://download.pytorch.org/whl/cu121 torchvision==0.17.2 --index-url https://download.pytorch.org/whl/cu121 xformers==0.0.25.post1 --index-url https://download.pytorch.org/whl/cu121 ================================================ FILE: Open-Sora/requirements/requirements-data.txt ================================================ gdown>=5.2.0 # [caption llava] ninja>=1.11.1.1 shortuuid>=1.0.13 markdown2[all] scikit-learn>=1.4.2 einops-exts>=0.0.4 # [camera_motion] decord==0.6.0 ptvsd==4.3.2 imageio-ffmpeg>=0.4.9 # [datasets] ffmpeg-python==0.2.0 lingua-language-detector==2.0.2 # [frame interpolation] imageio>=2.34.1 # [aesthetic] setuptools==68.2.2 clip @ git+https://github.com/openai/CLIP.git # [ocr] mmcv==2.1.0 mmdet==3.1.0 mmocr==1.0.1 detectron2 @ git+https://github.com/facebookresearch/detectron2.git@ff53992 ================================================ FILE: Open-Sora/requirements/requirements-eval.txt ================================================ # [vbench] detectron2 @ git+https://github.com/facebookresearch/detectron2.git@ff53992 imageio>=2.34.1 pyiqa==0.1.10 scikit-learn>=1.4.2 scikit-image>=0.20.0 lvis==0.5.3 boto3>=1.34.113 easydict>=1.9 fairscale>=0.4.13 # [vae] decord==0.6.0 pytorchvideo==0.1.5 lpips==0.1.4 ================================================ FILE: Open-Sora/requirements/requirements-pllava.txt ================================================ absl-py==2.1.0 accelerate==0.26.1 addict==2.4.0 aiofiles==23.2.1 aliyun-python-sdk-core==2.15.0 aliyun-python-sdk-kms==2.16.2 altair==5.2.0 annotated-types==0.6.0 antlr4-python3-runtime==4.9.3 anyio==4.3.0 anykeystore==0.2 apex==0.9.10.dev0 appdirs==1.4.4 argcomplete==3.2.3 attrs==23.2.0 av==10.0.0 beautifulsoup4==4.12.3 blessed==1.20.0 blessings==1.7 boto3==1.34.63 botocore==1.34.63 Brotli==1.1.0 cachetools==5.3.3 certifi==2024.2.2 cffi==1.16.0 charset-normalizer==3.3.2 click==8.1.7 colorama==0.4.6 contourpy==1.2.0 crcmod==1.7 cryptacular==1.6.2 cryptography==42.0.5 cycler==0.12.1 dacite==1.7.0 decorator==4.4.2 decord==0.6.0 deepspeed==0.14.0 defusedxml==0.7.1 Deprecated==1.2.14 dill==0.3.8 distro==1.9.0 dnspython==2.6.1 docker-pycreds==0.4.0 einops==0.6.1 exceptiongroup==1.2.0 fastapi==0.110.0 ffmpeg==1.4 ffmpy==0.3.2 fiftyone==0.23.6 fiftyone-brain==0.16.1 fiftyone_db==1.1.2 filelock==3.9.0 fonttools==4.49.0 fsspec==2024.2.0 ftfy==6.1.3 future==1.0.0 fvcore==0.1.5.post20221221 gdown==5.1.0 gitdb==4.0.11 GitPython==3.1.42 glob2==0.7 google-auth==2.28.2 google-auth-oauthlib==1.2.0 gpustat==1.1.1 gradio==4.21.0 gradio_client==0.12.0 graphql-core==3.2.3 greenlet==3.0.3 grpcio==1.62.1 h11==0.14.0 h2==4.1.0 hjson==3.1.0 hpack==4.0.0 httpcore==1.0.4 httpx==0.27.0 huggingface-hub==0.21.4 humanize==4.9.0 hupper==1.12.1 Hypercorn==0.16.0 hyperframe==6.0.1 idna==3.6 idscheck==2.3.0 imageio==2.27.0 imageio-ffmpeg==0.4.9 importlib_metadata==7.0.2 importlib_resources==6.3.0 inflate64==1.0.0 iopath==0.1.10 Jinja2==3.1.2 jmespath==0.10.0 joblib==1.3.2 jsonlines==4.0.0 jsonschema==4.21.1 jsonschema-specifications==2023.12.1 kaleido==0.2.1 kiwisolver==1.4.5 lazy_loader==0.3 Markdown==3.6 markdown-it-py==3.0.0 MarkupSafe==2.1.3 matplotlib==3.8.3 mdurl==0.1.2 mmcv-full==1.7.2 model-index==0.1.11 mongoengine==0.24.2 motor==3.3.2 moviepy==1.0.3 mpmath==1.3.0 multivolumefile==0.2.3 networkx==3.2.1 ninja==1.11.1.1 numpy==1.23.5 nvidia-ml-py==12.535.133 nvidia-ml-py3==7.352.0 oauthlib==3.2.2 omegaconf==2.3.0 openai==1.14.0 opencv-python==4.9.0.80 opencv-python-headless==4.9.0.80 opendatalab==0.0.10 openmim==0.3.9 openxlab==0.0.36 ordered-set==4.1.0 orjson==3.9.15 oss2==2.17.0 packaging==24.0 pandas==1.5.3 PasteDeploy==3.1.0 pathtools==0.1.2 pbkdf2==1.3 peft==0.10.0 pillow==10.2.0 plaster==1.1.2 plaster-pastedeploy==1.0.1 platformdirs==4.2.0 plotly==5.20.0 portalocker==2.8.2 pprintpp==0.4.0 priority==2.0.0 proglog==0.1.10 protobuf==4.23.4 psutil==5.9.4 py-cpuinfo==9.0.0 py7zr==0.21.0 pyasn1==0.5.1 pyasn1-modules==0.3.0 pybcj==1.0.2 pycparser==2.21 pycryptodome==3.20.0 pycryptodomex==3.20.0 pydantic==2.6.4 pydantic_core==2.16.3 pydub==0.25.1 Pygments==2.17.2 pymongo==4.6.2 pynvml==11.5.0 pyparsing==3.1.2 pyppmd==1.1.0 pyramid==2.0.2 pyramid-mailer==0.15.1 PySocks==1.7.1 python-dateutil==2.9.0.post0 python-multipart==0.0.9 python3-openid==3.2.0 pytz==2023.4 PyYAML==6.0 pyzstd==0.15.9 rarfile==4.1 referencing==0.33.0 regex==2023.12.25 repoze.sendmail==4.4.1 requests==2.28.2 requests-oauthlib==1.4.0 retrying==1.3.4 rich==13.4.2 rpds-py==0.18.0 rsa==4.9 ruff==0.3.2 s3transfer==0.10.1 safetensors==0.4.2 scikit-image==0.22.0 scikit-learn==1.4.1.post1 scipy==1.10.1 semantic-version==2.10.0 sentencepiece==0.2.0 sentry-sdk==1.42.0 setproctitle==1.3.3 shellingham==1.5.4 six==1.16.0 smmap==5.0.1 sniffio==1.3.1 sortedcontainers==2.4.0 soupsieve==2.5 SQLAlchemy==2.0.28 sse-starlette==0.10.3 sseclient-py==1.8.0 starlette==0.36.3 strawberry-graphql==0.138.1 sympy==1.12 tabulate==0.9.0 taskgroup==0.0.0a4 tenacity==8.2.3 tensorboard==2.15.1 tensorboard-data-server==0.7.2 tensorboardX==2.6.2.2 termcolor==2.3.0 texttable==1.7.0 threadpoolctl==3.3.0 tifffile==2024.2.12 timm==0.6.12 tokenizers==0.15.2 tomli==2.0.1 tomlkit==0.12.0 toolz==0.12.1 torch==2.2.2 torchaudio torchvision==0.17.2 tqdm==4.65.2 transaction==4.0 transformers==4.37.1 translationstring==1.4 triton==2.2.0 typer==0.9.0 typing_extensions==4.8.0 tzdata==2024.1 tzlocal==5.2 universal-analytics-python3==1.1.1 urllib3==1.26.18 uvicorn==0.28.0 velruse==1.1.1 venusian==3.1.0 voxel51-eta==0.12.6 wandb==0.14.0 wcwidth==0.2.13 WebOb==1.8.7 websockets==11.0.3 Werkzeug==3.0.1 wrapt==1.16.0 wsproto==1.2.0 WTForms==3.1.2 wtforms-recaptcha==0.3.2 xmltodict==0.13.0 yacs==0.1.8 yapf==0.40.2 zipp==3.18.1 zope.deprecation==5.0 zope.interface==6.2 zope.sqlalchemy==3.1 ================================================ FILE: Open-Sora/requirements/requirements-vae.txt ================================================ beartype==0.18.5 einops==0.8.0 einops-exts==0.0.4 opencv-python==4.9.0.80 pillow==10.3.0 ================================================ FILE: Open-Sora/requirements/requirements.txt ================================================ colossalai>=0.4.0 mmengine>=0.10.3 pandas>=2.0.3 timm==0.9.16 rotary_embedding_torch==0.5.3 ftfy>=6.2.0 # for t5 diffusers==0.27.2 # for vae accelerate==0.29.2 # for t5 av>=12.0.0 # for video loading numpy<2.0.0 # [gradio] gradio>=4.26.0 spaces>=0.28.3 # [notebook] ipykernel>=6.29.4 ipywidgets>=8.1.2 # [training] wandb>=0.17.0 tensorboard>=2.14.0 pandarallel>=1.6.5 pyarrow>=16.1.0 # for parquet # [dev] pre-commit>=3.5.0 openai ================================================ FILE: Open-Sora/scripts/inference.py ================================================ import os import time from pprint import pformat import colossalai import torch import torch.distributed as dist from colossalai.cluster import DistCoordinator from mmengine.runner import set_random_seed from tqdm import tqdm from opensora.acceleration.parallel_states import set_sequence_parallel_group from opensora.datasets import save_sample from opensora.datasets.aspect import get_image_size, get_num_frames from opensora.models.text_encoder.t5 import text_preprocessing from opensora.registry import MODELS, SCHEDULERS, build_module from opensora.utils.config_utils import parse_configs from opensora.utils.inference_utils import ( add_watermark, append_generated, append_score_to_prompts, apply_mask_strategy, collect_references_batch, dframe_to_frame, extract_json_from_prompts, extract_prompts_loop, get_save_path_name, load_prompts, merge_prompt, prepare_multi_resolution_info, refine_prompts_by_openai, split_prompt, ) from opensora.utils.misc import all_exists, create_logger, is_distributed, is_main_process, to_torch_dtype def main(): torch.set_grad_enabled(False) # ====================================================== # configs & runtime variables # ====================================================== # == parse configs == cfg = parse_configs(training=False) # == device and dtype == device = "cuda" if torch.cuda.is_available() else "cpu" cfg_dtype = cfg.get("dtype", "fp32") assert cfg_dtype in ["fp16", "bf16", "fp32"], f"Unknown mixed precision {cfg_dtype}" dtype = to_torch_dtype(cfg.get("dtype", "bf16")) torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True # == init distributed env == if is_distributed(): colossalai.launch_from_torch({}) coordinator = DistCoordinator() enable_sequence_parallelism = coordinator.world_size > 1 if enable_sequence_parallelism: set_sequence_parallel_group(dist.group.WORLD) else: coordinator = None enable_sequence_parallelism = False set_random_seed(seed=cfg.get("seed", 1024)) # == init logger == logger = create_logger() logger.info("Inference configuration:\n %s", pformat(cfg.to_dict())) verbose = cfg.get("verbose", 1) progress_wrap = tqdm if verbose == 1 else (lambda x: x) # ====================================================== # build model & load weights # ====================================================== logger.info("Building models...") # == build text-encoder and vae == text_encoder = build_module(cfg.text_encoder, MODELS, device=device) vae = build_module(cfg.vae, MODELS).to(device, dtype).eval() # == prepare video size == image_size = cfg.get("image_size", None) if image_size is None: resolution = cfg.get("resolution", None) aspect_ratio = cfg.get("aspect_ratio", None) assert ( resolution is not None and aspect_ratio is not None ), "resolution and aspect_ratio must be provided if image_size is not provided" image_size = get_image_size(resolution, aspect_ratio) num_frames = get_num_frames(cfg.num_frames) # == build diffusion model == input_size = (num_frames, *image_size) latent_size = vae.get_latent_size(input_size) model = ( build_module( cfg.model, MODELS, input_size=latent_size, in_channels=vae.out_channels, caption_channels=text_encoder.output_dim, model_max_length=text_encoder.model_max_length, enable_sequence_parallelism=enable_sequence_parallelism, ) .to(device, dtype) .eval() ) text_encoder.y_embedder = model.y_embedder # HACK: for classifier-free guidance # == build scheduler == scheduler = build_module(cfg.scheduler, SCHEDULERS) # ====================================================== # inference # ====================================================== # == load prompts == prompts = cfg.get("prompt", None) start_idx = cfg.get("start_index", 0) if prompts is None: if cfg.get("prompt_path", None) is not None: prompts = load_prompts(cfg.prompt_path, start_idx, cfg.get("end_index", None)) else: prompts = [cfg.get("prompt_generator", "")] * 1_000_000 # endless loop #print(start_idx, cfg.get("end_index", None)) # == prepare reference == reference_path = cfg.get("reference_path", [""] * len(prompts)) mask_strategy = cfg.get("mask_strategy", [""] * len(prompts)) assert len(reference_path) == len(prompts), "Length of reference must be the same as prompts" assert len(mask_strategy) == len(prompts), "Length of mask_strategy must be the same as prompts" # == prepare arguments == fps = cfg.fps save_fps = cfg.get("save_fps", fps // cfg.get("frame_interval", 1)) multi_resolution = cfg.get("multi_resolution", None) batch_size = cfg.get("batch_size", 1) num_sample = cfg.get("num_sample", 1) loop = cfg.get("loop", 1) condition_frame_length = cfg.get("condition_frame_length", 5) condition_frame_edit = cfg.get("condition_frame_edit", 0.0) align = cfg.get("align", None) save_dir = cfg.save_dir os.makedirs(save_dir, exist_ok=True) sample_name = cfg.get("sample_name", None) prompt_as_path = cfg.get("prompt_as_path", False) # == Iter over all samples == for i in progress_wrap(range(0, len(prompts), batch_size)): # == prepare batch prompts == batch_prompts = prompts[i : i + batch_size] ms = mask_strategy[i : i + batch_size] refs = reference_path[i : i + batch_size] # == get json from prompts == batch_prompts, refs, ms = extract_json_from_prompts(batch_prompts, refs, ms) original_batch_prompts = batch_prompts # == get reference for condition == refs = collect_references_batch(refs, vae, image_size) # == multi-resolution info == model_args = prepare_multi_resolution_info( multi_resolution, len(batch_prompts), image_size, num_frames, fps, device, dtype ) model_args['cache_type'] = 'attention' model_args['ratio_scheduler'] = 'ToCa' model_args['fresh_ratio'] = 0.1 model_args['fresh_threshold'] = 3 # Note this does not decide the force activatioin cycles, see more details in Open-Sora\opensora\models\cache_functions\force_scheduler.py model_args['force_fresh'] = 'global' model_args['soft_fresh_weight'] = 0.25 # == Iter over number of sampling for one prompt == for k in range(num_sample): # == prepare save paths == save_paths = [ get_save_path_name( save_dir, sample_name=sample_name, sample_idx=start_idx + idx, prompt=original_batch_prompts[idx], prompt_as_path=prompt_as_path, num_sample=num_sample, k=k, ) for idx in range(len(batch_prompts)) ] # NOTE: Skip if the sample already exists # This is useful for resuming sampling VBench if prompt_as_path and all_exists(save_paths): continue # == process prompts step by step == # 0. split prompt # each element in the list is [prompt_segment_list, loop_idx_list] batched_prompt_segment_list = [] batched_loop_idx_list = [] for prompt in batch_prompts: prompt_segment_list, loop_idx_list = split_prompt(prompt) batched_prompt_segment_list.append(prompt_segment_list) batched_loop_idx_list.append(loop_idx_list) # 1. refine prompt by openai if cfg.get("llm_refine", False): # only call openai API when # 1. seq parallel is not enabled # 2. seq parallel is enabled and the process is rank 0 if not enable_sequence_parallelism or (enable_sequence_parallelism and is_main_process()): for idx, prompt_segment_list in enumerate(batched_prompt_segment_list): batched_prompt_segment_list[idx] = refine_prompts_by_openai(prompt_segment_list) # sync the prompt if using seq parallel if enable_sequence_parallelism: coordinator.block_all() prompt_segment_length = [ len(prompt_segment_list) for prompt_segment_list in batched_prompt_segment_list ] # flatten the prompt segment list batched_prompt_segment_list = [ prompt_segment for prompt_segment_list in batched_prompt_segment_list for prompt_segment in prompt_segment_list ] # create a list of size equal to world size broadcast_obj_list = [batched_prompt_segment_list] * coordinator.world_size dist.broadcast_object_list(broadcast_obj_list, 0) # recover the prompt list batched_prompt_segment_list = [] segment_start_idx = 0 all_prompts = broadcast_obj_list[0] for num_segment in prompt_segment_length: batched_prompt_segment_list.append( all_prompts[segment_start_idx : segment_start_idx + num_segment] ) segment_start_idx += num_segment # 2. append score for idx, prompt_segment_list in enumerate(batched_prompt_segment_list): batched_prompt_segment_list[idx] = append_score_to_prompts( prompt_segment_list, aes=cfg.get("aes", None), flow=cfg.get("flow", None), camera_motion=cfg.get("camera_motion", None), ) # 3. clean prompt with T5 for idx, prompt_segment_list in enumerate(batched_prompt_segment_list): batched_prompt_segment_list[idx] = [text_preprocessing(prompt) for prompt in prompt_segment_list] # 4. merge to obtain the final prompt batch_prompts = [] for prompt_segment_list, loop_idx_list in zip(batched_prompt_segment_list, batched_loop_idx_list): batch_prompts.append(merge_prompt(prompt_segment_list, loop_idx_list)) # == Iter over loop generation == video_clips = [] for loop_i in range(loop): # == get prompt for loop i == batch_prompts_loop = extract_prompts_loop(batch_prompts, loop_i) # == add condition frames for loop == if loop_i > 0: refs, ms = append_generated( vae, video_clips[-1], refs, ms, loop_i, condition_frame_length, condition_frame_edit ) # == sampling == torch.manual_seed(1024 + k) # should set diffrent seed for different samples z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype) masks = apply_mask_strategy(z, refs, ms, loop_i, align=align) samples = scheduler.sample( model, text_encoder, z=z, prompts=batch_prompts_loop, device=device, additional_args=model_args, progress=verbose >= 2, mask=masks, ) samples = vae.decode(samples.to(dtype), num_frames=num_frames) video_clips.append(samples) # == save samples == if is_main_process(): for idx, batch_prompt in enumerate(batch_prompts): if verbose >= 2: logger.info("Prompt: %s", batch_prompt) save_path = save_paths[idx] video = [video_clips[i][idx] for i in range(loop)] for i in range(1, loop): video[i] = video[i][:, dframe_to_frame(condition_frame_length) :] video = torch.cat(video, dim=1) save_path = save_sample( video, fps=save_fps, save_path=save_path, verbose=verbose >= 2, ) if save_path.endswith(".mp4") and cfg.get("watermark", False): time.sleep(1) # prevent loading previous generated video add_watermark(save_path) start_idx += len(batch_prompts) logger.info("Inference finished.") logger.info("Saved %s samples to %s", start_idx, save_dir) if __name__ == "__main__": main() ================================================ FILE: Open-Sora/scripts/inference_vae.py ================================================ import os from pprint import pformat import colossalai import torch from mmengine.runner import set_random_seed from tqdm import tqdm from opensora.acceleration.parallel_states import get_data_parallel_group from opensora.datasets import save_sample from opensora.datasets.dataloader import prepare_dataloader from opensora.models.vae.losses import VAELoss from opensora.registry import DATASETS, MODELS, build_module from opensora.utils.config_utils import parse_configs from opensora.utils.misc import create_logger, get_world_size, is_distributed, is_main_process, to_torch_dtype def main(): torch.set_grad_enabled(False) # ====================================================== # configs & runtime variables # ====================================================== # == parse configs == cfg = parse_configs(training=False) # == device and dtype == device = "cuda" if torch.cuda.is_available() else "cpu" cfg_dtype = cfg.get("dtype", "fp32") assert cfg_dtype in ["fp16", "bf16", "fp32"], f"Unknown mixed precision {cfg_dtype}" dtype = to_torch_dtype(cfg.get("dtype", "bf16")) torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True # == init distributed env == if is_distributed(): colossalai.launch_from_torch({}) set_random_seed(seed=cfg.get("seed", 1024)) # == init logger == logger = create_logger() logger.info("Inference configuration:\n %s", pformat(cfg.to_dict())) verbose = cfg.get("verbose", 1) # ====================================================== # build dataset and dataloader # ====================================================== logger.info("Building reconstruction dataset...") dataset = build_module(cfg.dataset, DATASETS) batch_size = cfg.get("batch_size", 1) dataloader, _ = prepare_dataloader( dataset, batch_size=batch_size, num_workers=cfg.get("num_workers", 4), shuffle=False, drop_last=False, pin_memory=True, process_group=get_data_parallel_group(), ) logger.info("Dataset %s contains %s videos.", cfg.dataset.data_path, len(dataset)) total_batch_size = batch_size * get_world_size() logger.info("Total batch size: %s", total_batch_size) total_steps = len(dataloader) if cfg.get("num_samples", None) is not None: total_steps = min(int(cfg.num_samples // cfg.batch_size), total_steps) logger.info("limiting test dataset to %s", int(cfg.num_samples // cfg.batch_size) * cfg.batch_size) dataiter = iter(dataloader) # ====================================================== # build model & loss # ====================================================== logger.info("Building models...") model = build_module(cfg.model, MODELS).to(device, dtype).eval() vae_loss_fn = VAELoss( logvar_init=cfg.get("logvar_init", 0.0), perceptual_loss_weight=cfg.get("perceptual_loss_weight", 0.1), kl_loss_weight=cfg.get("kl_loss_weight", 1e-6), device=device, dtype=dtype, ) # ====================================================== # inference # ====================================================== # == global variables == running_loss = running_nll = running_nll_z = 0.0 loss_steps = 0 cal_stats = cfg.get("cal_stats", False) if cal_stats: num_samples = 0 running_sum = running_var = 0.0 running_sum_c = torch.zeros(model.out_channels, dtype=torch.float, device=device) running_var_c = torch.zeros(model.out_channels, dtype=torch.float, device=device) # prepare arguments save_fps = cfg.get("fps", 24) // cfg.get("frame_interval", 1) # Iter over the dataset with tqdm( range(total_steps), disable=not is_main_process() or verbose < 1, total=total_steps, initial=0, ) as pbar: for step in pbar: batch = next(dataiter) x = batch["video"].to(device, dtype) # [B, C, T, H, W] # == vae encoding & decoding === z, posterior, x_z = model.encode(x) x_rec, x_z_rec = model.decode(z, num_frames=x.size(2)) x_ref = model.spatial_vae.decode(x_z) # == check z shape == input_size = x.shape[2:] latent_size = model.get_latent_size(input_size) assert list(z.shape[2:]) == latent_size, f"z shape: {z.shape}, latent_size: {latent_size}" # == calculate stats == if cal_stats: num_samples += 1 running_sum += z.mean().item() running_var += (z - running_sum / num_samples).pow(2).mean().item() running_sum_c += z.mean(dim=(0, 2, 3, 4)).float() running_var_c += ( (z - running_sum_c[None, :, None, None, None] / num_samples).pow(2).mean(dim=(0, 2, 3, 4)).float() ) if verbose >= 1: pbar.set_postfix( { "mean": running_sum / num_samples, "std": (running_var / num_samples) ** 0.5, } ) if num_samples % cfg.get("log_stats_every", 100) == 0: logger.info( "VAE feature per channel stats: mean %s, var %s", (running_sum_c / num_samples).cpu().tolist(), (running_var_c / num_samples).sqrt().cpu().tolist(), ) # == loss calculation == nll_loss, weighted_nll_loss, weighted_kl_loss = vae_loss_fn(x, x_rec, posterior) nll_loss_z, _, _ = vae_loss_fn(x_z, x_z_rec, posterior, no_perceptual=True) vae_loss = weighted_nll_loss + weighted_kl_loss loss_steps += 1 running_loss = vae_loss.item() / loss_steps + running_loss * ((loss_steps - 1) / loss_steps) running_nll = nll_loss.item() / loss_steps + running_nll * ((loss_steps - 1) / loss_steps) running_nll_z = nll_loss_z.item() / loss_steps + running_nll_z * ((loss_steps - 1) / loss_steps) # == save samples == save_dir = cfg.get("save_dir", None) if is_main_process() and save_dir is not None: ori_dir = f"{save_dir}_ori" rec_dir = f"{save_dir}_rec" ref_dir = f"{save_dir}_spatial" os.makedirs(ori_dir, exist_ok=True) os.makedirs(rec_dir, exist_ok=True) os.makedirs(ref_dir, exist_ok=True) for idx, vid in enumerate(x): pos = step * cfg.batch_size + idx save_sample(vid, fps=save_fps, save_path=f"{ori_dir}/{pos:03d}", verbose=verbose >= 2) save_sample(x_rec[idx], fps=save_fps, save_path=f"{rec_dir}/{pos:03d}", verbose=verbose >= 2) save_sample(x_ref[idx], fps=save_fps, save_path=f"{ref_dir}/{pos:03d}", verbose=verbose >= 2) logger.info("VAE loss: %s", running_loss) logger.info("VAE nll loss: %s", running_nll) logger.info("VAE nll_z loss: %s", running_nll_z) if __name__ == "__main__": main() ================================================ FILE: Open-Sora/scripts/misc/extract_feat.py ================================================ import os from pprint import pformat import colossalai import torch import torch.distributed as dist from tqdm import tqdm from opensora.acceleration.parallel_states import get_data_parallel_group, set_data_parallel_group from opensora.datasets.dataloader import prepare_dataloader from opensora.registry import DATASETS, MODELS, build_module from opensora.utils.config_utils import parse_configs, save_training_config from opensora.utils.misc import FeatureSaver, Timer, create_logger, format_numel_str, get_model_numel, to_torch_dtype def main(): torch.set_grad_enabled(False) # ====================================================== # 1. configs & runtime variables # ====================================================== # == parse configs == cfg = parse_configs(training=False) # == device and dtype == assert torch.cuda.is_available(), "Training currently requires at least one GPU." cfg_dtype = cfg.get("dtype", "bf16") assert cfg_dtype in ["fp16", "bf16"], f"Unknown mixed precision {cfg_dtype}" dtype = to_torch_dtype(cfg.get("dtype", "bf16")) # == colossalai init distributed training == device = "cuda" if torch.cuda.is_available() else "cpu" cfg_dtype = cfg.get("dtype", "fp32") assert cfg_dtype in ["fp16", "bf16", "fp32"], f"Unknown mixed precision {cfg_dtype}" dtype = to_torch_dtype(cfg.get("dtype", "bf16")) torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True colossalai.launch_from_torch({}) set_data_parallel_group(dist.group.WORLD) # == init logger, tensorboard & wandb == logger = create_logger() logger.info("Configuration:\n %s", pformat(cfg.to_dict())) # ====================================================== # 2. build dataset and dataloader # ====================================================== logger.info("Building dataset...") # == build dataset == dataset = build_module(cfg.dataset, DATASETS) logger.info("Dataset contains %s samples.", len(dataset)) # == build dataloader == dataloader_args = dict( dataset=dataset, batch_size=cfg.get("batch_size", None), num_workers=cfg.get("num_workers", 4), seed=cfg.get("seed", 1024), shuffle=True, drop_last=True, pin_memory=True, process_group=get_data_parallel_group(), ) dataloader, _ = prepare_dataloader( bucket_config=cfg.get("bucket_config", None), num_bucket_build_workers=cfg.get("num_bucket_build_workers", 1), **dataloader_args, ) num_steps_per_epoch = len(dataloader) # ====================================================== # 3. build model # ====================================================== logger.info("Building models...") # == build text-encoder and vae == text_encoder = build_module(cfg.text_encoder, MODELS, device=device, dtype=dtype) vae = build_module(cfg.vae, MODELS).to(device, dtype).eval() # == build diffusion model == input_size = (dataset.num_frames, *dataset.image_size) latent_size = vae.get_latent_size(input_size) model = ( build_module( cfg.model, MODELS, input_size=latent_size, in_channels=vae.out_channels, caption_channels=text_encoder.output_dim, model_max_length=text_encoder.model_max_length, ) .to(device, dtype) .train() ) model_numel, model_numel_trainable = get_model_numel(model) logger.info( "[Diffusion] Trainable model params: %s, Total model params: %s", format_numel_str(model_numel_trainable), format_numel_str(model_numel), ) # ======================================================= # 5. training loop # ======================================================= # == global variables == bin_size = cfg.bin_size save_text_features = cfg.get("save_text_features", False) save_compressed_text_features = cfg.get("save_compressed_text_features", False) # == number of bins == num_bin = num_steps_per_epoch // bin_size logger.info("Number of batches: %s", num_steps_per_epoch) logger.info("Bin size: %s", bin_size) logger.info("Number of bins: %s", num_bin) # resume from a specific batch index start_index = cfg.get("start_index", 0) end_index = cfg.get("end_index", num_bin) dataloader.batch_sampler.load_state_dict({"last_micro_batch_access_index": start_index}) num_bin_to_process = min(num_bin, end_index) - start_index logger.info("Start index: %s", start_index) logger.info("End index: %s", end_index) logger.info("Number of batches to process: %s", num_bin_to_process) # create save directory assert cfg.get("save_dir", None) is not None, "Please specify the save_dir in the config file." save_dir = os.path.join(cfg.save_dir, f"s{start_index}_e{end_index}") os.makedirs(save_dir, exist_ok=True) save_training_config(cfg.to_dict(), save_dir) logger.info("Saving features to %s", save_dir) saver = FeatureSaver(save_dir, bin_size, start_bin=start_index) # == training loop in an epoch == dataloader_iter = iter(dataloader) log_time = cfg.get("log_time", False) for i in tqdm(range(0, num_bin_to_process * bin_size)): with Timer("step", log=log_time): with Timer("data loading", log=log_time): batch = next(dataloader_iter) x = batch.pop("video").to(device, dtype) # [B, C, T, H, W] y = batch.pop("text") with Timer("vae", log=log_time): x = vae.encode(x) with Timer("feature to cpu", log=log_time): x = x.cpu() batch_dict = { "x": x, "text": y, "fps": batch["fps"].to(dtype), "height": batch["height"].to(dtype), "width": batch["width"].to(dtype), "num_frames": batch["num_frames"].to(dtype), } if save_text_features: with Timer("text", log=log_time): text_infos = text_encoder.encode(y) y_feat = text_infos["y"] y_mask = text_infos["mask"] if save_compressed_text_features: y_feat, y_mask = model.encode_text(y_feat, y_mask) y_mask = torch.tensor(y_mask) with Timer("feature to cpu", log=log_time): y_feat = y_feat.cpu() y_mask = y_mask.cpu() batch_dict.update({"y": y_feat, "mask": y_mask}) saver.update(batch_dict) if __name__ == "__main__": main() ================================================ FILE: Open-Sora/scripts/misc/launch_extract_feat.sh ================================================ #!/bin/bash set -x set -e START_SPLIT=0 NUM_SPLIT=10 DATA_PATH=$1 SAVE_PATH=$2 DATA_ARG="--data-path $DATA_PATH" SAVE_ARG="--save-dir $SAVE_PATH" CMD="torchrun --standalone --nproc_per_node 1 scripts/misc/extract_feat.py configs/opensora-v1-2/misc/extract.py $DATA_ARG $SAVE_ARG" declare -a GPUS=(0 1 2 3 4 5 6 7) mkdir -p logs/extract_feat for i in "${GPUS[@]}"; do CUDA_VISIBLE_DEVICES=$i $CMD --start-index $(($START_SPLIT + i * $NUM_SPLIT)) --end-index $(($START_SPLIT + (i + 1) * $NUM_SPLIT)) >logs/extract_feat/$i.log 2>&1 & done ================================================ FILE: Open-Sora/setup.py ================================================ from typing import List from setuptools import find_packages, setup def fetch_requirements(paths) -> List[str]: """ This function reads the requirements file. Args: path (str): the path to the requirements file. Returns: The lines in the requirements file. """ if not isinstance(paths, list): paths = [paths] requirements = [] for path in paths: with open(path, "r") as fd: requirements += [r.strip() for r in fd.readlines()] return requirements def fetch_readme() -> str: """ This function reads the README.md file in the current directory. Returns: The lines in the README file. """ with open("README.md", encoding="utf-8") as f: return f.read() setup( name="opensora", version="1.2.0", packages=find_packages( exclude=( "assets", "cache", "configs", "docs", "eval", "evaluation_results", "gradio", "logs", "notebooks", "outputs", "pretrained_models", "samples", "scripts", "tests", "tools", "*.egg-info", ) ), description="Democratizing Efficient Video Production for All", long_description=fetch_readme(), long_description_content_type="text/markdown", license="Apache Software License 2.0", url="https://github.com/hpcaitech/Open-Sora", project_urls={ "Bug Tracker": "https://github.com/hpcaitech/Open-Sora/issues", "Examples": "https://hpcaitech.github.io/Open-Sora/", "Documentation": "https://github.com/hpcaitech/Open-Sora?tab=readme-ov-file", "Github": "https://github.com/hpcaitech/Open-Sora", }, install_requires=fetch_requirements("requirements/requirements.txt"), python_requires=">=3.6", classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: Apache Software License", "Environment :: GPU :: NVIDIA CUDA", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: System :: Distributed Computing", ], extras_require={ "data": fetch_requirements("requirements/requirements-data.txt"), "eval": fetch_requirements("requirements/requirements-eval.txt"), "vae": fetch_requirements("requirements/requirements-vae.txt"), "full": fetch_requirements( [ "requirements/requirements-data.txt", "requirements/requirements-eval.txt", ] ), }, ) ================================================ FILE: Open-Sora/tests/test_attn.py ================================================ import torch from colossalai.accelerator import get_accelerator from colossalai.utils import get_current_device from rotary_embedding_torch import RotaryEmbedding from opensora.models.layers.blocks import Attention # B, S, H = 7488, 1, 1152 # B, S, H = 32, 234, 1152 B, S, H = 128, 32, 1152 N, D = 16, 72 def run_attn(enable_flash_attn: bool): get_accelerator().reset_peak_memory_stats() rope = RotaryEmbedding(D).to(device=get_current_device(), dtype=torch.bfloat16) attn = Attention( H, N, qkv_bias=True, rope=rope.rotate_queries_or_keys, enable_flash_attn=enable_flash_attn, ).to(device=get_current_device(), dtype=torch.bfloat16) x = torch.randn(B, S, H, device=get_current_device(), dtype=torch.bfloat16).requires_grad_() y = attn(x) y.mean().backward() print(f"Peak memory: {get_accelerator().max_memory_allocated() / 1024**2:.2f} MB") if __name__ == "__main__": print("Use flashattn") run_attn(True) print("No flashattn") run_attn(False) ================================================ FILE: Open-Sora/tests/test_lr_scheduler.py ================================================ import torch from torch.optim import Adam from torchvision.models import resnet50 from tqdm import tqdm from opensora.utils.lr_scheduler import LinearWarmupLR def test_lr_scheduler(): warmup_steps = 200 model = resnet50().cuda() optimizer = Adam(model.parameters(), lr=0.01) scheduler = LinearWarmupLR(optimizer, warmup_steps=warmup_steps) current_lr = scheduler.get_lr()[0] data = torch.rand(1, 3, 224, 224).cuda() for i in tqdm(range(warmup_steps * 2)): out = model(data) out.mean().backward() optimizer.step() scheduler.step() if i >= warmup_steps: assert scheduler.get_lr()[0] == 0.01 else: assert scheduler.get_lr()[0] > current_lr, f"{scheduler.get_lr()[0]} <= {current_lr}" current_lr = scheduler.get_lr()[0] if __name__ == "__main__": test_lr_scheduler() ================================================ FILE: Open-Sora/tools/__init__.py ================================================ ================================================ FILE: Open-Sora/tools/caption/README.md ================================================ # Video Captioning Human labeling of videos is expensive and time-consuming. We adopt powerful image captioning models to generate captions for videos. Although GPT-4V achieves a better performance, its 20s/sample speed is too slow for us. As for our v1.2 model, we captioned our training videos with the [PLLaVA](https://github.com/magic-research/PLLaVA) model. PLLaVA performs highly competitively on multiple video-based text generation benchmarks including [MVbench](https://paperswithcode.com/sota/video-question-answering-on-mvbench?p=pllava-parameter-free-llava-extension-from-1). ## PLLaVA Captioning To balance captioning speed and performance, we chose the 13B version of PLLaVA configured with 2*2 spatial pooling. We feed it with 4 frames evenly extracted from the video. We accelerate its inference via (1) batching and (2) offload frame extraction to a separate process such that the GPU computations and frame extraction happen in parallel. ### Installation Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "PLLaVA Captioning" sections. ### Usage Since PLLaVA is not fashioned as a package, we will use PYTHONPATH to use it. ```bash cd .. # step back to pllava_dir CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ PYTHONPATH='$PYTHONPATH:OPEN_SORA_HOME/tools/caption/pllava_dir/PLLaVA' \ nohup python caption_pllava.py \ --pretrained_model_name_or_path PLLaVA/MODELS/pllava-13b \ --use_lora \ --lora_alpha 4 \ --num_frames 4 \ --weight_dir PLLaVA/MODELS/pllava-13b \ --csv_path meta.csv \ --pooling_shape 4-12-12 \ > pllava_caption.out 2>&1 & ``` ### PLLaVA vs. LLaVA In our previous releases, we used [LLaVA](#llava-captioning) for video captioning. Qualitatively speaking, we observe that PLLaVA has a somewhat higher chance of accurately capture the details in the video than LLaVA. See below for their comparison on a video sample.
LLaVA vs PLLaVA
| LLaVA | PLLaVA | |----------|----------| | The video is a close-up shot of two gold wedding rings. The rings are placed on a (black surface)✅, casting a soft shadow beneath them. The rings are positioned in such a way that (they are facing each other)❌, creating a mirror image effect. The rings are (identical in size and design)✅, suggesting they are a pair. The lighting in the video is soft and diffused, highlighting the gold color of the rings and creating a warm and inviting atmosphere. The overall style of the video is minimalist and elegant, focusing solely on the rings and their reflection. | The video shows a pair of gold wedding rings on a (reflective surface)✅. The rings are placed one on top of the other, (with the top ring slightly tilted to the left)✅. The rings have a (shiny, metallic finish)✅ and are the main focus of the image. The background is a gradient of dark to light gray, providing a neutral backdrop that highlights the rings. There are no texts or other objects in the image. The style of the video is a simple product display with a focus on the rings, likely intended for promotional or sales purposes. The lighting and shadows suggest a soft, even light source, (possibly a studio light)✅, which creates a reflective surface beneath the rings.| ## LLaVA Captioning We extract three frames from the video for captioning. With batch inference, we can achieve 10 times speedup. With approximately 720p resolution and 1 frames, the speed is 2~3 videos/s on 8 GPUs. If we resize the smaller side to 336, the speed can be 8 videos/s. In Open-Sora v1.1, to lower the cost, we use the 7B model. ### Installation Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "LLaVA Captioning" sections. ### Usage Prepare a csv file for processing. The csv file can be generated by `convert_dataset.py` according to its [documentation](/tools/datasets/README.md). Then, run the following command to generate captions for videos/images with Llava: ```bash # caption with mistral-7B torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava DATA.csv --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video # caption with llava-34B # NOTE: remember to enable flash attention for this model torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava DATA.csv --dp-size 4 --tp-size 2 --model-path liuhaotian/llava-v1.6-34b --prompt image-3ex --flash-attention # we run this on 8xH800 GPUs torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava DATA.csv --tp-size 2 --dp-size 4 --bs 16 # at least two 80G GPUs are required torchrun --nproc_per_node 2 --standalone -m tools.caption.caption_llava DATA.csv --tp-size 2 --dp-size 1 --bs 16 # can also caption images torchrun --nproc_per_node 2 --standalone -m tools.caption.caption_llava DATA.csv --tp-size 2 --dp-size 1 --bs 16 --prompt image-3ex ``` Please note that you should add the `--flash-attention` flag when running with Llama-based Llava models as it provides speedup but do turn it off for mistral-based ones. Reasons can be found in [this issue](https://discuss.huggingface.co/t/flash-attention-has-no-effect-on-inference/73453). After running the script, with `dp-size=N`, you will get `N` parts of csv files. Run the following command to merge them: ```bash python -m tools.datasets.datautil DATA_caption_part*.csv --output DATA_caption.csv ``` ### Resume Sometimes the process may be interrupted. We can resume the process by running the following command: ```bash # merge generated results python -m tools.datasets.datautil DATA_caption_part*.csv --output DATA_caption.csv # get the remaining videos python -m tools.datasets.datautil DATA.csv --difference DATA_caption.csv --output DATA_remaining.csv ``` Then use the output csv file to resume the process. ## GPT-4V Captioning Run the following command to generate captions for videos with GPT-4V: ```bash # output: DATA_caption.csv python -m tools.caption.caption_gpt4 DATA.csv --key $OPENAI_API_KEY ``` The cost is approximately $0.01 per video (3 frames per video). ## Camera Motion Detection Install required packages with `pip install -v .[data]` (See [installation.md](../../docs/installation.md)). Run the following command to classify camera motion: ```bash # output: meta_cmotion.csv python -m tools.caption.camera_motion.detect tools/caption/camera_motion/meta.csv ``` You may additionally specify `threshold` to indicate how "sensitive" the detection should be as below. For example `threshold = 0.2` means that the video is only counted as `tilt_up` when the pixels moved down by `>20%` of video height between the starting and ending frames. ```bash # output: meta_cmotion.csv python -m tools.caption.camera_motion.detect tools/caption/camera_motion/meta.csv --threshold 0.2 ``` Each video is classified according to 8 categories: `pan_right, pan_left, tilt_up, tilt_down, zoom_in, zoom_out, static, unclassified`. Categories of `tilt`, `pan` and `zoom` can overlap with each other. ## Tagging with Llama3 To understand the overall category distribution of our training dataset, we use Llama3 to generate tags based on the video captions. After obtaining Llama3 usage permission from huggingface/meta, you may generate tags based on the captions using Llama3 like this: ```bash torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llama3 meta.csv --key objects --output_prefix meta ``` This will generate tags based on the `text` column of `meta.csv` and put the results to `output_prefix + key.csv`. Currently the prompts for `objects` and `actions` are supported. ================================================ FILE: Open-Sora/tools/caption/__init__.py ================================================ ================================================ FILE: Open-Sora/tools/caption/acceleration/__init__.py ================================================ ================================================ FILE: Open-Sora/tools/caption/acceleration/llava/__init__.py ================================================ ================================================ FILE: Open-Sora/tools/caption/acceleration/llava/policies/__init__.py ================================================ from .llama import LlavaLlamaForCausalLMPolicy from .mistral import LlavaMistralForCausalLMPolicy ================================================ FILE: Open-Sora/tools/caption/acceleration/llava/policies/llama.py ================================================ from typing import Dict, Union import torch.nn as nn from colossalai.shardformer.layer import Linear1D_Col, Linear1D_Row from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription __all__ = ["LlavaLlamaPolicy", "LlavaLlamaForCausalLMPolicy"] class LlavaLlamaPolicy(Policy): def config_sanity_check(self): pass def preprocess(self): if self.shard_config.enable_tensor_parallelism: # Resize embedding self.model.config.vocab_size self.shard_config.tensor_parallel_size # if vocab_size % world_size != 0: # new_vocab_size = vocab_size + world_size - vocab_size % world_size # self.model.resize_token_embeddings(new_vocab_size) return self.model def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]: from transformers.models.llama.modeling_llama import LlamaDecoderLayer policy = {} if self.shard_config.enable_tensor_parallelism: decoder_attribute_replacement = { "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size, "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size, } if getattr(self.model.config, "num_key_value_heads", False): decoder_attribute_replacement["self_attn.num_key_value_heads"] = ( self.model.config.num_key_value_heads // self.shard_config.tensor_parallel_size ) policy[LlamaDecoderLayer] = ModulePolicyDescription( attribute_replacement=decoder_attribute_replacement, sub_module_replacement=[ SubModuleReplacementDescription( suffix="self_attn.q_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="self_attn.k_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="self_attn.v_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="self_attn.o_proj", target_module=Linear1D_Row, ), SubModuleReplacementDescription( suffix="mlp.gate_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="mlp.up_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="mlp.down_proj", target_module=Linear1D_Row, ), ], ) return policy def postprocess(self): return self.model class LlavaLlamaForCausalLMPolicy(LlavaLlamaPolicy): def module_policy(self): from transformers import LlamaForCausalLM policy = super().module_policy() if self.shard_config.enable_tensor_parallelism: # add a new item for casual lm new_item = { LlamaForCausalLM: ModulePolicyDescription( sub_module_replacement=[ SubModuleReplacementDescription( suffix="lm_head", target_module=Linear1D_Col, kwargs={"gather_output": True} ) ], ) } policy.update(new_item) return policy ================================================ FILE: Open-Sora/tools/caption/acceleration/llava/policies/mistral.py ================================================ import warnings from typing import Dict, Union import torch.nn as nn from colossalai.shardformer.layer import Linear1D_Col, Linear1D_Row, VocabParallelEmbedding1D from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription __all__ = ["LlavaMistralPolicy", "LlavaMistralForCausalLMPolicy"] class LlavaMistralPolicy(Policy): def config_sanity_check(self): pass def preprocess(self): if self.shard_config.enable_tensor_parallelism: # Resize embedding vocab_size = self.model.config.vocab_size world_size = self.shard_config.tensor_parallel_size if vocab_size % world_size != 0: new_vocab_size = vocab_size + world_size - vocab_size % world_size self.model.resize_token_embeddings(new_vocab_size) return self.model def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]: from transformers.models.mistral.modeling_mistral import MistralDecoderLayer, MistralModel policy = {} if self.shard_config.enable_sequence_parallelism: self.shard_config.enable_sequence_parallelism = False warnings.warn( "Mistral doesn't support sequence parallelism now, will ignore the sequence parallelism flag." ) if self.shard_config.enable_tensor_parallelism: decoder_attribute_replacement = { "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size, "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size, "self_attn.num_key_value_heads": self.model.config.num_key_value_heads // self.shard_config.tensor_parallel_size, } policy[MistralDecoderLayer] = ModulePolicyDescription( attribute_replacement=decoder_attribute_replacement, sub_module_replacement=[ SubModuleReplacementDescription( suffix="self_attn.q_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="self_attn.k_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="self_attn.v_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="self_attn.o_proj", target_module=Linear1D_Row, ), SubModuleReplacementDescription( suffix="mlp.gate_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="mlp.up_proj", target_module=Linear1D_Col, ), SubModuleReplacementDescription( suffix="mlp.down_proj", target_module=Linear1D_Row, ), ], ) self.append_or_create_submodule_replacement( description=SubModuleReplacementDescription( suffix="embed_tokens", target_module=VocabParallelEmbedding1D, ), policy=policy, target_key=MistralModel, ) return policy def postprocess(self): return self.model class LlavaMistralForCausalLMPolicy(LlavaMistralPolicy): def module_policy(self): from transformers import MistralForCausalLM policy = super().module_policy() if self.shard_config.enable_tensor_parallelism: # add a new item for casual lm new_item = { MistralForCausalLM: ModulePolicyDescription( sub_module_replacement=[ SubModuleReplacementDescription( suffix="lm_head", target_module=Linear1D_Col, kwargs=dict(gather_output=True) ) ] ) } policy.update(new_item) return policy ================================================ FILE: Open-Sora/tools/caption/camera_motion/__init__.py ================================================ ================================================ FILE: Open-Sora/tools/caption/camera_motion/camera_motion.py ================================================ import os import numpy as np import torch from .utils import load_video from .visualizer import Visualizer def transform(vector): x = np.mean([item[0] for item in vector]) y = np.mean([item[1] for item in vector]) return [x, y] class CameraPredict: def __init__(self, device, submodules_list, factor=0.25): self.device = device self.grid_size = 10 self.factor = factor try: self.model = torch.hub.load(submodules_list["repo"], submodules_list["model"]).to(self.device) except: # workaround for CERTIFICATE_VERIFY_FAILED (see: https://github.com/pytorch/pytorch/issues/33288#issuecomment-954160699) import ssl ssl._create_default_https_context = ssl._create_unverified_context self.model = torch.hub.load(submodules_list["repo"], submodules_list["model"]).to(self.device) def infer(self, video_path, save_video=False, save_dir="./saved_videos"): # load video video = load_video(video_path, return_tensor=False) # set scale height, width = video.shape[1], video.shape[2] self.scale = min(height, width) video = torch.from_numpy(video).permute(0, 3, 1, 2)[None].float().to(self.device) # B T C H W pred_tracks, pred_visibility = self.model(video, grid_size=self.grid_size) # B T N 2, B T N 1 if save_video: video_name = os.path.basename(video_path)[:-4] vis = Visualizer(save_dir=save_dir, pad_value=120, linewidth=3) vis.visualize(video, pred_tracks, pred_visibility, filename=video_name) return pred_tracks[0].long().detach().cpu().numpy() def transform_class(self, vector, min_reso): # 768*0.05 scale = min_reso * self.factor x, y = vector direction = [] if x > scale: direction.append("right") elif x < -scale: direction.append("left") if y > scale: direction.append("down") elif y < -scale: direction.append("up") return direction if direction else ["static"] def get_edge_point(self, track): middle = self.grid_size // 2 top = [list(track[0, i, :]) for i in range(middle - 2, middle + 2)] down = [list(track[self.grid_size - 1, i, :]) for i in range(middle - 2, middle + 2)] left = [list(track[i, 0, :]) for i in range(middle - 2, middle + 2)] right = [list(track[i, self.grid_size - 1, :]) for i in range(middle - 2, middle + 2)] return top, down, left, right def get_edge_direction(self, track1, track2): edge_points1 = self.get_edge_point(track1) edge_points2 = self.get_edge_point(track2) vector_results = [] for points1, points2 in zip(edge_points1, edge_points2): vectors = [[end[0] - start[0], end[1] - start[1]] for start, end in zip(points1, points2)] vector_results.append(vectors) vector_results = list(map(transform, vector_results)) class_results = [self.transform_class(vector, min_reso=self.scale) for vector in vector_results] return class_results def classify_top_down(self, top, down): results = [] classes = [f"{item_t}_{item_d}" for item_t in top for item_d in down] results_mapping = { "left_left": "pan_right", "right_right": "pan_left", "down_down": "tilt_up", "up_up": "tilt_down", "up_down": "zoom_in", "down_up": "zoom_out", "static_static": "static", } results = [results_mapping.get(cls) for cls in classes if cls in results_mapping] return results if results else ["None"] def classify_left_right(self, left, right): results = [] classes = [f"{item_l}_{item_r}" for item_l in left for item_r in right] results_mapping = { "left_left": "pan_right", "right_right": "pan_left", "down_down": "tilt_up", "up_up": "tilt_down", "left_right": "zoom_in", "right_left": "zoom_out", "static_static": "static", } results = [results_mapping.get(cls) for cls in classes if cls in results_mapping] return results if results else ["None"] def camera_classify(self, track1, track2): top, down, left, right = self.get_edge_direction(track1, track2) top_results = self.classify_top_down(top, down) left_results = self.classify_left_right(left, right) results = list(set(top_results + left_results)) if "None" in results and len(results) > 1: results.remove("None") if "static" in results and len(results) > 1: results.remove("static") if len(results) == 1 and results[0] == "None": # Tom added this to deal with edge cases results = ["Undetermined"] return results def predict(self, video_path): pred_track = self.infer(video_path) track1 = pred_track[0].reshape((self.grid_size, self.grid_size, 2)) track2 = pred_track[-1].reshape((self.grid_size, self.grid_size, 2)) results = self.camera_classify(track1, track2) return results def compute_camera_motion(device, submodules_dict, video_paths, factor): camera = CameraPredict(device, submodules_dict, factor) # predict_results = camera.predict(video_path) # return predict_results all_predictions = [] for video_path in video_paths: camera_motion_types = camera.predict(video_path) all_predictions.append("+".join(camera_motion_types)) return all_predictions ================================================ FILE: Open-Sora/tools/caption/camera_motion/detect.py ================================================ # Originally developed by https://github.com/Vchitect/VBench based on https://github.com/facebookresearch/co-tracker. import argparse from typing import List import pandas as pd from .camera_motion import compute_camera_motion def process(paths: List[str], threshold: float) -> List[str]: device = "cuda" submodules = {"repo": "facebookresearch/co-tracker", "model": "cotracker2"} camera_motion_types = compute_camera_motion(device, submodules, paths, factor=threshold) return camera_motion_types def main(args): output_file = args.input.replace(".csv", "_cmotion.csv") data = pd.read_csv(args.input) data["cmotion"] = process(data["path"], args.threshold) data.to_csv(output_file, index=False) print(f"Output saved to {output_file}") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("input", type=str) parser.add_argument("--threshold", type=float, default=0.25) args = parser.parse_args() main(args) ================================================ FILE: Open-Sora/tools/caption/camera_motion/requirements.txt ================================================ decord ptvsd imageio-ffmpeg ================================================ FILE: Open-Sora/tools/caption/camera_motion/utils.py ================================================ import numpy as np import torch from decord import VideoReader from PIL import Image, ImageSequence def get_frame_indices(num_frames, vlen, sample="rand", fix_start=None, input_fps=1, max_num_frames=-1): if sample in ["rand", "middle"]: # uniform sampling acc_samples = min(num_frames, vlen) # split the video into `acc_samples` intervals, and sample from each interval. intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int) ranges = [] for idx, interv in enumerate(intervals[:-1]): ranges.append((interv, intervals[idx + 1] - 1)) if sample == "rand": try: frame_indices = [random.choice(range(x[0], x[1])) for x in ranges] except: frame_indices = np.random.permutation(vlen)[:acc_samples] frame_indices.sort() frame_indices = list(frame_indices) elif fix_start is not None: frame_indices = [x[0] + fix_start for x in ranges] elif sample == "middle": frame_indices = [(x[0] + x[1]) // 2 for x in ranges] else: raise NotImplementedError if len(frame_indices) < num_frames: # padded with last frame padded_frame_indices = [frame_indices[-1]] * num_frames padded_frame_indices[: len(frame_indices)] = frame_indices frame_indices = padded_frame_indices elif "fps" in sample: # fps0.5, sequentially sample frames at 0.5 fps output_fps = float(sample[3:]) duration = float(vlen) / input_fps delta = 1 / output_fps # gap between frames, this is also the clip length each frame represents frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta) frame_indices = np.around(frame_seconds * input_fps).astype(int) frame_indices = [e for e in frame_indices if e < vlen] if max_num_frames > 0 and len(frame_indices) > max_num_frames: frame_indices = frame_indices[:max_num_frames] # frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames) else: raise ValueError return frame_indices def load_video(video_path, data_transform=None, num_frames=None, return_tensor=True, width=None, height=None): """ Load a video from a given path and apply optional data transformations. The function supports loading video in GIF (.gif), PNG (.png), and MP4 (.mp4) formats. Depending on the format, it processes and extracts frames accordingly. Parameters: - video_path (str): The file path to the video or image to be loaded. - data_transform (callable, optional): A function that applies transformations to the video data. Returns: - frames (torch.Tensor): A tensor containing the video frames with shape (T, C, H, W), where T is the number of frames, C is the number of channels, H is the height, and W is the width. Raises: - NotImplementedError: If the video format is not supported. The function first determines the format of the video file by its extension. For GIFs, it iterates over each frame and converts them to RGB. For PNGs, it reads the single frame, converts it to RGB. For MP4s, it reads the frames using the VideoReader class and converts them to NumPy arrays. If a data_transform is provided, it is applied to the buffer before converting it to a tensor. Finally, the tensor is permuted to match the expected (T, C, H, W) format. """ if video_path.endswith(".gif"): frame_ls = [] img = Image.open(video_path) for frame in ImageSequence.Iterator(img): frame = frame.convert("RGB") frame = np.array(frame).astype(np.uint8) frame_ls.append(frame) buffer = np.array(frame_ls).astype(np.uint8) elif video_path.endswith(".png"): frame = Image.open(video_path) frame = frame.convert("RGB") frame = np.array(frame).astype(np.uint8) frame_ls = [frame] buffer = np.array(frame_ls) elif video_path.endswith(".mp4"): import decord decord.bridge.set_bridge("native") if width: video_reader = VideoReader(video_path, width=width, height=height, num_threads=1) else: video_reader = VideoReader(video_path, num_threads=1) frames = video_reader.get_batch(range(len(video_reader))) # (T, H, W, C), torch.uint8 buffer = frames.asnumpy().astype(np.uint8) else: raise NotImplementedError frames = buffer if num_frames: frame_indices = get_frame_indices(num_frames, len(frames), sample="middle") frames = frames[frame_indices] if data_transform: frames = data_transform(frames) elif return_tensor: frames = torch.Tensor(frames) frames = frames.permute(0, 3, 1, 2) # (T, C, H, W), torch.uint8 return frames ================================================ FILE: Open-Sora/tools/caption/camera_motion/visualizer.py ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # This source code is licensed under the license found in the cotracker github repo. https://github.com/facebookresearch/co-tracker. import os import imageio import matplotlib.pyplot as plt import numpy as np import torch import torch.nn.functional as F import torchvision.transforms as transforms from matplotlib import cm from PIL import Image, ImageDraw def read_video_from_path(path): try: reader = imageio.get_reader(path) except Exception as e: print("Error opening video file: ", e) return None frames = [] for i, im in enumerate(reader): frames.append(np.array(im)) return np.stack(frames) def draw_circle(rgb, coord, radius, color=(255, 0, 0), visible=True): # Create a draw object draw = ImageDraw.Draw(rgb) # Calculate the bounding box of the circle left_up_point = (coord[0] - radius, coord[1] - radius) right_down_point = (coord[0] + radius, coord[1] + radius) # Draw the circle draw.ellipse( [left_up_point, right_down_point], fill=tuple(color) if visible else None, outline=tuple(color), ) return rgb def draw_line(rgb, coord_y, coord_x, color, linewidth): draw = ImageDraw.Draw(rgb) draw.line( (coord_y[0], coord_y[1], coord_x[0], coord_x[1]), fill=tuple(color), width=linewidth, ) return rgb def add_weighted(rgb, alpha, original, beta, gamma): return (rgb * alpha + original * beta + gamma).astype("uint8") class Visualizer: def __init__( self, save_dir: str = "./results", grayscale: bool = False, pad_value: int = 0, fps: int = 10, mode: str = "rainbow", # 'cool', 'optical_flow' linewidth: int = 2, show_first_frame: int = 10, tracks_leave_trace: int = 0, # -1 for infinite ): self.mode = mode self.save_dir = save_dir if mode == "rainbow": self.color_map = cm.get_cmap("gist_rainbow") elif mode == "cool": self.color_map = cm.get_cmap(mode) self.show_first_frame = show_first_frame self.grayscale = grayscale self.tracks_leave_trace = tracks_leave_trace self.pad_value = pad_value self.linewidth = linewidth self.fps = fps def visualize( self, video: torch.Tensor, # (B,T,C,H,W) tracks: torch.Tensor, # (B,T,N,2) visibility: torch.Tensor = None, # (B, T, N, 1) bool gt_tracks: torch.Tensor = None, # (B,T,N,2) segm_mask: torch.Tensor = None, # (B,1,H,W) filename: str = "video", writer=None, # tensorboard Summary Writer, used for visualization during training step: int = 0, query_frame: int = 0, save_video: bool = True, compensate_for_camera_motion: bool = False, ): if compensate_for_camera_motion: assert segm_mask is not None if segm_mask is not None: coords = tracks[0, query_frame].round().long() segm_mask = segm_mask[0, query_frame][coords[:, 1], coords[:, 0]].long() video = F.pad( video, (self.pad_value, self.pad_value, self.pad_value, self.pad_value), "constant", 255, ) print("video shape after pad is: ", video.shape) tracks = tracks + self.pad_value print(tracks) print("tracks shape after pad is: ", tracks.shape) if self.grayscale: transform = transforms.Grayscale() video = transform(video) video = video.repeat(1, 1, 3, 1, 1) res_video = self.draw_tracks_on_video( video=video, tracks=tracks, visibility=visibility, segm_mask=segm_mask, gt_tracks=gt_tracks, query_frame=query_frame, compensate_for_camera_motion=compensate_for_camera_motion, ) if save_video: self.save_video(res_video, filename=filename, writer=writer, step=step) return res_video def save_video(self, video, filename, writer=None, step=0): if writer is not None: writer.add_video( filename, video.to(torch.uint8), global_step=step, fps=self.fps, ) else: os.makedirs(self.save_dir, exist_ok=True) wide_list = list(video.unbind(1)) wide_list = [wide[0].permute(1, 2, 0).cpu().numpy() for wide in wide_list] # Prepare the video file path save_path = os.path.join(self.save_dir, f"{filename}.mp4") # Create a writer object video_writer = imageio.get_writer(save_path, fps=self.fps) # Write frames to the video file for frame in wide_list[2:-1]: video_writer.append_data(frame) video_writer.close() print(f"Video saved to {save_path}") def draw_tracks_on_video( self, video: torch.Tensor, tracks: torch.Tensor, visibility: torch.Tensor = None, segm_mask: torch.Tensor = None, gt_tracks=None, query_frame: int = 0, compensate_for_camera_motion=False, ): B, T, C, H, W = video.shape _, _, N, D = tracks.shape assert D == 2 assert C == 3 video = video[0].permute(0, 2, 3, 1).byte().detach().cpu().numpy() # S, H, W, C tracks = tracks[0].long().detach().cpu().numpy() # S, N, 2 if gt_tracks is not None: gt_tracks = gt_tracks[0].detach().cpu().numpy() res_video = [] # process input video for rgb in video: res_video.append(rgb.copy()) vector_colors = np.zeros((T, N, 3)) if self.mode == "optical_flow": import flow_vis vector_colors = flow_vis.flow_to_color(tracks - tracks[query_frame][None]) elif segm_mask is None: if self.mode == "rainbow": y_min, y_max = ( tracks[query_frame, :, 1].min(), tracks[query_frame, :, 1].max(), ) norm = plt.Normalize(y_min, y_max) for n in range(N): color = self.color_map(norm(tracks[query_frame, n, 1])) color = np.array(color[:3])[None] * 255 vector_colors[:, n] = np.repeat(color, T, axis=0) else: # color changes with time for t in range(T): color = np.array(self.color_map(t / T)[:3])[None] * 255 vector_colors[t] = np.repeat(color, N, axis=0) else: if self.mode == "rainbow": vector_colors[:, segm_mask <= 0, :] = 255 y_min, y_max = ( tracks[0, segm_mask > 0, 1].min(), tracks[0, segm_mask > 0, 1].max(), ) norm = plt.Normalize(y_min, y_max) for n in range(N): if segm_mask[n] > 0: color = self.color_map(norm(tracks[0, n, 1])) color = np.array(color[:3])[None] * 255 vector_colors[:, n] = np.repeat(color, T, axis=0) else: # color changes with segm class segm_mask = segm_mask.cpu() color = np.zeros((segm_mask.shape[0], 3), dtype=np.float32) color[segm_mask > 0] = np.array(self.color_map(1.0)[:3]) * 255.0 color[segm_mask <= 0] = np.array(self.color_map(0.0)[:3]) * 255.0 vector_colors = np.repeat(color[None], T, axis=0) # draw tracks if self.tracks_leave_trace != 0: for t in range(query_frame + 1, T): first_ind = max(0, t - self.tracks_leave_trace) if self.tracks_leave_trace >= 0 else 0 curr_tracks = tracks[first_ind : t + 1] curr_colors = vector_colors[first_ind : t + 1] if compensate_for_camera_motion: diff = (tracks[first_ind : t + 1, segm_mask <= 0] - tracks[t : t + 1, segm_mask <= 0]).mean(1)[ :, None ] curr_tracks = curr_tracks - diff curr_tracks = curr_tracks[:, segm_mask > 0] curr_colors = curr_colors[:, segm_mask > 0] res_video[t] = self._draw_pred_tracks( res_video[t], curr_tracks, curr_colors, ) if gt_tracks is not None: res_video[t] = self._draw_gt_tracks(res_video[t], gt_tracks[first_ind : t + 1]) # draw points for t in range(query_frame, T): img = Image.fromarray(np.uint8(res_video[t])) for i in range(N): coord = (tracks[t, i, 0], tracks[t, i, 1]) visibile = True if visibility is not None: visibile = visibility[0, t, i] if coord[0] != 0 and coord[1] != 0: if not compensate_for_camera_motion or (compensate_for_camera_motion and segm_mask[i] > 0): img = draw_circle( img, coord=coord, radius=int(self.linewidth * 2), color=vector_colors[t, i].astype(int), visible=visibile, ) res_video[t] = np.array(img) # construct the final rgb sequence if self.show_first_frame > 0: res_video = [res_video[0]] * self.show_first_frame + res_video[1:] return torch.from_numpy(np.stack(res_video)).permute(0, 3, 1, 2)[None].byte() def _draw_pred_tracks( self, rgb: np.ndarray, # H x W x 3 tracks: np.ndarray, # T x 2 vector_colors: np.ndarray, alpha: float = 0.5, ): T, N, _ = tracks.shape rgb = Image.fromarray(np.uint8(rgb)) for s in range(T - 1): vector_color = vector_colors[s] original = rgb.copy() alpha = (s / T) ** 2 for i in range(N): coord_y = (int(tracks[s, i, 0]), int(tracks[s, i, 1])) coord_x = (int(tracks[s + 1, i, 0]), int(tracks[s + 1, i, 1])) if coord_y[0] != 0 and coord_y[1] != 0: rgb = draw_line( rgb, coord_y, coord_x, vector_color[i].astype(int), self.linewidth, ) if self.tracks_leave_trace > 0: rgb = Image.fromarray(np.uint8(add_weighted(np.array(rgb), alpha, np.array(original), 1 - alpha, 0))) rgb = np.array(rgb) return rgb def _draw_gt_tracks( self, rgb: np.ndarray, # H x W x 3, gt_tracks: np.ndarray, # T x 2 ): T, N, _ = gt_tracks.shape color = np.array((211, 0, 0)) rgb = Image.fromarray(np.uint8(rgb)) for t in range(T): for i in range(N): gt_tracks = gt_tracks[t][i] # draw a red cross if gt_tracks[0] > 0 and gt_tracks[1] > 0: length = self.linewidth * 3 coord_y = (int(gt_tracks[0]) + length, int(gt_tracks[1]) + length) coord_x = (int(gt_tracks[0]) - length, int(gt_tracks[1]) - length) rgb = draw_line( rgb, coord_y, coord_x, color, self.linewidth, ) coord_y = (int(gt_tracks[0]) - length, int(gt_tracks[1]) + length) coord_x = (int(gt_tracks[0]) + length, int(gt_tracks[1]) - length) rgb = draw_line( rgb, coord_y, coord_x, color, self.linewidth, ) rgb = np.array(rgb) return rgb ================================================ FILE: Open-Sora/tools/caption/camera_motion_detect.py ================================================ # ref: https://github.com/antiboredom/camera-motion-detector import argparse import cv2 import numpy as np import pandas as pd from tqdm import tqdm tqdm.pandas() def apply(df, func, **kwargs): if pandas_has_parallel: return df.parallel_apply(func, **kwargs) return df.progress_apply(func, **kwargs) try: from pandarallel import pandarallel pandarallel.initialize(progress_bar=True) pandas_has_parallel = True except ImportError: pandas_has_parallel = False def make_empty(new_w, new_h): empty = [] for y in range(new_h): xvals = [] for x in range(new_w): xvals.append([x, y]) empty.append(xvals) empty = np.array(empty) return empty def get_type(mag, ang, zoom_in, tau_static=1.0, tau_zoom=(0.4, 0.6)): if mag < tau_static: return "static" if zoom_in < tau_zoom[0]: return "zoom out" if zoom_in > tau_zoom[1]: return "zoom in" if ang < 45 or ang >= 315: return "pan left" if 45 <= ang < 135: return "tilt up" if 135 <= ang < 225: return "pan right" if 225 <= ang < 315: return "tilt down" return "unknown" def get_video_type(frame_types): # count the number of each type counts = {} max_count = 0 max_type = None for frame_type in frame_types: if frame_type not in counts: counts[frame_type] = 0 counts[frame_type] += 1 if counts[frame_type] > max_count: max_count = counts[frame_type] max_type = frame_type if max_count > len(frame_types) / 2: return max_type if "static" in counts: return "unknown" if "zoom in" not in counts and "zoom out" not in counts: return "pan/tilt" return "dynamic" def process(path: str, frame_interval=15) -> str: cap = cv2.VideoCapture(path) count = 0 prvs = None frame_types = [] while cap.isOpened(): ret, frame = cap.read() if ret: frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) if count == 0: prvs = frame h, w = frame.shape empty = make_empty(w, h) empty_dists = np.sqrt( np.square(empty.ravel()[::2] - (w / 2)) + np.square(empty.ravel()[1::2] - (h / 2)) ) else: flow = cv2.calcOpticalFlowFarneback(prvs, frame, None, 0.5, 3, 15, 3, 5, 1.2, 0) mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1], angleInDegrees=True) mean_mag = np.median(mag) mean_ang = np.median(ang) flow_coords = flow + empty xvals = flow_coords.ravel()[::2] - (w / 2) yvals = flow_coords.ravel()[1::2] - (h / 2) dists = np.sqrt(np.square(xvals) + np.square(yvals)) dist_diff = dists >= empty_dists zoom_in_factor = np.count_nonzero(dist_diff) / len(dist_diff) frame_types.append(get_type(mean_mag, mean_ang, zoom_in_factor)) count += frame_interval cap.set(cv2.CAP_PROP_POS_FRAMES, count) else: cap.release() break video_type = get_video_type(frame_types) return video_type def main(args): output_file = args.input.replace(".csv", "_cmotion.csv") data = pd.read_csv(args.input) data["cmotion"] = apply(data["path"], process) data.to_csv(output_file, index=False) print(f"Output saved to {output_file}") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("input", type=str) parser.add_argument("--disable-parallel", action="store_true") args = parser.parse_args() if args.disable_parallel: pandas_has_parallel = False main(args) ================================================ FILE: Open-Sora/tools/caption/caption_gpt4.py ================================================ import argparse import base64 import csv import os from io import BytesIO import requests import tqdm from .utils import IMG_EXTENSIONS, PROMPTS, VID_EXTENSIONS, VideoTextDataset def to_base64(image): buffer = BytesIO() image.save(buffer, format="JPEG") return base64.b64encode(buffer.getvalue()).decode("utf-8") def get_caption(frame, prompt, api_key): headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"} payload = { "model": "gpt-4-vision-preview", "messages": [ { "role": "user", "content": [ { "type": "text", "text": prompt, }, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[0]}"}}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[1]}"}}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[2]}"}}, ], } ], "max_tokens": 300, } response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload, timeout=60) caption = response.json()["choices"][0]["message"]["content"] caption = caption.replace("\n", " ") return caption def main(args): # ====================================================== # 1. read video list # ====================================================== dataset = VideoTextDataset(args.input) output_file = os.path.splitext(args.input)[0] + "_caption.csv" f = open(output_file, "w") writer = csv.writer(f) writer.writerow(["video", "text"]) # make sure that the prompt type matches the data type data_extension = "." + dataset.data["path"].iloc[0].split(".")[-1] prompt_type = PROMPTS[args.prompt]["type"] if prompt_type == "image": assert ( data_extension.lower() in IMG_EXTENSIONS ), "The prompt is suitable for an image dataset but the data is not image." elif prompt_type == "video": assert ( data_extension.lower() in VID_EXTENSIONS ), "The prompt is suitable for a video dataset but the data is not video." else: raise ValueError(f"Found invalid prompt type {prompt_type}") # ====================================================== # 2. generate captions # ====================================================== for sample in tqdm.tqdm(dataset): prompt = PROMPTS[args.prompt]["text"] if "text" in args.prompt: prompt = prompt.format(sample["text"]) frames = sample["image"] frames = [to_base64(frame) for frame in frames] caption = get_caption(frames, prompt, args.key) writer.writerow((sample["path"], caption)) f.close() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("input", type=str, help="Path to the input CSV file") parser.add_argument("--prompt", type=str, default="video-f3-detail-3ex") parser.add_argument("--key", type=str) args = parser.parse_args() main(args) ================================================ FILE: Open-Sora/tools/caption/caption_llama3.py ================================================ import argparse import csv import os import warnings from datetime import timedelta import pandas as pd import torch import torch.distributed as dist from torch.utils.data import Dataset from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer from .utils import read_file os.system(f"cp {__file__} ~/backup/") # optionally backup the script warnings.filterwarnings("ignore") os.environ["TOKENIZERS_PARALLELISM"] = "false" from torch.distributed.elastic.multiprocessing.errors import record class CSVTextDataset(Dataset): def __init__(self, csv_path): self.df = pd.read_csv(csv_path) # assert text is in the columns assert "text" in self.df.columns, "text column not found in the csv file" def __len__(self): return len(self.df) def __getitem__(self, idx): if idx < 0 or idx >= len(self.df): raise IndexError return self.df.iloc[idx] def set_rank_and_world_size(self, rank, world_size): self.rank = rank self.world_size = world_size self.data_per_gpu = len(self) // world_size self.start_index = rank * self.data_per_gpu self.end_index = (rank + 1) * self.data_per_gpu if rank != world_size - 1 else len(self) self.df = self.df.iloc[self.start_index : self.end_index] def write_to_csv(self, output_file, data, new_key): """write the part of the df to a csv file corresponding to the rank and write self.data_list as a new column""" writer = csv.writer(open(output_file, "w")) columns = self.df.columns + [new_key] writer.writerow(columns) for index, row in self.df.iterrows(): if index < self.start_index or index >= self.end_index: continue writer.writerow([*row, data[index - self.start_index]]) writer.close() def pad_left(sequences, padding_value=0): # Determine the maximum length of the sequences max_len = max([s.size(0) for s in sequences]) # Create a list to hold the padded sequences padded_sequences = [] for sequence in sequences: # Calculate the number of padding elements needed for this sequence num_padding = max_len - sequence.size(0) # Create a tensor of padding values padding = torch.full((num_padding,), padding_value, dtype=sequence.dtype).to(sequence.device) # Concatenate the padding and the sequence to pad on the left padded_sequence = torch.cat([padding, sequence], dim=0) padded_sequences.append(padded_sequence) # Stack the padded sequences into a batch batch = torch.stack(padded_sequences) return batch @record def main(args): # ====================================================== # 1. init environment # ====================================================== dist.init_process_group(backend="nccl", timeout=timedelta(hours=24)) torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count()) # ====================================================== # 2. Prep rank-wise dataloader # ====================================================== dataframe = read_file(args.input) print("read data from {}".format(args.input)) dataset = CSVTextDataset(args.input) dataset.set_rank_and_world_size(dist.get_rank(), dist.get_world_size()) import os if os.getenv("DEBUG_ADDRESS") != None and dist.get_rank() == 2: import ptvsd print("waiting for debugger attachment") ptvsd.enable_attach(address=("localhost", int(os.getenv("DEBUG_ADDRESS"))), redirect_output=True) ptvsd.wait_for_attach() output_file = args.output_prefix + f"_rank{dist.get_rank()}" + f"_{args.key}.csv" output_file_handle = open(output_file, "w") writer = csv.writer(output_file_handle) columns = list(dataframe.columns) + [args.key] writer.writerow(columns) # add a new key named summary, write in csv file print("the processed data saved on this rank will be saved to {}".format(output_file)) def collate_fn(batch): return batch dataloader = torch.utils.data.DataLoader( dataset, # num_workers=2, batch_size=args.batch_size, collate_fn=collate_fn, shuffle=False, ) # ====================================================== # 2. process using llama3 and prompt # ====================================================== print("Using model with the id {}".format(args.model_id)) model_id = args.model_id tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left") model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map=dist.get_rank() % torch.cuda.device_count(), ) # .to(dist.get_rank() % torch.cuda.device_count()) dist.barrier() print("======== Process data using LLAMA3 ========") def extract_batch(texts, prompt): input_ids_list = [ tokenizer.apply_chat_template( [{"role": "system", "content": prompt}, {"role": "user", "content": text}], add_generation_prompt=True, return_tensors="pt", ).to(model.device)[0] for text in texts ] attention_mask_list = [ torch.ones(input_ids.shape, dtype=torch.long, device=model.device) for input_ids in input_ids_list ] # input_ids_batch = pad_left( # input_ids_list, padding_value=tokenizer.eos_token_id # ) input_ids_batch = torch.nn.utils.rnn.pad_sequence( input_ids_list, batch_first=True, padding_value=tokenizer.eos_token_id ) attention_mask_batch = torch.nn.utils.rnn.pad_sequence(attention_mask_list, batch_first=True, padding_value=0) # attention_mask_batch = pad_left( # attention_mask_list, padding_value=0 # ) terminators = [ tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>"), ] outputs = model.generate( input_ids_batch, max_new_tokens=512, attention_mask=attention_mask_batch, pad_token_id=tokenizer.eos_token_id, eos_token_id=terminators, # do_sample=True, # temperature=0.6, # top_p=0.9, ) responses = [] for i in range(len(texts)): response = outputs[i][input_ids_list[i].shape[-1] :] response = tokenizer.decode(response, skip_special_tokens=True) responses.append(response) return responses print("Processing starting...") if args.prompt == "" and args.key == "objects": prompt = ( "You are a AI assistant to extract objects from user's text. " "For example: user: 'In this video a dog is running around. In addition, a person is laughing at the dog.', you produce a list of objects separated by ',' and wrapped by '[' and ']': '[dog, person]' " ) elif args.prompt == "" and args.key == "actions": prompt = ( "You are a AI assistant to extract actions from user's text. " "For example: user: 'In this video a dog is running around. In addition, a person is laughing at the dog.', you produce a list of actions separated by ',' and wrapped by '[' and ']': '[run, laugh]' " ) else: prompt = args.prompt print("Prompt: {}".format(prompt)) args.batch_size # for i in tqdm(range(0, len(dataframe), batch_size)): for _, batch in enumerate(tqdm(dataloader)): # get the text column from the batch texts = [batch[i]["text"] for i in range(len(batch))] list_keywords = extract_batch(texts, prompt) for idx, keywords in enumerate(list_keywords): try: keywords_start = keywords.find("[") keywords_end = keywords.find("]") keywords = keywords[keywords_start + 1 : keywords_end] if ( "\n" in keywords or len(keywords.strip()) == 0 ): # we empirically observe that it produces newlines when no keywords are found keywords = "NONE_FOUND" except: keywords = "NONE_FOUND" row = batch[idx] writer.writerow([*row, keywords]) output_file_handle.close() dist.barrier() if dist.get_rank() == 0: collated_file = args.output_prefix + f"_{args.key}.csv" print("All ranks are finished. Collating the processed data to {}".format(collated_file)) import pandas as pd csv_files = [args.output_prefix + f"_rank{i}" + f"_{args.key}.csv" for i in range(dist.get_world_size())] # List to hold DataFrames dataframes = [] # Read each CSV into a DataFrame and append to list for file in csv_files: df = pd.read_csv(file) # scan each line in the df, if the ``key`` column is NaN, replace it with "NONE_FOUND" df[args.key] = df[args.key].fillna("NONE_FOUND") dataframes.append(df) # Concatenate all DataFrames combined_df = pd.concat(dataframes, ignore_index=True) # Save the combined DataFrame to a new CSV file combined_df.to_csv(collated_file, index=False) print("Collated data saved to {}".format(collated_file)) # terminate distributed env dist.destroy_process_group() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model-id", default="meta-llama/Meta-Llama-3-8B-Instruct") parser.add_argument("input", type=str, help="Path to the input CSV file") parser.add_argument("--output_prefix", type=str, help="Path to the output CSV file") parser.add_argument("--prompt", type=str, default="") parser.add_argument("--batch_size", type=int, default=32) parser.add_argument("--key", type=str) args = parser.parse_args() main(args) ================================================ FILE: Open-Sora/tools/caption/caption_llava.py ================================================ import argparse import csv import time import warnings from datetime import timedelta import torch import torch.distributed as dist from colossalai.cluster import DistCoordinator, ProcessGroupMesh from colossalai.shardformer import ShardConfig, ShardFormer from colossalai.utils import get_current_device, set_seed from llava.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX from llava.conversation import conv_templates from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token from llava.model.builder import load_pretrained_model from llava.utils import disable_torch_init from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm from ..datasets.utils import IMG_EXTENSIONS, VID_EXTENSIONS from .acceleration.llava.policies import LlavaLlamaForCausalLMPolicy, LlavaMistralForCausalLMPolicy from .utils import PROMPTS, Timer, VideoTextDataset, collate_fn disable_torch_init() class NoPaddingDistributedSampler(DistributedSampler): def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, seed=0, drop_last=False): super().__init__( dataset=dataset, num_replicas=num_replicas, rank=rank, seed=seed, shuffle=False, drop_last=False ) remainder = len(self.dataset) % self.num_replicas if remainder > 0 and (self.rank + 1) - remainder <= 0: # if the dataset is not divisible by num_replicas # the remaining items will be allocated to the first n ranks self.num_samples = len(self.dataset) // self.num_replicas + 1 else: self.num_samples = len(self.dataset) // self.num_replicas self.total_size = len(dataset) def __iter__(self): if self.shuffle: # deterministically shuffle based on epoch and seed g = torch.Generator() g.manual_seed(self.seed + self.epoch) indices = torch.randperm(len(self.dataset), generator=g).tolist() # type: ignore[arg-type] else: indices = list(range(len(self.dataset))) # type: ignore[arg-type] # remove tail of data to make it evenly divisible. indices = indices[: self.total_size] # subsample indices = indices[self.rank : self.total_size : self.num_replicas] assert len(indices) == self.num_samples return iter(indices) @torch.inference_mode() def main(args): # ====================================================== # 1. init environment # ====================================================== # we set a very large timeout to avoid some processes exit early dist.init_process_group(backend="nccl", timeout=timedelta(hours=24)) torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count()) set_seed(1024) coordinator = DistCoordinator() # prepare the dp and tp groups assert ( args.dp_size * args.tp_size == coordinator.world_size ), f"DP size {args.dp_size} * TP size {args.tp_size} must equal to world size {coordinator.world_size}" mesh = ProcessGroupMesh(args.dp_size, args.tp_size) dp_group = mesh.get_group_along_axis(0) tp_group = mesh.get_group_along_axis(1) # ====================================================== # 2. load model # ====================================================== model_path = args.model_path with warnings.catch_warnings(): warnings.simplefilter("ignore") # Pytorch non-meta copying warning fills out the console tokenizer, model, image_processor, context_len = load_pretrained_model( model_path=model_path, model_base=None, model_name=get_model_name_from_path(model_path), device=get_current_device(), torch_dtype=torch.float16, attn_implementation="flash_attention_2" if args.flash_attention else "eager", ) dist.barrier() # ====================================================== # 3. Apply system optimization # ====================================================== tp_size = dist.get_world_size(tp_group) shard_config = ShardConfig( tensor_parallel_process_group=tp_group if tp_size > 1 else None, enable_tensor_parallelism=True if tp_size > 1 else False, ) shard_former = ShardFormer(shard_config=shard_config) # check the model type model_name = model.__class__.__name__ print(model_name) if model_name == "LlavaLlamaForCausalLM": model = shard_former.optimize(model, policy=LlavaLlamaForCausalLMPolicy())[0].cuda() elif model_name == "LlavaMistralForCausalLM": model = shard_former.optimize(model, policy=LlavaMistralForCausalLMPolicy())[0].cuda() else: print(f"The shardformer policy for {model_name} is not implemented, skip") torch.cuda.empty_cache() # ====================================================== # 4. Prepare dataloader # ====================================================== # prepare prompt query = PROMPTS[args.prompt]["text"] if dist.get_rank() == 0: print(f"Prompt: {query}") if "text" in args.prompt: def get_text_input_ids(text): conv = conv_templates["chatml_direct"].copy() query_text = query.format(text) conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + "\n" + query_text) prompt = conv.get_prompt() # add num_frames images t = prompt.split("") prompt = t[0] + "" * args.num_frames + t[1] input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") input_ids = input_ids.unsqueeze(0) return input_ids else: conv = conv_templates["chatml_direct"].copy() conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + "\n" + query) prompt = conv.get_prompt() # add num_frames images t = prompt.split("") prompt = t[0] + "" * args.num_frames + t[1] input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") input_ids = input_ids.unsqueeze(0) def get_text_input_ids(*args): return input_ids # build dataset def transform(imgs): imgs = process_images(imgs, image_processor, model.config) imgs = imgs.to(dtype=torch.float16) return imgs dataset = VideoTextDataset( args.input, transform=transform, num_frames=args.num_frames, get_text_input_ids=get_text_input_ids, resize=args.resize, ) # make sure that the prompt type matches the data type data_extension = "." + dataset.data["path"].iloc[0].split(".")[-1] prompt_type = PROMPTS[args.prompt]["type"] if prompt_type == "image": assert ( data_extension.lower() in IMG_EXTENSIONS ), f"The prompt is suitable for an image dataset but the data is not image. The first data is of format {data_extension}" elif prompt_type == "video": assert ( data_extension.lower() in VID_EXTENSIONS ), f"The prompt is suitable for a video dataset but the data is not video. The first data is of format {data_extension}" else: raise ValueError(f"Found invalid prompt type {prompt_type}") total_num_videos = len(dataset) # build sampler dp_rank = dist.get_rank(dp_group) dp_size = dist.get_world_size(dp_group) sampler = NoPaddingDistributedSampler(dataset, rank=dp_rank, num_replicas=dp_size) # build dataloader dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.bs, shuffle=False, num_workers=args.num_workers, pin_memory=True, prefetch_factor=args.prefetch_factor, sampler=sampler, collate_fn=collate_fn, ) # prepare output file reader output_file = args.input.replace(".csv", "_caption.csv") # create csv writer has_dp_writter = dist.get_rank(tp_group) == 0 if has_dp_writter: # the dp writer takes care of the files processed on the current dp rank # so we use write mode output_file_split = output_file.replace(".csv", f"_part{dp_rank}.csv") dp_file = open(output_file_split, "w") dp_writer = csv.writer(dp_file) dp_writer.writerow(["path", "text", "num_frames"]) # ====================================================== # 5. generate captions # ====================================================== if dist.get_rank(tp_group) == 0: pbar = tqdm(dataloader, position=dp_rank, desc=f"Data Parallel Rank {dist.get_rank(dp_group)}") else: pbar = dataloader if args.profile: encode_time = [] generate_time = [] output_length = [] total_time = [] for i, batch in enumerate(pbar): # measure time if args.profile: torch.cuda.synchronize() start_time = time.time() video_files, frames, video_lengths, img_size_list, texts = batch # encode the batch of inputs with Timer() as encode_timer: samples = [] for imgs, imgs_size, input_ids in zip(frames, img_size_list, texts): imgs = imgs.cuda() input_ids = input_ids.cuda() _, _, _, _, inputs_embeds, _ = model.prepare_inputs_labels_for_multimodal( input_ids, None, None, None, None, images=imgs, image_sizes=imgs_size ) samples.append(inputs_embeds) # padding max_len = max([sample.shape[1] for sample in samples]) attention_mask = torch.tensor( [[0] * (max_len - samples[i].shape[1]) + [1] * samples[i].shape[1] for i in range(len(samples))] ).to(model.device) inputs_embeds = [ torch.cat( [ torch.zeros( (1, max_len - samples[i].shape[1], samples[i].shape[-1]), device=model.device, dtype=torch.float16, ), samples[i], ], dim=1, ) for i in range(len(samples)) ] inputs_embeds = torch.cat(inputs_embeds, dim=0) # generate outputs with Timer() as generate_timer: output_ids = super(type(model), model).generate( inputs_embeds=inputs_embeds, attention_mask=attention_mask, do_sample=False, # sampling is not deterministic and may cause TP to hang max_new_tokens=args.max_tokens, use_cache=True, ) # skip warmup and add profiling data if args.profile and i >= args.profile_warmup: output_length.append(output_ids.size(0) * output_ids.size(1)) outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True) outputs = [output.replace("\n", " ").strip() for output in outputs] # skip warmup and add profiling data if args.profile and i >= args.profile_warmup: # measure time torch.cuda.synchronize() time_taken = time.time() - start_time total_time.append(time_taken) encode_time.append(encode_timer.time_taken) generate_time.append(generate_timer.time_taken) # save results if has_dp_writter: result = list(zip(video_files, outputs, video_lengths)) for t in result: dp_writer.writerow(t) # display profiling info if args.profile: print(output_length) num_samples_after_warmup = total_num_videos - args.bs * args.profile_warmup * dp_size print(f"throughput (samples/s): {num_samples_after_warmup / sum(total_time)}") print(f"average encode time per sample: {sum(encode_time) / num_samples_after_warmup}") print(f"average generate time per sample: {sum(generate_time) / num_samples_after_warmup}") print(f"average number of tokens characters per sample: {sum(output_length) / num_samples_after_warmup}") print(f"Max GPU allocated / GB: {torch.cuda.max_memory_allocated() / 1024**3}") print(f"Max GPU reserved / GB: {torch.cuda.max_memory_reserved() / 1024**3}") # ====================================================== # 6. shutdown # ====================================================== # close file writing if has_dp_writter: dp_file.close() dist.barrier() # terminate distributed env dist.destroy_process_group() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("input", type=str, help="Path to the input CSV file") parser.add_argument("--model-path", type=str, default="liuhaotian/llava-v1.6-34b") parser.add_argument("--prompt", type=str, default="video-f1-detail-3ex") parser.add_argument("--resize", type=int, default=336) parser.add_argument("--num-frames", type=int, default=1) parser.add_argument("--max-tokens", type=int, default=300) # speed related parser.add_argument("--bs", type=int, default=16) parser.add_argument("--tp-size", type=int, default=2) parser.add_argument("--dp-size", type=int, default=4) parser.add_argument("--num-workers", type=int, default=8) parser.add_argument("--prefetch-factor", type=int, default=8, help="Prefetch factor") parser.add_argument( "--flash-attention", action="store_true", help="Whether to use flash attention. You can turn on this flag for llama model and off for mistral model.", ) # debug related parser.add_argument("--profile", action="store_true") parser.add_argument("--profile-warmup", type=int, default=1) args = parser.parse_args() main(args) ================================================ FILE: Open-Sora/tools/caption/pllava_dir/caption_pllava.py ================================================ import sys import os import os from pathlib import Path current_file = Path(__file__) # Gets the path of the current file fourth_level_parent = current_file.parents[3] datasets_dir = os.path.join(fourth_level_parent, "opensora/datasets") import sys sys.path.append(datasets_dir) from read_video import read_video_av sys.path.remove(datasets_dir) import itertools import logging import multiprocessing as mp from argparse import ArgumentParser from multiprocessing import Process, Queue import numpy as np import pandas as pd import torch import torchvision import transformers from decord import VideoReader, cpu from PIL import Image from tasks.eval.eval_utils import Conversation from tasks.eval.model_utils import load_pllava from torch.utils.data import Dataset from tqdm import tqdm from transformers.feature_extraction_utils import BatchFeature conv_template = Conversation( system="Describe this video. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.", roles=("USER:", "ASSISTANT:"), messages=[], sep=(" ", ""), mm_token="", ) logging.basicConfig() logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) RESOLUTION = 672 # def pllava_answer( conv: Conversation, model, processor, video_list, do_sample=True, max_new_tokens=200, num_beams=1, min_length=1, top_p=0.9, repetition_penalty=1.0, length_penalty=1, temperature=1.0, stop_criteria_keywords=None, print_res=False, ): # torch.cuda.empty_cache() prompt = conv.get_prompt() inputs_list = [processor(text=prompt, images=video, return_tensors="pt") for video in video_list] inputs_batched = dict() # add batch dimension by cat for input_type in list(inputs_list[0].keys()): inputs_batched[input_type] = torch.cat([inputs[input_type] for inputs in inputs_list]) inputs_batched = BatchFeature(inputs_batched, tensor_type="pt").to(model.device) with torch.no_grad(): output_texts = model.generate( **inputs_batched, media_type="video", do_sample=do_sample, max_new_tokens=max_new_tokens, num_beams=num_beams, min_length=min_length, top_p=top_p, repetition_penalty=repetition_penalty, length_penalty=length_penalty, temperature=temperature, ) output_texts = processor.batch_decode( output_texts, skip_special_tokens=True, clean_up_tokenization_spaces=False ) for i in range(len(output_texts)): if print_res: # debug usage print("### PROMPTING LM WITH: ", prompt) print("### LM OUTPUT TEXT: ", output_texts[i]) if conv.roles[-1] == "<|im_start|>assistant\n": split_tag = "<|im_start|> assistant\n" else: split_tag = conv.roles[-1] output_texts[i] = output_texts[i].split(split_tag)[-1] ending = conv.sep if isinstance(conv.sep, str) else conv.sep[1] output_texts[i] = output_texts[i].removesuffix(ending).strip() output_texts[i] = output_texts[i].replace("\n", " ") conv.messages[-1][1] = output_texts[i] return output_texts, conv def get_index(num_frames, num_segments): seg_size = float(num_frames - 1) / num_segments start = int(seg_size / 2) offsets = np.array([start + int(np.round(seg_size * idx)) for idx in range(num_segments)]) return offsets # def load_video(video_path, num_frames, return_msg=False, resolution=336): # transforms = torchvision.transforms.Resize(size=resolution) # vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) # total_num_frames = len(vr) # frame_indices = get_index(total_num_frames, num_frames) # images_group = list() # for frame_index in frame_indices: # img = Image.fromarray(vr[frame_index].asnumpy()) # images_group.append(transforms(img)) # if return_msg: # fps = float(vr.get_avg_fps()) # sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices]) # # " " should be added in the start and end # msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds." # return images_group, msg # else: # return images_group def load_video(video_path, num_frames, return_msg=False, resolution=336): transforms = torchvision.transforms.Resize(size=resolution) # vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) vframes, aframes, info = read_video_av( video_path, pts_unit="sec", output_format="THWC" ) print(vframes.shape) total_num_frames = len(vframes) # print("Video path: ", video_path) # print("Total number of frames: ", total_num_frames) frame_indices = get_index(total_num_frames, num_frames) images_group = list() for frame_index in frame_indices: img = Image.fromarray(vframes[frame_index].numpy()) images_group.append(transforms(img)) if return_msg: # fps = float(vframes.get_avg_fps()) # sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices]) # # " " should be added in the start and end # msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds." # return images_group, msg exit('return_msg not implemented yet') else: return images_group def collate_fn(batch): return batch class CSVDataset(Dataset): def __init__(self, csv_path, num_frames): self.df = pd.read_csv(csv_path) self.data_list = self.df.path.tolist() self.num_frames = num_frames def __len__(self): return len(self.data_list) def __getitem__(self, idx): if idx < 0 or idx >= len(self.data_list): raise IndexError try: video = load_video(self.data_list[idx], self.num_frames, resolution=RESOLUTION) except: return None return video def set_rank_and_world_size(self, rank, world_size): self.rank = rank self.world_size = world_size self.data_per_gpu = len(self) // world_size start_index = rank * self.data_per_gpu end_index = (rank + 1) * self.data_per_gpu if rank != world_size - 1 else len(self) self.data_list = self.data_list[start_index:end_index] def parse_args(): parser = ArgumentParser() parser.add_argument("--pretrained_model_name_or_path", type=str, required=True, default="llava-hf/llava-1.5-7b-hf") parser.add_argument( "--batch_size", type=int, required=False, default=1, ) parser.add_argument( "--csv_path", type=str, required=True, ) parser.add_argument( "--num_frames", type=int, required=True, default=4, ) parser.add_argument("--use_lora", action="store_true") parser.add_argument( "--lora_alpha", type=int, required=False, default=4, ) parser.add_argument( "--weight_dir", type=str, required=False, default=None, ) parser.add_argument( "--conv_mode", type=str, required=False, default="eval_mvbench", ) parser.add_argument( "--pooling_shape", type=str, required=False, default=None, ) parser.add_argument( "--error_message", type=str, required=False, default='error occured during captioning', ) args = parser.parse_args() return args def load_model_and_dataset( rank, world_size, pretrained_model_name_or_path, num_frames, use_lora, lora_alpha, weight_dir, csv_path, pooling_shape=(16, 12, 12), ): # remind that, once the model goes larger (30B+) may cause the memory to be heavily used up. Even Tearing Nodes. model, processor = load_pllava( pretrained_model_name_or_path, num_frames=num_frames, use_lora=use_lora, weight_dir=weight_dir, lora_alpha=lora_alpha, pooling_shape=pooling_shape, ) logger.info("done loading llava") # position embedding model = model.to(torch.device(rank)) model = model.eval() dataset = CSVDataset(csv_path, num_frames) dataset.set_rank_and_world_size(rank, world_size) return model, processor, dataset def infer( model, processor, video_list, conv_mode, print_res=False, ): # check if any video in video_list is None, if so, raise an exception if any([video is None for video in video_list]): raise Exception("Video not loaded properly") conv = conv_template.copy() conv.user_query("Describe the video in details.", is_mm=True) llm_responses, conv = pllava_answer( conv=conv, model=model, processor=processor, video_list=video_list, max_new_tokens=256, do_sample=False, print_res=print_res, ) return llm_responses def run(rank, args, world_size, output_queue): if rank == 0: import os if os.getenv("DEBUG_ADDRESS") != None: import ptvsd ptvsd.enable_attach(address=("localhost", int(os.getenv("DEBUG_ADDRESS"))), redirect_output=True) ptvsd.wait_for_attach() print("waiting for debugger attachment") if rank != 0: transformers.utils.logging.set_verbosity_error() logger.setLevel(transformers.logging.ERROR) print_res = False conv_mode = args.conv_mode if args.pooling_shape is not None: pooling_shape = tuple([int(x) for x in args.pooling_shape.split("-")]) logger.info(f"loading model and constructing dataset to gpu {rank}...") model, processor, dataset = load_model_and_dataset( rank, world_size, pretrained_model_name_or_path=args.pretrained_model_name_or_path, num_frames=args.num_frames, use_lora=args.use_lora, lora_alpha=args.lora_alpha, weight_dir=args.weight_dir, pooling_shape=pooling_shape, csv_path=args.csv_path, ) logger.info(f"done model and dataset...") logger.info("constructing dataset...") logger.info("single test...") dataloader = torch.utils.data.DataLoader( dataset, num_workers=2, batch_size=args.batch_size, collate_fn=collate_fn, shuffle=False, ) total = 0 result_list = [] print(len(dataset)) for batch in tqdm(dataloader): total += 1 try: preds = infer( model, processor, batch, conv_mode=conv_mode, print_res=print_res, ) except Exception as e: logger.error(f"error in {batch}: {str(e)}") # preds = args.error_message duplicated for each video in the batch preds = [args.error_message] * len(batch) result_list.extend(preds) output_queue.put((rank, result_list)) return result_list def main(): multiprocess = True mp.set_start_method("spawn") args = parse_args() # csv_path = '/home/tom/PLLaVA/test_short_caption_part2.csv' if multiprocess: n_gpus = torch.cuda.device_count() world_size = n_gpus print(f"world_size: {world_size}") # Create a queue to collect results from each process output_queue = Queue() # with Pool(world_size) as pool: # func = functools.partial(run, args=args, world_size=world_size) # result_lists = pool.map(func, range(world_size)) processes = [] for i in range(world_size): # Each process will now also take the output queue as an argument p = Process(target=run, args=(i, args, world_size, output_queue)) p.daemon = False processes.append(p) p.start() results_by_rank = {} for _ in range(world_size): rank, results = output_queue.get() # Retrieve results as they finish results_by_rank[rank] = results print(f"Results received from rank {rank}") # ORDER THE RESULTS BY RANK logger.info("finished running") for p in processes: p.join() results_list = list(itertools.chain.from_iterable(results_by_rank[i] for i in range(world_size))) # results_list = list(itertools.chain([results_by_rank[i] for i in range(world_size)])) # (data[key] for key in sorted_keys) # results_list = [item for sublist in results_by_rank.values() for item in sublist] else: results_list = run(0, world_size=1, args=args) # debug print(results_list) df = pd.read_csv(args.csv_path) # add a new column to the dataframe df["text"] = results_list drop_failed = True if drop_failed: # iterate through the dataframe and delete the entire row if captioning failed for i in tqdm(range(len(df))): if df["text"][i] == args.error_message: df = df.drop(i) # write the dataframe to a new csv file called '*_pllava_13b_caption.csv' new_csv_path = args.csv_path.replace(".csv", "_text.csv") df.to_csv(new_csv_path, index=False) print(f"Results saved to {new_csv_path}") if __name__ == "__main__": main() ================================================ FILE: Open-Sora/tools/caption/utils.py ================================================ import time import pandas as pd import torch import torchvision.transforms as transforms from torchvision.datasets.folder import pil_loader from tools.datasets.utils import extract_frames, is_video IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp") VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv") PROMPTS = { "image": { "text": "Describe this image and its style to generate a succinct yet informative description. Pay attention to all objects in the image. The description should be useful for AI to re-generate the image. The description should be no more than five sentences. Remember do not exceed 5 sentences.", "type": "image", }, "image-text": { "text": "Describe this image and its style in a very detailed manner. Pay attention to all objects in the image. The description should be useful for AI to re-generate the image. The description should be no more than six sentences. Some information about the image is '{}'.", "type": "image", }, "image-3ex": { "text": "An image is given. Describe this image and its style to generate a succinct yet informative description. Pay attention to all objects in the image. The description should be useful for AI to re-generate the video. The description should be no more than five sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick and walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.", "type": "image", }, "video": { "text": "Describe this video and its style in a very detailed manner. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences.", "type": "video", }, "video-text": { "text": "Describe this video and its style in a very detailed manner. Some information about the image is '{}'. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences.", "type": "video", }, "video-f1-detail-3ex": { "text": "A video is given by providing the middle frame. Describe this video and its style to generate a description. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.", "type": "video", }, "video-f1-detail-2ex-text": { "text": "A video is given by providing the middle frame. Some information about the image is '{}'. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.", "type": "video", }, "video-f3-detail-3ex": { "text": "A video is given by providing three frames in chronological order. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.", "type": "video", }, "video-f3-detail-2ex-text": { "text": "A video is given by providing three frames in chronological order. Some information about the image is '{}'. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.", "type": "video", }, } NUM_FRAMES_POINTS = { 1: (0.5,), 2: (0.25, 0.75), 3: (0.1, 0.5, 0.9), } def read_file(input_path): if input_path.endswith(".csv"): return pd.read_csv(input_path) elif input_path.endswith(".parquet"): return pd.read_parquet(input_path) else: raise NotImplementedError(f"Unsupported file format: {input_path}") class VideoTextDataset(torch.utils.data.Dataset): def __init__(self, csv_path, transform=None, num_frames=3, get_text_input_ids=None, resize=None): self.csv_path = csv_path self.transform = transform self.data = read_file(csv_path) self.points = NUM_FRAMES_POINTS[num_frames] self.get_text_input_ids = get_text_input_ids self.use_text = False self.resize_size = resize self.resize = transforms.Resize(resize, transforms.InterpolationMode.BICUBIC) if resize is not None else None if "text" in self.data.columns: self.use_text = True def getitem(self, index): sample = self.data.iloc[index] path = sample["path"] if not is_video(path): images = [pil_loader(path)] length = 1 else: images, length = extract_frames(sample["path"], points=self.points, backend="opencv", return_length=True) if self.resize_size is not None: images_r = [] for img in images: if img.size[0] > self.resize_size or img.size[1] > self.resize_size: img = self.resize(img) images_r.append(img) images = images_r imgs_size = [img.size for img in images] if self.transform is not None: images = self.transform(images) # we put images into a list as pytorch dataloader does not accept Pill out = dict(path=path, image=images, length=length, img_size=imgs_size) if self.get_text_input_ids is not None: if self.use_text: out["text"] = self.get_text_input_ids(sample["text"]) else: out["text"] = self.get_text_input_ids() else: if self.use_text: out["text"] = sample["text"] else: out["text"] = "" return out def __len__(self): return len(self.data) def __getitem__(self, index): return self.getitem(index) def collate_fn(batch): paths = [item["path"] for item in batch] images = [item["image"] for item in batch] lengths = [item["length"] for item in batch] img_sizes = [item["img_size"] for item in batch] texts = [item["text"] for item in batch] return paths, images, lengths, img_sizes, texts class Timer: def __init__(self): self.time_taken = 0 self.start_time = 0 self.end_time = 0 def __enter__(self): self.start_time = time.time() return self def __exit__(self, exc_type, exc_value, exc_tb): self.end_time = time.time() self.time_taken = self.end_time - self.start_time ================================================ FILE: Open-Sora/tools/datasets/README.md ================================================ # Dataset Management - [Dataset Management](#dataset-management) - [Dataset Format](#dataset-format) - [Dataset to CSV](#dataset-to-csv) - [Manage datasets](#manage-datasets) - [Requirement](#requirement) - [Basic Usage](#basic-usage) - [Score filtering](#score-filtering) - [Documentation](#documentation) - [Transform datasets](#transform-datasets) - [Resize](#resize) - [Frame extraction](#frame-extraction) - [Crop Midjourney 4 grid](#crop-midjourney-4-grid) - [Analyze datasets](#analyze-datasets) - [Data Process Pipeline](#data-process-pipeline) After preparing the raw dataset according to the [instructions](/docs/datasets.md), you can use the following commands to manage the dataset. ## Dataset Format All dataset should be provided in a `.csv` file (or `parquet.gzip` to save space), which is used for both training and data preprocessing. The columns should follow the words below: - `path`: the relative/absolute path or url to the image or video file. Required. - `text`: the caption or description of the image or video. Required for training. - `num_frames`: the number of frames in the video. Required for training. - `width`: the width of the video frame. Required for dynamic bucket. - `height`: the height of the video frame. Required for dynamic bucket. - `aspect_ratio`: the aspect ratio of the video frame (height / width). Required for dynamic bucket. - `resolution`: height x width. For analysis. - `text_len`: the number of tokens in the text. For analysis. - `aes`: aesthetic score calculated by [asethetic scorer](/tools/aesthetic/README.md). For filtering. - `flow`: optical flow score calculated by [UniMatch](/tools/scoring/README.md). For filtering. - `match`: matching score of a image-text/video-text pair calculated by [CLIP](/tools/scoring/README.md). For filtering. - `fps`: the frame rate of the video. Optional. - `cmotion`: the camera motion. An example ready for training: ```csv path, text, num_frames, width, height, aspect_ratio /absolute/path/to/image1.jpg, caption, 1, 720, 1280, 0.5625 /absolute/path/to/video1.mp4, caption, 120, 720, 1280, 0.5625 /absolute/path/to/video2.mp4, caption, 20, 256, 256, 1 ``` We use pandas to manage the `.csv` or `.parquet` files. The following code is for reading and writing files: ```python df = pd.read_csv(input_path) df = df.to_csv(output_path, index=False) # or use parquet, which is smaller df = pd.read_parquet(input_path) df = df.to_parquet(output_path, index=False) ``` ## Dataset to CSV As a start point, `convert.py` is used to convert the dataset to a CSV file. You can use the following commands to convert the dataset to a CSV file: ```bash python -m tools.datasets.convert DATASET-TYPE DATA_FOLDER # general video folder python -m tools.datasets.convert video VIDEO_FOLDER --output video.csv # general image folder python -m tools.datasets.convert image IMAGE_FOLDER --output image.csv # imagenet python -m tools.datasets.convert imagenet IMAGENET_FOLDER --split train # ucf101 python -m tools.datasets.convert ucf101 UCF101_FOLDER --split videos # vidprom python -m tools.datasets.convert vidprom VIDPROM_FOLDER --info VidProM_semantic_unique.csv ``` ## Manage datasets Use `datautil` to manage the dataset. ### Requirement Follow our [installation guide](../../docs/installation.md)'s "Data Dependencies" and "Datasets" section to install the required packages. ### Basic Usage You can use the following commands to process the `csv` or `parquet` files. The output file will be saved in the same directory as the input, with different suffixes indicating the processed method. ```bash # datautil takes multiple CSV files as input and merge them into one CSV file # output: DATA1+DATA2.csv python -m tools.datasets.datautil DATA1.csv DATA2.csv # shard CSV files into multiple CSV files # output: DATA1_0.csv, DATA1_1.csv, ... python -m tools.datasets.datautil DATA1.csv --shard 10 # filter frames between 128 and 256, with captions # output: DATA1_fmin_128_fmax_256.csv python -m tools.datasets.datautil DATA.csv --fmin 128 --fmax 256 # Disable parallel processing python -m tools.datasets.datautil DATA.csv --fmin 128 --fmax 256 --disable-parallel # Compute num_frames, height, width, fps, aspect_ratio for videos or images # output: IMG_DATA+VID_DATA_vinfo.csv python -m tools.datasets.datautil IMG_DATA.csv VID_DATA.csv --video-info # You can run multiple operations at the same time. python -m tools.datasets.datautil DATA.csv --video-info --remove-empty-caption --remove-url --lang en ``` ### Score filtering To examine and filter the quality of the dataset by aesthetic score and clip score, you can use the following commands: ```bash # sort the dataset by aesthetic score # output: DATA_sort.csv python -m tools.datasets.datautil DATA.csv --sort aesthetic_score # View examples of high aesthetic score head -n 10 DATA_sort.csv # View examples of low aesthetic score tail -n 10 DATA_sort.csv # sort the dataset by clip score # output: DATA_sort.csv python -m tools.datasets.datautil DATA.csv --sort clip_score # filter the dataset by aesthetic score # output: DATA_aesmin_0.5.csv python -m tools.datasets.datautil DATA.csv --aesmin 0.5 # filter the dataset by clip score # output: DATA_matchmin_0.5.csv python -m tools.datasets.datautil DATA.csv --matchmin 0.5 ``` ### Documentation You can also use `python -m tools.datasets.datautil --help` to see usage. | Args | File suffix | Description | | --------------------------- | -------------- | ------------------------------------------------------------- | | `--output OUTPUT` | | Output path | | `--format FORMAT` | | Output format (csv, parquet, parquet.gzip) | | `--disable-parallel` | | Disable `pandarallel` | | `--seed SEED` | | Random seed | | `--shard SHARD` | `_0`,`_1`, ... | Shard the dataset | | `--sort KEY` | `_sort` | Sort the dataset by KEY | | `--sort-descending KEY` | `_sort` | Sort the dataset by KEY in descending order | | `--difference DATA.csv` | | Remove the paths in DATA.csv from the dataset | | `--intersection DATA.csv` | | Keep the paths in DATA.csv from the dataset and merge columns | | `--info` | `_info` | Get the basic information of each video and image (cv2) | | `--ext` | `_ext` | Remove rows if the file does not exist | | `--relpath` | `_relpath` | Modify the path to relative path by root given | | `--abspath` | `_abspath` | Modify the path to absolute path by root given | | `--remove-empty-caption` | `_noempty` | Remove rows with empty caption | | `--remove-url` | `_nourl` | Remove rows with url in caption | | `--lang LANG` | `_lang` | Remove rows with other language | | `--remove-path-duplication` | `_noduppath` | Remove rows with duplicated path | | `--remove-text-duplication` | `_noduptext` | Remove rows with duplicated caption | | `--refine-llm-caption` | `_llm` | Modify the caption generated by LLM | | `--clean-caption MODEL` | `_clean` | Modify the caption according to T5 pipeline to suit training | | `--unescape` | `_unescape` | Unescape the caption | | `--merge-cmotion` | `_cmotion` | Merge the camera motion to the caption | | `--count-num-token` | `_ntoken` | Count the number of tokens in the caption | | `--load-caption EXT` | `_load` | Load the caption from the file | | `--fmin FMIN` | `_fmin` | Filter the dataset by minimum number of frames | | `--fmax FMAX` | `_fmax` | Filter the dataset by maximum number of frames | | `--hwmax HWMAX` | `_hwmax` | Filter the dataset by maximum height x width | | `--aesmin AESMIN` | `_aesmin` | Filter the dataset by minimum aesthetic score | | `--matchmin MATCHMIN` | `_matchmin` | Filter the dataset by minimum clip score | | `--flowmin FLOWMIN` | `_flowmin` | Filter the dataset by minimum optical flow score | ## Transform datasets The `tools.datasets.transform` module provides a set of tools to transform the dataset. The general usage is as follows: ```bash python -m tools.datasets.transform TRANSFORM_TYPE META.csv ORIGINAL_DATA_FOLDER DATA_FOLDER_TO_SAVE_RESULTS --additional-args ``` ### Resize Sometimes you may need to resize the images or videos to a specific resolution. You can use the following commands to resize the dataset: ```bash python -m tools.datasets.transform meta.csv /path/to/raw/data /path/to/new/data --length 2160 ``` ### Frame extraction To extract frames from videos, you can use the following commands: ```bash python -m tools.datasets.transform vid_frame_extract meta.csv /path/to/raw/data /path/to/new/data --points 0.1 0.5 0.9 ``` ### Crop Midjourney 4 grid Randomly select one of the 4 images in the 4 grid generated by Midjourney. ```bash python -m tools.datasets.transform img_rand_crop meta.csv /path/to/raw/data /path/to/new/data ``` ## Analyze datasets You can easily get basic information about a `.csv` dataset by using the following commands: ```bash # examine the first 10 rows of the CSV file head -n 10 DATA1.csv # count the number of data in the CSV file (approximately) wc -l DATA1.csv ``` For the dataset provided in a `.csv` or `.parquet` file, you can easily analyze the dataset using the following commands. Plots will be automatically saved. ```python pyhton -m tools.datasets.analyze DATA_info.csv ``` ## Data Process Pipeline ```bash # Suppose videos and images under ~/dataset/ # 1. Convert dataset to CSV python -m tools.datasets.convert video ~/dataset --output meta.csv # 2. Get video information python -m tools.datasets.datautil meta.csv --info --fmin 1 # 3. Get caption # 3.1. generate caption torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava meta_info_fmin1.csv --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video # merge generated results python -m tools.datasets.datautil meta_info_fmin1_caption_part*.csv --output meta_caption.csv # merge caption and info python -m tools.datasets.datautil meta_info_fmin1.csv --intersection meta_caption.csv --output meta_caption_info.csv # clean caption python -m tools.datasets.datautil meta_caption_info.csv --clean-caption --refine-llm-caption --remove-empty-caption --output meta_caption_processed.csv # 3.2. extract caption python -m tools.datasets.datautil meta_info_fmin1.csv --load-caption json --remove-empty-caption --clean-caption # 4. Scoring # aesthetic scoring torchrun --standalone --nproc_per_node 8 -m tools.scoring.aesthetic.inference meta_caption_processed.csv python -m tools.datasets.datautil meta_caption_processed_part*.csv --output meta_caption_processed_aes.csv # optical flow scoring torchrun --standalone --nproc_per_node 8 -m tools.scoring.optical_flow.inference meta_caption_processed.csv # matching scoring torchrun --standalone --nproc_per_node 8 -m tools.scoring.matching.inference meta_caption_processed.csv # camera motion python -m tools.caption.camera_motion_detect meta_caption_processed.csv ``` ================================================ FILE: Open-Sora/tools/datasets/__init__.py ================================================ ================================================ FILE: Open-Sora/tools/datasets/analyze.py ================================================ import argparse import os import matplotlib.pyplot as plt import pandas as pd def read_file(input_path): if input_path.endswith(".csv"): return pd.read_csv(input_path) elif input_path.endswith(".parquet"): return pd.read_parquet(input_path) else: raise NotImplementedError(f"Unsupported file format: {input_path}") def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("input", type=str, help="Path to the input dataset") parser.add_argument("--save-img", type=str, default="samples/infos/", help="Path to save the image") return parser.parse_args() def plot_data(data, column, bins, name): plt.clf() data.hist(column=column, bins=bins) os.makedirs(os.path.dirname(name), exist_ok=True) plt.savefig(name) print(f"Saved {name}") def plot_categorical_data(data, column, name): plt.clf() data[column].value_counts().plot(kind="bar") os.makedirs(os.path.dirname(name), exist_ok=True) plt.savefig(name) print(f"Saved {name}") COLUMNS = { "num_frames": 100, "resolution": 100, "text_len": 100, "aes": 100, "match": 100, "flow": 100, "cmotion": None, } def main(args): data = read_file(args.input) # === Image Data Info === image_index = data["num_frames"] == 1 if image_index.sum() > 0: print("=== Image Data Info ===") img_data = data[image_index] print(f"Number of images: {len(img_data)}") print(img_data.head()) print(img_data.describe()) if args.save_img: for column in COLUMNS: if column in img_data.columns and column not in ["num_frames", "cmotion"]: if COLUMNS[column] is None: plot_categorical_data(img_data, column, os.path.join(args.save_img, f"image_{column}.png")) else: plot_data(img_data, column, COLUMNS[column], os.path.join(args.save_img, f"image_{column}.png")) # === Video Data Info === if not image_index.all(): print("=== Video Data Info ===") video_data = data[~image_index] print(f"Number of videos: {len(video_data)}") if "num_frames" in video_data.columns: total_num_frames = video_data["num_frames"].sum() print(f"Number of frames: {total_num_frames}") DEFAULT_FPS = 30 total_hours = total_num_frames / DEFAULT_FPS / 3600 print(f"Total hours (30 FPS): {int(total_hours)}") print(video_data.head()) print(video_data.describe()) if args.save_img: for column in COLUMNS: if column in video_data.columns: if COLUMNS[column] is None: plot_categorical_data(video_data, column, os.path.join(args.save_img, f"video_{column}.png")) else: plot_data( video_data, column, COLUMNS[column], os.path.join(args.save_img, f"video_{column}.png") ) if __name__ == "__main__": args = parse_args() main(args) ================================================ FILE: Open-Sora/tools/datasets/convert.py ================================================ import argparse import os import time import pandas as pd from torchvision.datasets import ImageNet IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp") VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv", ".m2ts") def scan_recursively(root): num = 0 for entry in os.scandir(root): if entry.is_file(): yield entry elif entry.is_dir(): num += 1 if num % 100 == 0: print(f"Scanned {num} directories.") yield from scan_recursively(entry.path) def get_filelist(file_path, exts=None): filelist = [] time_start = time.time() # == OS Walk == # for home, dirs, files in os.walk(file_path): # for filename in files: # ext = os.path.splitext(filename)[-1].lower() # if exts is None or ext in exts: # filelist.append(os.path.join(home, filename)) # == Scandir == obj = scan_recursively(file_path) for entry in obj: if entry.is_file(): ext = os.path.splitext(entry.name)[-1].lower() if exts is None or ext in exts: filelist.append(entry.path) time_end = time.time() print(f"Scanned {len(filelist)} files in {time_end - time_start:.2f} seconds.") return filelist def split_by_capital(name): # BoxingPunchingBag -> Boxing Punching Bag new_name = "" for i in range(len(name)): if name[i].isupper() and i != 0: new_name += " " new_name += name[i] return new_name def process_imagenet(root, split): root = os.path.expanduser(root) data = ImageNet(root, split=split) samples = [(path, data.classes[label][0]) for path, label in data.samples] output = f"imagenet_{split}.csv" df = pd.DataFrame(samples, columns=["path", "text"]) df.to_csv(output, index=False) print(f"Saved {len(samples)} samples to {output}.") def process_ucf101(root, split): root = os.path.expanduser(root) video_lists = get_filelist(os.path.join(root, split)) classes = [x.split("/")[-2] for x in video_lists] classes = [split_by_capital(x) for x in classes] samples = list(zip(video_lists, classes)) output = f"ucf101_{split}.csv" df = pd.DataFrame(samples, columns=["path", "text"]) df.to_csv(output, index=False) print(f"Saved {len(samples)} samples to {output}.") def process_vidprom(root, info): root = os.path.expanduser(root) video_lists = get_filelist(root) video_set = set(video_lists) # read info csv infos = pd.read_csv(info) abs_path = infos["uuid"].apply(lambda x: os.path.join(root, f"pika-{x}.mp4")) is_exist = abs_path.apply(lambda x: x in video_set) df = pd.DataFrame(dict(path=abs_path[is_exist], text=infos["prompt"][is_exist])) df.to_csv("vidprom.csv", index=False) print(f"Saved {len(df)} samples to vidprom.csv.") def process_general_images(root, output): root = os.path.expanduser(root) if not os.path.exists(root): return path_list = get_filelist(root, IMG_EXTENSIONS) fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list] df = pd.DataFrame(dict(id=fname_list, path=path_list)) os.makedirs(os.path.dirname(output), exist_ok=True) df.to_csv(output, index=False) print(f"Saved {len(df)} samples to {output}.") def process_general_videos(root, output): root = os.path.expanduser(root) if not os.path.exists(root): return path_list = get_filelist(root, VID_EXTENSIONS) path_list = list(set(path_list)) # remove duplicates fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list] relpath_list = [os.path.relpath(x, root) for x in path_list] df = pd.DataFrame(dict(path=path_list, id=fname_list, relpath=relpath_list)) os.makedirs(os.path.dirname(output), exist_ok=True) df.to_csv(output, index=False) print(f"Saved {len(df)} samples to {output}.") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("dataset", type=str, choices=["imagenet", "ucf101", "vidprom", "image", "video"]) parser.add_argument("root", type=str) parser.add_argument("--split", type=str, default="train") parser.add_argument("--info", type=str, default=None) parser.add_argument("--output", type=str, default=None, required=True, help="Output path") args = parser.parse_args() if args.dataset == "imagenet": process_imagenet(args.root, args.split) elif args.dataset == "ucf101": process_ucf101(args.root, args.split) elif args.dataset == "vidprom": process_vidprom(args.root, args.info) elif args.dataset == "image": process_general_images(args.root, args.output) elif args.dataset == "video": process_general_videos(args.root, args.output) else: raise ValueError("Invalid dataset") ================================================ FILE: Open-Sora/tools/datasets/datautil.py ================================================ import argparse import html import json import os import random import re from functools import partial from glob import glob import cv2 import numpy as np import pandas as pd from PIL import Image from tqdm import tqdm from opensora.datasets.read_video import read_video from .utils import IMG_EXTENSIONS tqdm.pandas() try: from pandarallel import pandarallel PANDA_USE_PARALLEL = True except ImportError: PANDA_USE_PARALLEL = False def apply(df, func, **kwargs): if PANDA_USE_PARALLEL: return df.parallel_apply(func, **kwargs) return df.progress_apply(func, **kwargs) TRAIN_COLUMNS = ["path", "text", "num_frames", "fps", "height", "width", "aspect_ratio", "resolution", "text_len"] # ====================================================== # --info # ====================================================== def get_video_length(cap, method="header"): assert method in ["header", "set"] if method == "header": length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) else: cap.set(cv2.CAP_PROP_POS_AVI_RATIO, 1) length = int(cap.get(cv2.CAP_PROP_POS_FRAMES)) return length def get_info_old(path): try: ext = os.path.splitext(path)[1].lower() if ext in IMG_EXTENSIONS: im = cv2.imread(path) if im is None: return 0, 0, 0, np.nan, np.nan, np.nan height, width = im.shape[:2] num_frames, fps = 1, np.nan else: cap = cv2.VideoCapture(path) num_frames, height, width, fps = ( get_video_length(cap, method="header"), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)), int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), float(cap.get(cv2.CAP_PROP_FPS)), ) hw = height * width aspect_ratio = height / width if width > 0 else np.nan return num_frames, height, width, aspect_ratio, fps, hw except: return 0, 0, 0, np.nan, np.nan, np.nan def get_info(path): try: ext = os.path.splitext(path)[1].lower() if ext in IMG_EXTENSIONS: return get_image_info(path) else: return get_video_info(path) except: return 0, 0, 0, np.nan, np.nan, np.nan def get_image_info(path, backend="pillow"): if backend == "pillow": try: with open(path, "rb") as f: img = Image.open(f) img = img.convert("RGB") width, height = img.size num_frames, fps = 1, np.nan hw = height * width aspect_ratio = height / width if width > 0 else np.nan return num_frames, height, width, aspect_ratio, fps, hw except: return 0, 0, 0, np.nan, np.nan, np.nan elif backend == "cv2": try: im = cv2.imread(path) if im is None: return 0, 0, 0, np.nan, np.nan, np.nan height, width = im.shape[:2] num_frames, fps = 1, np.nan hw = height * width aspect_ratio = height / width if width > 0 else np.nan return num_frames, height, width, aspect_ratio, fps, hw except: return 0, 0, 0, np.nan, np.nan, np.nan else: raise ValueError def get_video_info(path, backend="torchvision"): if backend == "torchvision": try: vframes, infos = read_video(path) num_frames, height, width = vframes.shape[0], vframes.shape[2], vframes.shape[3] if "video_fps" in infos: fps = infos["video_fps"] else: fps = np.nan hw = height * width aspect_ratio = height / width if width > 0 else np.nan return num_frames, height, width, aspect_ratio, fps, hw except: return 0, 0, 0, np.nan, np.nan, np.nan elif backend == "cv2": try: cap = cv2.VideoCapture(path) num_frames, height, width, fps = ( get_video_length(cap, method="header"), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)), int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), float(cap.get(cv2.CAP_PROP_FPS)), ) hw = height * width aspect_ratio = height / width if width > 0 else np.nan return num_frames, height, width, aspect_ratio, fps, hw except: return 0, 0, 0, np.nan, np.nan, np.nan else: raise ValueError # ====================================================== # --refine-llm-caption # ====================================================== LLAVA_PREFIX = [ "The video shows", "The video captures", "The video features", "The video depicts", "The video presents", "The video features", "The video is ", "In the video,", "The image shows", "The image captures", "The image features", "The image depicts", "The image presents", "The image features", "The image is ", "The image portrays", "In the image,", ] def remove_caption_prefix(caption): for prefix in LLAVA_PREFIX: if caption.startswith(prefix) or caption.startswith(prefix.lower()): caption = caption[len(prefix) :].strip() if caption[0].islower(): caption = caption[0].upper() + caption[1:] return caption return caption # ====================================================== # --merge-cmotion # ====================================================== CMOTION_TEXT = { "static": "static", "pan_right": "pan right", "pan_left": "pan left", "zoom_in": "zoom in", "zoom_out": "zoom out", "tilt_up": "tilt up", "tilt_down": "tilt down", # "pan/tilt": "The camera is panning.", # "dynamic": "The camera is moving.", # "unknown": None, } CMOTION_PROBS = { # hard-coded probabilities "static": 1.0, "zoom_in": 1.0, "zoom_out": 1.0, "pan_left": 1.0, "pan_right": 1.0, "tilt_up": 1.0, "tilt_down": 1.0, # "dynamic": 1.0, # "unknown": 0.0, # "pan/tilt": 1.0, } def merge_cmotion(caption, cmotion): text = CMOTION_TEXT[cmotion] prob = CMOTION_PROBS[cmotion] if text is not None and random.random() < prob: caption = f"{caption} Camera motion: {text}." return caption # ====================================================== # --lang # ====================================================== def build_lang_detector(lang_to_detect): from lingua import Language, LanguageDetectorBuilder lang_dict = dict(en=Language.ENGLISH) assert lang_to_detect in lang_dict valid_lang = lang_dict[lang_to_detect] detector = LanguageDetectorBuilder.from_all_spoken_languages().with_low_accuracy_mode().build() def detect_lang(caption): confidence_values = detector.compute_language_confidence_values(caption) confidence = [x.language for x in confidence_values[:5]] if valid_lang not in confidence: return False return True return detect_lang # ====================================================== # --clean-caption # ====================================================== def basic_clean(text): import ftfy text = ftfy.fix_text(text) text = html.unescape(html.unescape(text)) return text.strip() BAD_PUNCT_REGEX = re.compile( r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" ) # noqa def clean_caption(caption): import urllib.parse as ul from bs4 import BeautifulSoup caption = str(caption) caption = ul.unquote_plus(caption) caption = caption.strip().lower() caption = re.sub("", "person", caption) # urls: caption = re.sub( r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", caption, ) # regex for urls caption = re.sub( r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", caption, ) # regex for urls # html: caption = BeautifulSoup(caption, features="html.parser").text # @ caption = re.sub(r"@[\w\d]+\b", "", caption) # 31C0—31EF CJK Strokes # 31F0—31FF Katakana Phonetic Extensions # 3200—32FF Enclosed CJK Letters and Months # 3300—33FF CJK Compatibility # 3400—4DBF CJK Unified Ideographs Extension A # 4DC0—4DFF Yijing Hexagram Symbols # 4E00—9FFF CJK Unified Ideographs caption = re.sub(r"[\u31c0-\u31ef]+", "", caption) caption = re.sub(r"[\u31f0-\u31ff]+", "", caption) caption = re.sub(r"[\u3200-\u32ff]+", "", caption) caption = re.sub(r"[\u3300-\u33ff]+", "", caption) caption = re.sub(r"[\u3400-\u4dbf]+", "", caption) caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption) caption = re.sub(r"[\u4e00-\u9fff]+", "", caption) ####################################################### # все виды тире / all types of dash --> "-" caption = re.sub( r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa "-", caption, ) # кавычки к одному стандарту caption = re.sub(r"[`´«»“”¨]", '"', caption) caption = re.sub(r"[‘’]", "'", caption) # " caption = re.sub(r""?", "", caption) # & caption = re.sub(r"&", "", caption) # ip adresses: caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption) # article ids: caption = re.sub(r"\d:\d\d\s+$", "", caption) # \n caption = re.sub(r"\\n", " ", caption) # "#123" caption = re.sub(r"#\d{1,3}\b", "", caption) # "#12345.." caption = re.sub(r"#\d{5,}\b", "", caption) # "123456.." caption = re.sub(r"\b\d{6,}\b", "", caption) # filenames: caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) # caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" caption = re.sub(BAD_PUNCT_REGEX, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " # this-is-my-cute-cat / this_is_my_cute_cat regex2 = re.compile(r"(?:\-|\_)") if len(re.findall(regex2, caption)) > 3: caption = re.sub(regex2, " ", caption) caption = basic_clean(caption) caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640 caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231 caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) caption = re.sub(r"\bpage\s+\d+\b", "", caption) caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) caption = re.sub(r"\b\s+\:\s+", r": ", caption) caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption) caption = re.sub(r"\s+", " ", caption) caption.strip() caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption) caption = re.sub(r"^[\'\_,\-\:;]", r"", caption) caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption) caption = re.sub(r"^\.\S+$", "", caption) return caption.strip() def text_preprocessing(text, use_text_preprocessing: bool = True): if use_text_preprocessing: # The exact text cleaning as was in the training stage: text = clean_caption(text) text = clean_caption(text) return text else: return text.lower().strip() # ====================================================== # load caption # ====================================================== def load_caption(path, ext): try: assert ext in ["json"] json_path = path.split(".")[0] + ".json" with open(json_path, "r") as f: data = json.load(f) caption = data["caption"] return caption except: return "" # ====================================================== # --clean-caption # ====================================================== DROP_SCORE_PROB = 0.2 def score_to_text(data): text = data["text"] scores = [] # aesthetic if "aes" in data: aes = data["aes"] if random.random() > DROP_SCORE_PROB: score_text = f"aesthetic score: {aes:.1f}" scores.append(score_text) if "flow" in data: flow = data["flow"] if random.random() > DROP_SCORE_PROB: score_text = f"motion score: {flow:.1f}" scores.append(score_text) if len(scores) > 0: text = f"{text} [{', '.join(scores)}]" return text # ====================================================== # read & write # ====================================================== def read_file(input_path): if input_path.endswith(".csv"): return pd.read_csv(input_path) elif input_path.endswith(".parquet"): return pd.read_parquet(input_path) else: raise NotImplementedError(f"Unsupported file format: {input_path}") def save_file(data, output_path): output_dir = os.path.dirname(output_path) if not os.path.exists(output_dir) and output_dir != "": os.makedirs(output_dir) if output_path.endswith(".csv"): return data.to_csv(output_path, index=False) elif output_path.endswith(".parquet"): return data.to_parquet(output_path, index=False) else: raise NotImplementedError(f"Unsupported file format: {output_path}") def read_data(input_paths): data = [] input_name = "" input_list = [] for input_path in input_paths: input_list.extend(glob(input_path)) print("Input files:", input_list) for i, input_path in enumerate(input_list): if not os.path.exists(input_path): continue data.append(read_file(input_path)) input_name += os.path.basename(input_path).split(".")[0] if i != len(input_list) - 1: input_name += "+" print(f"Loaded {len(data[-1])} samples from '{input_path}'.") if len(data) == 0: print(f"No samples to process. Exit.") exit() data = pd.concat(data, ignore_index=True, sort=False) print(f"Total number of samples: {len(data)}") return data, input_name # ====================================================== # main # ====================================================== # To add a new method, register it in the main, parse_args, and get_output_path functions, and update the doc at /tools/datasets/README.md#documentation def main(args): # reading data data, input_name = read_data(args.input) # make difference if args.difference is not None: data_diff = pd.read_csv(args.difference) print(f"Difference csv contains {len(data_diff)} samples.") data = data[~data["path"].isin(data_diff["path"])] input_name += f"-{os.path.basename(args.difference).split('.')[0]}" print(f"Filtered number of samples: {len(data)}.") # make intersection if args.intersection is not None: data_new = pd.read_csv(args.intersection) print(f"Intersection csv contains {len(data_new)} samples.") cols_to_use = data_new.columns.difference(data.columns) col_on = "path" # if 'id' in data.columns and 'id' in data_new.columns: # col_on = 'id' cols_to_use = cols_to_use.insert(0, col_on) data = pd.merge(data, data_new[cols_to_use], on=col_on, how="inner") print(f"Intersection number of samples: {len(data)}.") # get output path output_path = get_output_path(args, input_name) # preparation if args.lang is not None: detect_lang = build_lang_detector(args.lang) if args.count_num_token == "t5": from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("DeepFloyd/t5-v1_1-xxl") # IO-related if args.load_caption is not None: assert "path" in data.columns data["text"] = apply(data["path"], load_caption, ext=args.load_caption) if args.info: info = apply(data["path"], get_info) ( data["num_frames"], data["height"], data["width"], data["aspect_ratio"], data["fps"], data["resolution"], ) = zip(*info) if args.video_info: info = apply(data["path"], get_video_info) ( data["num_frames"], data["height"], data["width"], data["aspect_ratio"], data["fps"], data["resolution"], ) = zip(*info) if args.ext: assert "path" in data.columns data = data[apply(data["path"], os.path.exists)] # filtering if args.remove_url: assert "text" in data.columns data = data[~data["text"].str.contains(r"(?Phttps?://[^\s]+)", regex=True)] if args.lang is not None: assert "text" in data.columns data = data[data["text"].progress_apply(detect_lang)] # cannot parallelize if args.remove_empty_path: assert "path" in data.columns data = data[data["path"].str.len() > 0] data = data[~data["path"].isna()] if args.remove_empty_caption: assert "text" in data.columns data = data[data["text"].str.len() > 0] data = data[~data["text"].isna()] if args.remove_path_duplication: assert "path" in data.columns data = data.drop_duplicates(subset=["path"]) if args.path_subset: data = data[data["path"].str.contains(args.path_subset)] # processing if args.relpath is not None: data["path"] = apply(data["path"], lambda x: os.path.relpath(x, args.relpath)) if args.abspath is not None: data["path"] = apply(data["path"], lambda x: os.path.join(args.abspath, x)) if args.path_to_id: data["id"] = apply(data["path"], lambda x: os.path.splitext(os.path.basename(x))[0]) if args.merge_cmotion: data["text"] = apply(data, lambda x: merge_cmotion(x["text"], x["cmotion"]), axis=1) if args.refine_llm_caption: assert "text" in data.columns data["text"] = apply(data["text"], remove_caption_prefix) if args.append_text is not None: assert "text" in data.columns data["text"] = data["text"] + args.append_text if args.score_to_text: data["text"] = apply(data, score_to_text, axis=1) if args.clean_caption: assert "text" in data.columns data["text"] = apply( data["text"], partial(text_preprocessing, use_text_preprocessing=True), ) if args.count_num_token is not None: assert "text" in data.columns data["text_len"] = apply(data["text"], lambda x: len(tokenizer(x)["input_ids"])) if args.update_text is not None: data_new = pd.read_csv(args.update_text) num_updated = data.path.isin(data_new.path).sum() print(f"Number of updated samples: {num_updated}.") data = data.set_index("path") data_new = data_new[["path", "text"]].set_index("path") data.update(data_new) data = data.reset_index() # sort if args.sort is not None: data = data.sort_values(by=args.sort, ascending=False) if args.sort_ascending is not None: data = data.sort_values(by=args.sort_ascending, ascending=True) # filtering if args.filesize: assert "path" in data.columns data["filesize"] = apply(data["path"], lambda x: os.stat(x).st_size / 1024 / 1024) if args.fsmax is not None: assert "filesize" in data.columns data = data[data["filesize"] <= args.fsmax] if args.remove_empty_caption: assert "text" in data.columns data = data[data["text"].str.len() > 0] data = data[~data["text"].isna()] if args.fmin is not None: assert "num_frames" in data.columns data = data[data["num_frames"] >= args.fmin] if args.fmax is not None: assert "num_frames" in data.columns data = data[data["num_frames"] <= args.fmax] if args.fpsmax is not None: assert "fps" in data.columns data = data[(data["fps"] <= args.fpsmax) | np.isnan(data["fps"])] if args.hwmax is not None: if "resolution" not in data.columns: height = data["height"] width = data["width"] data["resolution"] = height * width data = data[data["resolution"] <= args.hwmax] if args.aesmin is not None: assert "aes" in data.columns data = data[data["aes"] >= args.aesmin] if args.matchmin is not None: assert "match" in data.columns data = data[data["match"] >= args.matchmin] if args.flowmin is not None: assert "flow" in data.columns data = data[data["flow"] >= args.flowmin] if args.remove_text_duplication: data = data.drop_duplicates(subset=["text"], keep="first") if args.img_only: data = data[data["path"].str.lower().str.endswith(IMG_EXTENSIONS)] if args.vid_only: data = data[~data["path"].str.lower().str.endswith(IMG_EXTENSIONS)] # process data if args.shuffle: data = data.sample(frac=1).reset_index(drop=True) # shuffle if args.head is not None: data = data.head(args.head) # train columns if args.train_column: all_columns = data.columns columns_to_drop = all_columns.difference(TRAIN_COLUMNS) data = data.drop(columns=columns_to_drop) print(f"Filtered number of samples: {len(data)}.") # shard data if args.shard is not None: sharded_data = np.array_split(data, args.shard) for i in range(args.shard): output_path_part = output_path.split(".") output_path_s = ".".join(output_path_part[:-1]) + f"_{i}." + output_path_part[-1] save_file(sharded_data[i], output_path_s) print(f"Saved {len(sharded_data[i])} samples to {output_path_s}.") else: save_file(data, output_path) print(f"Saved {len(data)} samples to {output_path}.") def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("input", type=str, nargs="+", help="path to the input dataset") parser.add_argument("--output", type=str, default=None, help="output path") parser.add_argument("--format", type=str, default="csv", help="output format", choices=["csv", "parquet"]) parser.add_argument("--disable-parallel", action="store_true", help="disable parallel processing") parser.add_argument("--num-workers", type=int, default=None, help="number of workers") parser.add_argument("--seed", type=int, default=42, help="random seed") # special case parser.add_argument("--shard", type=int, default=None, help="shard the dataset") parser.add_argument("--sort", type=str, default=None, help="sort by column") parser.add_argument("--sort-ascending", type=str, default=None, help="sort by column (ascending order)") parser.add_argument("--difference", type=str, default=None, help="get difference from the dataset") parser.add_argument( "--intersection", type=str, default=None, help="keep the paths in csv from the dataset and merge columns" ) parser.add_argument("--train-column", action="store_true", help="only keep the train column") # IO-related parser.add_argument("--info", action="store_true", help="get the basic information of each video and image") parser.add_argument("--video-info", action="store_true", help="get the basic information of each video") parser.add_argument("--ext", action="store_true", help="check if the file exists") parser.add_argument( "--load-caption", type=str, default=None, choices=["json", "txt"], help="load the caption from json or txt" ) # path processing parser.add_argument("--relpath", type=str, default=None, help="modify the path to relative path by root given") parser.add_argument("--abspath", type=str, default=None, help="modify the path to absolute path by root given") parser.add_argument("--path-to-id", action="store_true", help="add id based on path") parser.add_argument( "--path-subset", type=str, default=None, help="extract a subset data containing the given `path-subset` value" ) parser.add_argument( "--remove-empty-path", action="store_true", help="remove rows with empty path", # caused by transform, cannot read path ) # caption filtering parser.add_argument( "--remove-empty-caption", action="store_true", help="remove rows with empty caption", ) parser.add_argument("--remove-url", action="store_true", help="remove rows with url in caption") parser.add_argument("--lang", type=str, default=None, help="remove rows with other language") parser.add_argument("--remove-path-duplication", action="store_true", help="remove rows with duplicated path") parser.add_argument("--remove-text-duplication", action="store_true", help="remove rows with duplicated caption") # caption processing parser.add_argument("--refine-llm-caption", action="store_true", help="modify the caption generated by LLM") parser.add_argument( "--clean-caption", action="store_true", help="modify the caption according to T5 pipeline to suit training" ) parser.add_argument("--merge-cmotion", action="store_true", help="merge the camera motion to the caption") parser.add_argument( "--count-num-token", type=str, choices=["t5"], default=None, help="Count the number of tokens in the caption" ) parser.add_argument("--append-text", type=str, default=None, help="append text to the caption") parser.add_argument("--score-to-text", action="store_true", help="convert score to text") parser.add_argument("--update-text", type=str, default=None, help="update the text with the given text") # score filtering parser.add_argument("--filesize", action="store_true", help="get the filesize of each video and image in MB") parser.add_argument("--fsmax", type=int, default=None, help="filter the dataset by maximum filesize") parser.add_argument("--fmin", type=int, default=None, help="filter the dataset by minimum number of frames") parser.add_argument("--fmax", type=int, default=None, help="filter the dataset by maximum number of frames") parser.add_argument("--hwmax", type=int, default=None, help="filter the dataset by maximum resolution") parser.add_argument("--aesmin", type=float, default=None, help="filter the dataset by minimum aes score") parser.add_argument("--matchmin", type=float, default=None, help="filter the dataset by minimum match score") parser.add_argument("--flowmin", type=float, default=None, help="filter the dataset by minimum flow score") parser.add_argument("--fpsmax", type=float, default=None, help="filter the dataset by maximum fps") parser.add_argument("--img-only", action="store_true", help="only keep the image data") parser.add_argument("--vid-only", action="store_true", help="only keep the video data") # data processing parser.add_argument("--shuffle", default=False, action="store_true", help="shuffle the dataset") parser.add_argument("--head", type=int, default=None, help="return the first n rows of data") return parser.parse_args() def get_output_path(args, input_name): if args.output is not None: return args.output name = input_name dir_path = os.path.dirname(args.input[0]) # sort if args.sort is not None: assert args.sort_ascending is None name += "_sort" if args.sort_ascending is not None: assert args.sort is None name += "_sort" # IO-related # for IO-related, the function must be wrapped in try-except if args.info: name += "_info" if args.video_info: name += "_vinfo" if args.ext: name += "_ext" if args.load_caption: name += f"_load{args.load_caption}" # path processing if args.relpath is not None: name += "_relpath" if args.abspath is not None: name += "_abspath" if args.remove_empty_path: name += "_noemptypath" # caption filtering if args.remove_empty_caption: name += "_noempty" if args.remove_url: name += "_nourl" if args.lang is not None: name += f"_{args.lang}" if args.remove_path_duplication: name += "_noduppath" if args.remove_text_duplication: name += "_noduptext" if args.path_subset: name += "_subset" # caption processing if args.refine_llm_caption: name += "_llm" if args.clean_caption: name += "_clean" if args.merge_cmotion: name += "_cmcaption" if args.count_num_token: name += "_ntoken" if args.append_text is not None: name += "_appendtext" if args.score_to_text: name += "_score2text" if args.update_text is not None: name += "_update" # score filtering if args.filesize: name += "_filesize" if args.fsmax is not None: name += f"_fsmax{args.fsmax}" if args.fmin is not None: name += f"_fmin{args.fmin}" if args.fmax is not None: name += f"_fmax{args.fmax}" if args.fpsmax is not None: name += f"_fpsmax{args.fpsmax}" if args.hwmax is not None: name += f"_hwmax{args.hwmax}" if args.aesmin is not None: name += f"_aesmin{args.aesmin}" if args.matchmin is not None: name += f"_matchmin{args.matchmin}" if args.flowmin is not None: name += f"_flowmin{args.flowmin}" if args.img_only: name += "_img" if args.vid_only: name += "_vid" # processing if args.shuffle: name += f"_shuffled_seed{args.seed}" if args.head is not None: name += f"_first_{args.head}_data" output_path = os.path.join(dir_path, f"{name}.{args.format}") return output_path if __name__ == "__main__": args = parse_args() if args.disable_parallel: PANDA_USE_PARALLEL = False if PANDA_USE_PARALLEL: if args.num_workers is not None: pandarallel.initialize(nb_workers=args.num_workers, progress_bar=True) else: pandarallel.initialize(progress_bar=True) if args.seed is not None: random.seed(args.seed) np.random.seed(args.seed) main(args) ================================================ FILE: Open-Sora/tools/datasets/filter_panda10m.py ================================================ # TODO: remove this file before releasing import argparse import html import os import re import pandas as pd from tqdm import tqdm tqdm.pandas() try: from pandarallel import pandarallel pandarallel.initialize(progress_bar=True) pandas_has_parallel = True except ImportError: pandas_has_parallel = False def apply(df, func, **kwargs): if pandas_has_parallel: return df.parallel_apply(func, **kwargs) return df.progress_apply(func, **kwargs) def basic_clean(text): import ftfy text = ftfy.fix_text(text) text = html.unescape(html.unescape(text)) return text.strip() BAD_PUNCT_REGEX = re.compile( r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" ) # noqa def clean_caption(caption): import urllib.parse as ul from bs4 import BeautifulSoup caption = str(caption) caption = ul.unquote_plus(caption) caption = caption.strip().lower() caption = re.sub("", "person", caption) # urls: caption = re.sub( r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", caption, ) # regex for urls caption = re.sub( r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", caption, ) # regex for urls # html: caption = BeautifulSoup(caption, features="html.parser").text # @ caption = re.sub(r"@[\w\d]+\b", "", caption) # 31C0—31EF CJK Strokes # 31F0—31FF Katakana Phonetic Extensions # 3200—32FF Enclosed CJK Letters and Months # 3300—33FF CJK Compatibility # 3400—4DBF CJK Unified Ideographs Extension A # 4DC0—4DFF Yijing Hexagram Symbols # 4E00—9FFF CJK Unified Ideographs caption = re.sub(r"[\u31c0-\u31ef]+", "", caption) caption = re.sub(r"[\u31f0-\u31ff]+", "", caption) caption = re.sub(r"[\u3200-\u32ff]+", "", caption) caption = re.sub(r"[\u3300-\u33ff]+", "", caption) caption = re.sub(r"[\u3400-\u4dbf]+", "", caption) caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption) caption = re.sub(r"[\u4e00-\u9fff]+", "", caption) ####################################################### # все виды тире / all types of dash --> "-" caption = re.sub( r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa "-", caption, ) # кавычки к одному стандарту caption = re.sub(r"[`´«»“”¨]", '"', caption) caption = re.sub(r"[‘’]", "'", caption) # " caption = re.sub(r""?", "", caption) # & caption = re.sub(r"&", "", caption) # ip adresses: caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption) # article ids: caption = re.sub(r"\d:\d\d\s+$", "", caption) # \n caption = re.sub(r"\\n", " ", caption) # "#123" caption = re.sub(r"#\d{1,3}\b", "", caption) # "#12345.." caption = re.sub(r"#\d{5,}\b", "", caption) # "123456.." caption = re.sub(r"\b\d{6,}\b", "", caption) # filenames: caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) # caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" caption = re.sub(BAD_PUNCT_REGEX, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " # this-is-my-cute-cat / this_is_my_cute_cat regex2 = re.compile(r"(?:\-|\_)") if len(re.findall(regex2, caption)) > 3: caption = re.sub(regex2, " ", caption) caption = basic_clean(caption) caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640 caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231 caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) caption = re.sub(r"\bpage\s+\d+\b", "", caption) caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) caption = re.sub(r"\b\s+\:\s+", r": ", caption) caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption) caption = re.sub(r"\s+", " ", caption) caption.strip() caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption) caption = re.sub(r"^[\'\_,\-\:;]", r"", caption) caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption) caption = re.sub(r"^\.\S+$", "", caption) return caption.strip() def get_10m_set(): meta_path_10m = "/mnt/hdd/data/Panda-70M/raw/meta/train/panda70m_training_10m.csv" meta_10m = pd.read_csv(meta_path_10m) def process_single_caption(row): text_list = eval(row["caption"]) clean_list = [clean_caption(x) for x in text_list] return str(clean_list) ret = apply(meta_10m, process_single_caption, axis=1) # ret = meta_10m.progress_apply(process_single_caption, axis=1) print("==> text processed.") text_list = [] for x in ret: text_list += eval(x) # text_set = text_set.union(set(eval(x))) text_set = set(text_list) # meta_10m['caption_new'] = ret # meta_10m.to_csv('/mnt/hdd/data/Panda-70M/raw/meta/train/panda70m_training_10m_new-cap.csv') # video_id_set = set(meta_10m['videoID']) # id2t = {} # for idx, row in tqdm(meta_10m.iterrows(), total=len(meta_10m)): # video_id = row['videoID'] # text_list = eval(row['caption']) # id2t[video_id] = set(text_list) print(f"==> Loaded meta_10m from '{meta_path_10m}'") return text_set def filter_panda10m_text(meta_path, text_set): def process_single_row(row): # path = row['path'] t = row["text"] # fname = os.path.basename(path) # video_id = fname[:fname.rindex('_')] if t not in text_set: return False return True meta = pd.read_csv(meta_path) ret = apply(meta, process_single_row, axis=1) # ret = meta.progress_apply(process_single_row, axis=1) meta = meta[ret] wo_ext, ext = os.path.splitext(meta_path) out_path = f"{wo_ext}_filter-10m{ext}" meta.to_csv(out_path, index=False) print(f"New meta (shape={meta.shape}) saved to '{out_path}'.") def filter_panda10m_timestamp(meta_path): meta_path_10m = "/mnt/hdd/data/Panda-70M/raw/meta/train/panda70m_training_10m.csv" meta_10m = pd.read_csv(meta_path_10m) id2t = {} for idx, row in tqdm(meta_10m.iterrows(), total=len(meta_10m)): video_id = row["videoID"] timestamp = eval(row["timestamp"]) timestamp = [str(tuple(x)) for x in timestamp] id2t[video_id] = timestamp # video_id_set_10m = set(meta_10m['videoID']) print(f"==> Loaded meta_10m from '{meta_path_10m}'") def process_single_row(row): path = row["path"] t = row["timestamp"] fname = os.path.basename(path) video_id = fname[: fname.rindex("_")] if video_id not in id2t: return False if t not in id2t[video_id]: return False return True # return video_id in video_id_set_10m meta = pd.read_csv(meta_path) ret = apply(meta, process_single_row, axis=1) meta = meta[ret] wo_ext, ext = os.path.splitext(meta_path) out_path = f"{wo_ext}_filter-10m{ext}" meta.to_csv(out_path, index=False) print(f"New meta (shape={meta.shape}) saved to '{out_path}'.") def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--meta_path", type=str, nargs="+") parser.add_argument("--num_workers", default=5, type=int) args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() text_set = get_10m_set() for x in args.meta_path: filter_panda10m_text(x, text_set) ================================================ FILE: Open-Sora/tools/datasets/split.py ================================================ import argparse from typing import List import pandas as pd from mmengine.config import Config from opensora.datasets.bucket import Bucket def split_by_bucket( bucket: Bucket, input_files: List[str], output_path: str, limit: int, frame_interval: int, ): print(f"Split {len(input_files)} files into {len(bucket)} buckets") total_limit = len(bucket) * limit bucket_cnt = {} # get all bucket id for hw_id, d in bucket.ar_criteria.items(): for t_id, v in d.items(): for ar_id in v.keys(): bucket_id = (hw_id, t_id, ar_id) bucket_cnt[bucket_id] = 0 output_df = None # split files for path in input_files: df = pd.read_csv(path) if output_df is None: output_df = pd.DataFrame(columns=df.columns) for i in range(len(df)): row = df.iloc[i] t, h, w = row["num_frames"], row["height"], row["width"] bucket_id = bucket.get_bucket_id(t, h, w, frame_interval) if bucket_id is None: continue if bucket_cnt[bucket_id] < limit: bucket_cnt[bucket_id] += 1 output_df = pd.concat([output_df, pd.DataFrame([row])], ignore_index=True) if len(output_df) >= total_limit: break if len(output_df) >= total_limit: break assert len(output_df) <= total_limit if len(output_df) == total_limit: print(f"All buckets are full ({total_limit} samples)") else: print(f"Only {len(output_df)} files are used") output_df.to_csv(output_path, index=False) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("input", type=str, nargs="+") parser.add_argument("-o", "--output", required=True) parser.add_argument("-c", "--config", required=True) parser.add_argument("-l", "--limit", default=200, type=int) args = parser.parse_args() assert args.limit > 0 cfg = Config.fromfile(args.config) bucket_config = cfg.bucket_config # rewrite bucket_config for ar, d in bucket_config.items(): for frames, t in d.items(): p, bs = t if p > 0.0: p = 1.0 d[frames] = (p, bs) bucket = Bucket(bucket_config) split_by_bucket(bucket, args.input, args.output, args.limit, cfg.dataset.frame_interval) ================================================ FILE: Open-Sora/tools/datasets/transform.py ================================================ import argparse import os import random import cv2 import numpy as np import pandas as pd from tqdm import tqdm from .utils import IMG_EXTENSIONS, extract_frames tqdm.pandas() try: from pandarallel import pandarallel pandarallel.initialize(progress_bar=True) pandas_has_parallel = True except ImportError: pandas_has_parallel = False def apply(df, func, **kwargs): if pandas_has_parallel: return df.parallel_apply(func, **kwargs) return df.progress_apply(func, **kwargs) def get_new_path(path, input_dir, output): path_new = os.path.join(output, os.path.relpath(path, input_dir)) os.makedirs(os.path.dirname(path_new), exist_ok=True) return path_new def resize(path, length, input_dir, output): path_new = get_new_path(path, input_dir, output) ext = os.path.splitext(path)[1].lower() assert ext in IMG_EXTENSIONS img = cv2.imread(path) if img is not None: h, w = img.shape[:2] if min(h, w) > length: if h > w: new_h = length new_w = int(w * new_h / h) else: new_w = length new_h = int(h * new_w / w) img = cv2.resize(img, (new_w, new_h)) cv2.imwrite(path_new, img) else: path_new = "" return path_new def rand_crop(path, input_dir, output): ext = os.path.splitext(path)[1].lower() path_new = get_new_path(path, input_dir, output) assert ext in IMG_EXTENSIONS img = cv2.imread(path) if img is not None: h, w = img.shape[:2] width, height, _ = img.shape pos = random.randint(0, 3) if pos == 0: img_cropped = img[: width // 2, : height // 2] elif pos == 1: img_cropped = img[width // 2 :, : height // 2] elif pos == 2: img_cropped = img[: width // 2, height // 2 :] else: img_cropped = img[width // 2 :, height // 2 :] cv2.imwrite(path_new, img_cropped) else: path_new = "" return path_new def main(args): data = pd.read_csv(args.input) if args.method == "img_rand_crop": data["path"] = apply(data["path"], lambda x: rand_crop(x, args.input_dir, args.output)) output_csv = args.input.replace(".csv", f"_rand_crop.csv") elif args.method == "img_resize": data["path"] = apply(data["path"], lambda x: resize(x, args.length, args.input_dir, args.output)) output_csv = args.input.replace(".csv", f"_resized{args.length}.csv") elif args.method == "vid_frame_extract": points = args.points if args.points is not None else args.points_index data = pd.DataFrame(np.repeat(data.values, 3, axis=0), columns=data.columns) num_points = len(points) data["point"] = np.nan for i, point in enumerate(points): if isinstance(point, int): data.loc[i::num_points, "point"] = point else: data.loc[i::num_points, "point"] = data.loc[i::num_points, "num_frames"] * point data["path"] = apply(data, lambda x: extract_frames(x["path"], args.input_dir, args.output, x["point"]), axis=1) output_csv = args.input.replace(".csv", f"_vid_frame_extract.csv") data.to_csv(output_csv, index=False) print(f"Saved to {output_csv}") def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("method", type=str, choices=["img_resize", "img_rand_crop", "vid_frame_extract"]) parser.add_argument("input", type=str) parser.add_argument("input_dir", type=str) parser.add_argument("output", type=str) parser.add_argument("--disable-parallel", action="store_true") parser.add_argument("--length", type=int, default=2160) parser.add_argument("--seed", type=int, default=42, help="seed for random") parser.add_argument("--points", nargs="+", type=float, default=None) parser.add_argument("--points_index", nargs="+", type=int, default=None) args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() random.seed(args.seed) if args.disable_parallel: pandas_has_parallel = False main(args) ================================================ FILE: Open-Sora/tools/datasets/utils.py ================================================ import os import cv2 import numpy as np from PIL import Image IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp") VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv") def is_video(filename): ext = os.path.splitext(filename)[-1].lower() return ext in VID_EXTENSIONS def extract_frames( video_path, frame_inds=None, points=None, backend="opencv", return_length=False, num_frames=None, ): """ Args: video_path (str): path to video frame_inds (List[int]): indices of frames to extract points (List[float]): values within [0, 1); multiply #frames to get frame indices Return: List[PIL.Image] """ assert backend in ["av", "opencv", "decord"] assert (frame_inds is None) or (points is None) if backend == "av": import av container = av.open(video_path) if num_frames is not None: total_frames = num_frames else: total_frames = container.streams.video[0].frames if points is not None: frame_inds = [int(p * total_frames) for p in points] frames = [] for idx in frame_inds: if idx >= total_frames: idx = total_frames - 1 target_timestamp = int(idx * av.time_base / container.streams.video[0].average_rate) container.seek(target_timestamp) frame = next(container.decode(video=0)).to_image() frames.append(frame) if return_length: return frames, total_frames return frames elif backend == "decord": import decord container = decord.VideoReader(video_path, num_threads=1) if num_frames is not None: total_frames = num_frames else: total_frames = len(container) if points is not None: frame_inds = [int(p * total_frames) for p in points] frame_inds = np.array(frame_inds).astype(np.int32) frame_inds[frame_inds >= total_frames] = total_frames - 1 frames = container.get_batch(frame_inds).asnumpy() # [N, H, W, C] frames = [Image.fromarray(x) for x in frames] if return_length: return frames, total_frames return frames elif backend == "opencv": cap = cv2.VideoCapture(video_path) if num_frames is not None: total_frames = num_frames else: total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) if points is not None: frame_inds = [int(p * total_frames) for p in points] frames = [] for idx in frame_inds: if idx >= total_frames: idx = total_frames - 1 cap.set(cv2.CAP_PROP_POS_FRAMES, idx) # HACK: sometimes OpenCV fails to read frames, return a black frame instead try: ret, frame = cap.read() frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame = Image.fromarray(frame) except Exception as e: print(f"[Warning] Error reading frame {idx} from {video_path}: {e}") # First, try to read the first frame try: print(f"[Warning] Try reading first frame.") cap.set(cv2.CAP_PROP_POS_FRAMES, 0) ret, frame = cap.read() frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame = Image.fromarray(frame) # If that fails, return a black frame except Exception as e: print(f"[Warning] Error in reading first frame from {video_path}: {e}") height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) frame = Image.new("RGB", (width, height), (0, 0, 0)) # HACK: if height or width is 0, return a black frame instead if frame.height == 0 or frame.width == 0: height = width = 256 frame = Image.new("RGB", (width, height), (0, 0, 0)) frames.append(frame) if return_length: return frames, total_frames return frames else: raise ValueError ================================================ FILE: Open-Sora/tools/frame_interpolation/README.md ================================================ # Frame Interpolation For current version, we sample 1 frame out of 3 frames in the video. Although we are going to use VAE to avoid frame loss, we provide a frame interpolation tool to interpolate the video now. The frame interpolation tool is based on [AMT](https://github.com/MCG-NKU/AMT). Interpolation can be useful for scenery videos, but it may not be suitable for videos with fast motion. ## Requirement Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "Frame Interpolation" sections. ## Model We use **AMT** as our frame interpolation model. After sampling, you can use frame interpolation model to interpolate your video smoothly. ## Usage The ckpt file will be automatically downloaded in user's `.cache` directory. You can use frame interpolation to your video file or a video folder. 1. Process a video file ```python python -m tools.frame_interpolation.interpolation your_video.mp4 ``` 2. Process all video file in target directory ```python python -m tools.frame_interpolation.interpolation your_video_dir --output_path samples/interpolation ``` The output video will be stored at `output_path` and its duration time is equal `the total number of frames after frame interpolation / the frame rate` ### Command Line Arguments * `input`: Path of the input video. **Video path** or **Folder path(with --folder)** * `--ckpt`: Pretrained model of [AMT](https://github.com/MCG-NKU/AMT). Default path: `~/.cache/amt-g.pth`. * `--niter`: Iterations of interpolation. With $m$ input frames, `[N_ITER]` $=n$ corresponds to $2^n\times (m-1)+1$ output frames. * `--fps`: Frame rate of the input video. (Default: 8) * `--output_path`: **Folder Path** of the output video. ================================================ FILE: Open-Sora/tools/frame_interpolation/__init__.py ================================================ ================================================ FILE: Open-Sora/tools/frame_interpolation/interpolation.py ================================================ # this script is modified from https://github.com/MCG-NKU/AMT/blob/main/demos/demo_2x.py import argparse import os import os.path as osp import cv2 import numpy as np import torch from opensora.utils.ckpt_utils import download_model from .networks.amt_g import Model from .utils.utils import InputPadder, img2tensor, tensor2img hf_endpoint = os.environ.get("HF_ENDPOINT") if hf_endpoint is None: hf_endpoint = "https://huggingface.co" VID_EXT = [".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".webm"] network_cfg = { "params": { "corr_radius": 3, "corr_lvls": 4, "num_flows": 5, }, } device = "cuda" if torch.cuda.is_available() else "cpu" def init(): """ initialize the device and the anchor resolution. """ if device == "cuda": anchor_resolution = 1024 * 512 anchor_memory = 1500 * 1024**2 anchor_memory_bias = 2500 * 1024**2 vram_avail = torch.cuda.get_device_properties(device).total_memory print("VRAM available: {:.1f} MB".format(vram_avail / 1024**2)) else: # Do not resize in cpu mode anchor_resolution = 8192 * 8192 anchor_memory = 1 anchor_memory_bias = 0 vram_avail = 1 return anchor_resolution, anchor_memory, anchor_memory_bias, vram_avail def get_input_video_from_path(input_path): """ Get the input video from the input_path. params: input_path: str, the path of the input video. devices: str, the device to run the model. returns: inputs: list, the list of the input frames. scale: float, the scale of the input frames. padder: InputPadder, the padder to pad the input frames. """ anchor_resolution, anchor_memory, anchor_memory_bias, vram_avail = init() if osp.splitext(input_path)[-1].lower() in VID_EXT: vcap = cv2.VideoCapture(input_path) inputs = [] w = int(vcap.get(cv2.CAP_PROP_FRAME_WIDTH)) h = int(vcap.get(cv2.CAP_PROP_FRAME_HEIGHT)) scale = anchor_resolution / (h * w) * np.sqrt((vram_avail - anchor_memory_bias) / anchor_memory) scale = 1 if scale > 1 else scale scale = 1 / np.floor(1 / np.sqrt(scale) * 16) * 16 if scale < 1: print(f"Due to the limited VRAM, the video will be scaled by {scale:.2f}") padding = int(16 / scale) padder = InputPadder((h, w), padding) while True: ret, frame = vcap.read() if ret is False: break frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame_t = img2tensor(frame).to(device) frame_t = padder.pad(frame_t) inputs.append(frame_t) print(f"Loading the [video] from {input_path}, the number of frames [{len(inputs)}]") else: raise TypeError("Input should be a video.") return inputs, scale, padder def load_model(ckpt): """ load the frame interpolation model. """ params = network_cfg.get("params", {}) model = Model(**params) model.load_state_dict(ckpt["state_dict"]) model = model.to(device) model.eval() return model def interpolater(model, inputs, scale, padder, iters=1): """ interpolating with the interpolation model. params: model: nn.Module, the frame interpolation model. inputs: list, the list of the input frames. scale: float, the scale of the input frames. iters: int, the number of iterations of interpolation. The final frames model generating is 2 ** iters * (m - 1) + 1 and m is input frames. returns: outputs: list, the list of the output frames. """ print("Start frame interpolation:") embt = torch.tensor(1 / 2).float().view(1, 1, 1, 1).to(device) for i in range(iters): print(f"Iter {i+1}. input_frames={len(inputs)} output_frames={2*len(inputs)-1}") outputs = [inputs[0]] for in_0, in_1 in zip(inputs[:-1], inputs[1:]): in_0 = in_0.to(device) in_1 = in_1.to(device) with torch.no_grad(): imgt_pred = model(in_0, in_1, embt, scale_factor=scale, eval=True)["imgt_pred"] outputs += [imgt_pred.cpu(), in_1.cpu()] inputs = outputs outputs = padder.unpad(*outputs) return outputs def write(outputs, input_path, output_path, fps=30): """ write results to the output_path. """ if osp.exists(output_path) is False: os.makedirs(output_path) size = outputs[0].shape[2:][::-1] _, file_name_with_extension = os.path.split(input_path) file_name, _ = os.path.splitext(file_name_with_extension) save_video_path = f"{output_path}/fps{fps}_{file_name}.mp4" fourcc = cv2.VideoWriter_fourcc(*"mp4v") writer = cv2.VideoWriter(save_video_path, fourcc, fps, size) for i, imgt_pred in enumerate(outputs): imgt_pred = tensor2img(imgt_pred) imgt_pred = cv2.cvtColor(imgt_pred, cv2.COLOR_RGB2BGR) writer.write(imgt_pred) print(f"Demo video is saved to [{save_video_path}]") writer.release() def process( model, image_path, output_path, fps, iters, ): inputs, scale, padder = get_input_video_from_path(image_path) outputs = interpolater(model, inputs, scale, padder, iters) write(outputs, image_path, output_path, fps) def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("input", help="Input video.") parser.add_argument("--ckpt", type=str, default="./pretrained_models/amt-g.pth", help="The pretrained model.") parser.add_argument( "--niters", type=int, default=1, help="Iter of Interpolation. The number of frames will be double after per iter.", ) parser.add_argument("--output_path", type=str, default="samples", help="Output path.") parser.add_argument("--fps", type=int, default=8, help="Frames rate of the output video.") parser.add_argument("--folder", action="store_true", help="If the input is a folder, set this flag.") args = parser.parse_args() times_frame = 2**args.niters old_fps = args.fps args.fps = args.fps * times_frame print(f"Interpolation will turn {old_fps}fps video to {args.fps}fps video.") args.input = os.path.expanduser(args.input) args.ckpt = os.path.expanduser(args.ckpt) args.folder = osp.splitext(args.input)[-1].lower() not in VID_EXT args.ckpt = download_model(local_path=args.ckpt, url=hf_endpoint + "/lalala125/AMT/resolve/main/amt-g.pth") return args if __name__ == "__main__": args = parse_args() ckpt_path = args.ckpt input_path = args.input output_path = args.output_path iters = int(args.niters) fps = int(args.fps) model = load_model(ckpt_path) if args.folder: for file in os.listdir(input_path): if osp.splitext(file)[-1].lower() in VID_EXT: vid_path = os.path.join(input_path, file) process(model, vid_path, output_path, fps, iters) else: process(model, input_path, output_path, fps, iters) print("Interpolation is done.") print(f"Output path: {output_path}") ================================================ FILE: Open-Sora/tools/frame_interpolation/networks/__init__.py ================================================ from .amt_g import Model ================================================ FILE: Open-Sora/tools/frame_interpolation/networks/amt_g.py ================================================ import torch import torch.nn as nn from .blocks.feat_enc import LargeEncoder from .blocks.ifrnet import Encoder, InitDecoder, IntermediateDecoder, resize from .blocks.multi_flow import MultiFlowDecoder, multi_flow_combine from .blocks.raft import BasicUpdateBlock, BidirCorrBlock, coords_grid class Model(nn.Module): def __init__(self, corr_radius=3, corr_lvls=4, num_flows=5, channels=[84, 96, 112, 128], skip_channels=84): super(Model, self).__init__() self.radius = corr_radius self.corr_levels = corr_lvls self.num_flows = num_flows self.feat_encoder = LargeEncoder(output_dim=128, norm_fn="instance", dropout=0.0) self.encoder = Encoder(channels, large=True) self.decoder4 = InitDecoder(channels[3], channels[2], skip_channels) self.decoder3 = IntermediateDecoder(channels[2], channels[1], skip_channels) self.decoder2 = IntermediateDecoder(channels[1], channels[0], skip_channels) self.decoder1 = MultiFlowDecoder(channels[0], skip_channels, num_flows) self.update4 = self._get_updateblock(112, None) self.update3_low = self._get_updateblock(96, 2.0) self.update2_low = self._get_updateblock(84, 4.0) self.update3_high = self._get_updateblock(96, None) self.update2_high = self._get_updateblock(84, None) self.comb_block = nn.Sequential( nn.Conv2d(3 * self.num_flows, 6 * self.num_flows, 7, 1, 3), nn.PReLU(6 * self.num_flows), nn.Conv2d(6 * self.num_flows, 3, 7, 1, 3), ) def _get_updateblock(self, cdim, scale_factor=None): return BasicUpdateBlock( cdim=cdim, hidden_dim=192, flow_dim=64, corr_dim=256, corr_dim2=192, fc_dim=188, scale_factor=scale_factor, corr_levels=self.corr_levels, radius=self.radius, ) def _corr_scale_lookup(self, corr_fn, coord, flow0, flow1, embt, downsample=1): # convert t -> 0 to 0 -> 1 | convert t -> 1 to 1 -> 0 # based on linear assumption t1_scale = 1.0 / embt t0_scale = 1.0 / (1.0 - embt) if downsample != 1: inv = 1 / downsample flow0 = inv * resize(flow0, scale_factor=inv) flow1 = inv * resize(flow1, scale_factor=inv) corr0, corr1 = corr_fn(coord + flow1 * t1_scale, coord + flow0 * t0_scale) corr = torch.cat([corr0, corr1], dim=1) flow = torch.cat([flow0, flow1], dim=1) return corr, flow def forward(self, img0, img1, embt, scale_factor=1.0, eval=False, **kwargs): mean_ = torch.cat([img0, img1], 2).mean(1, keepdim=True).mean(2, keepdim=True).mean(3, keepdim=True) img0 = img0 - mean_ img1 = img1 - mean_ img0_ = resize(img0, scale_factor) if scale_factor != 1.0 else img0 img1_ = resize(img1, scale_factor) if scale_factor != 1.0 else img1 b, _, h, w = img0_.shape coord = coords_grid(b, h // 8, w // 8, img0.device) fmap0, fmap1 = self.feat_encoder([img0_, img1_]) # [1, 128, H//8, W//8] corr_fn = BidirCorrBlock(fmap0, fmap1, radius=self.radius, num_levels=self.corr_levels) # f0_1: [1, c0, H//2, W//2] | f0_2: [1, c1, H//4, W//4] # f0_3: [1, c2, H//8, W//8] | f0_4: [1, c3, H//16, W//16] f0_1, f0_2, f0_3, f0_4 = self.encoder(img0_) f1_1, f1_2, f1_3, f1_4 = self.encoder(img1_) ######################################### the 4th decoder ######################################### up_flow0_4, up_flow1_4, ft_3_ = self.decoder4(f0_4, f1_4, embt) corr_4, flow_4 = self._corr_scale_lookup(corr_fn, coord, up_flow0_4, up_flow1_4, embt, downsample=1) # residue update with lookup corr delta_ft_3_, delta_flow_4 = self.update4(ft_3_, flow_4, corr_4) delta_flow0_4, delta_flow1_4 = torch.chunk(delta_flow_4, 2, 1) up_flow0_4 = up_flow0_4 + delta_flow0_4 up_flow1_4 = up_flow1_4 + delta_flow1_4 ft_3_ = ft_3_ + delta_ft_3_ ######################################### the 3rd decoder ######################################### up_flow0_3, up_flow1_3, ft_2_ = self.decoder3(ft_3_, f0_3, f1_3, up_flow0_4, up_flow1_4) corr_3, flow_3 = self._corr_scale_lookup(corr_fn, coord, up_flow0_3, up_flow1_3, embt, downsample=2) # residue update with lookup corr delta_ft_2_, delta_flow_3 = self.update3_low(ft_2_, flow_3, corr_3) delta_flow0_3, delta_flow1_3 = torch.chunk(delta_flow_3, 2, 1) up_flow0_3 = up_flow0_3 + delta_flow0_3 up_flow1_3 = up_flow1_3 + delta_flow1_3 ft_2_ = ft_2_ + delta_ft_2_ # residue update with lookup corr (hr) corr_3 = resize(corr_3, scale_factor=2.0) up_flow_3 = torch.cat([up_flow0_3, up_flow1_3], dim=1) delta_ft_2_, delta_up_flow_3 = self.update3_high(ft_2_, up_flow_3, corr_3) ft_2_ += delta_ft_2_ up_flow0_3 += delta_up_flow_3[:, 0:2] up_flow1_3 += delta_up_flow_3[:, 2:4] ######################################### the 2nd decoder ######################################### up_flow0_2, up_flow1_2, ft_1_ = self.decoder2(ft_2_, f0_2, f1_2, up_flow0_3, up_flow1_3) corr_2, flow_2 = self._corr_scale_lookup(corr_fn, coord, up_flow0_2, up_flow1_2, embt, downsample=4) # residue update with lookup corr delta_ft_1_, delta_flow_2 = self.update2_low(ft_1_, flow_2, corr_2) delta_flow0_2, delta_flow1_2 = torch.chunk(delta_flow_2, 2, 1) up_flow0_2 = up_flow0_2 + delta_flow0_2 up_flow1_2 = up_flow1_2 + delta_flow1_2 ft_1_ = ft_1_ + delta_ft_1_ # residue update with lookup corr (hr) corr_2 = resize(corr_2, scale_factor=4.0) up_flow_2 = torch.cat([up_flow0_2, up_flow1_2], dim=1) delta_ft_1_, delta_up_flow_2 = self.update2_high(ft_1_, up_flow_2, corr_2) ft_1_ += delta_ft_1_ up_flow0_2 += delta_up_flow_2[:, 0:2] up_flow1_2 += delta_up_flow_2[:, 2:4] ######################################### the 1st decoder ######################################### up_flow0_1, up_flow1_1, mask, img_res = self.decoder1(ft_1_, f0_1, f1_1, up_flow0_2, up_flow1_2) if scale_factor != 1.0: up_flow0_1 = resize(up_flow0_1, scale_factor=(1.0 / scale_factor)) * (1.0 / scale_factor) up_flow1_1 = resize(up_flow1_1, scale_factor=(1.0 / scale_factor)) * (1.0 / scale_factor) mask = resize(mask, scale_factor=(1.0 / scale_factor)) img_res = resize(img_res, scale_factor=(1.0 / scale_factor)) # Merge multiple predictions imgt_pred = multi_flow_combine(self.comb_block, img0, img1, up_flow0_1, up_flow1_1, mask, img_res, mean_) imgt_pred = torch.clamp(imgt_pred, 0, 1) if eval: return { "imgt_pred": imgt_pred, } else: up_flow0_1 = up_flow0_1.reshape(b, self.num_flows, 2, h, w) up_flow1_1 = up_flow1_1.reshape(b, self.num_flows, 2, h, w) return { "imgt_pred": imgt_pred, "flow0_pred": [up_flow0_1, up_flow0_2, up_flow0_3, up_flow0_4], "flow1_pred": [up_flow1_1, up_flow1_2, up_flow1_3, up_flow1_4], "ft_pred": [ft_1_, ft_2_, ft_3_], } ================================================ FILE: Open-Sora/tools/frame_interpolation/networks/blocks/__init__.py ================================================ ================================================ FILE: Open-Sora/tools/frame_interpolation/networks/blocks/feat_enc.py ================================================ import torch import torch.nn as nn class BottleneckBlock(nn.Module): def __init__(self, in_planes, planes, norm_fn="group", stride=1): super(BottleneckBlock, self).__init__() self.conv1 = nn.Conv2d(in_planes, planes // 4, kernel_size=1, padding=0) self.conv2 = nn.Conv2d(planes // 4, planes // 4, kernel_size=3, padding=1, stride=stride) self.conv3 = nn.Conv2d(planes // 4, planes, kernel_size=1, padding=0) self.relu = nn.ReLU(inplace=True) num_groups = planes // 8 if norm_fn == "group": self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes // 4) self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes // 4) self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) if not stride == 1: self.norm4 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) elif norm_fn == "batch": self.norm1 = nn.BatchNorm2d(planes // 4) self.norm2 = nn.BatchNorm2d(planes // 4) self.norm3 = nn.BatchNorm2d(planes) if not stride == 1: self.norm4 = nn.BatchNorm2d(planes) elif norm_fn == "instance": self.norm1 = nn.InstanceNorm2d(planes // 4) self.norm2 = nn.InstanceNorm2d(planes // 4) self.norm3 = nn.InstanceNorm2d(planes) if not stride == 1: self.norm4 = nn.InstanceNorm2d(planes) elif norm_fn == "none": self.norm1 = nn.Sequential() self.norm2 = nn.Sequential() self.norm3 = nn.Sequential() if not stride == 1: self.norm4 = nn.Sequential() if stride == 1: self.downsample = None else: self.downsample = nn.Sequential(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm4) def forward(self, x): y = x y = self.relu(self.norm1(self.conv1(y))) y = self.relu(self.norm2(self.conv2(y))) y = self.relu(self.norm3(self.conv3(y))) if self.downsample is not None: x = self.downsample(x) return self.relu(x + y) class ResidualBlock(nn.Module): def __init__(self, in_planes, planes, norm_fn="group", stride=1): super(ResidualBlock, self).__init__() self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1) self.relu = nn.ReLU(inplace=True) num_groups = planes // 8 if norm_fn == "group": self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) if not stride == 1: self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) elif norm_fn == "batch": self.norm1 = nn.BatchNorm2d(planes) self.norm2 = nn.BatchNorm2d(planes) if not stride == 1: self.norm3 = nn.BatchNorm2d(planes) elif norm_fn == "instance": self.norm1 = nn.InstanceNorm2d(planes) self.norm2 = nn.InstanceNorm2d(planes) if not stride == 1: self.norm3 = nn.InstanceNorm2d(planes) elif norm_fn == "none": self.norm1 = nn.Sequential() self.norm2 = nn.Sequential() if not stride == 1: self.norm3 = nn.Sequential() if stride == 1: self.downsample = None else: self.downsample = nn.Sequential(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3) def forward(self, x): y = x y = self.relu(self.norm1(self.conv1(y))) y = self.relu(self.norm2(self.conv2(y))) if self.downsample is not None: x = self.downsample(x) return self.relu(x + y) class SmallEncoder(nn.Module): def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0): super(SmallEncoder, self).__init__() self.norm_fn = norm_fn if self.norm_fn == "group": self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32) elif self.norm_fn == "batch": self.norm1 = nn.BatchNorm2d(32) elif self.norm_fn == "instance": self.norm1 = nn.InstanceNorm2d(32) elif self.norm_fn == "none": self.norm1 = nn.Sequential() self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3) self.relu1 = nn.ReLU(inplace=True) self.in_planes = 32 self.layer1 = self._make_layer(32, stride=1) self.layer2 = self._make_layer(64, stride=2) self.layer3 = self._make_layer(96, stride=2) self.dropout = None if dropout > 0: self.dropout = nn.Dropout2d(p=dropout) self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0) def _make_layer(self, dim, stride=1): layer1 = BottleneckBlock(self.in_planes, dim, self.norm_fn, stride=stride) layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1) layers = (layer1, layer2) self.in_planes = dim return nn.Sequential(*layers) def forward(self, x): # if input is list, combine batch dimension is_list = isinstance(x, tuple) or isinstance(x, list) if is_list: batch_dim = x[0].shape[0] x = torch.cat(x, dim=0) x = self.conv1(x) x = self.norm1(x) x = self.relu1(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.conv2(x) if self.training and self.dropout is not None: x = self.dropout(x) if is_list: x = torch.split(x, [batch_dim, batch_dim], dim=0) return x class BasicEncoder(nn.Module): def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0): super(BasicEncoder, self).__init__() self.norm_fn = norm_fn if self.norm_fn == "group": self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64) elif self.norm_fn == "batch": self.norm1 = nn.BatchNorm2d(64) elif self.norm_fn == "instance": self.norm1 = nn.InstanceNorm2d(64) elif self.norm_fn == "none": self.norm1 = nn.Sequential() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) self.relu1 = nn.ReLU(inplace=True) self.in_planes = 64 self.layer1 = self._make_layer(64, stride=1) self.layer2 = self._make_layer(72, stride=2) self.layer3 = self._make_layer(128, stride=2) # output convolution self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1) self.dropout = None if dropout > 0: self.dropout = nn.Dropout2d(p=dropout) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0) def _make_layer(self, dim, stride=1): layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride) layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1) layers = (layer1, layer2) self.in_planes = dim return nn.Sequential(*layers) def forward(self, x): # if input is list, combine batch dimension is_list = isinstance(x, tuple) or isinstance(x, list) if is_list: batch_dim = x[0].shape[0] x = torch.cat(x, dim=0) x = self.conv1(x) x = self.norm1(x) x = self.relu1(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.conv2(x) if self.training and self.dropout is not None: x = self.dropout(x) if is_list: x = torch.split(x, [batch_dim, batch_dim], dim=0) return x class LargeEncoder(nn.Module): def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0): super(LargeEncoder, self).__init__() self.norm_fn = norm_fn if self.norm_fn == "group": self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64) elif self.norm_fn == "batch": self.norm1 = nn.BatchNorm2d(64) elif self.norm_fn == "instance": self.norm1 = nn.InstanceNorm2d(64) elif self.norm_fn == "none": self.norm1 = nn.Sequential() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) self.relu1 = nn.ReLU(inplace=True) self.in_planes = 64 self.layer1 = self._make_layer(64, stride=1) self.layer2 = self._make_layer(112, stride=2) self.layer3 = self._make_layer(160, stride=2) self.layer3_2 = self._make_layer(160, stride=1) # output convolution self.conv2 = nn.Conv2d(self.in_planes, output_dim, kernel_size=1) self.dropout = None if dropout > 0: self.dropout = nn.Dropout2d(p=dropout) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0) def _make_layer(self, dim, stride=1): layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride) layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1) layers = (layer1, layer2) self.in_planes = dim return nn.Sequential(*layers) def forward(self, x): # if input is list, combine batch dimension is_list = isinstance(x, tuple) or isinstance(x, list) if is_list: batch_dim = x[0].shape[0] x = torch.cat(x, dim=0) x = self.conv1(x) x = self.norm1(x) x = self.relu1(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer3_2(x) x = self.conv2(x) if self.training and self.dropout is not None: x = self.dropout(x) if is_list: x = torch.split(x, [batch_dim, batch_dim], dim=0) return x ================================================ FILE: Open-Sora/tools/frame_interpolation/networks/blocks/ifrnet.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from tools.frame_interpolation.utils.flow_utils import warp def resize(x, scale_factor): return F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False) def convrelu(in_channels, out_channels, kernel_size=3, stride=1, padding=1, dilation=1, groups=1, bias=True): return nn.Sequential( nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias=bias), nn.PReLU(out_channels), ) class ResBlock(nn.Module): def __init__(self, in_channels, side_channels, bias=True): super(ResBlock, self).__init__() self.side_channels = side_channels self.conv1 = nn.Sequential( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(in_channels) ) self.conv2 = nn.Sequential( nn.Conv2d(side_channels, side_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(side_channels), ) self.conv3 = nn.Sequential( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(in_channels) ) self.conv4 = nn.Sequential( nn.Conv2d(side_channels, side_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(side_channels), ) self.conv5 = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias) self.prelu = nn.PReLU(in_channels) def forward(self, x): out = self.conv1(x) res_feat = out[:, : -self.side_channels, ...] side_feat = out[:, -self.side_channels :, :, :] side_feat = self.conv2(side_feat) out = self.conv3(torch.cat([res_feat, side_feat], 1)) res_feat = out[:, : -self.side_channels, ...] side_feat = out[:, -self.side_channels :, :, :] side_feat = self.conv4(side_feat) out = self.conv5(torch.cat([res_feat, side_feat], 1)) out = self.prelu(x + out) return out class Encoder(nn.Module): def __init__(self, channels, large=False): super(Encoder, self).__init__() self.channels = channels prev_ch = 3 for idx, ch in enumerate(channels, 1): k = 7 if large and idx == 1 else 3 p = 3 if k == 7 else 1 self.register_module( f"pyramid{idx}", nn.Sequential(convrelu(prev_ch, ch, k, 2, p), convrelu(ch, ch, 3, 1, 1)) ) prev_ch = ch def forward(self, in_x): fs = [] for idx in range(len(self.channels)): out_x = getattr(self, f"pyramid{idx+1}")(in_x) fs.append(out_x) in_x = out_x return fs class InitDecoder(nn.Module): def __init__(self, in_ch, out_ch, skip_ch) -> None: super().__init__() self.convblock = nn.Sequential( convrelu(in_ch * 2 + 1, in_ch * 2), ResBlock(in_ch * 2, skip_ch), nn.ConvTranspose2d(in_ch * 2, out_ch + 4, 4, 2, 1, bias=True), ) def forward(self, f0, f1, embt): h, w = f0.shape[2:] embt = embt.repeat(1, 1, h, w) out = self.convblock(torch.cat([f0, f1, embt], 1)) flow0, flow1 = torch.chunk(out[:, :4, ...], 2, 1) ft_ = out[:, 4:, ...] return flow0, flow1, ft_ class IntermediateDecoder(nn.Module): def __init__(self, in_ch, out_ch, skip_ch) -> None: super().__init__() self.convblock = nn.Sequential( convrelu(in_ch * 3 + 4, in_ch * 3), ResBlock(in_ch * 3, skip_ch), nn.ConvTranspose2d(in_ch * 3, out_ch + 4, 4, 2, 1, bias=True), ) def forward(self, ft_, f0, f1, flow0_in, flow1_in): f0_warp = warp(f0, flow0_in) f1_warp = warp(f1, flow1_in) f_in = torch.cat([ft_, f0_warp, f1_warp, flow0_in, flow1_in], 1) out = self.convblock(f_in) flow0, flow1 = torch.chunk(out[:, :4, ...], 2, 1) ft_ = out[:, 4:, ...] flow0 = flow0 + 2.0 * resize(flow0_in, scale_factor=2.0) flow1 = flow1 + 2.0 * resize(flow1_in, scale_factor=2.0) return flow0, flow1, ft_ ================================================ FILE: Open-Sora/tools/frame_interpolation/networks/blocks/multi_flow.py ================================================ import torch import torch.nn as nn from tools.frame_interpolation.utils.flow_utils import warp from .ifrnet import ResBlock, convrelu, resize def multi_flow_combine(comb_block, img0, img1, flow0, flow1, mask=None, img_res=None, mean=None): """ A parallel implementation of multiple flow field warping comb_block: An nn.Seqential object. img shape: [b, c, h, w] flow shape: [b, 2*num_flows, h, w] mask (opt): If 'mask' is None, the function conduct a simple average. img_res (opt): If 'img_res' is None, the function adds zero instead. mean (opt): If 'mean' is None, the function adds zero instead. """ b, c, h, w = flow0.shape num_flows = c // 2 flow0 = flow0.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w) flow1 = flow1.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w) mask = mask.reshape(b, num_flows, 1, h, w).reshape(-1, 1, h, w) if mask is not None else None img_res = img_res.reshape(b, num_flows, 3, h, w).reshape(-1, 3, h, w) if img_res is not None else 0 img0 = torch.stack([img0] * num_flows, 1).reshape(-1, 3, h, w) img1 = torch.stack([img1] * num_flows, 1).reshape(-1, 3, h, w) mean = torch.stack([mean] * num_flows, 1).reshape(-1, 1, 1, 1) if mean is not None else 0 img0_warp = warp(img0, flow0) img1_warp = warp(img1, flow1) img_warps = mask * img0_warp + (1 - mask) * img1_warp + mean + img_res img_warps = img_warps.reshape(b, num_flows, 3, h, w) imgt_pred = img_warps.mean(1) + comb_block(img_warps.view(b, -1, h, w)) return imgt_pred class MultiFlowDecoder(nn.Module): def __init__(self, in_ch, skip_ch, num_flows=3): super(MultiFlowDecoder, self).__init__() self.num_flows = num_flows self.convblock = nn.Sequential( convrelu(in_ch * 3 + 4, in_ch * 3), ResBlock(in_ch * 3, skip_ch), nn.ConvTranspose2d(in_ch * 3, 8 * num_flows, 4, 2, 1, bias=True), ) def forward(self, ft_, f0, f1, flow0, flow1): n = self.num_flows f0_warp = warp(f0, flow0) f1_warp = warp(f1, flow1) out = self.convblock(torch.cat([ft_, f0_warp, f1_warp, flow0, flow1], 1)) delta_flow0, delta_flow1, mask, img_res = torch.split(out, [2 * n, 2 * n, n, 3 * n], 1) mask = torch.sigmoid(mask) flow0 = delta_flow0 + 2.0 * resize(flow0, scale_factor=2.0).repeat(1, self.num_flows, 1, 1) flow1 = delta_flow1 + 2.0 * resize(flow1, scale_factor=2.0).repeat(1, self.num_flows, 1, 1) return flow0, flow1, mask, img_res ================================================ FILE: Open-Sora/tools/frame_interpolation/networks/blocks/raft.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F def resize(x, scale_factor): return F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False) def bilinear_sampler(img, coords, mask=False): """Wrapper for grid_sample, uses pixel coordinates""" H, W = img.shape[-2:] xgrid, ygrid = coords.split([1, 1], dim=-1) xgrid = 2 * xgrid / (W - 1) - 1 ygrid = 2 * ygrid / (H - 1) - 1 grid = torch.cat([xgrid, ygrid], dim=-1) img = F.grid_sample(img, grid, align_corners=True) if mask: mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1) return img, mask.float() return img def coords_grid(batch, ht, wd, device): coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device), indexing="ij") coords = torch.stack(coords[::-1], dim=0).float() return coords[None].repeat(batch, 1, 1, 1) class SmallUpdateBlock(nn.Module): def __init__(self, cdim, hidden_dim, flow_dim, corr_dim, fc_dim, corr_levels=4, radius=3, scale_factor=None): super(SmallUpdateBlock, self).__init__() cor_planes = corr_levels * (2 * radius + 1) ** 2 self.scale_factor = scale_factor self.convc1 = nn.Conv2d(2 * cor_planes, corr_dim, 1, padding=0) self.convf1 = nn.Conv2d(4, flow_dim * 2, 7, padding=3) self.convf2 = nn.Conv2d(flow_dim * 2, flow_dim, 3, padding=1) self.conv = nn.Conv2d(corr_dim + flow_dim, fc_dim, 3, padding=1) self.gru = nn.Sequential( nn.Conv2d(fc_dim + 4 + cdim, hidden_dim, 3, padding=1), nn.LeakyReLU(negative_slope=0.1, inplace=True), nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), ) self.feat_head = nn.Sequential( nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), nn.LeakyReLU(negative_slope=0.1, inplace=True), nn.Conv2d(hidden_dim, cdim, 3, padding=1), ) self.flow_head = nn.Sequential( nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), nn.LeakyReLU(negative_slope=0.1, inplace=True), nn.Conv2d(hidden_dim, 4, 3, padding=1), ) self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True) def forward(self, net, flow, corr): net = resize(net, 1 / self.scale_factor) if self.scale_factor is not None else net cor = self.lrelu(self.convc1(corr)) flo = self.lrelu(self.convf1(flow)) flo = self.lrelu(self.convf2(flo)) cor_flo = torch.cat([cor, flo], dim=1) inp = self.lrelu(self.conv(cor_flo)) inp = torch.cat([inp, flow, net], dim=1) out = self.gru(inp) delta_net = self.feat_head(out) delta_flow = self.flow_head(out) if self.scale_factor is not None: delta_net = resize(delta_net, scale_factor=self.scale_factor) delta_flow = self.scale_factor * resize(delta_flow, scale_factor=self.scale_factor) return delta_net, delta_flow class BasicUpdateBlock(nn.Module): def __init__( self, cdim, hidden_dim, flow_dim, corr_dim, corr_dim2, fc_dim, corr_levels=4, radius=3, scale_factor=None, out_num=1, ): super(BasicUpdateBlock, self).__init__() cor_planes = corr_levels * (2 * radius + 1) ** 2 self.scale_factor = scale_factor self.convc1 = nn.Conv2d(2 * cor_planes, corr_dim, 1, padding=0) self.convc2 = nn.Conv2d(corr_dim, corr_dim2, 3, padding=1) self.convf1 = nn.Conv2d(4, flow_dim * 2, 7, padding=3) self.convf2 = nn.Conv2d(flow_dim * 2, flow_dim, 3, padding=1) self.conv = nn.Conv2d(flow_dim + corr_dim2, fc_dim, 3, padding=1) self.gru = nn.Sequential( nn.Conv2d(fc_dim + 4 + cdim, hidden_dim, 3, padding=1), nn.LeakyReLU(negative_slope=0.1, inplace=True), nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), ) self.feat_head = nn.Sequential( nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), nn.LeakyReLU(negative_slope=0.1, inplace=True), nn.Conv2d(hidden_dim, cdim, 3, padding=1), ) self.flow_head = nn.Sequential( nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), nn.LeakyReLU(negative_slope=0.1, inplace=True), nn.Conv2d(hidden_dim, 4 * out_num, 3, padding=1), ) self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True) def forward(self, net, flow, corr): net = resize(net, 1 / self.scale_factor) if self.scale_factor is not None else net cor = self.lrelu(self.convc1(corr)) cor = self.lrelu(self.convc2(cor)) flo = self.lrelu(self.convf1(flow)) flo = self.lrelu(self.convf2(flo)) cor_flo = torch.cat([cor, flo], dim=1) inp = self.lrelu(self.conv(cor_flo)) inp = torch.cat([inp, flow, net], dim=1) out = self.gru(inp) delta_net = self.feat_head(out) delta_flow = self.flow_head(out) if self.scale_factor is not None: delta_net = resize(delta_net, scale_factor=self.scale_factor) delta_flow = self.scale_factor * resize(delta_flow, scale_factor=self.scale_factor) return delta_net, delta_flow class BidirCorrBlock: def __init__(self, fmap1, fmap2, num_levels=4, radius=4): self.num_levels = num_levels self.radius = radius self.corr_pyramid = [] self.corr_pyramid_T = [] corr = BidirCorrBlock.corr(fmap1, fmap2) batch, h1, w1, dim, h2, w2 = corr.shape corr_T = corr.clone().permute(0, 4, 5, 3, 1, 2) corr = corr.reshape(batch * h1 * w1, dim, h2, w2) corr_T = corr_T.reshape(batch * h2 * w2, dim, h1, w1) self.corr_pyramid.append(corr) self.corr_pyramid_T.append(corr_T) for _ in range(self.num_levels - 1): corr = F.avg_pool2d(corr, 2, stride=2) corr_T = F.avg_pool2d(corr_T, 2, stride=2) self.corr_pyramid.append(corr) self.corr_pyramid_T.append(corr_T) def __call__(self, coords0, coords1): r = self.radius coords0 = coords0.permute(0, 2, 3, 1) coords1 = coords1.permute(0, 2, 3, 1) assert coords0.shape == coords1.shape, f"coords0 shape: [{coords0.shape}] is not equal to [{coords1.shape}]" batch, h1, w1, _ = coords0.shape out_pyramid = [] out_pyramid_T = [] for i in range(self.num_levels): corr = self.corr_pyramid[i] corr_T = self.corr_pyramid_T[i] dx = torch.linspace(-r, r, 2 * r + 1, device=coords0.device) dy = torch.linspace(-r, r, 2 * r + 1, device=coords0.device) delta = torch.stack(torch.meshgrid(dy, dx, indexing="ij"), axis=-1) delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2) centroid_lvl_0 = coords0.reshape(batch * h1 * w1, 1, 1, 2) / 2**i centroid_lvl_1 = coords1.reshape(batch * h1 * w1, 1, 1, 2) / 2**i coords_lvl_0 = centroid_lvl_0 + delta_lvl coords_lvl_1 = centroid_lvl_1 + delta_lvl corr = bilinear_sampler(corr, coords_lvl_0) corr_T = bilinear_sampler(corr_T, coords_lvl_1) corr = corr.view(batch, h1, w1, -1) corr_T = corr_T.view(batch, h1, w1, -1) out_pyramid.append(corr) out_pyramid_T.append(corr_T) out = torch.cat(out_pyramid, dim=-1) out_T = torch.cat(out_pyramid_T, dim=-1) return out.permute(0, 3, 1, 2).contiguous().float(), out_T.permute(0, 3, 1, 2).contiguous().float() @staticmethod def corr(fmap1, fmap2): batch, dim, ht, wd = fmap1.shape fmap1 = fmap1.view(batch, dim, ht * wd) fmap2 = fmap2.view(batch, dim, ht * wd) corr = torch.matmul(fmap1.transpose(1, 2), fmap2) corr = corr.view(batch, ht, wd, 1, ht, wd) return corr / torch.sqrt(torch.tensor(dim).float()) ================================================ FILE: Open-Sora/tools/frame_interpolation/utils/__init__.py ================================================ ================================================ FILE: Open-Sora/tools/frame_interpolation/utils/dist_utils.py ================================================ import os import torch def get_world_size(): """Find OMPI world size without calling mpi functions :rtype: int """ if os.environ.get("PMI_SIZE") is not None: return int(os.environ.get("PMI_SIZE") or 1) elif os.environ.get("OMPI_COMM_WORLD_SIZE") is not None: return int(os.environ.get("OMPI_COMM_WORLD_SIZE") or 1) else: return torch.cuda.device_count() def get_global_rank(): """Find OMPI world rank without calling mpi functions :rtype: int """ if os.environ.get("PMI_RANK") is not None: return int(os.environ.get("PMI_RANK") or 0) elif os.environ.get("OMPI_COMM_WORLD_RANK") is not None: return int(os.environ.get("OMPI_COMM_WORLD_RANK") or 0) else: return 0 def get_local_rank(): """Find OMPI local rank without calling mpi functions :rtype: int """ if os.environ.get("MPI_LOCALRANKID") is not None: return int(os.environ.get("MPI_LOCALRANKID") or 0) elif os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") is not None: return int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") or 0) else: return 0 def get_master_ip(): if os.environ.get("AZ_BATCH_MASTER_NODE") is not None: return os.environ.get("AZ_BATCH_MASTER_NODE").split(":")[0] elif os.environ.get("AZ_BATCHAI_MPI_MASTER_NODE") is not None: return os.environ.get("AZ_BATCHAI_MPI_MASTER_NODE") else: return "127.0.0.1" ================================================ FILE: Open-Sora/tools/frame_interpolation/utils/flow_utils.py ================================================ import numpy as np import torch import torch.nn.functional as F from PIL import ImageFile ImageFile.LOAD_TRUNCATED_IMAGES = True def warp(img, flow): B, _, H, W = flow.shape xx = torch.linspace(-1.0, 1.0, W).view(1, 1, 1, W).expand(B, -1, H, -1) yy = torch.linspace(-1.0, 1.0, H).view(1, 1, H, 1).expand(B, -1, -1, W) grid = torch.cat([xx, yy], 1).to(img) flow_ = torch.cat([flow[:, 0:1, :, :] / ((W - 1.0) / 2.0), flow[:, 1:2, :, :] / ((H - 1.0) / 2.0)], 1) grid_ = (grid + flow_).permute(0, 2, 3, 1) output = F.grid_sample(input=img, grid=grid_, mode="bilinear", padding_mode="border", align_corners=True) return output def make_colorwheel(): """ Generates a color wheel for optical flow visualization as presented in: Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007) URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf Code follows the original C++ source code of Daniel Scharstein. Code follows the Matlab source code of Deqing Sun. Returns: np.ndarray: Color wheel """ RY = 15 YG = 6 GC = 4 CB = 11 BM = 13 MR = 6 ncols = RY + YG + GC + CB + BM + MR colorwheel = np.zeros((ncols, 3)) col = 0 # RY colorwheel[0:RY, 0] = 255 colorwheel[0:RY, 1] = np.floor(255 * np.arange(0, RY) / RY) col = col + RY # YG colorwheel[col : col + YG, 0] = 255 - np.floor(255 * np.arange(0, YG) / YG) colorwheel[col : col + YG, 1] = 255 col = col + YG # GC colorwheel[col : col + GC, 1] = 255 colorwheel[col : col + GC, 2] = np.floor(255 * np.arange(0, GC) / GC) col = col + GC # CB colorwheel[col : col + CB, 1] = 255 - np.floor(255 * np.arange(CB) / CB) colorwheel[col : col + CB, 2] = 255 col = col + CB # BM colorwheel[col : col + BM, 2] = 255 colorwheel[col : col + BM, 0] = np.floor(255 * np.arange(0, BM) / BM) col = col + BM # MR colorwheel[col : col + MR, 2] = 255 - np.floor(255 * np.arange(MR) / MR) colorwheel[col : col + MR, 0] = 255 return colorwheel def flow_uv_to_colors(u, v, convert_to_bgr=False): """ Applies the flow color wheel to (possibly clipped) flow components u and v. According to the C++ source code of Daniel Scharstein According to the Matlab source code of Deqing Sun Args: u (np.ndarray): Input horizontal flow of shape [H,W] v (np.ndarray): Input vertical flow of shape [H,W] convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False. Returns: np.ndarray: Flow visualization image of shape [H,W,3] """ flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8) colorwheel = make_colorwheel() # shape [55x3] ncols = colorwheel.shape[0] rad = np.sqrt(np.square(u) + np.square(v)) a = np.arctan2(-v, -u) / np.pi fk = (a + 1) / 2 * (ncols - 1) k0 = np.floor(fk).astype(np.int32) k1 = k0 + 1 k1[k1 == ncols] = 0 f = fk - k0 for i in range(colorwheel.shape[1]): tmp = colorwheel[:, i] col0 = tmp[k0] / 255.0 col1 = tmp[k1] / 255.0 col = (1 - f) * col0 + f * col1 idx = rad <= 1 col[idx] = 1 - rad[idx] * (1 - col[idx]) col[~idx] = col[~idx] * 0.75 # out of range # Note the 2-i => BGR instead of RGB ch_idx = 2 - i if convert_to_bgr else i flow_image[:, :, ch_idx] = np.floor(255 * col) return flow_image def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False): """ Expects a two dimensional flow image of shape. Args: flow_uv (np.ndarray): Flow UV image of shape [H,W,2] clip_flow (float, optional): Clip maximum of flow values. Defaults to None. convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False. Returns: np.ndarray: Flow visualization image of shape [H,W,3] """ assert flow_uv.ndim == 3, "input flow must have three dimensions" assert flow_uv.shape[2] == 2, "input flow must have shape [H,W,2]" if clip_flow is not None: flow_uv = np.clip(flow_uv, 0, clip_flow) u = flow_uv[:, :, 0] v = flow_uv[:, :, 1] rad = np.sqrt(np.square(u) + np.square(v)) rad_max = np.max(rad) epsilon = 1e-5 u = u / (rad_max + epsilon) v = v / (rad_max + epsilon) return flow_uv_to_colors(u, v, convert_to_bgr) ================================================ FILE: Open-Sora/tools/frame_interpolation/utils/utils.py ================================================ import random import re import sys import numpy as np import torch import torch.nn.functional as F from imageio import imread, imwrite from PIL import ImageFile ImageFile.LOAD_TRUNCATED_IMAGES = True class AverageMeter: def __init__(self): self.reset() def reset(self): self.val = 0.0 self.avg = 0.0 self.sum = 0.0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count class AverageMeterGroups: def __init__(self) -> None: self.meter_dict = dict() def update(self, dict, n=1): for name, val in dict.items(): if self.meter_dict.get(name) is None: self.meter_dict[name] = AverageMeter() self.meter_dict[name].update(val, n) def reset(self, name=None): if name is None: for v in self.meter_dict.values(): v.reset() else: meter = self.meter_dict.get(name) if meter is not None: meter.reset() def avg(self, name): meter = self.meter_dict.get(name) if meter is not None: return meter.avg class InputPadder: """Pads images such that dimensions are divisible by divisor""" def __init__(self, dims, divisor=16): self.ht, self.wd = dims[-2:] pad_ht = (((self.ht // divisor) + 1) * divisor - self.ht) % divisor pad_wd = (((self.wd // divisor) + 1) * divisor - self.wd) % divisor self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2, pad_ht - pad_ht // 2] def pad(self, *inputs): if len(inputs) == 1: return F.pad(inputs[0], self._pad, mode="replicate") else: return [F.pad(x, self._pad, mode="replicate") for x in inputs] def unpad(self, *inputs): if len(inputs) == 1: return self._unpad(inputs[0]) else: return [self._unpad(x) for x in inputs] def _unpad(self, x): ht, wd = x.shape[-2:] c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]] return x[..., c[0] : c[1], c[2] : c[3]] def img2tensor(img): if img.shape[-1] > 3: img = img[:, :, :3] return torch.tensor(img).permute(2, 0, 1).unsqueeze(0) / 255.0 def tensor2img(img_t): return (img_t * 255.0).detach().squeeze(0).permute(1, 2, 0).cpu().numpy().clip(0, 255).astype(np.uint8) def seed_all(seed): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) def read(file): if file.endswith(".float3"): return readFloat(file) elif file.endswith(".flo"): return readFlow(file) elif file.endswith(".ppm"): return readImage(file) elif file.endswith(".pgm"): return readImage(file) elif file.endswith(".png"): return readImage(file) elif file.endswith(".jpg"): return readImage(file) elif file.endswith(".pfm"): return readPFM(file)[0] else: raise Exception("don't know how to read %s" % file) def write(file, data): if file.endswith(".float3"): return writeFloat(file, data) elif file.endswith(".flo"): return writeFlow(file, data) elif file.endswith(".ppm"): return writeImage(file, data) elif file.endswith(".pgm"): return writeImage(file, data) elif file.endswith(".png"): return writeImage(file, data) elif file.endswith(".jpg"): return writeImage(file, data) elif file.endswith(".pfm"): return writePFM(file, data) else: raise Exception("don't know how to write %s" % file) def readPFM(file): file = open(file, "rb") color = None width = None height = None scale = None endian = None header = file.readline().rstrip() if header.decode("ascii") == "PF": color = True elif header.decode("ascii") == "Pf": color = False else: raise Exception("Not a PFM file.") dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii")) if dim_match: width, height = list(map(int, dim_match.groups())) else: raise Exception("Malformed PFM header.") scale = float(file.readline().decode("ascii").rstrip()) if scale < 0: endian = "<" scale = -scale else: endian = ">" data = np.fromfile(file, endian + "f") shape = (height, width, 3) if color else (height, width) data = np.reshape(data, shape) data = np.flipud(data) return data, scale def writePFM(file, image, scale=1): file = open(file, "wb") color = None if image.dtype.name != "float32": raise Exception("Image dtype must be float32.") image = np.flipud(image) if len(image.shape) == 3 and image.shape[2] == 3: color = True elif len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1: color = False else: raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.") file.write("PF\n" if color else "Pf\n".encode()) file.write("%d %d\n".encode() % (image.shape[1], image.shape[0])) endian = image.dtype.byteorder if endian == "<" or endian == "=" and sys.byteorder == "little": scale = -scale file.write("%f\n".encode() % scale) image.tofile(file) def readFlow(name): if name.endswith(".pfm") or name.endswith(".PFM"): return readPFM(name)[0][:, :, 0:2] f = open(name, "rb") header = f.read(4) if header.decode("utf-8") != "PIEH": raise Exception("Flow file header does not contain PIEH") width = np.fromfile(f, np.int32, 1).squeeze() height = np.fromfile(f, np.int32, 1).squeeze() flow = np.fromfile(f, np.float32, width * height * 2).reshape((height, width, 2)) return flow.astype(np.float32) def readImage(name): if name.endswith(".pfm") or name.endswith(".PFM"): data = readPFM(name)[0] if len(data.shape) == 3: return data[:, :, 0:3] else: return data return imread(name) def writeImage(name, data): if name.endswith(".pfm") or name.endswith(".PFM"): return writePFM(name, data, 1) return imwrite(name, data) def writeFlow(name, flow): f = open(name, "wb") f.write("PIEH".encode("utf-8")) np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f) flow = flow.astype(np.float32) flow.tofile(f) def readFloat(name): f = open(name, "rb") if (f.readline().decode("utf-8")) != "float\n": raise Exception("float file %s did not contain keyword" % name) dim = int(f.readline()) dims = [] count = 1 for i in range(0, dim): d = int(f.readline()) dims.append(d) count *= d dims = list(reversed(dims)) data = np.fromfile(f, np.float32, count).reshape(dims) if dim > 2: data = np.transpose(data, (2, 1, 0)) data = np.transpose(data, (1, 0, 2)) return data def writeFloat(name, data): f = open(name, "wb") dim = len(data.shape) if dim > 3: raise Exception("bad float file dimension: %d" % dim) f.write(("float\n").encode("ascii")) f.write(("%d\n" % dim).encode("ascii")) if dim == 1: f.write(("%d\n" % data.shape[0]).encode("ascii")) else: f.write(("%d\n" % data.shape[1]).encode("ascii")) f.write(("%d\n" % data.shape[0]).encode("ascii")) for i in range(2, dim): f.write(("%d\n" % data.shape[i]).encode("ascii")) data = data.astype(np.float32) if dim == 2: data.tofile(f) else: np.transpose(data, (2, 0, 1)).tofile(f) def check_dim_and_resize(tensor_list): shape_list = [] for t in tensor_list: shape_list.append(t.shape[2:]) if len(set(shape_list)) > 1: desired_shape = shape_list[0] print(f"Inconsistent size of input video frames. All frames will be resized to {desired_shape}") resize_tensor_list = [] for t in tensor_list: resize_tensor_list.append(torch.nn.functional.interpolate(t, size=tuple(desired_shape), mode="bilinear")) tensor_list = resize_tensor_list return tensor_list ================================================ FILE: Open-Sora/tools/scene_cut/README.md ================================================ # Scene Detection and Video Splitting - [Scene Detection and Video Splitting](#scene-detection-and-video-splitting) - [Prepare Meta Files](#prepare-meta-files) - [Scene Detection](#scene-detection) - [Video Splitting](#video-splitting) In many cases, raw videos contain several scenes and are too long for training. Thus, it is essential to split them into shorter clips based on scenes. Here, we provide code for scene detection and video splitting. ## Prepare Meta Files At this step, you should have a raw video dataset prepared. A meta file of the dataset information is needed for data processing. To create a meta file from a folder, run: ```bash python -m tools.datasets.convert video /path/to/video/folder --output /path/to/save/meta.csv ``` This should output a `.csv` file with column `path`. If you already have a meta file for the videos and want to keep the information. **Make sure** the meta file has column `id`, which is the id for each video, and the video is named as `{id}.mp4`. The following command will add a new column `path` to the meta file. ```bash python tools/scene_cut/convert_id_to_path.py /path/to/meta.csv --folder_path /path/to/video/folder ``` This should output - `{prefix}_path-filtered.csv` with column `path` (broken videos filtered) - `{prefix}_path_intact.csv` with column `path` and `intact` (`intact` indicating a video is intact or not) ## Scene Detection Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "Scene Detection" sections. **Make sure** the input meta file has column `path`, which is the path of a video. ```bash python tools/scene_cut/scene_detect.py /path/to/meta.csv ``` The output is `{prefix}_timestamp.csv` with column `timestamp`. Each cell in column `timestamp` is a list of tuples, with each tuple indicating the start and end timestamp of a scene (e.g., `[('00:00:01.234', '00:00:02.345'), ('00:00:03.456', '00:00:04.567')]`). ## Video Splitting After obtaining timestamps for scenes, we conduct video splitting (cutting). **Make sure** the meta file contains column `timestamp`. ```bash python tools/scene_cut/cut.py /path/to/meta.csv --save_dir /path/to/output/dir ``` This will save video clips to `/path/to/output/dir`. The video clips are named as `{video_id}_scene-{scene_id}.mp4` To create a new meta file for the generated clips, run: ```bash python -m tools.datasets.convert video /path/to/video/folder --output /path/to/save/meta.csv ``` ================================================ FILE: Open-Sora/tools/scene_cut/__init__.py ================================================ ================================================ FILE: Open-Sora/tools/scene_cut/convert_id_to_path.py ================================================ import argparse import json import os from functools import partial import cv2 import numpy as np import pandas as pd from mmengine.logging import print_log from moviepy.editor import VideoFileClip from pandarallel import pandarallel from tqdm import tqdm tqdm.pandas() def is_intact_video(video_path, mode="moviepy", verbose=False, logger=None): if not os.path.exists(video_path): if verbose: print_log(f"Could not find '{video_path}'", logger=logger) return False if mode == "moviepy": try: VideoFileClip(video_path) if verbose: print_log(f"The video file '{video_path}' is intact.", logger=logger) return True except Exception as e: if verbose: print_log(f"Error: {e}", logger=logger) print_log(f"The video file '{video_path}' is not intact.", logger=logger) return False elif mode == "cv2": try: cap = cv2.VideoCapture(video_path) if cap.isOpened(): if verbose: print_log(f"The video file '{video_path}' is intact.", logger=logger) return True except Exception as e: if verbose: print_log(f"Error: {e}", logger=logger) print_log(f"The video file '{video_path}' is not intact.", logger=logger) return False else: raise ValueError def has_downloaded_success(json_path): if not os.path.exists(json_path): return False try: with open(json_path, "r") as f: data = json.load(f) if "success" not in data or isinstance(data["success"], bool) is False or data["success"] is False: return False except Exception: return False return True def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("meta_path", type=str) parser.add_argument("--folder_path", type=str, required=True) parser.add_argument("--mode", type=str, default=None) parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel") args = parser.parse_args() return args def main(): args = parse_args() meta_path = args.meta_path folder_path = args.folder_path mode = args.mode def is_intact(row, mode=None): video_id = row["id"] video_path = os.path.join(folder_path, f"{video_id}.mp4") row["path"] = video_path if mode == ".mp4": if is_intact_video(video_path): return True, video_path return False, video_path elif mode == ".json": # json_path = os.path.join(root_raw, f"data/{split}/{video_id}.json") json_path = os.path.join(folder_path, f"{video_id}.json") if has_downloaded_success(json_path): return True, video_path return False, video_path elif mode is None: return True, video_path else: raise ValueError meta_dirpath = os.path.dirname(meta_path) meta_fname = os.path.basename(meta_path) wo_ext, ext = os.path.splitext(meta_fname) if args.num_workers is not None: pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers) else: pandarallel.initialize(progress_bar=True) is_intact_partial = partial(is_intact, mode=mode) meta = pd.read_csv(meta_path) ret = meta.parallel_apply(is_intact_partial, axis=1) intact, paths = list(zip(*ret)) meta["intact"] = intact meta["path"] = paths out_path = os.path.join(meta_dirpath, f"{wo_ext}_path_intact.csv") meta.to_csv(out_path, index=False) print(f"New meta (shape={meta.shape}) with intact info saved to '{out_path}'") meta_format = meta[np.array(intact)] meta_format.drop("intact", axis=1, inplace=True) out_path = os.path.join(meta_dirpath, f"{wo_ext}_path-filtered.csv") meta_format.to_csv(out_path, index=False) print(f"New meta (shape={meta_format.shape}) with format info saved to '{out_path}'") if __name__ == "__main__": main() ================================================ FILE: Open-Sora/tools/scene_cut/cut.py ================================================ import cv2 # isort:skip import argparse import os import subprocess from functools import partial import pandas as pd from imageio_ffmpeg import get_ffmpeg_exe from pandarallel import pandarallel from scenedetect import FrameTimecode from tqdm import tqdm tqdm.pandas() def print_log(s, logger=None): if logger is not None: logger.info(s) else: print(s) def process_single_row(row, args): video_path = row["path"] logger = None # check mp4 integrity # if not is_intact_video(video_path, logger=logger): # return False try: if "timestamp" in row: timestamp = row["timestamp"] if not (timestamp.startswith("[") and timestamp.endswith("]")): return False scene_list = eval(timestamp) scene_list = [(FrameTimecode(s, fps=100), FrameTimecode(t, fps=100)) for s, t in scene_list] else: scene_list = [None] if args.drop_invalid_timestamps: return True except Exception as e: if args.drop_invalid_timestamps: return False if "relpath" in row: save_dir = os.path.dirname(os.path.join(args.save_dir, row["relpath"])) os.makedirs(save_dir, exist_ok=True) else: save_dir = args.save_dir shorter_size = args.shorter_size if (shorter_size is not None) and ("height" in row) and ("width" in row): min_size = min(row["height"], row["width"]) if min_size <= shorter_size: shorter_size = None split_video( video_path, scene_list, save_dir=save_dir, min_seconds=args.min_seconds, max_seconds=args.max_seconds, target_fps=args.target_fps, shorter_size=shorter_size, logger=logger, ) return True def split_video( video_path, scene_list, save_dir, min_seconds=2, max_seconds=15, target_fps=30, shorter_size=None, verbose=False, logger=None, ): """ scenes shorter than min_seconds will be ignored; scenes longer than max_seconds will be cut to save the beginning max_seconds. Currently, the saved file name pattern is f'{fname}_scene-{idx}'.mp4 Args: scene_list (List[Tuple[FrameTimecode, FrameTimecode]]): each element is (s, t): start and end of a scene. min_seconds (float | None) max_seconds (float | None) target_fps (int | None) shorter_size (int | None) """ FFMPEG_PATH = get_ffmpeg_exe() save_path_list = [] for idx, scene in enumerate(scene_list): if scene is not None: s, t = scene # FrameTimecode if min_seconds is not None: if (t - s).get_seconds() < min_seconds: continue duration = t - s if max_seconds is not None: fps = s.framerate max_duration = FrameTimecode(max_seconds, fps=fps) duration = min(max_duration, duration) # save path fname = os.path.basename(video_path) fname_wo_ext = os.path.splitext(fname)[0] # TODO: fname pattern save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4") if os.path.exists(save_path): # print_log(f"File '{save_path}' already exists. Skip.", logger=logger) continue # ffmpeg cmd cmd = [FFMPEG_PATH] # Only show ffmpeg output for the first call, which will display any # errors if it fails, and then break the loop. We only show error messages # for the remaining calls. # cmd += ['-v', 'error'] # clip to cut # Note: -ss after -i is very slow; put -ss before -i !!! if scene is None: cmd += ["-nostdin", "-y", "-i", video_path] else: cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-i", video_path, "-t", str(duration.get_seconds())] # target fps if target_fps is not None: cmd += ["-r", f"{target_fps}"] # aspect ratio if shorter_size is not None: cmd += ["-vf", f"scale='if(gt(iw,ih),-2,{shorter_size})':'if(gt(iw,ih),{shorter_size},-2)'"] # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"] cmd += ["-map", "0:v", save_path] # print(cmd) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout, stderr = proc.communicate() # stdout = stdout.decode("utf-8") # print_log(stdout, logger=logger) save_path_list.append(video_path) if verbose: print_log(f"Video clip saved to '{save_path}'", logger=logger) return save_path_list def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("meta_path", type=str) parser.add_argument("--save_dir", type=str) parser.add_argument( "--min_seconds", type=float, default=None, help="if not None, clip shorter than min_seconds is ignored" ) parser.add_argument( "--max_seconds", type=float, default=None, help="if not None, clip longer than max_seconds is truncated" ) parser.add_argument("--target_fps", type=int, default=None, help="target fps of clips") parser.add_argument( "--shorter_size", type=int, default=None, help="resize the shorter size by keeping ratio; will not do upscale" ) parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel") parser.add_argument("--disable_parallel", action="store_true", help="disable parallel processing") parser.add_argument("--drop_invalid_timestamps", action="store_true", help="drop rows with invalid timestamps") args = parser.parse_args() return args def main(): args = parse_args() meta_path = args.meta_path if not os.path.exists(meta_path): print(f"Meta file '{meta_path}' not found. Exit.") exit() # create save_dir os.makedirs(args.save_dir, exist_ok=True) # initialize pandarallel if not args.disable_parallel: if args.num_workers is not None: pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers) else: pandarallel.initialize(progress_bar=True) process_single_row_partial = partial(process_single_row, args=args) # process meta = pd.read_csv(args.meta_path) if not args.disable_parallel: results = meta.parallel_apply(process_single_row_partial, axis=1) else: results = meta.apply(process_single_row_partial, axis=1) if args.drop_invalid_timestamps: meta = meta[results] assert args.meta_path.endswith("timestamp.csv"), "Only support *timestamp.csv" meta.to_csv(args.meta_path.replace("timestamp.csv", "correct_timestamp.csv"), index=False) print(f"Corrected timestamp file saved to '{args.meta_path.replace('timestamp.csv', 'correct_timestamp.csv')}'") if __name__ == "__main__": main() ================================================ FILE: Open-Sora/tools/scene_cut/scene_detect.py ================================================ import argparse import os import numpy as np import pandas as pd from pandarallel import pandarallel from scenedetect import AdaptiveDetector, detect from tqdm import tqdm tqdm.pandas() def process_single_row(row): # windows # from scenedetect import detect, ContentDetector, AdaptiveDetector video_path = row["path"] detector = AdaptiveDetector( adaptive_threshold=3.0, # luma_only=True, ) # detector = ContentDetector() # TODO: catch error here try: scene_list = detect(video_path, detector, start_in_scene=True) timestamp = [(s.get_timecode(), t.get_timecode()) for s, t in scene_list] return True, str(timestamp) except Exception as e: print(f"Video '{video_path}' with error {e}") return False, "" def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("meta_path", type=str) parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel") args = parser.parse_args() return args def main(): args = parse_args() meta_path = args.meta_path if not os.path.exists(meta_path): print(f"Meta file '{meta_path}' not found. Exit.") exit() if args.num_workers is not None: pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers) else: pandarallel.initialize(progress_bar=True) meta = pd.read_csv(meta_path) ret = meta.parallel_apply(process_single_row, axis=1) succ, timestamps = list(zip(*ret)) meta["timestamp"] = timestamps meta = meta[np.array(succ)] wo_ext, ext = os.path.splitext(meta_path) out_path = f"{wo_ext}_timestamp{ext}" meta.to_csv(out_path, index=False) print(f"New meta (shape={meta.shape}) with timestamp saved to '{out_path}'.") if __name__ == "__main__": main() ================================================ FILE: Open-Sora/tools/scoring/README.md ================================================ # Scoring and Filtering - [Scoring and Filtering](#scoring-and-filtering) - [Aesthetic Score](#aesthetic-score) - [Optical Flow Score](#optical-flow-score) - [OCR](#ocr) - [Matching Score](#matching-score) - [Filtering](#filtering) ## Aesthetic Score To evaluate the aesthetic quality of videos, we use the scoring model from [CLIP+MLP Aesthetic Score Predictor](https://github.com/christophschuhmann/improved-aesthetic-predictor). This model is trained on 176K SAC (Simulacra Aesthetic Captions) pairs, 15K LAION-Logos (Logos) pairs, and 250K AVA (The Aesthetic Visual Analysis) image-text pairs. The aesthetic score is between 1 and 10, where 5.5 can be considered as the threshold for fair aesthetics, and 6.5 for high aesthetics. Good text-to-image models can achieve a score of 7.0 or higher. For videos, we extract the first, last, and the middle frames for evaluation. The script also supports images as input. The throughput of our code is ~1K videos/s on a single H800 GPU. It also supports running on multiple GPUs for further acceleration. First, install the required packages following our [installation instructions](../../docs/installation.md)'s "Data Dependencies". Next, download the scoring model to `./pretrained_models/aesthetic.pth`. ```bash wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth ``` Then, run the following command. **Make sure** the meta file has column `path` (path to the sample). ```bash torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference /path/to/meta.csv --bs 1024 --num_workers 16 ``` This will generate multiple part files, each corresponding to a node . Run `python -m tools.datasets.datautil /path/to/meta_aes_part*.csv --output /path/to/meta_aes.csv` to merge them. ## Optical Flow Score Optical flow scores are used to assess the motion of a video. Higher optical flow scores indicate larger movement. We use the [UniMatch](https://github.com/autonomousvision/unimatch) model for this task. First, install the required packages following our [installation instructions](../../docs/installation.md)'s "Data Dependencies". Next, download the pretrained model to `./pretrained_model/unimatch/` ```bash wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth -P ./pretrained_models/unimatch/ ``` Then, run the following command. **Make sure** the meta file has column `path` (path to the sample). ```bash torchrun --standalone --nproc_per_node 8 tools/scoring/optical_flow/inference.py /path/to/meta.csv ``` This should output `/path/to/meta_flow.csv` with column `flow`. ## OCR Some videos are of dense text scenes like news broadcast and advertisement, which are not desired for training. We apply Optical Character Recognition (OCR) to detect texts and drop samples with dense texts. Here, we use the [DBNet++](https://arxiv.org/abs/2202.10304) model implemented by [MMOCR](https://github.com/open-mmlab/mmocr/). First, install the required packages following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "OCR" section. Then, run the following command. **Make sure** the meta file has column `path` (path to the sample). ```bash torchrun --standalone --nproc_per_node 8 -m tools.scoring.ocr.inference /path/to/meta.csv ``` This should output `/path/to/meta_ocr.csv` with column `ocr`, indicating the number of text regions with detection confidence > 0.3. ## Matching Score Matching scores are calculated to evaluate the alignment between an image/video and its caption. Here, we use the [CLIP](https://github.com/openai/CLIP) model, which is trained on image-text pairs. We simply use the cosine similarity as the matching score. For videos, we extract the middle frame and compare it with the caption. First, install OpenAI CLIP. ```bash pip install git+https://github.com/openai/CLIP.git ``` Then, run the following command. **Make sure** the meta file has column `path` (path to the sample) and `text` (caption of the sample). ```bash torchrun --standalone --nproc_per_node 8 tools/scoring/matching/inference.py /path/to/meta.csv ``` This should output `/path/to/meta_match.csv` with column `match`. Higher matching scores indicate better image-text/video-text alignment. ## Filtering Once scores are obtained, it is simple to filter samples based on these scores. Here is an example to remove samples of aesthetic score < 5.0. ``` python -m tools.datasets.datautil /path/to/meta.csv --aesmin 5.0 ``` This should output `/path/to/meta_aesmin5.0.csv` with column `aes` >= 5.0 ================================================ FILE: Open-Sora/tools/scoring/__init__.py ================================================ ================================================ FILE: Open-Sora/tools/scoring/aesthetic/__init__.py ================================================ ================================================ FILE: Open-Sora/tools/scoring/aesthetic/inference.py ================================================ # adapted from https://github.com/christophschuhmann/improved-aesthetic-predictor/blob/main/simple_inference.py import cv2 # isort:skip import argparse import gc import os from datetime import timedelta import clip import numpy as np import pandas as pd import torch import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F from einops import rearrange from torch.utils.data import DataLoader, DistributedSampler from torchvision.datasets.folder import pil_loader from tqdm import tqdm from tools.datasets.utils import extract_frames, is_video NUM_FRAMES_POINTS = { 1: (0.5,), 2: (0.25, 0.5), 3: (0.1, 0.5, 0.9), } def merge_scores(gathered_list: list, meta: pd.DataFrame, column): # reorder indices_list = list(map(lambda x: x[0], gathered_list)) scores_list = list(map(lambda x: x[1], gathered_list)) flat_indices = [] for x in zip(*indices_list): flat_indices.extend(x) flat_scores = [] for x in zip(*scores_list): flat_scores.extend(x) flat_indices = np.array(flat_indices) flat_scores = np.array(flat_scores) # filter duplicates unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True) meta.loc[unique_indices, column] = flat_scores[unique_indices_idx] # drop indices in meta not in unique_indices meta = meta.loc[unique_indices] return meta class VideoTextDataset(torch.utils.data.Dataset): def __init__(self, meta_path, transform=None, num_frames=3): self.meta_path = meta_path self.meta = pd.read_csv(meta_path) self.transform = transform self.points = NUM_FRAMES_POINTS[num_frames] def __getitem__(self, index): sample = self.meta.iloc[index] path = sample["path"] # extract frames if not is_video(path): images = [pil_loader(path)] else: num_frames = sample["num_frames"] if "num_frames" in sample else None images = extract_frames(sample["path"], points=self.points, backend="opencv", num_frames=num_frames) # transform images = [self.transform(img) for img in images] # stack images = torch.stack(images) ret = dict(index=index, images=images) return ret def __len__(self): return len(self.meta) class MLP(nn.Module): def __init__(self, input_size): super().__init__() self.input_size = input_size self.layers = nn.Sequential( nn.Linear(self.input_size, 1024), nn.Dropout(0.2), nn.Linear(1024, 128), nn.Dropout(0.2), nn.Linear(128, 64), nn.Dropout(0.1), nn.Linear(64, 16), nn.Linear(16, 1), ) def forward(self, x): return self.layers(x) class AestheticScorer(nn.Module): def __init__(self, input_size, device): super().__init__() self.mlp = MLP(input_size) self.clip, self.preprocess = clip.load("ViT-L/14", device=device) self.eval() self.to(device) def forward(self, x): image_features = self.clip.encode_image(x) image_features = F.normalize(image_features, p=2, dim=-1).float() return self.mlp(image_features) def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("meta_path", type=str, help="Path to the input CSV file") parser.add_argument("--bs", type=int, default=1024, help="Batch size") parser.add_argument("--num_workers", type=int, default=16, help="Number of workers") parser.add_argument("--prefetch_factor", type=int, default=3, help="Prefetch factor") parser.add_argument("--num_frames", type=int, default=3, help="Number of frames to extract") parser.add_argument("--skip_if_existing", action="store_true") args = parser.parse_args() return args def main(): args = parse_args() meta_path = args.meta_path if not os.path.exists(meta_path): print(f"Meta file '{meta_path}' not found. Exit.") exit() wo_ext, ext = os.path.splitext(meta_path) out_path = f"{wo_ext}_aes{ext}" if args.skip_if_existing and os.path.exists(out_path): print(f"Output meta file '{out_path}' already exists. Exit.") exit() dist.init_process_group(backend="nccl", timeout=timedelta(hours=24)) torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count()) # build model device = "cuda" if torch.cuda.is_available() else "cpu" model = AestheticScorer(768, device) model.mlp.load_state_dict(torch.load("pretrained_models/aesthetic.pth", map_location=device)) preprocess = model.preprocess # build dataset dataset = VideoTextDataset(args.meta_path, transform=preprocess, num_frames=args.num_frames) dataloader = DataLoader( dataset, batch_size=args.bs, num_workers=args.num_workers, sampler=DistributedSampler( dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=False, drop_last=False, ), ) # compute aesthetic scores indices_list = [] scores_list = [] model.eval() for batch in tqdm(dataloader, disable=dist.get_rank() != 0): indices = batch["index"] images = batch["images"].to(device, non_blocking=True) B = images.shape[0] images = rearrange(images, "B N C H W -> (B N) C H W") # compute score with torch.no_grad(): scores = model(images) scores = rearrange(scores, "(B N) 1 -> B N", B=B) scores = scores.mean(dim=1) scores_np = scores.to(torch.float32).cpu().numpy() indices_list.extend(indices.tolist()) scores_list.extend(scores_np.tolist()) # save local results meta_local = merge_scores([(indices_list, scores_list)], dataset.meta, column="aes") save_dir_local = os.path.join(os.path.dirname(out_path), "parts") os.makedirs(save_dir_local, exist_ok=True) out_path_local = os.path.join( save_dir_local, os.path.basename(out_path).replace(".csv", f"_part_{dist.get_rank()}.csv") ) meta_local.to_csv(out_path_local, index=False) # wait for all ranks to finish data processing dist.barrier() torch.cuda.empty_cache() gc.collect() gathered_list = [None] * dist.get_world_size() dist.all_gather_object(gathered_list, (indices_list, scores_list)) if dist.get_rank() == 0: meta_new = merge_scores(gathered_list, dataset.meta, column="aes") meta_new.to_csv(out_path, index=False) print(f"New meta with aesthetic scores saved to '{out_path}'.") if __name__ == "__main__": main() ================================================ FILE: Open-Sora/tools/scoring/matching/__init__.py ================================================ ================================================ FILE: Open-Sora/tools/scoring/matching/inference.py ================================================ import argparse import os import clip import colossalai import numpy as np import pandas as pd import torch import torch.distributed as dist import torch.nn.functional as F from torch.utils.data import DataLoader, DistributedSampler from torchvision.datasets.folder import pil_loader from tqdm import tqdm from tools.datasets.utils import extract_frames, is_video def merge_scores(gathered_list: list, meta: pd.DataFrame, column): # reorder indices_list = list(map(lambda x: x[0], gathered_list)) scores_list = list(map(lambda x: x[1], gathered_list)) flat_indices = [] for x in zip(*indices_list): flat_indices.extend(x) flat_scores = [] for x in zip(*scores_list): flat_scores.extend(x) flat_indices = np.array(flat_indices) flat_scores = np.array(flat_scores) # filter duplicates unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True) meta.loc[unique_indices, column] = flat_scores[unique_indices_idx] return meta class VideoTextDataset(torch.utils.data.Dataset): def __init__(self, meta_path, transform): self.meta_path = meta_path self.meta = pd.read_csv(meta_path) self.transform = transform def __getitem__(self, index): row = self.meta.iloc[index] path = row["path"] if is_video(path): img = extract_frames(path, points=[0.5], backend="opencv")[0] else: img = pil_loader(path) img = self.transform(img) text = row["text"] text = clip.tokenize(text, truncate=True).squeeze() return img, text, index def __len__(self): return len(self.meta) def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("meta_path", type=str, help="Path to the input CSV file") parser.add_argument("--bs", type=int, default=16, help="Batch size") parser.add_argument("--num_workers", type=int, default=16, help="Number of workers") parser.add_argument("--skip_if_existing", action="store_true") args = parser.parse_args() return args def main(): args = parse_args() meta_path = args.meta_path if not os.path.exists(meta_path): print(f"Meta file '{meta_path}' not found. Exit.") exit() wo_ext, ext = os.path.splitext(meta_path) out_path = f"{wo_ext}_match{ext}" if args.skip_if_existing and os.path.exists(out_path): print(f"Output meta file '{out_path}' already exists. Exit.") exit() colossalai.launch_from_torch({}) # build model device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") model, preprocess = clip.load("ViT-L/14", device=device) logit_scale = model.logit_scale.exp().item() # build dataset dataset = VideoTextDataset(meta_path=meta_path, transform=preprocess) dataloader = DataLoader( dataset, batch_size=args.bs, num_workers=args.num_workers, sampler=DistributedSampler( dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=False, drop_last=False, ), ) # compute scores indices_list = [] scores_list = [] model.eval() for imgs, text, indices in tqdm(dataloader, disable=dist.get_rank() != 0): imgs = imgs.to(device) text = text.to(device) with torch.no_grad(): feat_img = model.encode_image(imgs) feat_text = model.encode_text(text) feat_img = F.normalize(feat_img, dim=1) feat_text = F.normalize(feat_text, dim=1) clip_scores = logit_scale * (feat_img * feat_text).sum(dim=1) clip_scores = clip_scores.cpu().tolist() indices_list.extend(indices) scores_list.extend(clip_scores) gathered_list = [None] * dist.get_world_size() dist.all_gather_object(gathered_list, (indices_list, scores_list)) if dist.get_rank() == 0: meta_new = merge_scores(gathered_list, dataset.meta, column="match") meta_new.to_csv(out_path, index=False) print(f"New meta with matching scores saved to '{out_path}'.") if __name__ == "__main__": main() ================================================ FILE: Open-Sora/tools/scoring/ocr/__init__.py ================================================ ================================================ FILE: Open-Sora/tools/scoring/ocr/dbnetpp.py ================================================ model = dict( type="DBNet", backbone=dict( type="CLIPResNet", depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=-1, norm_cfg=dict(type="BN", requires_grad=True), norm_eval=False, style="pytorch", dcn=dict(type="DCNv2", deform_groups=1, fallback_on_stride=False), # init_cfg=dict( # type='Pretrained', # checkpoint='https://download.openmmlab.com/mmocr/backbone/resnet50-oclip-7ba0c533.pth'), stage_with_dcn=(False, True, True, True), ), neck=dict( type="FPNC", in_channels=[256, 512, 1024, 2048], lateral_channels=256, asf_cfg=dict(attention_type="ScaleChannelSpatial"), ), det_head=dict( type="DBHead", in_channels=256, module_loss=dict(type="DBModuleLoss"), postprocessor=dict( type="DBPostprocessor", text_repr_type="quad", epsilon_ratio=0.002, ), ), data_preprocessor=dict( type="TextDetDataPreprocessor", mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], bgr_to_rgb=True, pad_size_divisor=32, ), init_cfg=dict( type="Pretrained", checkpoint="https://download.openmmlab.com/mmocr/textdet/dbnetpp/" "dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015/" "dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015_20221101_124139-4ecb39ac.pth", ), ) test_pipeline = [ # dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), dict(type="Resize", scale=(4068, 1024), keep_ratio=True), dict( type="PackTextDetInputs", # meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor'), meta_keys=("img_shape", "scale_factor"), ), ] # Visualization vis_backends = [dict(type="LocalVisBackend")] visualizer = dict( type="TextDetLocalVisualizer", name="visualizer", vis_backends=vis_backends, ) ================================================ FILE: Open-Sora/tools/scoring/ocr/inference.py ================================================ import argparse import os import colossalai import numpy as np import pandas as pd import torch import torch.distributed as dist from mmengine import Config from mmengine.dataset import Compose, default_collate from mmengine.registry import DefaultScope from mmocr.datasets import PackTextDetInputs from mmocr.registry import MODELS from torch.utils.data import DataLoader, DistributedSampler from torchvision.datasets.folder import pil_loader from torchvision.transforms import CenterCrop, Compose, Resize from tqdm import tqdm from tools.datasets.utils import extract_frames, is_video def merge_scores(gathered_list: list, meta: pd.DataFrame): # reorder indices_list = list(map(lambda x: x[0], gathered_list)) scores_list = list(map(lambda x: x[1], gathered_list)) flat_indices = [] for x in zip(*indices_list): flat_indices.extend(x) flat_scores = [] for x in zip(*scores_list): flat_scores.extend(x) flat_indices = np.array(flat_indices) flat_scores = np.array(flat_scores) # filter duplicates unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True) meta.loc[unique_indices, "ocr"] = flat_scores[unique_indices_idx] class VideoTextDataset(torch.utils.data.Dataset): def __init__(self, meta_path, transform): self.meta_path = meta_path self.meta = pd.read_csv(meta_path) self.transform = transform self.transform = Compose( [ Resize(1024), CenterCrop(1024), ] ) self.formatting = PackTextDetInputs(meta_keys=["scale_factor"]) def __getitem__(self, index): row = self.meta.iloc[index] path = row["path"] if is_video(path): img = extract_frames(path, frame_inds=[10], backend="opencv")[0] else: img = pil_loader(path) img = self.transform(img) img_array = np.array(img)[:, :, ::-1].copy() # bgr results = { "img": img_array, "scale_factor": 1.0, # 'img_shape': img_array.shape[-2], # 'ori_shape': img_array.shape[-2], } results = self.formatting(results) results["index"] = index return results def __len__(self): return len(self.meta) def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("meta_path", type=str, help="Path to the input CSV file") parser.add_argument("--bs", type=int, default=16, help="Batch size") parser.add_argument("--num_workers", type=int, default=16, help="Number of workers") parser.add_argument("--skip_if_existing", action="store_true") args = parser.parse_args() return args def main(): args = parse_args() meta_path = args.meta_path if not os.path.exists(meta_path): print(f"Meta file '{meta_path}' not found. Exit.") exit() wo_ext, ext = os.path.splitext(meta_path) out_path = f"{wo_ext}_ocr{ext}" if args.skip_if_existing and os.path.exists(out_path): print(f"Output meta file '{out_path}' already exists. Exit.") exit() cfg = Config.fromfile("./tools/scoring/ocr/dbnetpp.py") colossalai.launch_from_torch({}) device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") DefaultScope.get_instance("ocr", scope_name="mmocr") # use mmocr Registry as default # build model model = MODELS.build(cfg.model) model.init_weights() model.to(device) # set data_preprocessor._device print("==> Model built.") # build dataset transform = Compose(cfg.test_pipeline) dataset = VideoTextDataset(meta_path=meta_path, transform=transform) dataloader = DataLoader( dataset, batch_size=args.bs, num_workers=args.num_workers, sampler=DistributedSampler( dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=False, drop_last=False, ), collate_fn=default_collate, ) print("==> Dataloader built.") # compute scores dataset.meta["ocr"] = np.nan indices_list = [] scores_list = [] model.eval() for data in tqdm(dataloader, disable=dist.get_rank() != 0): indices_i = data["index"] indices_list.extend(indices_i.tolist()) del data["index"] pred = model.test_step(data) # this line will cast data to device num_texts_i = [(x.pred_instances.scores > 0.3).sum().item() for x in pred] scores_list.extend(num_texts_i) gathered_list = [None] * dist.get_world_size() dist.all_gather_object(gathered_list, (indices_list, scores_list)) if dist.get_rank() == 0: merge_scores(gathered_list, dataset.meta) dataset.meta.to_csv(out_path, index=False) print(f"New meta (shape={dataset.meta.shape}) with ocr results saved to '{out_path}'.") if __name__ == "__main__": main() ================================================ FILE: Open-Sora/tools/scoring/optical_flow/__init__.py ================================================ ================================================ FILE: Open-Sora/tools/scoring/optical_flow/inference.py ================================================ import cv2 # isort:skip import argparse import gc import os from datetime import timedelta import numpy as np import pandas as pd import torch import torch.distributed as dist import torch.nn.functional as F from einops import rearrange from torch.utils.data import DataLoader, DistributedSampler from torchvision.transforms.functional import pil_to_tensor from tqdm import tqdm from tools.datasets.utils import extract_frames from tools.scoring.optical_flow.unimatch import UniMatch # torch.backends.cudnn.enabled = False # This line enables large batch, but the speed is similar def merge_scores(gathered_list: list, meta: pd.DataFrame, column): # reorder indices_list = list(map(lambda x: x[0], gathered_list)) scores_list = list(map(lambda x: x[1], gathered_list)) flat_indices = [] for x in zip(*indices_list): flat_indices.extend(x) flat_scores = [] for x in zip(*scores_list): flat_scores.extend(x) flat_indices = np.array(flat_indices) flat_scores = np.array(flat_scores) # filter duplicates unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True) meta.loc[unique_indices, column] = flat_scores[unique_indices_idx] # drop indices in meta not in unique_indices meta = meta.loc[unique_indices] return meta class VideoTextDataset(torch.utils.data.Dataset): def __init__(self, meta_path, frame_inds=[0, 10, 20, 30]): self.meta_path = meta_path self.meta = pd.read_csv(meta_path) self.frame_inds = frame_inds def __getitem__(self, index): sample = self.meta.iloc[index] path = sample["path"] # extract frames images = extract_frames(path, frame_inds=self.frame_inds, backend="opencv") # transform images = torch.stack([pil_to_tensor(x) for x in images]) # stack # shape: [N, C, H, W]; dtype: torch.uint8 images = images.float() H, W = images.shape[-2:] if H > W: images = rearrange(images, "N C H W -> N C W H") images = F.interpolate(images, size=(320, 576), mode="bilinear", align_corners=True) ret = dict(index=index, images=images) return ret def __len__(self): return len(self.meta) def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("meta_path", type=str, help="Path to the input CSV file") parser.add_argument("--bs", type=int, default=4, help="Batch size") # don't use too large bs for unimatch parser.add_argument("--num_workers", type=int, default=16, help="Number of workers") parser.add_argument("--skip_if_existing", action="store_true") args = parser.parse_args() return args def main(): args = parse_args() meta_path = args.meta_path if not os.path.exists(meta_path): print(f"Meta file '{meta_path}' not found. Exit.") exit() wo_ext, ext = os.path.splitext(meta_path) out_path = f"{wo_ext}_flow{ext}" if args.skip_if_existing and os.path.exists(out_path): print(f"Output meta file '{out_path}' already exists. Exit.") exit() torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False dist.init_process_group(backend="nccl", timeout=timedelta(hours=24)) torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count()) # build model device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") model = UniMatch( feature_channels=128, num_scales=2, upsample_factor=4, num_head=1, ffn_dim_expansion=4, num_transformer_layers=6, reg_refine=True, task="flow", ) ckpt = torch.load("./pretrained_models/unimatch/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth") model.load_state_dict(ckpt["model"]) model = model.to(device) # build dataset dataset = VideoTextDataset(meta_path=meta_path, frame_inds=[0, 10, 20, 30]) dataloader = DataLoader( dataset, batch_size=args.bs, num_workers=args.num_workers, sampler=DistributedSampler( dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=False, drop_last=False, ), ) # compute optical flow scores indices_list = [] scores_list = [] model.eval() for batch in tqdm(dataloader, disable=dist.get_rank() != 0): indices = batch["index"] images = batch["images"].to(device, non_blocking=True) B = images.shape[0] batch_0 = rearrange(images[:, :-1], "B N C H W -> (B N) C H W").contiguous() batch_1 = rearrange(images[:, 1:], "B N C H W -> (B N) C H W").contiguous() with torch.no_grad(): res = model( batch_0, batch_1, attn_type="swin", attn_splits_list=[2, 8], corr_radius_list=[-1, 4], prop_radius_list=[-1, 1], num_reg_refine=6, task="flow", pred_bidir_flow=False, ) flow_maps = res["flow_preds"][-1].cpu() # [B * (N-1), 2, H, W] flow_maps = rearrange(flow_maps, "(B N) C H W -> B N H W C", B=B) flow_scores = flow_maps.abs().mean(dim=[1, 2, 3, 4]) flow_scores = flow_scores.tolist() indices_list.extend(indices.tolist()) scores_list.extend(flow_scores) # save local results meta_local = merge_scores([(indices_list, scores_list)], dataset.meta, column="flow") save_dir_local = os.path.join(os.path.dirname(out_path), "parts") os.makedirs(save_dir_local, exist_ok=True) out_path_local = os.path.join( save_dir_local, os.path.basename(out_path).replace(".csv", f"_part_{dist.get_rank()}.csv") ) meta_local.to_csv(out_path_local, index=False) # wait for all ranks to finish data processing dist.barrier() torch.cuda.empty_cache() gc.collect() gathered_list = [None] * dist.get_world_size() dist.all_gather_object(gathered_list, (indices_list, scores_list)) if dist.get_rank() == 0: meta_new = merge_scores(gathered_list, dataset.meta, column="flow") meta_new.to_csv(out_path, index=False) print(f"New meta with optical flow scores saved to '{out_path}'.") if __name__ == "__main__": main() ================================================ FILE: Open-Sora/tools/scoring/optical_flow/unimatch/__init__.py ================================================ from .unimatch import UniMatch ================================================ FILE: Open-Sora/tools/scoring/optical_flow/unimatch/attention.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from .utils import merge_splits, merge_splits_1d, split_feature, split_feature_1d def single_head_full_attention(q, k, v): # q, k, v: [B, L, C] assert q.dim() == k.dim() == v.dim() == 3 scores = torch.matmul(q, k.permute(0, 2, 1)) / (q.size(2) ** 0.5) # [B, L, L] attn = torch.softmax(scores, dim=2) # [B, L, L] out = torch.matmul(attn, v) # [B, L, C] return out def single_head_full_attention_1d( q, k, v, h=None, w=None, ): # q, k, v: [B, L, C] assert h is not None and w is not None assert q.size(1) == h * w b, _, c = q.size() q = q.view(b, h, w, c) # [B, H, W, C] k = k.view(b, h, w, c) v = v.view(b, h, w, c) scale_factor = c**0.5 scores = torch.matmul(q, k.permute(0, 1, 3, 2)) / scale_factor # [B, H, W, W] attn = torch.softmax(scores, dim=-1) out = torch.matmul(attn, v).view(b, -1, c) # [B, H*W, C] return out def single_head_split_window_attention( q, k, v, num_splits=1, with_shift=False, h=None, w=None, attn_mask=None, ): # ref: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py # q, k, v: [B, L, C] assert q.dim() == k.dim() == v.dim() == 3 assert h is not None and w is not None assert q.size(1) == h * w b, _, c = q.size() b_new = b * num_splits * num_splits window_size_h = h // num_splits window_size_w = w // num_splits q = q.view(b, h, w, c) # [B, H, W, C] k = k.view(b, h, w, c) v = v.view(b, h, w, c) scale_factor = c**0.5 if with_shift: assert attn_mask is not None # compute once shift_size_h = window_size_h // 2 shift_size_w = window_size_w // 2 q = torch.roll(q, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2)) k = torch.roll(k, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2)) v = torch.roll(v, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2)) q = split_feature(q, num_splits=num_splits, channel_last=True) # [B*K*K, H/K, W/K, C] k = split_feature(k, num_splits=num_splits, channel_last=True) v = split_feature(v, num_splits=num_splits, channel_last=True) scores = ( torch.matmul(q.view(b_new, -1, c), k.view(b_new, -1, c).permute(0, 2, 1)) / scale_factor ) # [B*K*K, H/K*W/K, H/K*W/K] if with_shift: scores += attn_mask.repeat(b, 1, 1) attn = torch.softmax(scores, dim=-1) out = torch.matmul(attn, v.view(b_new, -1, c)) # [B*K*K, H/K*W/K, C] out = merge_splits( out.view(b_new, h // num_splits, w // num_splits, c), num_splits=num_splits, channel_last=True ) # [B, H, W, C] # shift back if with_shift: out = torch.roll(out, shifts=(shift_size_h, shift_size_w), dims=(1, 2)) out = out.view(b, -1, c) return out def single_head_split_window_attention_1d( q, k, v, relative_position_bias=None, num_splits=1, with_shift=False, h=None, w=None, attn_mask=None, ): # q, k, v: [B, L, C] assert h is not None and w is not None assert q.size(1) == h * w b, _, c = q.size() b_new = b * num_splits * h window_size_w = w // num_splits q = q.view(b * h, w, c) # [B*H, W, C] k = k.view(b * h, w, c) v = v.view(b * h, w, c) scale_factor = c**0.5 if with_shift: assert attn_mask is not None # compute once shift_size_w = window_size_w // 2 q = torch.roll(q, shifts=-shift_size_w, dims=1) k = torch.roll(k, shifts=-shift_size_w, dims=1) v = torch.roll(v, shifts=-shift_size_w, dims=1) q = split_feature_1d(q, num_splits=num_splits) # [B*H*K, W/K, C] k = split_feature_1d(k, num_splits=num_splits) v = split_feature_1d(v, num_splits=num_splits) scores = ( torch.matmul(q.view(b_new, -1, c), k.view(b_new, -1, c).permute(0, 2, 1)) / scale_factor ) # [B*H*K, W/K, W/K] if with_shift: # attn_mask: [K, W/K, W/K] scores += attn_mask.repeat(b * h, 1, 1) # [B*H*K, W/K, W/K] attn = torch.softmax(scores, dim=-1) out = torch.matmul(attn, v.view(b_new, -1, c)) # [B*H*K, W/K, C] out = merge_splits_1d(out, h, num_splits=num_splits) # [B, H, W, C] # shift back if with_shift: out = torch.roll(out, shifts=shift_size_w, dims=2) out = out.view(b, -1, c) return out class SelfAttnPropagation(nn.Module): """ flow propagation with self-attention on feature query: feature0, key: feature0, value: flow """ def __init__( self, in_channels, **kwargs, ): super(SelfAttnPropagation, self).__init__() self.q_proj = nn.Linear(in_channels, in_channels) self.k_proj = nn.Linear(in_channels, in_channels) for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def forward( self, feature0, flow, local_window_attn=False, local_window_radius=1, **kwargs, ): # q, k: feature [B, C, H, W], v: flow [B, 2, H, W] if local_window_attn: return self.forward_local_window_attn(feature0, flow, local_window_radius=local_window_radius) b, c, h, w = feature0.size() query = feature0.view(b, c, h * w).permute(0, 2, 1) # [B, H*W, C] # a note: the ``correct'' implementation should be: # ``query = self.q_proj(query), key = self.k_proj(query)'' # this problem is observed while cleaning up the code # however, this doesn't affect the performance since the projection is a linear operation, # thus the two projection matrices for key can be merged # so I just leave it as is in order to not re-train all models :) query = self.q_proj(query) # [B, H*W, C] key = self.k_proj(query) # [B, H*W, C] value = flow.view(b, flow.size(1), h * w).permute(0, 2, 1) # [B, H*W, 2] scores = torch.matmul(query, key.permute(0, 2, 1)) / (c**0.5) # [B, H*W, H*W] prob = torch.softmax(scores, dim=-1) out = torch.matmul(prob, value) # [B, H*W, 2] out = out.view(b, h, w, value.size(-1)).permute(0, 3, 1, 2) # [B, 2, H, W] return out def forward_local_window_attn( self, feature0, flow, local_window_radius=1, ): assert flow.size(1) == 2 or flow.size(1) == 1 # flow or disparity or depth assert local_window_radius > 0 b, c, h, w = feature0.size() value_channel = flow.size(1) feature0_reshape = self.q_proj(feature0.view(b, c, -1).permute(0, 2, 1)).reshape( b * h * w, 1, c ) # [B*H*W, 1, C] kernel_size = 2 * local_window_radius + 1 feature0_proj = self.k_proj(feature0.view(b, c, -1).permute(0, 2, 1)).permute(0, 2, 1).reshape(b, c, h, w) feature0_window = F.unfold( feature0_proj, kernel_size=kernel_size, padding=local_window_radius ) # [B, C*(2R+1)^2), H*W] feature0_window = ( feature0_window.view(b, c, kernel_size**2, h, w) .permute(0, 3, 4, 1, 2) .reshape(b * h * w, c, kernel_size**2) ) # [B*H*W, C, (2R+1)^2] flow_window = F.unfold(flow, kernel_size=kernel_size, padding=local_window_radius) # [B, 2*(2R+1)^2), H*W] flow_window = ( flow_window.view(b, value_channel, kernel_size**2, h, w) .permute(0, 3, 4, 2, 1) .reshape(b * h * w, kernel_size**2, value_channel) ) # [B*H*W, (2R+1)^2, 2] scores = torch.matmul(feature0_reshape, feature0_window) / (c**0.5) # [B*H*W, 1, (2R+1)^2] prob = torch.softmax(scores, dim=-1) out = ( torch.matmul(prob, flow_window).view(b, h, w, value_channel).permute(0, 3, 1, 2).contiguous() ) # [B, 2, H, W] return out ================================================ FILE: Open-Sora/tools/scoring/optical_flow/unimatch/backbone.py ================================================ import torch.nn as nn from .trident_conv import MultiScaleTridentConv class ResidualBlock(nn.Module): def __init__( self, in_planes, planes, norm_layer=nn.InstanceNorm2d, stride=1, dilation=1, ): super(ResidualBlock, self).__init__() self.conv1 = nn.Conv2d( in_planes, planes, kernel_size=3, dilation=dilation, padding=dilation, stride=stride, bias=False ) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, dilation=dilation, padding=dilation, bias=False) self.relu = nn.ReLU(inplace=True) self.norm1 = norm_layer(planes) self.norm2 = norm_layer(planes) if not stride == 1 or in_planes != planes: self.norm3 = norm_layer(planes) if stride == 1 and in_planes == planes: self.downsample = None else: self.downsample = nn.Sequential(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3) def forward(self, x): y = x y = self.relu(self.norm1(self.conv1(y))) y = self.relu(self.norm2(self.conv2(y))) if self.downsample is not None: x = self.downsample(x) return self.relu(x + y) class CNNEncoder(nn.Module): def __init__( self, output_dim=128, norm_layer=nn.InstanceNorm2d, num_output_scales=1, **kwargs, ): super(CNNEncoder, self).__init__() self.num_branch = num_output_scales feature_dims = [64, 96, 128] self.conv1 = nn.Conv2d(3, feature_dims[0], kernel_size=7, stride=2, padding=3, bias=False) # 1/2 self.norm1 = norm_layer(feature_dims[0]) self.relu1 = nn.ReLU(inplace=True) self.in_planes = feature_dims[0] self.layer1 = self._make_layer(feature_dims[0], stride=1, norm_layer=norm_layer) # 1/2 self.layer2 = self._make_layer(feature_dims[1], stride=2, norm_layer=norm_layer) # 1/4 # highest resolution 1/4 or 1/8 stride = 2 if num_output_scales == 1 else 1 self.layer3 = self._make_layer( feature_dims[2], stride=stride, norm_layer=norm_layer, ) # 1/4 or 1/8 self.conv2 = nn.Conv2d(feature_dims[2], output_dim, 1, 1, 0) if self.num_branch > 1: if self.num_branch == 4: strides = (1, 2, 4, 8) elif self.num_branch == 3: strides = (1, 2, 4) elif self.num_branch == 2: strides = (1, 2) else: raise ValueError self.trident_conv = MultiScaleTridentConv( output_dim, output_dim, kernel_size=3, strides=strides, paddings=1, num_branch=self.num_branch, ) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0) def _make_layer(self, dim, stride=1, dilation=1, norm_layer=nn.InstanceNorm2d): layer1 = ResidualBlock(self.in_planes, dim, norm_layer=norm_layer, stride=stride, dilation=dilation) layer2 = ResidualBlock(dim, dim, norm_layer=norm_layer, stride=1, dilation=dilation) layers = (layer1, layer2) self.in_planes = dim return nn.Sequential(*layers) def forward(self, x): x = self.conv1(x) x = self.norm1(x) x = self.relu1(x) x = self.layer1(x) # 1/2 x = self.layer2(x) # 1/4 x = self.layer3(x) # 1/8 or 1/4 x = self.conv2(x) if self.num_branch > 1: out = self.trident_conv([x] * self.num_branch) # high to low res else: out = [x] return out ================================================ FILE: Open-Sora/tools/scoring/optical_flow/unimatch/geometry.py ================================================ import torch import torch.nn.functional as F def coords_grid(b, h, w, homogeneous=False, device=None): y, x = torch.meshgrid(torch.arange(h), torch.arange(w)) # [H, W] stacks = [x, y] if homogeneous: ones = torch.ones_like(x) # [H, W] stacks.append(ones) grid = torch.stack(stacks, dim=0).float() # [2, H, W] or [3, H, W] grid = grid[None].repeat(b, 1, 1, 1) # [B, 2, H, W] or [B, 3, H, W] if device is not None: grid = grid.to(device) return grid def generate_window_grid(h_min, h_max, w_min, w_max, len_h, len_w, device=None): assert device is not None x, y = torch.meshgrid( [torch.linspace(w_min, w_max, len_w, device=device), torch.linspace(h_min, h_max, len_h, device=device)], ) grid = torch.stack((x, y), -1).transpose(0, 1).float() # [H, W, 2] return grid def normalize_coords(coords, h, w): # coords: [B, H, W, 2] c = torch.Tensor([(w - 1) / 2.0, (h - 1) / 2.0]).float().to(coords.device) return (coords - c) / c # [-1, 1] def bilinear_sample(img, sample_coords, mode="bilinear", padding_mode="zeros", return_mask=False): # img: [B, C, H, W] # sample_coords: [B, 2, H, W] in image scale if sample_coords.size(1) != 2: # [B, H, W, 2] sample_coords = sample_coords.permute(0, 3, 1, 2) b, _, h, w = sample_coords.shape # Normalize to [-1, 1] x_grid = 2 * sample_coords[:, 0] / (w - 1) - 1 y_grid = 2 * sample_coords[:, 1] / (h - 1) - 1 grid = torch.stack([x_grid, y_grid], dim=-1) # [B, H, W, 2] img = F.grid_sample(img, grid, mode=mode, padding_mode=padding_mode, align_corners=True) if return_mask: mask = (x_grid >= -1) & (y_grid >= -1) & (x_grid <= 1) & (y_grid <= 1) # [B, H, W] return img, mask return img def flow_warp(feature, flow, mask=False, padding_mode="zeros"): b, c, h, w = feature.size() assert flow.size(1) == 2 grid = coords_grid(b, h, w).to(flow.device) + flow # [B, 2, H, W] return bilinear_sample(feature, grid, padding_mode=padding_mode, return_mask=mask) def forward_backward_consistency_check(fwd_flow, bwd_flow, alpha=0.01, beta=0.5): # fwd_flow, bwd_flow: [B, 2, H, W] # alpha and beta values are following UnFlow (https://arxiv.org/abs/1711.07837) assert fwd_flow.dim() == 4 and bwd_flow.dim() == 4 assert fwd_flow.size(1) == 2 and bwd_flow.size(1) == 2 flow_mag = torch.norm(fwd_flow, dim=1) + torch.norm(bwd_flow, dim=1) # [B, H, W] warped_bwd_flow = flow_warp(bwd_flow, fwd_flow) # [B, 2, H, W] warped_fwd_flow = flow_warp(fwd_flow, bwd_flow) # [B, 2, H, W] diff_fwd = torch.norm(fwd_flow + warped_bwd_flow, dim=1) # [B, H, W] diff_bwd = torch.norm(bwd_flow + warped_fwd_flow, dim=1) threshold = alpha * flow_mag + beta fwd_occ = (diff_fwd > threshold).float() # [B, H, W] bwd_occ = (diff_bwd > threshold).float() return fwd_occ, bwd_occ def back_project(depth, intrinsics): # Back project 2D pixel coords to 3D points # depth: [B, H, W] # intrinsics: [B, 3, 3] b, h, w = depth.shape grid = coords_grid(b, h, w, homogeneous=True, device=depth.device) # [B, 3, H, W] intrinsics_inv = torch.inverse(intrinsics) # [B, 3, 3] points = intrinsics_inv.bmm(grid.view(b, 3, -1)).view(b, 3, h, w) * depth.unsqueeze(1) # [B, 3, H, W] return points def camera_transform(points_ref, extrinsics_ref=None, extrinsics_tgt=None, extrinsics_rel=None): # Transform 3D points from reference camera to target camera # points_ref: [B, 3, H, W] # extrinsics_ref: [B, 4, 4] # extrinsics_tgt: [B, 4, 4] # extrinsics_rel: [B, 4, 4], relative pose transform b, _, h, w = points_ref.shape if extrinsics_rel is None: extrinsics_rel = torch.bmm(extrinsics_tgt, torch.inverse(extrinsics_ref)) # [B, 4, 4] points_tgt = ( torch.bmm(extrinsics_rel[:, :3, :3], points_ref.view(b, 3, -1)) + extrinsics_rel[:, :3, -1:] ) # [B, 3, H*W] points_tgt = points_tgt.view(b, 3, h, w) # [B, 3, H, W] return points_tgt def reproject(points_tgt, intrinsics, return_mask=False): # reproject to target view # points_tgt: [B, 3, H, W] # intrinsics: [B, 3, 3] b, _, h, w = points_tgt.shape proj_points = torch.bmm(intrinsics, points_tgt.view(b, 3, -1)).view(b, 3, h, w) # [B, 3, H, W] X = proj_points[:, 0] Y = proj_points[:, 1] Z = proj_points[:, 2].clamp(min=1e-3) pixel_coords = torch.stack([X / Z, Y / Z], dim=1).view(b, 2, h, w) # [B, 2, H, W] in image scale if return_mask: # valid mask in pixel space mask = ( (pixel_coords[:, 0] >= 0) & (pixel_coords[:, 0] <= (w - 1)) & (pixel_coords[:, 1] >= 0) & (pixel_coords[:, 1] <= (h - 1)) ) # [B, H, W] return pixel_coords, mask return pixel_coords def reproject_coords( depth_ref, intrinsics, extrinsics_ref=None, extrinsics_tgt=None, extrinsics_rel=None, return_mask=False ): # Compute reprojection sample coords points_ref = back_project(depth_ref, intrinsics) # [B, 3, H, W] points_tgt = camera_transform(points_ref, extrinsics_ref, extrinsics_tgt, extrinsics_rel=extrinsics_rel) if return_mask: reproj_coords, mask = reproject(points_tgt, intrinsics, return_mask=return_mask) # [B, 2, H, W] in image scale return reproj_coords, mask reproj_coords = reproject(points_tgt, intrinsics, return_mask=return_mask) # [B, 2, H, W] in image scale return reproj_coords def compute_flow_with_depth_pose( depth_ref, intrinsics, extrinsics_ref=None, extrinsics_tgt=None, extrinsics_rel=None, return_mask=False ): b, h, w = depth_ref.shape coords_init = coords_grid(b, h, w, device=depth_ref.device) # [B, 2, H, W] if return_mask: reproj_coords, mask = reproject_coords( depth_ref, intrinsics, extrinsics_ref, extrinsics_tgt, extrinsics_rel=extrinsics_rel, return_mask=return_mask, ) # [B, 2, H, W] rigid_flow = reproj_coords - coords_init return rigid_flow, mask reproj_coords = reproject_coords( depth_ref, intrinsics, extrinsics_ref, extrinsics_tgt, extrinsics_rel=extrinsics_rel, return_mask=return_mask ) # [B, 2, H, W] rigid_flow = reproj_coords - coords_init return rigid_flow ================================================ FILE: Open-Sora/tools/scoring/optical_flow/unimatch/matching.py ================================================ import torch import torch.nn.functional as F from .geometry import coords_grid, generate_window_grid, normalize_coords def global_correlation_softmax( feature0, feature1, pred_bidir_flow=False, ): # global correlation b, c, h, w = feature0.shape feature0 = feature0.view(b, c, -1).permute(0, 2, 1) # [B, H*W, C] feature1 = feature1.view(b, c, -1) # [B, C, H*W] correlation = torch.matmul(feature0, feature1).view(b, h, w, h, w) / (c**0.5) # [B, H, W, H, W] # flow from softmax init_grid = coords_grid(b, h, w).to(correlation.device) # [B, 2, H, W] grid = init_grid.view(b, 2, -1).permute(0, 2, 1) # [B, H*W, 2] correlation = correlation.view(b, h * w, h * w) # [B, H*W, H*W] if pred_bidir_flow: correlation = torch.cat((correlation, correlation.permute(0, 2, 1)), dim=0) # [2*B, H*W, H*W] init_grid = init_grid.repeat(2, 1, 1, 1) # [2*B, 2, H, W] grid = grid.repeat(2, 1, 1) # [2*B, H*W, 2] b = b * 2 prob = F.softmax(correlation, dim=-1) # [B, H*W, H*W] correspondence = torch.matmul(prob, grid).view(b, h, w, 2).permute(0, 3, 1, 2) # [B, 2, H, W] # when predicting bidirectional flow, flow is the concatenation of forward flow and backward flow flow = correspondence - init_grid return flow, prob def local_correlation_softmax( feature0, feature1, local_radius, padding_mode="zeros", ): b, c, h, w = feature0.size() coords_init = coords_grid(b, h, w).to(feature0.device) # [B, 2, H, W] coords = coords_init.view(b, 2, -1).permute(0, 2, 1) # [B, H*W, 2] local_h = 2 * local_radius + 1 local_w = 2 * local_radius + 1 window_grid = generate_window_grid( -local_radius, local_radius, -local_radius, local_radius, local_h, local_w, device=feature0.device ) # [2R+1, 2R+1, 2] window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1) # [B, 1, (2R+1)^2, 2] sample_coords = coords.unsqueeze(-2) + window_grid # [B, H*W, (2R+1)^2, 2] sample_coords_softmax = sample_coords # exclude coords that are out of image space valid_x = (sample_coords[:, :, :, 0] >= 0) & (sample_coords[:, :, :, 0] < w) # [B, H*W, (2R+1)^2] valid_y = (sample_coords[:, :, :, 1] >= 0) & (sample_coords[:, :, :, 1] < h) # [B, H*W, (2R+1)^2] valid = valid_x & valid_y # [B, H*W, (2R+1)^2], used to mask out invalid values when softmax # normalize coordinates to [-1, 1] sample_coords_norm = normalize_coords(sample_coords, h, w) # [-1, 1] window_feature = F.grid_sample(feature1, sample_coords_norm, padding_mode=padding_mode, align_corners=True).permute( 0, 2, 1, 3 ) # [B, H*W, C, (2R+1)^2] feature0_view = feature0.permute(0, 2, 3, 1).view(b, h * w, 1, c) # [B, H*W, 1, C] corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (c**0.5) # [B, H*W, (2R+1)^2] # mask invalid locations corr[~valid] = -1e9 prob = F.softmax(corr, -1) # [B, H*W, (2R+1)^2] correspondence = ( torch.matmul(prob.unsqueeze(-2), sample_coords_softmax).squeeze(-2).view(b, h, w, 2).permute(0, 3, 1, 2) ) # [B, 2, H, W] flow = correspondence - coords_init match_prob = prob return flow, match_prob def local_correlation_with_flow( feature0, feature1, flow, local_radius, padding_mode="zeros", dilation=1, ): b, c, h, w = feature0.size() coords_init = coords_grid(b, h, w).to(feature0.device) # [B, 2, H, W] coords = coords_init.view(b, 2, -1).permute(0, 2, 1) # [B, H*W, 2] local_h = 2 * local_radius + 1 local_w = 2 * local_radius + 1 window_grid = generate_window_grid( -local_radius, local_radius, -local_radius, local_radius, local_h, local_w, device=feature0.device ) # [2R+1, 2R+1, 2] window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1) # [B, 1, (2R+1)^2, 2] sample_coords = coords.unsqueeze(-2) + window_grid * dilation # [B, H*W, (2R+1)^2, 2] # flow can be zero when using features after transformer if not isinstance(flow, float): sample_coords = sample_coords + flow.view(b, 2, -1).permute(0, 2, 1).unsqueeze(-2) # [B, H*W, (2R+1)^2, 2] else: assert flow == 0.0 # normalize coordinates to [-1, 1] sample_coords_norm = normalize_coords(sample_coords, h, w) # [-1, 1] window_feature = F.grid_sample(feature1, sample_coords_norm, padding_mode=padding_mode, align_corners=True).permute( 0, 2, 1, 3 ) # [B, H*W, C, (2R+1)^2] feature0_view = feature0.permute(0, 2, 3, 1).view(b, h * w, 1, c) # [B, H*W, 1, C] corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (c**0.5) # [B, H*W, (2R+1)^2] corr = corr.view(b, h, w, -1).permute(0, 3, 1, 2).contiguous() # [B, (2R+1)^2, H, W] return corr def global_correlation_softmax_stereo( feature0, feature1, ): # global correlation on horizontal direction b, c, h, w = feature0.shape x_grid = torch.linspace(0, w - 1, w, device=feature0.device) # [W] feature0 = feature0.permute(0, 2, 3, 1) # [B, H, W, C] feature1 = feature1.permute(0, 2, 1, 3) # [B, H, C, W] correlation = torch.matmul(feature0, feature1) / (c**0.5) # [B, H, W, W] # mask subsequent positions to make disparity positive mask = torch.triu(torch.ones((w, w)), diagonal=1).type_as(feature0) # [W, W] valid_mask = (mask == 0).unsqueeze(0).unsqueeze(0).repeat(b, h, 1, 1) # [B, H, W, W] correlation[~valid_mask] = -1e9 prob = F.softmax(correlation, dim=-1) # [B, H, W, W] correspondence = (x_grid.view(1, 1, 1, w) * prob).sum(-1) # [B, H, W] # NOTE: unlike flow, disparity is typically positive disparity = x_grid.view(1, 1, w).repeat(b, h, 1) - correspondence # [B, H, W] return disparity.unsqueeze(1), prob # feature resolution def local_correlation_softmax_stereo( feature0, feature1, local_radius, ): b, c, h, w = feature0.size() coords_init = coords_grid(b, h, w).to(feature0.device) # [B, 2, H, W] coords = coords_init.view(b, 2, -1).permute(0, 2, 1).contiguous() # [B, H*W, 2] local_h = 1 local_w = 2 * local_radius + 1 window_grid = generate_window_grid( 0, 0, -local_radius, local_radius, local_h, local_w, device=feature0.device ) # [1, 2R+1, 2] window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1) # [B, 1, (2R+1), 2] sample_coords = coords.unsqueeze(-2) + window_grid # [B, H*W, (2R+1), 2] sample_coords_softmax = sample_coords # exclude coords that are out of image space valid_x = (sample_coords[:, :, :, 0] >= 0) & (sample_coords[:, :, :, 0] < w) # [B, H*W, (2R+1)^2] valid_y = (sample_coords[:, :, :, 1] >= 0) & (sample_coords[:, :, :, 1] < h) # [B, H*W, (2R+1)^2] valid = valid_x & valid_y # [B, H*W, (2R+1)^2], used to mask out invalid values when softmax # normalize coordinates to [-1, 1] sample_coords_norm = normalize_coords(sample_coords, h, w) # [-1, 1] window_feature = F.grid_sample(feature1, sample_coords_norm, padding_mode="zeros", align_corners=True).permute( 0, 2, 1, 3 ) # [B, H*W, C, (2R+1)] feature0_view = feature0.permute(0, 2, 3, 1).contiguous().view(b, h * w, 1, c) # [B, H*W, 1, C] corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (c**0.5) # [B, H*W, (2R+1)] # mask invalid locations corr[~valid] = -1e9 prob = F.softmax(corr, -1) # [B, H*W, (2R+1)] correspondence = ( torch.matmul(prob.unsqueeze(-2), sample_coords_softmax) .squeeze(-2) .view(b, h, w, 2) .permute(0, 3, 1, 2) .contiguous() ) # [B, 2, H, W] flow = correspondence - coords_init # flow at feature resolution match_prob = prob flow_x = -flow[:, :1] # [B, 1, H, W] return flow_x, match_prob def correlation_softmax_depth( feature0, feature1, intrinsics, pose, depth_candidates, depth_from_argmax=False, pred_bidir_depth=False, ): b, c, h, w = feature0.size() assert depth_candidates.dim() == 4 # [B, D, H, W] scale_factor = c**0.5 if pred_bidir_depth: feature0, feature1 = torch.cat((feature0, feature1), dim=0), torch.cat((feature1, feature0), dim=0) intrinsics = intrinsics.repeat(2, 1, 1) pose = torch.cat((pose, torch.inverse(pose)), dim=0) depth_candidates = depth_candidates.repeat(2, 1, 1, 1) # depth candidates are actually inverse depth warped_feature1 = warp_with_pose_depth_candidates( feature1, intrinsics, pose, 1.0 / depth_candidates, ) # [B, C, D, H, W] correlation = (feature0.unsqueeze(2) * warped_feature1).sum(1) / scale_factor # [B, D, H, W] match_prob = F.softmax(correlation, dim=1) # [B, D, H, W] # for cross-task transfer (flow -> depth), extract depth with argmax at test time if depth_from_argmax: index = torch.argmax(match_prob, dim=1, keepdim=True) depth = torch.gather(depth_candidates, dim=1, index=index) else: depth = (match_prob * depth_candidates).sum(dim=1, keepdim=True) # [B, 1, H, W] return depth, match_prob def warp_with_pose_depth_candidates( feature1, intrinsics, pose, depth, clamp_min_depth=1e-3, ): """ feature1: [B, C, H, W] intrinsics: [B, 3, 3] pose: [B, 4, 4] depth: [B, D, H, W] """ assert intrinsics.size(1) == intrinsics.size(2) == 3 assert pose.size(1) == pose.size(2) == 4 assert depth.dim() == 4 b, d, h, w = depth.size() c = feature1.size(1) with torch.no_grad(): # pixel coordinates grid = coords_grid(b, h, w, homogeneous=True, device=depth.device) # [B, 3, H, W] # back project to 3D and transform viewpoint points = torch.inverse(intrinsics).bmm(grid.view(b, 3, -1)) # [B, 3, H*W] points = torch.bmm(pose[:, :3, :3], points).unsqueeze(2).repeat(1, 1, d, 1) * depth.view( b, 1, d, h * w ) # [B, 3, D, H*W] points = points + pose[:, :3, -1:].unsqueeze(-1) # [B, 3, D, H*W] # reproject to 2D image plane points = torch.bmm(intrinsics, points.view(b, 3, -1)).view(b, 3, d, h * w) # [B, 3, D, H*W] pixel_coords = points[:, :2] / points[:, -1:].clamp(min=clamp_min_depth) # [B, 2, D, H*W] # normalize to [-1, 1] x_grid = 2 * pixel_coords[:, 0] / (w - 1) - 1 y_grid = 2 * pixel_coords[:, 1] / (h - 1) - 1 grid = torch.stack([x_grid, y_grid], dim=-1) # [B, D, H*W, 2] # sample features warped_feature = F.grid_sample( feature1, grid.view(b, d * h, w, 2), mode="bilinear", padding_mode="zeros", align_corners=True ).view( b, c, d, h, w ) # [B, C, D, H, W] return warped_feature ================================================ FILE: Open-Sora/tools/scoring/optical_flow/unimatch/position.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved # https://github.com/facebookresearch/detr/blob/main/models/position_encoding.py import math import torch import torch.nn as nn class PositionEmbeddingSine(nn.Module): """ This is a more standard version of the position embedding, very similar to the one used by the Attention is all you need paper, generalized to work on images. """ def __init__(self, num_pos_feats=64, temperature=10000, normalize=True, scale=None): super().__init__() self.num_pos_feats = num_pos_feats self.temperature = temperature self.normalize = normalize if scale is not None and normalize is False: raise ValueError("normalize should be True if scale is passed") if scale is None: scale = 2 * math.pi self.scale = scale def forward(self, x): # x = tensor_list.tensors # [B, C, H, W] # mask = tensor_list.mask # [B, H, W], input with padding, valid as 0 b, c, h, w = x.size() mask = torch.ones((b, h, w), device=x.device) # [B, H, W] y_embed = mask.cumsum(1, dtype=torch.float32) x_embed = mask.cumsum(2, dtype=torch.float32) if self.normalize: eps = 1e-6 y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) pos_x = x_embed[:, :, :, None] / dim_t pos_y = y_embed[:, :, :, None] / dim_t pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) return pos ================================================ FILE: Open-Sora/tools/scoring/optical_flow/unimatch/reg_refine.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F class FlowHead(nn.Module): def __init__( self, input_dim=128, hidden_dim=256, out_dim=2, ): super(FlowHead, self).__init__() self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1) self.conv2 = nn.Conv2d(hidden_dim, out_dim, 3, padding=1) self.relu = nn.ReLU(inplace=True) def forward(self, x): out = self.conv2(self.relu(self.conv1(x))) return out class SepConvGRU(nn.Module): def __init__( self, hidden_dim=128, input_dim=192 + 128, kernel_size=5, ): padding = (kernel_size - 1) // 2 super(SepConvGRU, self).__init__() self.convz1 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (1, kernel_size), padding=(0, padding)) self.convr1 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (1, kernel_size), padding=(0, padding)) self.convq1 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (1, kernel_size), padding=(0, padding)) self.convz2 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (kernel_size, 1), padding=(padding, 0)) self.convr2 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (kernel_size, 1), padding=(padding, 0)) self.convq2 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (kernel_size, 1), padding=(padding, 0)) def forward(self, h, x): # horizontal hx = torch.cat([h, x], dim=1) z = torch.sigmoid(self.convz1(hx)) r = torch.sigmoid(self.convr1(hx)) q = torch.tanh(self.convq1(torch.cat([r * h, x], dim=1))) h = (1 - z) * h + z * q # vertical hx = torch.cat([h, x], dim=1) z = torch.sigmoid(self.convz2(hx)) r = torch.sigmoid(self.convr2(hx)) q = torch.tanh(self.convq2(torch.cat([r * h, x], dim=1))) h = (1 - z) * h + z * q return h class BasicMotionEncoder(nn.Module): def __init__( self, corr_channels=324, flow_channels=2, ): super(BasicMotionEncoder, self).__init__() self.convc1 = nn.Conv2d(corr_channels, 256, 1, padding=0) self.convc2 = nn.Conv2d(256, 192, 3, padding=1) self.convf1 = nn.Conv2d(flow_channels, 128, 7, padding=3) self.convf2 = nn.Conv2d(128, 64, 3, padding=1) self.conv = nn.Conv2d(64 + 192, 128 - flow_channels, 3, padding=1) def forward(self, flow, corr): cor = F.relu(self.convc1(corr)) cor = F.relu(self.convc2(cor)) flo = F.relu(self.convf1(flow)) flo = F.relu(self.convf2(flo)) cor_flo = torch.cat([cor, flo], dim=1) out = F.relu(self.conv(cor_flo)) return torch.cat([out, flow], dim=1) class BasicUpdateBlock(nn.Module): def __init__( self, corr_channels=324, hidden_dim=128, context_dim=128, downsample_factor=8, flow_dim=2, bilinear_up=False, ): super(BasicUpdateBlock, self).__init__() self.encoder = BasicMotionEncoder( corr_channels=corr_channels, flow_channels=flow_dim, ) self.gru = SepConvGRU(hidden_dim=hidden_dim, input_dim=context_dim + hidden_dim) self.flow_head = FlowHead( hidden_dim, hidden_dim=256, out_dim=flow_dim, ) if bilinear_up: self.mask = None else: self.mask = nn.Sequential( nn.Conv2d(hidden_dim, 256, 3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(256, downsample_factor**2 * 9, 1, padding=0), ) def forward(self, net, inp, corr, flow): motion_features = self.encoder(flow, corr) inp = torch.cat([inp, motion_features], dim=1) net = self.gru(net, inp) delta_flow = self.flow_head(net) if self.mask is not None: mask = self.mask(net) else: mask = None return net, mask, delta_flow ================================================ FILE: Open-Sora/tools/scoring/optical_flow/unimatch/transformer.py ================================================ import torch import torch.nn as nn from .attention import ( single_head_full_attention, single_head_full_attention_1d, single_head_split_window_attention, single_head_split_window_attention_1d, ) from .utils import generate_shift_window_attn_mask, generate_shift_window_attn_mask_1d class TransformerLayer(nn.Module): def __init__( self, d_model=128, nhead=1, no_ffn=False, ffn_dim_expansion=4, ): super(TransformerLayer, self).__init__() self.dim = d_model self.nhead = nhead self.no_ffn = no_ffn # multi-head attention self.q_proj = nn.Linear(d_model, d_model, bias=False) self.k_proj = nn.Linear(d_model, d_model, bias=False) self.v_proj = nn.Linear(d_model, d_model, bias=False) self.merge = nn.Linear(d_model, d_model, bias=False) self.norm1 = nn.LayerNorm(d_model) # no ffn after self-attn, with ffn after cross-attn if not self.no_ffn: in_channels = d_model * 2 self.mlp = nn.Sequential( nn.Linear(in_channels, in_channels * ffn_dim_expansion, bias=False), nn.GELU(), nn.Linear(in_channels * ffn_dim_expansion, d_model, bias=False), ) self.norm2 = nn.LayerNorm(d_model) def forward( self, source, target, height=None, width=None, shifted_window_attn_mask=None, shifted_window_attn_mask_1d=None, attn_type="swin", with_shift=False, attn_num_splits=None, ): # source, target: [B, L, C] query, key, value = source, target, target # for stereo: 2d attn in self-attn, 1d attn in cross-attn is_self_attn = (query - key).abs().max() < 1e-6 # single-head attention query = self.q_proj(query) # [B, L, C] key = self.k_proj(key) # [B, L, C] value = self.v_proj(value) # [B, L, C] if attn_type == "swin" and attn_num_splits > 1: # self, cross-attn: both swin 2d if self.nhead > 1: # we observe that multihead attention slows down the speed and increases the memory consumption # without bringing obvious performance gains and thus the implementation is removed raise NotImplementedError else: message = single_head_split_window_attention( query, key, value, num_splits=attn_num_splits, with_shift=with_shift, h=height, w=width, attn_mask=shifted_window_attn_mask, ) elif attn_type == "self_swin2d_cross_1d": # self-attn: swin 2d, cross-attn: full 1d if self.nhead > 1: raise NotImplementedError else: if is_self_attn: if attn_num_splits > 1: message = single_head_split_window_attention( query, key, value, num_splits=attn_num_splits, with_shift=with_shift, h=height, w=width, attn_mask=shifted_window_attn_mask, ) else: # full 2d attn message = single_head_full_attention(query, key, value) # [N, L, C] else: # cross attn 1d message = single_head_full_attention_1d( query, key, value, h=height, w=width, ) elif attn_type == "self_swin2d_cross_swin1d": # self-attn: swin 2d, cross-attn: swin 1d if self.nhead > 1: raise NotImplementedError else: if is_self_attn: if attn_num_splits > 1: # self attn shift window message = single_head_split_window_attention( query, key, value, num_splits=attn_num_splits, with_shift=with_shift, h=height, w=width, attn_mask=shifted_window_attn_mask, ) else: # full 2d attn message = single_head_full_attention(query, key, value) # [N, L, C] else: if attn_num_splits > 1: assert shifted_window_attn_mask_1d is not None # cross attn 1d shift message = single_head_split_window_attention_1d( query, key, value, num_splits=attn_num_splits, with_shift=with_shift, h=height, w=width, attn_mask=shifted_window_attn_mask_1d, ) else: message = single_head_full_attention_1d( query, key, value, h=height, w=width, ) else: message = single_head_full_attention(query, key, value) # [B, L, C] message = self.merge(message) # [B, L, C] message = self.norm1(message) if not self.no_ffn: message = self.mlp(torch.cat([source, message], dim=-1)) message = self.norm2(message) return source + message class TransformerBlock(nn.Module): """self attention + cross attention + FFN""" def __init__( self, d_model=128, nhead=1, ffn_dim_expansion=4, ): super(TransformerBlock, self).__init__() self.self_attn = TransformerLayer( d_model=d_model, nhead=nhead, no_ffn=True, ffn_dim_expansion=ffn_dim_expansion, ) self.cross_attn_ffn = TransformerLayer( d_model=d_model, nhead=nhead, ffn_dim_expansion=ffn_dim_expansion, ) def forward( self, source, target, height=None, width=None, shifted_window_attn_mask=None, shifted_window_attn_mask_1d=None, attn_type="swin", with_shift=False, attn_num_splits=None, ): # source, target: [B, L, C] # self attention source = self.self_attn( source, source, height=height, width=width, shifted_window_attn_mask=shifted_window_attn_mask, attn_type=attn_type, with_shift=with_shift, attn_num_splits=attn_num_splits, ) # cross attention and ffn source = self.cross_attn_ffn( source, target, height=height, width=width, shifted_window_attn_mask=shifted_window_attn_mask, shifted_window_attn_mask_1d=shifted_window_attn_mask_1d, attn_type=attn_type, with_shift=with_shift, attn_num_splits=attn_num_splits, ) return source class FeatureTransformer(nn.Module): def __init__( self, num_layers=6, d_model=128, nhead=1, ffn_dim_expansion=4, ): super(FeatureTransformer, self).__init__() self.d_model = d_model self.nhead = nhead self.layers = nn.ModuleList( [ TransformerBlock( d_model=d_model, nhead=nhead, ffn_dim_expansion=ffn_dim_expansion, ) for i in range(num_layers) ] ) for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def forward( self, feature0, feature1, attn_type="swin", attn_num_splits=None, **kwargs, ): b, c, h, w = feature0.shape assert self.d_model == c feature0 = feature0.flatten(-2).permute(0, 2, 1) # [B, H*W, C] feature1 = feature1.flatten(-2).permute(0, 2, 1) # [B, H*W, C] # 2d attention if "swin" in attn_type and attn_num_splits > 1: # global and refine use different number of splits window_size_h = h // attn_num_splits window_size_w = w // attn_num_splits # compute attn mask once shifted_window_attn_mask = generate_shift_window_attn_mask( input_resolution=(h, w), window_size_h=window_size_h, window_size_w=window_size_w, shift_size_h=window_size_h // 2, shift_size_w=window_size_w // 2, device=feature0.device, ) # [K*K, H/K*W/K, H/K*W/K] else: shifted_window_attn_mask = None # 1d attention if "swin1d" in attn_type and attn_num_splits > 1: window_size_w = w // attn_num_splits # compute attn mask once shifted_window_attn_mask_1d = generate_shift_window_attn_mask_1d( input_w=w, window_size_w=window_size_w, shift_size_w=window_size_w // 2, device=feature0.device, ) # [K, W/K, W/K] else: shifted_window_attn_mask_1d = None # concat feature0 and feature1 in batch dimension to compute in parallel concat0 = torch.cat((feature0, feature1), dim=0) # [2B, H*W, C] concat1 = torch.cat((feature1, feature0), dim=0) # [2B, H*W, C] for i, layer in enumerate(self.layers): concat0 = layer( concat0, concat1, height=h, width=w, attn_type=attn_type, with_shift="swin" in attn_type and attn_num_splits > 1 and i % 2 == 1, attn_num_splits=attn_num_splits, shifted_window_attn_mask=shifted_window_attn_mask, shifted_window_attn_mask_1d=shifted_window_attn_mask_1d, ) # update feature1 concat1 = torch.cat(concat0.chunk(chunks=2, dim=0)[::-1], dim=0) feature0, feature1 = concat0.chunk(chunks=2, dim=0) # [B, H*W, C] # reshape back feature0 = feature0.view(b, h, w, c).permute(0, 3, 1, 2).contiguous() # [B, C, H, W] feature1 = feature1.view(b, h, w, c).permute(0, 3, 1, 2).contiguous() # [B, C, H, W] return feature0, feature1 ================================================ FILE: Open-Sora/tools/scoring/optical_flow/unimatch/trident_conv.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # https://github.com/facebookresearch/detectron2/blob/main/projects/TridentNet/tridentnet/trident_conv.py import torch from torch import nn from torch.nn import functional as F from torch.nn.modules.utils import _pair class MultiScaleTridentConv(nn.Module): def __init__( self, in_channels, out_channels, kernel_size, stride=1, strides=1, paddings=0, dilations=1, dilation=1, groups=1, num_branch=1, test_branch_idx=-1, bias=False, norm=None, activation=None, ): super(MultiScaleTridentConv, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = _pair(kernel_size) self.num_branch = num_branch self.stride = _pair(stride) self.groups = groups self.with_bias = bias self.dilation = dilation if isinstance(paddings, int): paddings = [paddings] * self.num_branch if isinstance(dilations, int): dilations = [dilations] * self.num_branch if isinstance(strides, int): strides = [strides] * self.num_branch self.paddings = [_pair(padding) for padding in paddings] self.dilations = [_pair(dilation) for dilation in dilations] self.strides = [_pair(stride) for stride in strides] self.test_branch_idx = test_branch_idx self.norm = norm self.activation = activation assert len({self.num_branch, len(self.paddings), len(self.strides)}) == 1 self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)) if bias: self.bias = nn.Parameter(torch.Tensor(out_channels)) else: self.bias = None nn.init.kaiming_uniform_(self.weight, nonlinearity="relu") if self.bias is not None: nn.init.constant_(self.bias, 0) def forward(self, inputs): num_branch = self.num_branch if self.training or self.test_branch_idx == -1 else 1 assert len(inputs) == num_branch if self.training or self.test_branch_idx == -1: outputs = [ F.conv2d(input, self.weight, self.bias, stride, padding, self.dilation, self.groups) for input, stride, padding in zip(inputs, self.strides, self.paddings) ] else: outputs = [ F.conv2d( inputs[0], self.weight, self.bias, self.strides[self.test_branch_idx] if self.test_branch_idx == -1 else self.strides[-1], self.paddings[self.test_branch_idx] if self.test_branch_idx == -1 else self.paddings[-1], self.dilation, self.groups, ) ] if self.norm is not None: outputs = [self.norm(x) for x in outputs] if self.activation is not None: outputs = [self.activation(x) for x in outputs] return outputs ================================================ FILE: Open-Sora/tools/scoring/optical_flow/unimatch/unimatch.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from .attention import SelfAttnPropagation from .backbone import CNNEncoder from .geometry import compute_flow_with_depth_pose, flow_warp from .matching import ( correlation_softmax_depth, global_correlation_softmax, global_correlation_softmax_stereo, local_correlation_softmax, local_correlation_softmax_stereo, local_correlation_with_flow, ) from .reg_refine import BasicUpdateBlock from .transformer import FeatureTransformer from .utils import feature_add_position, normalize_img, upsample_flow_with_mask class UniMatch(nn.Module): def __init__( self, num_scales=1, feature_channels=128, upsample_factor=8, num_head=1, ffn_dim_expansion=4, num_transformer_layers=6, reg_refine=False, # optional local regression refinement task="flow", ): super(UniMatch, self).__init__() self.feature_channels = feature_channels self.num_scales = num_scales self.upsample_factor = upsample_factor self.reg_refine = reg_refine # CNN self.backbone = CNNEncoder(output_dim=feature_channels, num_output_scales=num_scales) # Transformer self.transformer = FeatureTransformer( num_layers=num_transformer_layers, d_model=feature_channels, nhead=num_head, ffn_dim_expansion=ffn_dim_expansion, ) # propagation with self-attn self.feature_flow_attn = SelfAttnPropagation(in_channels=feature_channels) if not self.reg_refine or task == "depth": # convex upsampling simiar to RAFT # concat feature0 and low res flow as input self.upsampler = nn.Sequential( nn.Conv2d(2 + feature_channels, 256, 3, 1, 1), nn.ReLU(inplace=True), nn.Conv2d(256, upsample_factor**2 * 9, 1, 1, 0), ) # thus far, all the learnable parameters are task-agnostic if reg_refine: # optional task-specific local regression refinement self.refine_proj = nn.Conv2d(128, 256, 1) self.refine = BasicUpdateBlock( corr_channels=(2 * 4 + 1) ** 2, downsample_factor=upsample_factor, flow_dim=2 if task == "flow" else 1, bilinear_up=task == "depth", ) def extract_feature(self, img0, img1): concat = torch.cat((img0, img1), dim=0) # [2B, C, H, W] features = self.backbone(concat) # list of [2B, C, H, W], resolution from high to low # reverse: resolution from low to high features = features[::-1] feature0, feature1 = [], [] for i in range(len(features)): feature = features[i] chunks = torch.chunk(feature, 2, 0) # tuple feature0.append(chunks[0]) feature1.append(chunks[1]) return feature0, feature1 def upsample_flow(self, flow, feature, bilinear=False, upsample_factor=8, is_depth=False): if bilinear: multiplier = 1 if is_depth else upsample_factor up_flow = ( F.interpolate(flow, scale_factor=upsample_factor, mode="bilinear", align_corners=True) * multiplier ) else: concat = torch.cat((flow, feature), dim=1) mask = self.upsampler(concat) up_flow = upsample_flow_with_mask(flow, mask, upsample_factor=self.upsample_factor, is_depth=is_depth) return up_flow def forward( self, img0, img1, attn_type=None, attn_splits_list=None, corr_radius_list=None, prop_radius_list=None, num_reg_refine=1, pred_bidir_flow=False, task="flow", intrinsics=None, pose=None, # relative pose transform min_depth=1.0 / 0.5, # inverse depth range max_depth=1.0 / 10, num_depth_candidates=64, depth_from_argmax=False, pred_bidir_depth=False, **kwargs, ): if pred_bidir_flow: assert task == "flow" if task == "depth": assert self.num_scales == 1 # multi-scale depth model is not supported yet results_dict = {} flow_preds = [] if task == "flow": # stereo and depth tasks have normalized img in dataloader img0, img1 = normalize_img(img0, img1) # [B, 3, H, W] # list of features, resolution low to high feature0_list, feature1_list = self.extract_feature(img0, img1) # list of features flow = None if task != "depth": assert len(attn_splits_list) == len(corr_radius_list) == len(prop_radius_list) == self.num_scales else: assert len(attn_splits_list) == len(prop_radius_list) == self.num_scales == 1 for scale_idx in range(self.num_scales): feature0, feature1 = feature0_list[scale_idx], feature1_list[scale_idx] if pred_bidir_flow and scale_idx > 0: # predicting bidirectional flow with refinement feature0, feature1 = torch.cat((feature0, feature1), dim=0), torch.cat((feature1, feature0), dim=0) feature0_ori, feature1_ori = feature0, feature1 upsample_factor = self.upsample_factor * (2 ** (self.num_scales - 1 - scale_idx)) if task == "depth": # scale intrinsics intrinsics_curr = intrinsics.clone() intrinsics_curr[:, :2] = intrinsics_curr[:, :2] / upsample_factor if scale_idx > 0: assert task != "depth" # not supported for multi-scale depth model flow = F.interpolate(flow, scale_factor=2, mode="bilinear", align_corners=True) * 2 if flow is not None: assert task != "depth" flow = flow.detach() if task == "stereo": # construct flow vector for disparity # flow here is actually disparity zeros = torch.zeros_like(flow) # [B, 1, H, W] # NOTE: reverse disp, disparity is positive displace = torch.cat((-flow, zeros), dim=1) # [B, 2, H, W] feature1 = flow_warp(feature1, displace) # [B, C, H, W] elif task == "flow": feature1 = flow_warp(feature1, flow) # [B, C, H, W] else: raise NotImplementedError attn_splits = attn_splits_list[scale_idx] if task != "depth": corr_radius = corr_radius_list[scale_idx] prop_radius = prop_radius_list[scale_idx] # add position to features feature0, feature1 = feature_add_position(feature0, feature1, attn_splits, self.feature_channels) # Transformer feature0, feature1 = self.transformer( feature0, feature1, attn_type=attn_type, attn_num_splits=attn_splits, ) # correlation and softmax if task == "depth": # first generate depth candidates b, _, h, w = feature0.size() depth_candidates = torch.linspace(min_depth, max_depth, num_depth_candidates).type_as(feature0) depth_candidates = depth_candidates.view(1, num_depth_candidates, 1, 1).repeat( b, 1, h, w ) # [B, D, H, W] flow_pred = correlation_softmax_depth( feature0, feature1, intrinsics_curr, pose, depth_candidates=depth_candidates, depth_from_argmax=depth_from_argmax, pred_bidir_depth=pred_bidir_depth, )[0] else: if corr_radius == -1: # global matching if task == "flow": flow_pred = global_correlation_softmax(feature0, feature1, pred_bidir_flow)[0] elif task == "stereo": flow_pred = global_correlation_softmax_stereo(feature0, feature1)[0] else: raise NotImplementedError else: # local matching if task == "flow": flow_pred = local_correlation_softmax(feature0, feature1, corr_radius)[0] elif task == "stereo": flow_pred = local_correlation_softmax_stereo(feature0, feature1, corr_radius)[0] else: raise NotImplementedError # flow or residual flow flow = flow + flow_pred if flow is not None else flow_pred if task == "stereo": flow = flow.clamp(min=0) # positive disparity # upsample to the original resolution for supervison at training time only if self.training: flow_bilinear = self.upsample_flow( flow, None, bilinear=True, upsample_factor=upsample_factor, is_depth=task == "depth" ) flow_preds.append(flow_bilinear) # flow propagation with self-attn if (pred_bidir_flow or pred_bidir_depth) and scale_idx == 0: feature0 = torch.cat((feature0, feature1), dim=0) # [2*B, C, H, W] for propagation flow = self.feature_flow_attn( feature0, flow.detach(), local_window_attn=prop_radius > 0, local_window_radius=prop_radius, ) # bilinear exclude the last one if self.training and scale_idx < self.num_scales - 1: flow_up = self.upsample_flow( flow, feature0, bilinear=True, upsample_factor=upsample_factor, is_depth=task == "depth" ) flow_preds.append(flow_up) if scale_idx == self.num_scales - 1: if not self.reg_refine: # upsample to the original image resolution if task == "stereo": flow_pad = torch.cat((-flow, torch.zeros_like(flow)), dim=1) # [B, 2, H, W] flow_up_pad = self.upsample_flow(flow_pad, feature0) flow_up = -flow_up_pad[:, :1] # [B, 1, H, W] elif task == "depth": depth_pad = torch.cat((flow, torch.zeros_like(flow)), dim=1) # [B, 2, H, W] depth_up_pad = self.upsample_flow(depth_pad, feature0, is_depth=True).clamp( min=min_depth, max=max_depth ) flow_up = depth_up_pad[:, :1] # [B, 1, H, W] else: flow_up = self.upsample_flow(flow, feature0) flow_preds.append(flow_up) else: # task-specific local regression refinement # supervise current flow if self.training: flow_up = self.upsample_flow( flow, feature0, bilinear=True, upsample_factor=upsample_factor, is_depth=task == "depth" ) flow_preds.append(flow_up) assert num_reg_refine > 0 for refine_iter_idx in range(num_reg_refine): flow = flow.detach() if task == "stereo": zeros = torch.zeros_like(flow) # [B, 1, H, W] # NOTE: reverse disp, disparity is positive displace = torch.cat((-flow, zeros), dim=1) # [B, 2, H, W] correlation = local_correlation_with_flow( feature0_ori, feature1_ori, flow=displace, local_radius=4, ) # [B, (2R+1)^2, H, W] elif task == "depth": if pred_bidir_depth and refine_iter_idx == 0: intrinsics_curr = intrinsics_curr.repeat(2, 1, 1) pose = torch.cat((pose, torch.inverse(pose)), dim=0) feature0_ori, feature1_ori = torch.cat((feature0_ori, feature1_ori), dim=0), torch.cat( (feature1_ori, feature0_ori), dim=0 ) flow_from_depth = compute_flow_with_depth_pose( 1.0 / flow.squeeze(1), intrinsics_curr, extrinsics_rel=pose, ) correlation = local_correlation_with_flow( feature0_ori, feature1_ori, flow=flow_from_depth, local_radius=4, ) # [B, (2R+1)^2, H, W] else: correlation = local_correlation_with_flow( feature0_ori, feature1_ori, flow=flow, local_radius=4, ) # [B, (2R+1)^2, H, W] proj = self.refine_proj(feature0) net, inp = torch.chunk(proj, chunks=2, dim=1) net = torch.tanh(net) inp = torch.relu(inp) net, up_mask, residual_flow = self.refine( net, inp, correlation, flow.clone(), ) if task == "depth": flow = (flow - residual_flow).clamp(min=min_depth, max=max_depth) else: flow = flow + residual_flow if task == "stereo": flow = flow.clamp(min=0) # positive if self.training or refine_iter_idx == num_reg_refine - 1: if task == "depth": if refine_iter_idx < num_reg_refine - 1: # bilinear upsampling flow_up = self.upsample_flow( flow, feature0, bilinear=True, upsample_factor=upsample_factor, is_depth=True ) else: # last one convex upsampling # NOTE: clamp depth due to the zero padding in the unfold in the convex upsampling # pad depth to 2 channels as flow depth_pad = torch.cat((flow, torch.zeros_like(flow)), dim=1) # [B, 2, H, W] depth_up_pad = self.upsample_flow(depth_pad, feature0, is_depth=True).clamp( min=min_depth, max=max_depth ) flow_up = depth_up_pad[:, :1] # [B, 1, H, W] else: flow_up = upsample_flow_with_mask( flow, up_mask, upsample_factor=self.upsample_factor, is_depth=task == "depth" ) flow_preds.append(flow_up) if task == "stereo": for i in range(len(flow_preds)): flow_preds[i] = flow_preds[i].squeeze(1) # [B, H, W] # convert inverse depth to depth if task == "depth": for i in range(len(flow_preds)): flow_preds[i] = 1.0 / flow_preds[i].squeeze(1) # [B, H, W] results_dict.update({"flow_preds": flow_preds}) return results_dict ================================================ FILE: Open-Sora/tools/scoring/optical_flow/unimatch/utils.py ================================================ import torch import torch.nn.functional as F from .position import PositionEmbeddingSine def generate_window_grid(h_min, h_max, w_min, w_max, len_h, len_w, device=None): assert device is not None x, y = torch.meshgrid( [torch.linspace(w_min, w_max, len_w, device=device), torch.linspace(h_min, h_max, len_h, device=device)], ) grid = torch.stack((x, y), -1).transpose(0, 1).float() # [H, W, 2] return grid def normalize_coords(coords, h, w): # coords: [B, H, W, 2] c = torch.Tensor([(w - 1) / 2.0, (h - 1) / 2.0]).float().to(coords.device) return (coords - c) / c # [-1, 1] def normalize_img(img0, img1): # loaded images are in [0, 255] # normalize by ImageNet mean and std mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(img1.device) std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(img1.device) img0 = (img0 / 255.0 - mean) / std img1 = (img1 / 255.0 - mean) / std return img0, img1 def split_feature( feature, num_splits=2, channel_last=False, ): if channel_last: # [B, H, W, C] b, h, w, c = feature.size() assert h % num_splits == 0 and w % num_splits == 0 b_new = b * num_splits * num_splits h_new = h // num_splits w_new = w // num_splits feature = ( feature.view(b, num_splits, h // num_splits, num_splits, w // num_splits, c) .permute(0, 1, 3, 2, 4, 5) .reshape(b_new, h_new, w_new, c) ) # [B*K*K, H/K, W/K, C] else: # [B, C, H, W] b, c, h, w = feature.size() assert h % num_splits == 0 and w % num_splits == 0 b_new = b * num_splits * num_splits h_new = h // num_splits w_new = w // num_splits feature = ( feature.view(b, c, num_splits, h // num_splits, num_splits, w // num_splits) .permute(0, 2, 4, 1, 3, 5) .reshape(b_new, c, h_new, w_new) ) # [B*K*K, C, H/K, W/K] return feature def merge_splits( splits, num_splits=2, channel_last=False, ): if channel_last: # [B*K*K, H/K, W/K, C] b, h, w, c = splits.size() new_b = b // num_splits // num_splits splits = splits.view(new_b, num_splits, num_splits, h, w, c) merge = ( splits.permute(0, 1, 3, 2, 4, 5).contiguous().view(new_b, num_splits * h, num_splits * w, c) ) # [B, H, W, C] else: # [B*K*K, C, H/K, W/K] b, c, h, w = splits.size() new_b = b // num_splits // num_splits splits = splits.view(new_b, num_splits, num_splits, c, h, w) merge = ( splits.permute(0, 3, 1, 4, 2, 5).contiguous().view(new_b, c, num_splits * h, num_splits * w) ) # [B, C, H, W] return merge def generate_shift_window_attn_mask( input_resolution, window_size_h, window_size_w, shift_size_h, shift_size_w, device=torch.device("cuda") ): # ref: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py # calculate attention mask for SW-MSA h, w = input_resolution img_mask = torch.zeros((1, h, w, 1)).to(device) # 1 H W 1 h_slices = (slice(0, -window_size_h), slice(-window_size_h, -shift_size_h), slice(-shift_size_h, None)) w_slices = (slice(0, -window_size_w), slice(-window_size_w, -shift_size_w), slice(-shift_size_w, None)) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 mask_windows = split_feature(img_mask, num_splits=input_resolution[-1] // window_size_w, channel_last=True) mask_windows = mask_windows.view(-1, window_size_h * window_size_w) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) return attn_mask def feature_add_position(feature0, feature1, attn_splits, feature_channels): pos_enc = PositionEmbeddingSine(num_pos_feats=feature_channels // 2) if attn_splits > 1: # add position in splited window feature0_splits = split_feature(feature0, num_splits=attn_splits) feature1_splits = split_feature(feature1, num_splits=attn_splits) position = pos_enc(feature0_splits) feature0_splits = feature0_splits + position feature1_splits = feature1_splits + position feature0 = merge_splits(feature0_splits, num_splits=attn_splits) feature1 = merge_splits(feature1_splits, num_splits=attn_splits) else: position = pos_enc(feature0) feature0 = feature0 + position feature1 = feature1 + position return feature0, feature1 def upsample_flow_with_mask(flow, up_mask, upsample_factor, is_depth=False): # convex upsampling following raft mask = up_mask b, flow_channel, h, w = flow.shape mask = mask.view(b, 1, 9, upsample_factor, upsample_factor, h, w) # [B, 1, 9, K, K, H, W] mask = torch.softmax(mask, dim=2) multiplier = 1 if is_depth else upsample_factor up_flow = F.unfold(multiplier * flow, [3, 3], padding=1) up_flow = up_flow.view(b, flow_channel, 9, 1, 1, h, w) # [B, 2, 9, 1, 1, H, W] up_flow = torch.sum(mask * up_flow, dim=2) # [B, 2, K, K, H, W] up_flow = up_flow.permute(0, 1, 4, 2, 5, 3) # [B, 2, K, H, K, W] up_flow = up_flow.reshape(b, flow_channel, upsample_factor * h, upsample_factor * w) # [B, 2, K*H, K*W] return up_flow def split_feature_1d( feature, num_splits=2, ): # feature: [B, W, C] b, w, c = feature.size() assert w % num_splits == 0 b_new = b * num_splits w_new = w // num_splits feature = feature.view(b, num_splits, w // num_splits, c).view(b_new, w_new, c) # [B*K, W/K, C] return feature def merge_splits_1d( splits, h, num_splits=2, ): b, w, c = splits.size() new_b = b // num_splits // h splits = splits.view(new_b, h, num_splits, w, c) merge = splits.view(new_b, h, num_splits * w, c) # [B, H, W, C] return merge def window_partition_1d(x, window_size_w): """ Args: x: (B, W, C) window_size (int): window size Returns: windows: (num_windows*B, window_size, C) """ B, W, C = x.shape x = x.view(B, W // window_size_w, window_size_w, C).view(-1, window_size_w, C) return x def generate_shift_window_attn_mask_1d(input_w, window_size_w, shift_size_w, device=torch.device("cuda")): # calculate attention mask for SW-MSA img_mask = torch.zeros((1, input_w, 1)).to(device) # 1 W 1 w_slices = (slice(0, -window_size_w), slice(-window_size_w, -shift_size_w), slice(-shift_size_w, None)) cnt = 0 for w in w_slices: img_mask[:, w, :] = cnt cnt += 1 mask_windows = window_partition_1d(img_mask, window_size_w) # nW, window_size, 1 mask_windows = mask_windows.view(-1, window_size_w) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) # nW, window_size, window_size attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) return attn_mask ================================================ FILE: PixArt-alpha-ToCa/Dockerfile ================================================ # This is a sample Dockefile that builds a runtime container and runs the sample Gradio app. # Note, you must pass in the pretrained models when you run the container. FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 WORKDIR /workspace RUN apt-get update && \ apt-get install -y \ git \ python3 \ python-is-python3 \ python3-pip \ python3.10-venv \ libgl1 \ libgl1-mesa-glx \ libglib2.0-0 \ && rm -rf /var/lib/apt/lists/* ADD requirements.txt . RUN pip install -r requirements.txt ADD . . RUN chmod a+x docker-entrypoint.sh ENV DEMO_PORT=12345 ENTRYPOINT [ "/workspace/docker-entrypoint.sh" ] ================================================ FILE: PixArt-alpha-ToCa/README(PixArt-alpha).md ================================================

###
👉 PixArt-α: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis
###
ICLR 2024 Spotlight
--- This repo contains PyTorch model definitions, pre-trained weights and inference/sampling code for our paper exploring Fast training diffusion models with transformers. You can find more visualizations on our [project page](https://pixart-alpha.github.io/). **PixArt-α Community**: Join our PixArt-α discord channels for discussions. Coders are welcome to contribute. > [**PixArt-α: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis**](https://pixart-alpha.github.io/)
> [Junsong Chen*](https://lawrence-cj.github.io/), [Jincheng Yu*](https://lovesykun.cn/about.html), > [Chongjian Ge*](https://chongjiange.github.io/), [Lewei Yao*](https://scholar.google.com/citations?user=hqDyTg8AAAAJ&hl=zh-CN&oi=ao), > [Enze Xie](https://xieenze.github.io/)†, > [Yue Wu](https://yuewuhkust.github.io/), [Zhongdao Wang](https://zhongdao.github.io/), > [James Kwok](https://www.cse.ust.hk/~jamesk/), [Ping Luo](http://luoping.me/), > [Huchuan Lu](https://scholar.google.com/citations?hl=en&user=D3nE0agAAAAJ), > [Zhenguo Li](https://scholar.google.com/citations?user=XboZC1AAAAAJ) >
Huawei Noah’s Ark Lab, Dalian University of Technology, HKU, HKUST
> [**PIXART-δ: Fast and Controllable Image Generation with Latent Consistency Models**](https://pixart-alpha.github.io/)
> [Junsong Chen](https://lawrence-cj.github.io/), [Yue Wu](https://yuewuhkust.github.io/), [Simian Luo](https://luosiallen.github.io/), [Enze Xie](https://xieenze.github.io/)†, > [Sayak Paul](https://sayak.dev/), [Ping Luo](http://luoping.me/), [Hang Zhao](), [Zhenguo Li](https://scholar.google.com/citations?user=XboZC1AAAAAJ) >
Huawei Noah’s Ark Lab, DLUT, Tsinghua University, HKU, Hugging Face
--- ## Breaking News 🔥🔥!! - (🔥 New) Apr. 12, 2024. 💥 A better version of [PixArt-Σ](https://github.com/PixArt-alpha/PixArt-sigma) training & inference code, checkpoints are all released!!! Welcome to collaborate and contribute. Star 🌟us if you think it is helpful!! - (🔥 New) Jan. 19, 2024. 💥 [PixArt-δ](https://arxiv.org/abs/2401.05252) ControlNet [app_controlnet.py](app/app_controlnet.py) and [Checkpoint](https://huggingface.co/PixArt-alpha/PixArt-ControlNet/tree/main) are released!!! - (🔥 New) Jan. 16, 2024. 💥 Glad to announce that [PixArt-α](https://arxiv.org/abs/2310.00426) is accepted by ICLR 2024 (Spotlight). - (🔥 New) Dec. 17, 2023. 💥 PixArt supports [ComfyUI](https://github.com/comfyanonymous/ComfyUI#manual-install-windows-linux). Thanks to [@city96](https://github.com/city96/ComfyUI_ExtraModels) with his great work. - (🔥 New) Nov. 30, 2023. 💥 PixArt collaborates with [LCMs](https://github.com/luosiallen/latent-consistency-model) team to make the **fastest** [Training & Inference Text-to-Image Generation System](https://github.com/PixArt-alpha/PixArt-alpha). Here, [Training code](train_scripts/train_pixart_lcm.py) & [Inference code](scripts/inference_lcm.py) & [Weights](https://huggingface.co/PixArt-alpha/PixArt-LCM-XL-2-1024-MS) & [HF Demo](https://huggingface.co/spaces/PixArt-alpha/PixArt-LCM) [OpenXLab Demo](https://openxlab.org.cn/apps/detail/houshaowei/PixArt-LCM) are all released, we hope users will enjoy them. Detailed **inference speed** and **code guidance** can be found in [docs](asset/docs/pixart_lcm.md). At the same time, we update the codebase for better user experience and fix some bugs in the newest version. --- ## 🚩 **New Features/Updates** - ✅ Jan. 11, 2024. 💥 [PixArt-δ](https://arxiv.org/abs/2401.05252): We are excited to announce the release of the [PixArt-δ](https://arxiv.org/abs/2401.05252) technical report!!! This report offers valuable insights into the training of LCM and ControlNet-like modules in Transformer Models. Along with the report, we have also released all the training and inference code for LCM & ControlNet [in this repository](https://github.com/PixArt-alpha/PixArt-alpha). We encourage you to try them out and warmly welcome any Pull Requests from our users. Your contributions and feedback are highly appreciated! - ✅ Feb. 07, 2024. [train_diffusers.py](train_scripts/train_diffusers.py) can directly train with diffusers model and visualize during training. - ✅ Jan. 26, 2024. 💥 All checkpoints of [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha), including 256px checkpoints are all available here [Download Models](#-download-models). - ✅ Jan. 19, 2024. 💥 [PixArt-δ](https://arxiv.org/abs/2401.05252) ControlNet [app_controlnet.py](app/app_controlnet.py) and [Checkpoint](https://huggingface.co/PixArt-alpha/PixArt-ControlNet/tree/main) is released!!! - ✅ Jan. 12, 2024. 💥 We release the [SAM-LLaVA-Captions](https://huggingface.co/datasets/PixArt-alpha/SAM-LLaVA-Captions10M) used in PixArt-α training. - ✅ Dec. 27, 2023. [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) incorporates into [ControlLLM](https://github.com/OpenGVLab/ControlLLM)! - ✅ Dec. 17, 2023. [PixArt-LCM-Lora](train_scripts/train_pixart_lcm_lora.py) & [PixArt-Lora](train_scripts/train_pixart_lora_hf.py) training scripts in Hugging Face style is released. - ✅ Dec. 13, 2023. Add multi-scale vae feature extraction in [tools/extract_features.py](https://github.com/PixArt-alpha/PixArt-alpha/blob/3b4f0afdbe39def80b41ab05c664c963edeebbcd/tools/extract_features.py#L276). - ✅ Dec. 01, 2023. Add a [Notebook folder](./notebooks) to help users get started with PixArt quickly! Thanks to [@kopyl](https://github.com/kopyl) for his contribution! - ✅ Nov. 27, 2023. 💥 **PixArt-α Community**: Join our PixArt-α discord channels for discussions. Coders are welcome to contribute. - ✅ Nov. 21, 2023. 💥 [SA-Sovler](https://arxiv.org/abs/2309.05019) official code first release [here](asset/docs/sasolver.md). - ✅ Nov. 19, 2023. Release `PixArt + Dreambooth` training scripts. - ✅ Nov. 16, 2023. Diffusers support `random resolution` and `batch images` generation now. Besides, running `Pixart` in under 8GB GPU VRAM is available in 🧨 [diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/pixart). - ✅ Nov. 10, 2023. Support DALL-E 3 Consistency Decoder in 🧨 diffusers. - ✅ Nov. 06, 2023. Release pretrained weights with 🧨 diffusers integration, Hugging Face demo, and Google Colab example. - ✅ Nov. 03, 2023. Release the LLaVA-captioning inference code. - ✅ Oct. 27, 2023. Release the training & feature extraction code. - ✅ Oct. 20, 2023. Collaborate with Hugging Face & Diffusers team to co-release the code and weights. (plz stay tuned.) - ✅ Oct. 15, 2023. Release the inference code. --- ## Contents * [Training](#-how-to-train) * [Inference](#-how-to-test) * [Download Models](#-download-models) * [Use diffusers](#1---using-in--diffusers) * [Data Processing](#-how-to-extract-t5-and-vae-features) * [PixArt-**α** Demo](#3---gradio-with-diffusers--faster-) * [PixArt-**α** 8GB VRAM](asset/docs/pixart.md) * [PixArt-**δ** (LCM)](asset/docs/pixart_lcm.md) * [PixArt-**δ** (ControlNet)](asset/docs/pixart_controlnet.md) * [PixArt-**δ** (Dreambooth)](asset/docs/pixart-dreambooth.md) * [Acknowledgement](#acknowledgements) * [Citation](#bibtex) * [PixArt-**Σ** Releasing](https://github.com/PixArt-alpha/PixArt-sigma) --- ## 🐱 Abstract TL; DR: PixArt-α is a Transformer-based T2I diffusion model whose image generation quality is competitive with state-of-the-art image generators (e.g., Imagen, SDXL, and even Midjourney), and the training speed markedly surpasses existing large-scale T2I models, e.g., PixArt-α only takes 10.8% of Stable Diffusion v1.5's training time (675 vs. 6,250 A100 GPU days).
CLICK for the full abstract The most advanced text-to-image (T2I) models require significant training costs (e.g., millions of GPU hours), seriously hindering the fundamental innovation for the AIGC community while increasing CO2 emissions. This paper introduces PixArt-α, a Transformer-based T2I diffusion model whose image generation quality is competitive with state-of-the-art image generators (e.g., Imagen, SDXL, and even Midjourney), reaching near-commercial application standards. Additionally, it supports high-resolution image synthesis up to 1024px resolution with low training cost. To achieve this goal, three core designs are proposed: (1) Training strategy decomposition: We devise three distinct training steps that separately optimize pixel dependency, text-image alignment, and image aesthetic quality; (2) Efficient T2I Transformer: We incorporate cross-attention modules into Diffusion Transformer (DiT) to inject text conditions and streamline the computation-intensive class-condition branch; (3) High-informative data: We emphasize the significance of concept density in text-image pairs and leverage a large Vision-Language model to auto-label dense pseudo-captions to assist text-image alignment learning. As a result, PixArt-α's training speed markedly surpasses existing large-scale T2I models, e.g., PixArt-α only takes 10.8% of Stable Diffusion v1.5's training time (675 vs. 6,250 A100 GPU days), saving nearly $300,000 ($26,000 vs. $320,000) and reducing 90% CO2 emissions. Moreover, compared with a larger SOTA model, RAPHAEL, our training cost is merely 1%. Extensive experiments demonstrate that PixArt-α excels in image quality, artistry, and semantic control. We hope PixArt-α will provide new insights to the AIGC community and startups to accelerate building their own high-quality yet low-cost generative models from scratch.
--- ![A small cactus with a happy face in the Sahara desert.](asset/images/teaser.png) --- # 🔥🔥🔥 Why PixArt-α? ## Training Efficiency PixArt-α only takes 12% of Stable Diffusion v1.5's training time (753 vs. 6,250 A100 GPU days), saving nearly $300,000 ($28,000 vs. $320,000) and reducing 90% CO2 emissions. Moreover, compared with a larger SOTA model, RAPHAEL, our training cost is merely 1%. ![Training Efficiency.](asset/images/efficiency.png) | Method | Type | #Params | #Images| FID-30K ↓ | A100 GPU days | |-----------|------|---------|--------|------------------|---------------| | DALL·E | Diff | 12.0B | 250M | 27.50 | | | GLIDE | Diff | 5.0B | 250M | 12.24 | | | LDM | Diff | 1.4B | 400M | 12.64 | | | DALL·E 2 | Diff | 6.5B | 650M | 10.39 | 41,66 | | SDv1.5 | Diff | 0.9B | 2000M | 9.62 | 6,250 | | GigaGAN | GAN | 0.9B | 2700M | 9.09 | 4,783 | | Imagen | Diff | 3.0B | 860M | 7.27 | 7,132 | | RAPHAEL | Diff | 3.0B | 5000M+ | 6.61 | 60,000 | | PixArt-α | Diff | 0.6B | 25M | 7.32 (zero-shot) | 753 | | PixArt-α | Diff | 0.6B | 25M | 5.51 (COCO FT) | 753 | ## Inference Efficiency PIXART-δ successfully generates **1024x1024 high resolution** images within **0.5 seconds** on an A100. With the implementation of 8-bit inference technology, PIXART-δ requires **less than 8GB of GPU VRAM**. Let us stress again how liberating it is to explore image generation so easily with PixArt-LCM. | Hardware | PIXART-δ (4 steps) | SDXL LoRA LCM (4 steps) | PixArt-α (14 steps) | SDXL standard (25 steps) | |-----------------------------|--------------------|-------------------------|---------------------|---------------------------| | T4 (Google Colab Free Tier) | 3.3s | 8.4s | 16.0s | 26.5s | | V100 (32 GB) | 0.8s | 1.2s | 5.5s | 7.7s | | A100 (80 GB) | 0.51s | 1.2s | 2.2s | 3.8s | These tests were run with a batch size of 1 in all cases. For cards with a lot of capacity, such as A100, performance increases significantly when generating multiple images at once, which is usually the case for production workloads. ## High-quality Generation from PixArt-α - More samples
- PixArt + [Dreambooth](https://dreambooth.github.io/)
- PixArt + [ControlNet](https://github.com/lllyasviel/ControlNet)
# 🔧 Dependencies and Installation - Python >= 3.9 (Recommend to use [Anaconda](https://www.anaconda.com/download/#linux) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html)) - [PyTorch >= 1.13.0+cu11.7](https://pytorch.org/) ```bash conda create -n pixart python=3.9 conda activate pixart pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu118 git clone https://github.com/PixArt-alpha/PixArt-alpha.git cd PixArt-alpha pip install -r requirements.txt ``` # ⏬ Download Models All models will be automatically downloaded. You can also choose to download manually from this [url](https://huggingface.co/PixArt-alpha/PixArt-alpha). | Model | #Params | url | Download in OpenXLab | |:----------------------------|:--------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------| | T5 | 4.3B | [T5](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) | [T5](https://download.openxlab.org.cn/models/PixArt-alpha/PixArt-alpha/weight/t5-v1_1-xxl.zip) | | VAE | 80M | [VAE](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/sd-vae-ft-ema) | [VAE](https://download.openxlab.org.cn/models/PixArt-alpha/PixArt-alpha/weight/sd-vae-ft-ema.zip) | | PixArt-α-SAM-256 | 0.6B | [PixArt-XL-2-SAM-256x256.pth](https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-SAM-256x256.pth) or [diffusers version](https://huggingface.co/PixArt-alpha/PixArt-XL-2-SAM-256x256) | [256-SAM](https://download.openxlab.org.cn/models/PixArt-alpha/PixArt-alpha/weight/PixArt-XL-2-SAM-256x256.pth) | | PixArt-α-256 | 0.6B | [PixArt-XL-2-256x256.pth](https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-256x256.pth) or [diffusers version](https://huggingface.co/PixArt-alpha/PixArt-XL-2-256x256) | [256](https://download.openxlab.org.cn/models/PixArt-alpha/PixArt-alpha/weight/PixArt-XL-2-256x256.pth) | | PixArt-α-256-MSCOCO-FID7.32 | 0.6B | [PixArt-XL-2-256x256.pth](https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-256x256-MSCOCO-FID732.pth) | [256]() | | PixArt-α-512 | 0.6B | [PixArt-XL-2-512x512.pth](https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-512x512.pth) or [diffusers version](https://huggingface.co/PixArt-alpha/PixArt-XL-2-512x512) | [512](https://download.openxlab.org.cn/models/PixArt-alpha/PixArt-alpha/weight/PixArt-XL-2-512x512.pth) | | PixArt-α-1024 | 0.6B | [PixArt-XL-2-1024-MS.pth](https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-1024-MS.pth) or [diffusers version](https://huggingface.co/PixArt-alpha/PixArt-XL-2-1024-MS) | [1024](https://download.openxlab.org.cn/models/PixArt-alpha/PixArt-alpha/weight/PixArt-XL-2-1024-MS.pth) | | PixArt-δ-1024-LCM | 0.6B | [diffusers version](https://huggingface.co/PixArt-alpha/PixArt-LCM-XL-2-1024-MS) | | | ControlNet-HED-Encoder | 30M | [ControlNetHED.pth](https://huggingface.co/PixArt-alpha/PixArt-alpha/blob/main/ControlNetHED.pth) | | | PixArt-δ-512-ControlNet | 0.9B | [PixArt-XL-2-512-ControlNet.pth](https://huggingface.co/PixArt-alpha/PixArt-ControlNet/tree/main) | [512](https://openxlab.org.cn/models/detail/PixArt-alpha/PixArt-ControlNet) | | PixArt-δ-1024-ControlNet | 0.9B | [PixArt-XL-2-1024-ControlNet.pth](https://huggingface.co/PixArt-alpha/PixArt-ControlNet/tree/main) | [1024](https://openxlab.org.cn/models/detail/PixArt-alpha/PixArt-ControlNet) | ALSO find all models in [OpenXLab_PixArt-alpha](https://openxlab.org.cn/models/detail/PixArt-alpha/PixArt-alpha) # 🔥 How to Train ## 1. PixArt Training **First of all.** Thanks to [@kopyl](https://github.com/kopyl), you can reproduce the full fine-tune training flow on [Pokemon dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) from HugginFace with notebooks: 1. Train with [notebooks/train.ipynb](https://github.com/PixArt-alpha/PixArt-alpha/blob/53dac066f60fe5fdbdde4f0360145ca96d4cc38c/notebooks/train.ipynb). 2. Convert to Diffusers with [notebooks/convert-checkpoint-to-diffusers.ipynb](https://github.com/PixArt-alpha/PixArt-alpha/blob/master/notebooks/convert-checkpoint-to-diffusers.ipynb). 3. Run the inference with converted checkpoint in step 2 with [notebooks/infer.ipynb](https://github.com/PixArt-alpha/PixArt-alpha/blob/master/notebooks/infer.ipynb). **Then, for more details.** Here we take SAM dataset training config as an example, but of course, you can also prepare your own dataset following this method. You **ONLY** need to change the **config** file in [config](./configs/pixart_config) and **dataloader** in [dataset](./diffusion/data/datasets). ```bash python -m torch.distributed.launch --nproc_per_node=2 --master_port=12345 train_scripts/train.py configs/pixart_config/PixArt_xl2_img256_SAM.py --work-dir output/train_SAM_256 ``` The directory structure for SAM dataset is: ``` cd ./data SA1B ├──images/ (images are saved here) │ ├──sa_xxxxx.jpg │ ├──sa_xxxxx.jpg │ ├──...... ├──captions/ (corresponding captions are saved here, same name as images) │ ├──sa_xxxxx.txt │ ├──sa_xxxxx.txt ├──partition/ (all image names are stored txt file where each line is a image name) │ ├──part0.txt │ ├──part1.txt │ ├──...... ├──caption_feature_wmask/ (run tools/extract_caption_feature.py to generate caption T5 features, same name as images except .npz extension) │ ├──sa_xxxxx.npz │ ├──sa_xxxxx.npz │ ├──...... ├──img_vae_feature/ (run tools/extract_img_vae_feature.py to generate image VAE features, same name as images except .npy extension) │ ├──train_vae_256/ │ │ ├──noflip/ │ │ │ ├──sa_xxxxx.npy │ │ │ ├──sa_xxxxx.npy │ │ │ ├──...... ``` **Here we prepare data_toy for better understanding** ```bash cd ./data git lfs install git clone https://huggingface.co/datasets/PixArt-alpha/data_toy ``` Then, [Here](https://huggingface.co/datasets/PixArt-alpha/data_toy/blob/main/part0.txt) is an example of partition/part0.txt file. --- Besides, for json file guided [training](https://github.com/PixArt-alpha/PixArt-alpha/blob/fe0cb78065d64c18ecd8955a04e4f29138d47946/configs/pixart_config/PixArt_xl2_img1024_internalms.py#L3C2-L3C2), [here](https://huggingface.co/datasets/PixArt-alpha/data_toy/blob/main/data_info.json) is a toy json file for better understand. --- ## 2. PixArt + DreamBooth Training Following the `Pixart + DreamBooth` [training guidance](asset/docs/pixart-dreambooth.md) ## 3. PixArt + LCM / LCM-LoRA Training Following the `PixArt + LCM` [training guidance](asset/docs/pixart_lcm.md) ## 4. PixArt + ControlNet Training Following the `PixArt + ControlNet` [training guidance](asset/docs/pixart_controlnet.md) ## 4. PixArt + LoRA Training ```bash pip install peft==0.6.2 accelerate launch --num_processes=1 --main_process_port=36667 train_scripts/train_pixart_lora_hf.py --mixed_precision="fp16" \ --pretrained_model_name_or_path=PixArt-alpha/PixArt-XL-2-1024-MS \ --dataset_name=lambdalabs/pokemon-blip-captions --caption_column="text" \ --resolution=1024 --random_flip \ --train_batch_size=16 \ --num_train_epochs=200 --checkpointing_steps=100 \ --learning_rate=1e-06 --lr_scheduler="constant" --lr_warmup_steps=0 \ --seed=42 \ --output_dir="pixart-pokemon-model" \ --validation_prompt="cute dragon creature" --report_to="tensorboard" \ --gradient_checkpointing --checkpoints_total_limit=10 --validation_epochs=5 \ --rank=16 ``` # 💻 How to Test Inference requires at least `23GB` of GPU memory using this repo, while `11GB and 8GB` using in 🧨 [diffusers](#using-in--diffusers). Currently support: - [x] [IDDPM](https://arxiv.org/abs/2102.09672) - [x] [DPM-Solver](https://arxiv.org/abs/2206.00927) - [x] [SA-Solver](https://arxiv.org/abs/2309.05019) - [ ] [DPM-Solver-v3](https://arxiv.org/abs/2310.13268v2) ## 1. Quick start with [Gradio](https://www.gradio.app/guides/quickstart) To get started, first install the required dependencies. Make sure you've downloaded the [models](https://huggingface.co/PixArt-alpha/PixArt-alpha) to the output/pretrained_models folder, and then run on your local machine: ```bash DEMO_PORT=12345 python app/app.py ``` As an alternative, a sample [Dockerfile](Dockerfile) is provided to make a runtime container that starts the Gradio app. ```bash docker build . -t pixart docker run --gpus all -it -p 12345:12345 -v :/root/.cache/huggingface pixart ``` Or use docker-compose. Note, if you want to change context from the 1024 to 512 or LCM version of the app just change the APP_CONTEXT env variable in the docker-compose.yml file. The default is 1024 ```bash docker compose build docker compose up ``` Let's have a look at a simple example using the `http://your-server-ip:12345`. ## 2. Integration in diffusers ### 1). Using in 🧨 diffusers Make sure you have the updated versions of the following libraries: ```bash pip install -U transformers accelerate diffusers SentencePiece ftfy beautifulsoup4 ``` And then: ```python import torch from diffusers import PixArtAlphaPipeline, ConsistencyDecoderVAE, AutoencoderKL device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # You can replace the checkpoint id with "PixArt-alpha/PixArt-XL-2-512x512" too. pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16, use_safetensors=True) # If use DALL-E 3 Consistency Decoder # pipe.vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16) # If use SA-Solver sampler # from diffusion.sa_solver_diffusers import SASolverScheduler # pipe.scheduler = SASolverScheduler.from_config(pipe.scheduler.config, algorithm_type='data_prediction') # If loading a LoRA model # transformer = Transformer2DModel.from_pretrained("PixArt-alpha/PixArt-LCM-XL-2-1024-MS", subfolder="transformer", torch_dtype=torch.float16) # transformer = PeftModel.from_pretrained(transformer, "Your-LoRA-Model-Path") # pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-LCM-XL-2-1024-MS", transformer=transformer, torch_dtype=torch.float16, use_safetensors=True) # del transformer # Enable memory optimizations. # pipe.enable_model_cpu_offload() pipe.to(device) prompt = "A small cactus with a happy face in the Sahara desert." image = pipe(prompt).images[0] image.save("./catcus.png") ``` Check out the [documentation](./asset/docs/sasolver.md) for more information about SA-Solver Sampler. This integration allows running the pipeline with a batch size of 4 under 11 GBs of GPU VRAM. Check out the [documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/pixart) to learn more. ### 2). Running the `PixArtAlphaPipeline` in under 8GB GPU VRAM GPU VRAM consumption under 8 GB is supported now, please refer to [documentation](asset/docs/pixart.md) for more information. ### 3). Gradio with diffusers (Faster) To get started, first install the required dependencies, then run on your local machine: ```bash # diffusers version DEMO_PORT=12345 python app/app.py ``` Let's have a look at a simple example using the `http://your-server-ip:12345`. You can also click [here](https://colab.research.google.com/drive/1jZ5UZXk7tcpTfVwnX33dDuefNMcnW9ME?usp=sharing) to have a free trial on Google Colab. ### 4). Convert .pth checkpoint into diffusers version ```bash python tools/convert_pixart_alpha_to_diffusers.py --image_size your_img_size --multi_scale_train (True if you use PixArtMS else False) --orig_ckpt_path path/to/pth --dump_path path/to/diffusers --only_transformer=True ``` ## 3. Online Demo [![Hugging Face PixArt](https://img.shields.io/static/v1?label=Demo&message=HuggingFace%20Gradio&color=orange)](https://huggingface.co/spaces/PixArt-alpha/PixArt-alpha) ![Online Demo sample](asset/images/sample.png) # ✏️ How to LLaVA captioning Thanks to the code base of [LLaVA-Lightning-MPT](https://huggingface.co/liuhaotian/LLaVA-Lightning-MPT-7B-preview), we can caption the LAION and SAM dataset with the following launching code: ```bash python tools/VLM_caption_lightning.py --output output/dir/ --data-root data/root/path --index path/to/data.json ``` We present auto-labeling with custom prompts for LAION (left) and SAM (right). The words highlighted in green represent the original caption in LAION, while those marked in red indicate the detailed captions labeled by LLaVA. ![Dialog with LLaVA.](asset/images/LLaVA-dialog.png) # ✏️ How to extract T5 and VAE features Prepare T5 text feature and VAE image feature in advance will speed up the training process and save GPU memory. ```bash python tools/extract_features.py --img_size=1024 \ --json_path "data/data_info.json" \ --t5_save_root "data/SA1B/caption_feature_wmask" \ --vae_save_root "data/SA1B/img_vae_features" \ --pretrained_models_dir "output/pretrained_models" \ --dataset_root "data/SA1B/Images/" ``` ## 💪To-Do List (Congratulations🎉) - [x] Inference code - [x] Training code - [x] T5 & VAE feature extraction code - [x] LLaVA captioning code - [x] Model zoo - [x] Diffusers version & Hugging Face demo - [x] Google Colab example - [x] DALLE3 VAE integration - [x] Inference under 8GB GPU VRAM with diffusers - [x] Dreambooth Training code - [x] SA-Solver code - [x] PixArt-α-LCM will release soon - [x] Multi-scale vae feature extraction code - [x] PixArt-α-LCM-LoRA scripts will release soon - [x] PixArt-α-LoRA training scripts will release soon - [x] ControlNet code will be released - [x] SAM-LLaVA caption dataset - [x] ControlNet checkpoint - [x] 256px pre-trained models - [x] PixArt-Σ: Next version model with much better ability is training! # Other Source We make a video comparing PixArt with current most powerful Text-to-Image models. [![Watch the video](https://img.youtube.com/vi/7_6KsIITgWY/maxresdefault.jpg)](https://www.youtube.com/watch?v=7_6KsIITgWY) # 📖BibTeX @misc{chen2023pixartalpha, title={PixArt-$\alpha$: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis}, author={Junsong Chen and Jincheng Yu and Chongjian Ge and Lewei Yao and Enze Xie and Yue Wu and Zhongdao Wang and James Kwok and Ping Luo and Huchuan Lu and Zhenguo Li}, year={2023}, eprint={2310.00426}, archivePrefix={arXiv}, primaryClass={cs.CV} } @misc{chen2024pixartdelta, title={PIXART-{\delta}: Fast and Controllable Image Generation with Latent Consistency Models}, author={Junsong Chen and Yue Wu and Simian Luo and Enze Xie and Sayak Paul and Ping Luo and Hang Zhao and Zhenguo Li}, year={2024}, eprint={2401.05252}, archivePrefix={arXiv}, primaryClass={cs.CV} } # 🤗Acknowledgements - Thanks to [Diffusers](https://github.com/huggingface/diffusers) for their wonderful technical support and awesome collaboration! - Thanks to [Hugging Face](https://github.com/huggingface) for sponsoring the nicely demo! - Thanks to [DiT](https://github.com/facebookresearch/DiT) for their wonderful work and codebase! ## Star History [![Star History Chart](https://api.star-history.com/svg?repos=PixArt-alpha/PixArt-alpha&type=Date)](https://star-history.com/#PixArt-alpha/PixArt-alpha&Date) ================================================ FILE: PixArt-alpha-ToCa/app/app.py ================================================ #!/usr/bin/env python from __future__ import annotations import os import sys from pathlib import Path current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) import random import gradio as gr import numpy as np import uuid from diffusers import ConsistencyDecoderVAE, PixArtAlphaPipeline, DPMSolverMultistepScheduler import torch from typing import Tuple from datetime import datetime from diffusion.sa_solver_diffusers import SASolverScheduler DESCRIPTION = """![Logo](https://raw.githubusercontent.com/PixArt-alpha/PixArt-alpha.github.io/master/static/images/logo.png) # PixArt-Alpha 1024px #### [PixArt-Alpha 1024px](https://github.com/PixArt-alpha/PixArt-alpha) is a transformer-based text-to-image diffusion system trained on text embeddings from T5. This demo uses the [PixArt-alpha/PixArt-XL-2-1024-MS](https://huggingface.co/PixArt-alpha/PixArt-XL-2-1024-MS) checkpoint. #### English prompts ONLY; 提示词仅限英文 Don't want to queue? Try [OpenXLab](https://openxlab.org.cn/apps/detail/PixArt-alpha/PixArt-alpha) or [Google Colab Demo](https://colab.research.google.com/drive/1jZ5UZXk7tcpTfVwnX33dDuefNMcnW9ME?usp=sharing). ### You may change the DPM-Solver inference steps from 14 to 20, if you didn't get satisfied results. """ if not torch.cuda.is_available(): DESCRIPTION += "\n

Running on CPU 🥶 This demo does not work on CPU.

" MAX_SEED = np.iinfo(np.int32).max CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1" MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "2048")) USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1" ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1" PORT = int(os.getenv("DEMO_PORT", "15432")) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") style_list = [ { "name": "(No style)", "prompt": "{prompt}", "negative_prompt": "", }, { "name": "Cinematic", "prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy", "negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured", }, { "name": "Photographic", "prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed", "negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly", }, { "name": "Anime", "prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime, highly detailed", "negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast", }, { "name": "Manga", "prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style", "negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style", }, { "name": "Digital Art", "prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed", "negative_prompt": "photo, photorealistic, realism, ugly", }, { "name": "Pixel art", "prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics", "negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic", }, { "name": "Fantasy art", "prompt": "ethereal fantasy concept art of {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy", "negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white", }, { "name": "Neonpunk", "prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional", "negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured", }, { "name": "3D Model", "prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting", "negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting", }, ] styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list} STYLE_NAMES = list(styles.keys()) DEFAULT_STYLE_NAME = "(No style)" SCHEDULE_NAME = ["DPM-Solver", "SA-Solver"] DEFAULT_SCHEDULE_NAME = "DPM-Solver" NUM_IMAGES_PER_PROMPT = 1 def apply_style(style_name: str, positive: str, negative: str = "") -> Tuple[str, str]: p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME]) if not negative: negative = "" return p.replace("{prompt}", positive), n + negative if torch.cuda.is_available(): pipe = PixArtAlphaPipeline.from_pretrained( "PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16, use_safetensors=True, ) if os.getenv('CONSISTENCY_DECODER', False): print("Using DALL-E 3 Consistency Decoder") pipe.vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16) if ENABLE_CPU_OFFLOAD: pipe.enable_model_cpu_offload() else: pipe.to(device) print("Loaded on Device!") # speed-up T5 pipe.text_encoder.to_bettertransformer() if USE_TORCH_COMPILE: pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=True) print("Model Compiled!") def save_image(img): unique_name = f'{str(uuid.uuid4())}.png' save_path = os.path.join(f'output/online_demo_img/{datetime.now().date()}') os.makedirs(save_path, exist_ok=True) unique_name = os.path.join(save_path, unique_name) img.save(unique_name) return unique_name def randomize_seed_fn(seed: int, randomize_seed: bool) -> int: if randomize_seed: seed = random.randint(0, MAX_SEED) return seed def generate( prompt: str, negative_prompt: str = "", style: str = DEFAULT_STYLE_NAME, use_negative_prompt: bool = False, seed: int = 0, width: int = 1024, height: int = 1024, schedule: str = 'DPM-Solver', dpms_guidance_scale: float = 4.5, sas_guidance_scale: float = 3, dpms_inference_steps: int = 20, sas_inference_steps: int = 25, randomize_seed: bool = False, use_resolution_binning: bool = True, progress=gr.Progress(track_tqdm=True), ): seed = int(randomize_seed_fn(seed, randomize_seed)) generator = torch.Generator().manual_seed(seed) if schedule == 'DPM-Solver': if not isinstance(pipe.scheduler, DPMSolverMultistepScheduler): pipe.scheduler = DPMSolverMultistepScheduler() num_inference_steps = dpms_inference_steps guidance_scale = dpms_guidance_scale elif schedule == "SA-Solver": if not isinstance(pipe.scheduler, SASolverScheduler): pipe.scheduler = SASolverScheduler.from_config(pipe.scheduler.config, algorithm_type='data_prediction', tau_func=lambda t: 1 if 200 <= t <= 800 else 0, predictor_order=2, corrector_order=2) num_inference_steps = sas_inference_steps guidance_scale = sas_guidance_scale else: raise ValueError(f"Unknown schedule: {schedule}") if not use_negative_prompt: negative_prompt = None # type: ignore prompt, negative_prompt = apply_style(style, prompt, negative_prompt) images = pipe( prompt=prompt, width=width, height=height, negative_prompt=negative_prompt, guidance_scale=guidance_scale, num_inference_steps=num_inference_steps, generator=generator, num_images_per_prompt=NUM_IMAGES_PER_PROMPT, use_resolution_binning=use_resolution_binning, output_type="pil", ).images image_paths = [save_image(img) for img in images] print(image_paths) return image_paths, seed examples = [ "A small cactus with a happy face in the Sahara desert.", "an astronaut sitting in a diner, eating fries, cinematic, analog film", "Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail.", "stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, blue and pink, brilliantly illuminated in the background.", "professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest.", "beautiful lady, freckles, big smile, blue eyes, short ginger hair, dark makeup, wearing a floral blue vest top, soft light, dark grey background", "Spectacular Tiny World in the Transparent Jar On the Table, interior of the Great Hall, Elaborate, Carved Architecture, Anatomy, Symetrical, Geometric and Parameteric Details, Precision Flat line Details, Pattern, Dark fantasy, Dark errie mood and ineffably mysterious mood, Technical design, Intricate Ultra Detail, Ornate Detail, Stylized and Futuristic and Biomorphic Details, Architectural Concept, Low contrast Details, Cinematic Lighting, 8k, by moebius, Fullshot, Epic, Fullshot, Octane render, Unreal ,Photorealistic, Hyperrealism", "anthropomorphic profile of the white snow owl Crystal priestess , art deco painting, pretty and expressive eyes, ornate costume, mythical, ethereal, intricate, elaborate, hyperrealism, hyper detailed, 3D, 8K, Ultra Realistic, high octane, ultra resolution, amazing detail, perfection, In frame, photorealistic, cinematic lighting, visual clarity, shading , Lumen Reflections, Super-Resolution, gigapixel, color grading, retouch, enhanced, PBR, Blender, V-ray, Procreate, zBrush, Unreal Engine 5, cinematic, volumetric, dramatic, neon lighting, wide angle lens ,no digital painting blur", "The parametric hotel lobby is a sleek and modern space with plenty of natural light. The lobby is spacious and open with a variety of seating options. The front desk is a sleek white counter with a parametric design. The walls are a light blue color with parametric patterns. The floor is a light wood color with a parametric design. There are plenty of plants and flowers throughout the space. The overall effect is a calm and relaxing space. occlusion, moody, sunset, concept art, octane rendering, 8k, highly detailed, concept art, highly detailed, beautiful scenery, cinematic, beautiful light, hyperreal, octane render, hdr, long exposure, 8K, realistic, fog, moody, fire and explosions, smoke, 50mm f2.8", ] with gr.Blocks(css="app/style.css") as demo: gr.Markdown(DESCRIPTION) gr.DuplicateButton( value="Duplicate Space for private use", elem_id="duplicate-button", visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1", ) with gr.Group(): with gr.Row(): prompt = gr.Text( label="Prompt", show_label=False, max_lines=1, placeholder="Enter your prompt", container=False, ) run_button = gr.Button("Run", scale=0) result = gr.Gallery(label="Result", columns=NUM_IMAGES_PER_PROMPT, show_label=False) with gr.Accordion("Advanced options", open=False): with gr.Row(): use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=False, visible=True) schedule = gr.Radio( show_label=True, container=True, interactive=True, choices=SCHEDULE_NAME, value=DEFAULT_SCHEDULE_NAME, label="Sampler Schedule", visible=True, ) style_selection = gr.Radio( show_label=True, container=True, interactive=True, choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME, label="Image Style", ) negative_prompt = gr.Text( label="Negative prompt", max_lines=1, placeholder="Enter a negative prompt", visible=True, ) seed = gr.Slider( label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0, ) randomize_seed = gr.Checkbox(label="Randomize seed", value=True) with gr.Row(visible=True): width = gr.Slider( label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024, ) height = gr.Slider( label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024, ) with gr.Row(): dpms_guidance_scale = gr.Slider( label="DPM-Solver Guidance scale", minimum=1, maximum=10, step=0.1, value=4.5, ) dpms_inference_steps = gr.Slider( label="DPM-Solver inference steps", minimum=5, maximum=40, step=1, value=14, ) with gr.Row(): sas_guidance_scale = gr.Slider( label="SA-Solver Guidance scale", minimum=1, maximum=10, step=0.1, value=3, ) sas_inference_steps = gr.Slider( label="SA-Solver inference steps", minimum=10, maximum=40, step=1, value=25, ) gr.Examples( examples=examples, inputs=prompt, outputs=[result, seed], fn=generate, cache_examples=CACHE_EXAMPLES, ) use_negative_prompt.change( fn=lambda x: gr.update(visible=x), inputs=use_negative_prompt, outputs=negative_prompt, api_name=False, ) gr.on( triggers=[ prompt.submit, negative_prompt.submit, run_button.click, ], fn=generate, inputs=[ prompt, negative_prompt, style_selection, use_negative_prompt, seed, width, height, schedule, dpms_guidance_scale, sas_guidance_scale, dpms_inference_steps, sas_inference_steps, randomize_seed, ], outputs=[result, seed], api_name="run", ) if __name__ == "__main__": demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=PORT, debug=True) ================================================ FILE: PixArt-alpha-ToCa/app/app_512.py ================================================ #!/usr/bin/env python from __future__ import annotations import os import sys from pathlib import Path current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) import random import gradio as gr import numpy as np import uuid from diffusers import PixArtAlphaPipeline, ConsistencyDecoderVAE, DPMSolverMultistepScheduler import torch from typing import Tuple from datetime import datetime from diffusion.data.datasets import ASPECT_RATIO_512_TEST from diffusion.model.utils import resize_and_crop_img from diffusion.sa_solver_diffusers import SASolverScheduler DESCRIPTION = """![Logo](https://raw.githubusercontent.com/PixArt-alpha/PixArt-alpha.github.io/master/static/images/logo.png) # PixArt-Alpha 512px #### [PixArt-Alpha 512px](https://github.com/PixArt-alpha/PixArt-alpha) is a transformer-based text-to-image diffusion system trained on text embeddings from T5. This demo uses the [PixArt-alpha/PixArt-XL-2-512x512](https://huggingface.co/PixArt-alpha/PixArt-XL-2-512x512) checkpoint. #### English prompts ONLY; 提示词仅限英文 Don't want to queue? Try [OpenXLab](https://openxlab.org.cn/apps/detail/PixArt-alpha/PixArt-alpha) or [Google Colab Demo](https://colab.research.google.com/drive/1jZ5UZXk7tcpTfVwnX33dDuefNMcnW9ME?usp=sharing). """ if not torch.cuda.is_available(): DESCRIPTION += "\n

Running on CPU 🥶 This demo does not work on CPU.

" MAX_SEED = np.iinfo(np.int32).max CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1" MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "1024")) USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1" ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1" PORT = int(os.getenv("DEMO_PORT", "15432")) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") style_list = [ { "name": "(No style)", "prompt": "{prompt}", "negative_prompt": "", }, { "name": "Cinematic", "prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy", "negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured", }, { "name": "Photographic", "prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed", "negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly", }, { "name": "Anime", "prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime, highly detailed", "negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast", }, { "name": "Manga", "prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style", "negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style", }, { "name": "Digital Art", "prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed", "negative_prompt": "photo, photorealistic, realism, ugly", }, { "name": "Pixel art", "prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics", "negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic", }, { "name": "Fantasy art", "prompt": "ethereal fantasy concept art of {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy", "negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white", }, { "name": "Neonpunk", "prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional", "negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured", }, { "name": "3D Model", "prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting", "negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting", }, ] styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list} STYLE_NAMES = list(styles.keys()) DEFAULT_STYLE_NAME = "(No style)" SCHEDULE_NAME = ["DPM-Solver", "SA-Solver"] DEFAULT_SCHEDULE_NAME = "DPM-Solver" NUM_IMAGES_PER_PROMPT = 2 def apply_style(style_name: str, positive: str, negative: str = "") -> Tuple[str, str]: p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME]) if not negative: negative = "" return p.replace("{prompt}", positive), n + negative if torch.cuda.is_available(): pipe = PixArtAlphaPipeline.from_pretrained( "PixArt-alpha/PixArt-XL-2-512x512", torch_dtype=torch.float16, variant="fp16", use_safetensors=True, ) if os.getenv('CONSISTENCY_DECODER', False): print("Using DALL-E 3 Consistency Decoder") pipe.vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16) if ENABLE_CPU_OFFLOAD: pipe.enable_model_cpu_offload() else: pipe.to(device) print("Loaded on Device!") # speed-up T5 pipe.text_encoder.to_bettertransformer() if USE_TORCH_COMPILE: pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=True) print("Model Compiled!") def prepare_prompt_hw(height, width, ratios): ar = float(height/width) closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar)) default_hw = ratios[closest_ratio] return int(default_hw[0]), int(default_hw[1]) def save_image(img): unique_name = f'{str(uuid.uuid4())}.png' save_path = os.path.join(f'output/online_demo_img512/{datetime.now().date()}') os.makedirs(save_path, exist_ok=True) unique_name = os.path.join(save_path, unique_name) img.save(unique_name) return unique_name def randomize_seed_fn(seed: int, randomize_seed: bool) -> int: if randomize_seed: seed = random.randint(0, MAX_SEED) return seed def classify_height_width_bin(height: int, width: int, ratios: dict): ar = float(height / width) closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar)) default_hw = ratios[closest_ratio] return int(default_hw[0]), int(default_hw[1]) def generate( prompt: str, negative_prompt: str = "", style: str = DEFAULT_STYLE_NAME, use_negative_prompt: bool = False, seed: int = 0, width: int = 512, height: int = 512, schedule: str = 'DPM-Solver', dpms_guidance_scale: float = 4.5, sas_guidance_scale: float = 3, dpms_inference_steps: int = 20, sas_inference_steps: int = 25, randomize_seed: bool = False, use_resolution_binning: bool = True, progress=gr.Progress(track_tqdm=True), ): seed = int(randomize_seed_fn(seed, randomize_seed)) generator = torch.Generator().manual_seed(seed) if schedule == 'DPM-Solver': if not isinstance(pipe.scheduler, DPMSolverMultistepScheduler): pipe.scheduler = DPMSolverMultistepScheduler() num_inference_steps = dpms_inference_steps guidance_scale = dpms_guidance_scale elif schedule == "SA-Solver": if not isinstance(pipe.scheduler, SASolverScheduler): pipe.scheduler = SASolverScheduler.from_config(pipe.scheduler.config, algorithm_type='data_prediction', tau_func=lambda t: 1 if 200 <= t <= 800 else 0, predictor_order=2, corrector_order=2) num_inference_steps = sas_inference_steps guidance_scale = sas_guidance_scale else: raise ValueError(f"Unknown schedule: {schedule}") if not use_negative_prompt: negative_prompt = None # type: ignore prompt, negative_prompt = apply_style(style, prompt, negative_prompt) if use_resolution_binning: orig_height, orig_width = height, width height, width = classify_height_width_bin(height, width, ratios=ASPECT_RATIO_512_TEST) images = pipe( prompt=prompt, width=width, height=height, negative_prompt=negative_prompt, guidance_scale=guidance_scale, num_inference_steps=num_inference_steps, generator=generator, use_resolution_binning=False, num_images_per_prompt=NUM_IMAGES_PER_PROMPT, output_type="pil", ).images if use_resolution_binning: images = [resize_and_crop_img(img, orig_width, orig_height) for img in images] image_paths = [save_image(img) for img in images] print(image_paths) return image_paths, seed examples = [ "A small cactus with a happy face in the Sahara desert.", "an astronaut sitting in a diner, eating fries, cinematic, analog film", "Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail.", "stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, blue and pink, brilliantly illuminated in the background.", "professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest.", "beautiful lady, freckles, big smile, blue eyes, short ginger hair, dark makeup, wearing a floral blue vest top, soft light, dark grey background", "Spectacular Tiny World in the Transparent Jar On the Table, interior of the Great Hall, Elaborate, Carved Architecture, Anatomy, Symetrical, Geometric and Parameteric Details, Precision Flat line Details, Pattern, Dark fantasy, Dark errie mood and ineffably mysterious mood, Technical design, Intricate Ultra Detail, Ornate Detail, Stylized and Futuristic and Biomorphic Details, Architectural Concept, Low contrast Details, Cinematic Lighting, 8k, by moebius, Fullshot, Epic, Fullshot, Octane render, Unreal ,Photorealistic, Hyperrealism", "anthropomorphic profile of the white snow owl Crystal priestess , art deco painting, pretty and expressive eyes, ornate costume, mythical, ethereal, intricate, elaborate, hyperrealism, hyper detailed, 3D, 8K, Ultra Realistic, high octane, ultra resolution, amazing detail, perfection, In frame, photorealistic, cinematic lighting, visual clarity, shading , Lumen Reflections, Super-Resolution, gigapixel, color grading, retouch, enhanced, PBR, Blender, V-ray, Procreate, zBrush, Unreal Engine 5, cinematic, volumetric, dramatic, neon lighting, wide angle lens ,no digital painting blur", "The parametric hotel lobby is a sleek and modern space with plenty of natural light. The lobby is spacious and open with a variety of seating options. The front desk is a sleek white counter with a parametric design. The walls are a light blue color with parametric patterns. The floor is a light wood color with a parametric design. There are plenty of plants and flowers throughout the space. The overall effect is a calm and relaxing space. occlusion, moody, sunset, concept art, octane rendering, 8k, highly detailed, concept art, highly detailed, beautiful scenery, cinematic, beautiful light, hyperreal, octane render, hdr, long exposure, 8K, realistic, fog, moody, fire and explosions, smoke, 50mm f2.8", ] with gr.Blocks(css="scripts/style.css") as demo: gr.Markdown(DESCRIPTION) gr.DuplicateButton( value="Duplicate Space for private use", elem_id="duplicate-button", visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1", ) with gr.Group(): with gr.Row(): prompt = gr.Text( label="Prompt", show_label=False, max_lines=1, placeholder="Enter your prompt", container=False, ) run_button = gr.Button("Run", scale=0) result = gr.Gallery(label="Result", columns=NUM_IMAGES_PER_PROMPT, show_label=False) with gr.Accordion("Advanced options", open=False): with gr.Row(): use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=False, visible=False) schedule = gr.Radio( show_label=True, container=True, interactive=True, choices=SCHEDULE_NAME, value=DEFAULT_SCHEDULE_NAME, label="Sampler Schedule", visible=True, ) style_selection = gr.Radio( show_label=True, container=True, interactive=True, choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME, label="Image Style", ) negative_prompt = gr.Text( label="Negative prompt (no use now)", max_lines=1, placeholder="Enter a negative prompt", visible=False, ) seed = gr.Slider( label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0, ) randomize_seed = gr.Checkbox(label="Randomize seed", value=True) with gr.Row(visible=True): width = gr.Slider( label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=512, ) height = gr.Slider( label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=512, ) with gr.Row(): dpms_guidance_scale = gr.Slider( label="DPM-Solver Guidance scale", minimum=1, maximum=10, step=0.1, value=4.5, ) dpms_inference_steps = gr.Slider( label="DPM-Solver inference steps", minimum=5, maximum=40, step=1, value=20, ) with gr.Row(): sas_guidance_scale = gr.Slider( label="SA-Solver Guidance scale", minimum=1, maximum=10, step=0.1, value=3, ) sas_inference_steps = gr.Slider( label="SA-Solver inference steps", minimum=10, maximum=40, step=1, value=25, ) gr.Examples( examples=examples, inputs=prompt, outputs=[result, seed], fn=generate, cache_examples=CACHE_EXAMPLES, ) use_negative_prompt.change( fn=lambda x: gr.update(visible=x), inputs=use_negative_prompt, outputs=negative_prompt, api_name=False, ) gr.on( triggers=[ prompt.submit, negative_prompt.submit, run_button.click, ], fn=generate, inputs=[ prompt, negative_prompt, style_selection, use_negative_prompt, seed, width, height, schedule, dpms_guidance_scale, sas_guidance_scale, dpms_inference_steps, sas_inference_steps, randomize_seed, ], outputs=[result, seed], api_name="run", ) if __name__ == "__main__": demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=PORT, debug=True) ================================================ FILE: PixArt-alpha-ToCa/app/app_controlnet.py ================================================ #!/usr/bin/env python from __future__ import annotations import argparse import os import random import sys import uuid from datetime import datetime from pathlib import Path from typing import List, Tuple, Union current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) import gradio as gr import numpy as np import torch from PIL import Image as PILImage import torchvision.transforms as T import torchvision.transforms.functional as TF from torchvision.utils import _log_api_usage_once, make_grid, save_image from diffusers import PixArtAlphaPipeline from diffusion import DPMS, SASolverSampler from diffusion.data.datasets import * from diffusion.model.hed import HEDdetector from diffusion.model.nets import PixArt_XL_2, PixArtMS_XL_2, ControlPixArtHalf, ControlPixArtMSHalf from diffusion.model.utils import resize_and_crop_tensor from diffusion.utils.misc import read_config from tools.download import find_model DESCRIPTION = """![Logo](https://raw.githubusercontent.com/PixArt-alpha/PixArt-alpha.github.io/master/static/images/logo.png) # PixArt-Delta (ControlNet) #### [PixArt-Alpha 1024px](https://github.com/PixArt-alpha/PixArt-alpha) is a transformer-based text-to-image diffusion system trained on text embeddings from T5. #### This demo uses the [PixArt-alpha/PixArt-XL-2-1024-ControlNet](https://huggingface.co/PixArt-alpha/PixArt-ControlNet/tree/main) checkpoint. #### This demo uses the [PixArt-alpha/PixArt-XL-2-512-ControlNet](https://huggingface.co/PixArt-alpha/PixArt-ControlNet/tree/main) checkpoint. #### English prompts ONLY; 提示词仅限英文 ### Please use the image size corresponding to the model as input to get the best performance. (eg. 1024px for PixArt-XL-2-1024-ControlNet.pth) """ if not torch.cuda.is_available(): DESCRIPTION += "\n

Running on CPU �� This demo does not work on CPU.

" MAX_SEED = np.iinfo(np.int32).max CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1" MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "2048")) USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1" ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1" PORT = int(os.getenv("DEMO_PORT", "15432")) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @torch.no_grad() def ndarr_image(tensor: Union[torch.Tensor, List[torch.Tensor]], **kwargs, ) -> None: if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(save_image) grid = make_grid(tensor, **kwargs) ndarr = grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy() return ndarr style_list = [ { "name": "(No style)", "prompt": "{prompt}", "negative_prompt": "", }, { "name": "Cinematic", "prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy", "negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured", }, { "name": "Photographic", "prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed", "negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly", }, { "name": "Anime", "prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime, highly detailed", "negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast", }, { "name": "Manga", "prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style", "negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style", }, { "name": "Digital Art", "prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed", "negative_prompt": "photo, photorealistic, realism, ugly", }, { "name": "Pixel art", "prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics", "negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic", }, { "name": "Fantasy art", "prompt": "ethereal fantasy concept art of {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy", "negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white", }, { "name": "Neonpunk", "prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional", "negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured", }, { "name": "3D Model", "prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting", "negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting", }, ] styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list} STYLE_NAMES = list(styles.keys()) DEFAULT_STYLE_NAME = "(No style)" SCHEDULE_NAME = ["DPM-Solver", "SA-Solver"] DEFAULT_SCHEDULE_NAME = "DPM-Solver" def apply_style(style_name: str, positive: str, negative: str = "") -> Tuple[str, str]: p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME]) if not negative: negative = "" return p.replace("{prompt}", positive), n + negative def save_image(img): unique_name = str(uuid.uuid4()) + '.png' save_path = os.path.join(f'output/online_demo_img/{datetime.now().date()}') os.makedirs(save_path, exist_ok=True) unique_name = os.path.join(save_path, unique_name) img.save(unique_name) return unique_name def randomize_seed_fn(seed: int, randomize_seed: bool) -> int: if randomize_seed: seed = random.randint(0, MAX_SEED) return seed @torch.inference_mode() def generate( prompt: str, given_image = None, negative_prompt: str = "", style: str = DEFAULT_STYLE_NAME, use_negative_prompt: bool = False, seed: int = 0, width: int = 1024, height: int = 1024, schedule: str = 'DPM-Solver', dpms_guidance_scale: float = 4.5, sas_guidance_scale: float = 3, dpms_inference_steps: int = 14, sas_inference_steps: int = 25, randomize_seed: bool = False, ): seed = int(randomize_seed_fn(seed, randomize_seed)) torch.manual_seed(seed) torch.cuda.empty_cache() strength = 1.0 c_vis = given_image if not use_negative_prompt: negative_prompt = None # type: ignore prompt, negative_prompt = apply_style(style, prompt, negative_prompt) prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask\ = pipe.encode_prompt(prompt=prompt, negative_prompt=negative_prompt) prompt_embeds, negative_prompt_embeds = prompt_embeds[:, None], negative_prompt_embeds[:, None] torch.cuda.empty_cache() # condition process if given_image is not None: ar = torch.tensor([given_image.size[1] / given_image.size[0]], device=device)[None] custom_hw = torch.tensor([given_image.size[1], given_image.size[0]], device=device)[None] closest_hw = base_ratios[min(base_ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))] hw = torch.tensor(closest_hw, device=device)[None] condition_transform = T.Compose([ T.Lambda(lambda img: img.convert('RGB')), T.Resize(int(min(closest_hw))), T.CenterCrop([int(closest_hw[0]), int(closest_hw[1])]), T.ToTensor(), ]) given_image = condition_transform(given_image).unsqueeze(0).to(device) hed_edge = hed(given_image) * strength hed_edge = TF.normalize(hed_edge, [.5], [.5]) hed_edge = hed_edge.repeat(1, 3, 1, 1).to(weight_dtype) posterior = vae.encode(hed_edge).latent_dist condition = posterior.sample() c = condition * config.scale_factor c_vis = vae.decode(condition)['sample'] c_vis = torch.clamp(127.5 * c_vis + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()[0] else: c = None ar = torch.tensor([int(height) / int(width)], device=device)[None] custom_hw = torch.tensor([int(height), int(width)], device=device)[None] closest_hw = base_ratios[min(base_ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))] hw = torch.tensor(closest_hw, device=device)[None] latent_size_h, latent_size_w = int(hw[0, 0] // 8), int(hw[0, 1] // 8) # Sample images: if schedule == 'DPM-Solver': # Create sampling noise: n = prompt_embeds.shape[0] z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device) model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=prompt_attention_mask, c=c) dpm_solver = DPMS(model.forward_with_dpmsolver, condition=prompt_embeds, uncondition=negative_prompt_embeds, cfg_scale=dpms_guidance_scale, model_kwargs=model_kwargs) samples = dpm_solver.sample( z, steps=dpms_inference_steps, order=2, skip_type="time_uniform", method="multistep", ).to(weight_dtype) elif schedule == "SA-Solver": # Create sampling noise: n = prompt_embeds.shape[0] model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=prompt_attention_mask, c=c) sas_solver = SASolverSampler(model.forward_with_dpmsolver, device=device) samples = sas_solver.sample( S=sas_inference_steps, batch_size=n, shape=(4, latent_size_h, latent_size_w), eta=1, conditioning=prompt_embeds, unconditional_conditioning=negative_prompt_embeds, unconditional_guidance_scale=sas_guidance_scale, model_kwargs=model_kwargs, )[0].to(weight_dtype) samples = vae.decode(samples / config.scale_factor).sample torch.cuda.empty_cache() samples = resize_and_crop_tensor(samples, custom_hw[0, 1], custom_hw[0, 0]) samples = PILImage.fromarray(ndarr_image(samples, normalize=True, value_range=(-1, 1))) image_paths = [save_image(samples)] c_vis = PILImage.fromarray(c_vis) if c_vis is not None else samples c_paths = [save_image(c_vis)] print(image_paths) return image_paths, c_paths, seed def get_args(): parser = argparse.ArgumentParser() parser.add_argument("config", type=str, help="config") parser.add_argument('--image_size', default=1024, type=int) parser.add_argument('--model_path', type=str) return parser.parse_args() args = get_args() config = read_config(args.config) device = "cuda" if torch.cuda.is_available() else "cpu" assert args.image_size in [512, 1024], "We only provide pre-trained models for 512x512 and 1024x1024 resolutions." lewei_scale = {512: 1, 1024: 2} latent_size = args.image_size // 8 weight_dtype = torch.float16 print(f"Inference with {weight_dtype}") if torch.cuda.is_available(): hed = HEDdetector(False).to(device) pipe = PixArtAlphaPipeline.from_pretrained( "PixArt-alpha/PixArt-XL-2-1024-MS", transformer=None, torch_dtype=weight_dtype, use_safetensors=True, ) pipe.to(device) print("Loaded on Device!") vae = pipe.vae text_encoder = pipe.text_encoder tokenizer = pipe.tokenizer assert args.image_size == config.image_size if config.image_size == 512: model = PixArt_XL_2(input_size=latent_size, lewei_scale=lewei_scale[config.image_size]) print('model architecture ControlPixArtHalf and image size is 512') model = ControlPixArtHalf(model).to(device) elif config.image_size == 1024: model = PixArtMS_XL_2(input_size=latent_size, lewei_scale=lewei_scale[config.image_size]) print('model architecture ControlPixArtMSHalf and image size is 1024') model = ControlPixArtMSHalf(model).to(device) state_dict = find_model(args.model_path)['state_dict'] if 'pos_embed' in state_dict: del state_dict['pos_embed'] elif 'base_model.pos_embed' in state_dict: del state_dict['base_model.pos_embed'] missing, unexpected = model.load_state_dict(state_dict, strict=False) print('Missing keys (missing pos_embed is normal): ', missing) print('Unexpected keys', unexpected) model.eval() model.to(weight_dtype) base_ratios = eval(f'ASPECT_RATIO_{args.image_size}_TEST') with gr.Blocks(css="app/style_controlnet.css") as demo: gr.Markdown(DESCRIPTION) gr.DuplicateButton( value="Duplicate Space for private use", elem_id="duplicate-button", visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1", ) image_input = gr.Image( label="Image", height=360, width=360, show_label=False, sources="upload", type="pil", ) with gr.Group(): with gr.Row(): prompt = gr.Text( label="Prompt", show_label=False, max_lines=1, placeholder="Enter your prompt", container=False, ) run_button = gr.Button("Run", scale=0) with gr.Group(): with gr.Row(): hed_result = gr.Gallery(label="Hed Result", show_label=False) result = gr.Gallery(label="Result", show_label=False) with gr.Accordion("Advanced options", open=False): with gr.Row(): use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=False, visible=True) schedule = gr.Radio( show_label=True, container=True, interactive=True, choices=SCHEDULE_NAME, value=DEFAULT_SCHEDULE_NAME, label="Sampler Schedule", visible=True, ) style_selection = gr.Radio( show_label=True, container=True, interactive=True, choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME, label="Image Style", ) negative_prompt = gr.Text( label="Negative prompt", max_lines=1, placeholder="Enter a negative prompt", visible=True, ) seed = gr.Slider( label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0, ) randomize_seed = gr.Checkbox(label="Randomize seed", value=True) with gr.Row(visible=True): width = gr.Slider( label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=config.image_size, ) height = gr.Slider( label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=config.image_size, ) with gr.Row(): dpms_guidance_scale = gr.Slider( label="DPM-Solver Guidance scale", minimum=1, maximum=10, step=0.1, value=4.5, ) dpms_inference_steps = gr.Slider( label="DPM-Solver inference steps", minimum=5, maximum=40, step=1, value=14, ) with gr.Row(): sas_guidance_scale = gr.Slider( label="SA-Solver Guidance scale", minimum=1, maximum=10, step=0.1, value=3, ) sas_inference_steps = gr.Slider( label="SA-Solver inference steps", minimum=10, maximum=40, step=1, value=25, ) gr.Examples( examples=[ [ "anime superman in action", "asset/images/controlnet/0_0.png", ], [ "illustration of A loving couple standing in the open kitchen of the living room, cooking ,Couples have a full body, with characters accounting for a quarter of the screen, and the composition of the living room has a large perspective, resulting in a larger space.", "asset/images/controlnet/0_3.png", ], [ "A Electric 4 seats mini VAN,simple design stylel,led headlight,front 45 angle view,sunlight,clear sky.", "asset/images/controlnet/0_2.png", ], ], inputs=[prompt, image_input], outputs=[result, hed_result, seed], fn=generate, cache_examples=CACHE_EXAMPLES, ) use_negative_prompt.change( fn=lambda x: gr.update(visible=x), inputs=use_negative_prompt, outputs=negative_prompt, api_name=False, ) gr.on( triggers=[ prompt.submit, negative_prompt.submit, run_button.click, ], fn=generate, inputs=[ prompt, image_input, negative_prompt, style_selection, use_negative_prompt, seed, width, height, schedule, dpms_guidance_scale, sas_guidance_scale, dpms_inference_steps, sas_inference_steps, randomize_seed, ], outputs=[result, hed_result, seed], api_name="run", ) if __name__ == "__main__": demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=PORT, debug=True) ================================================ FILE: PixArt-alpha-ToCa/app/app_lcm.py ================================================ #!/usr/bin/env python from __future__ import annotations import os import sys from pathlib import Path current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) import random import gradio as gr import numpy as np import uuid from diffusers import PixArtAlphaPipeline, Transformer2DModel from peft import PeftModel import torch from typing import Tuple from datetime import datetime import argparse DESCRIPTION = """![Logo](https://raw.githubusercontent.com/PixArt-alpha/PixArt-alpha.github.io/master/static/images/pixart-lcm.png) # PixArt-LCM 1024px #### [PixArt-Alpha 1024px](https://github.com/PixArt-alpha/PixArt-alpha) is a transformer-based text-to-image diffusion system trained on text embeddings from T5. This demo uses the [PixArt-alpha/PixArt-LCM-XL-2-1024-MS](https://huggingface.co/PixArt-alpha/PixArt-LCM-XL-2-1024-MS) checkpoint. #### [LCMs](https://github.com/luosiallen/latent-consistency-model) is a diffusion distillation method which predict PF-ODE's solution directly in latent space, achieving super fast inference with few steps. #### English prompts ONLY; 提示词仅限英文 Don't want to queue? Try [OpenXLab](https://openxlab.org.cn/apps/detail/PixArt-alpha/PixArt-alpha) or [Google Colab Demo](https://colab.research.google.com/drive/1jZ5UZXk7tcpTfVwnX33dDuefNMcnW9ME?usp=sharing). """ if not torch.cuda.is_available(): DESCRIPTION += "\n

Running on CPU 🥶 This demo does not work on CPU.

" MAX_SEED = np.iinfo(np.int32).max CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1" MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "2048")) USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1" ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1" PORT = int(os.getenv("DEMO_PORT", "15432")) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") style_list = [ { "name": "(No style)", "prompt": "{prompt}", "negative_prompt": "", }, { "name": "Cinematic", "prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy", "negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured", }, { "name": "Photographic", "prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed", "negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly", }, { "name": "Anime", "prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime, highly detailed", "negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast", }, { "name": "Manga", "prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style", "negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style", }, { "name": "Digital Art", "prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed", "negative_prompt": "photo, photorealistic, realism, ugly", }, { "name": "Pixel art", "prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics", "negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic", }, { "name": "Fantasy art", "prompt": "ethereal fantasy concept art of {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy", "negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white", }, { "name": "Neonpunk", "prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional", "negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured", }, { "name": "3D Model", "prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting", "negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting", }, ] styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list} STYLE_NAMES = list(styles.keys()) DEFAULT_STYLE_NAME = "(No style)" NUM_IMAGES_PER_PROMPT = 1 def apply_style(style_name: str, positive: str, negative: str = "") -> Tuple[str, str]: p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME]) if not negative: negative = "" return p.replace("{prompt}", positive), n + negative def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--is_lora', action='store_true', help='enable lora ckpt loading') parser.add_argument('--repo_id', default="PixArt-alpha/PixArt-LCM-XL-2-1024-MS", type=str) parser.add_argument('--lora_repo_id', default="PixArt-alpha/PixArt-LCM-LoRA-XL-2-1024-MS", type=str) return parser.parse_args() args = get_args() if torch.cuda.is_available(): if not args.is_lora: pipe = PixArtAlphaPipeline.from_pretrained( args.repo_id, torch_dtype=torch.float16, use_safetensors=True, ) else: assert args.lora_repo_id is not None transformer = Transformer2DModel.from_pretrained(args.repo_id, subfolder="transformer", torch_dtype=torch.float16) transformer = PeftModel.from_pretrained(transformer, args.lora_repo_id) pipe = PixArtAlphaPipeline.from_pretrained( args.repo_id, transformer=transformer, torch_dtype=torch.float16, use_safetensors=True, ) del transformer if ENABLE_CPU_OFFLOAD: pipe.enable_model_cpu_offload() else: pipe.to(device) print("Loaded on Device!") # speed-up T5 pipe.text_encoder.to_bettertransformer() if USE_TORCH_COMPILE: pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=True) print("Model Compiled!") def save_image(img): unique_name = f'{str(uuid.uuid4())}.png' save_path = os.path.join(f'output/online_demo_img/{datetime.now().date()}') os.makedirs(save_path, exist_ok=True) unique_name = os.path.join(save_path, unique_name) img.save(unique_name) return unique_name def randomize_seed_fn(seed: int, randomize_seed: bool) -> int: if randomize_seed: seed = random.randint(0, MAX_SEED) return seed def generate( prompt: str, negative_prompt: str = "", style: str = DEFAULT_STYLE_NAME, use_negative_prompt: bool = False, seed: int = 0, width: int = 1024, height: int = 1024, inference_steps: int = 4, randomize_seed: bool = False, use_resolution_binning: bool = True, progress=gr.Progress(track_tqdm=True), ): seed = int(randomize_seed_fn(seed, randomize_seed)) generator = torch.Generator().manual_seed(seed) if not use_negative_prompt: negative_prompt = None # type: ignore prompt, negative_prompt = apply_style(style, prompt, negative_prompt) images = pipe( prompt=prompt, width=width, height=height, negative_prompt=negative_prompt, guidance_scale=0., num_inference_steps=inference_steps, generator=generator, num_images_per_prompt=NUM_IMAGES_PER_PROMPT, use_resolution_binning=use_resolution_binning, output_type="pil", ).images image_paths = [save_image(img) for img in images] print(image_paths) return image_paths, seed examples = [ "A small cactus with a happy face in the Sahara desert.", "an astronaut sitting in a diner, eating fries, cinematic, analog film", "Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail.", "stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, blue and pink, brilliantly illuminated in the background.", "professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest.", "beautiful lady, freckles, big smile, blue eyes, short ginger hair, dark makeup, wearing a floral blue vest top, soft light, dark grey background", "Spectacular Tiny World in the Transparent Jar On the Table, interior of the Great Hall, Elaborate, Carved Architecture, Anatomy, Symetrical, Geometric and Parameteric Details, Precision Flat line Details, Pattern, Dark fantasy, Dark errie mood and ineffably mysterious mood, Technical design, Intricate Ultra Detail, Ornate Detail, Stylized and Futuristic and Biomorphic Details, Architectural Concept, Low contrast Details, Cinematic Lighting, 8k, by moebius, Fullshot, Epic, Fullshot, Octane render, Unreal ,Photorealistic, Hyperrealism", "anthropomorphic profile of the white snow owl Crystal priestess , art deco painting, pretty and expressive eyes, ornate costume, mythical, ethereal, intricate, elaborate, hyperrealism, hyper detailed, 3D, 8K, Ultra Realistic, high octane, ultra resolution, amazing detail, perfection, In frame, photorealistic, cinematic lighting, visual clarity, shading , Lumen Reflections, Super-Resolution, gigapixel, color grading, retouch, enhanced, PBR, Blender, V-ray, Procreate, zBrush, Unreal Engine 5, cinematic, volumetric, dramatic, neon lighting, wide angle lens ,no digital painting blur", "The parametric hotel lobby is a sleek and modern space with plenty of natural light. The lobby is spacious and open with a variety of seating options. The front desk is a sleek white counter with a parametric design. The walls are a light blue color with parametric patterns. The floor is a light wood color with a parametric design. There are plenty of plants and flowers throughout the space. The overall effect is a calm and relaxing space. occlusion, moody, sunset, concept art, octane rendering, 8k, highly detailed, concept art, highly detailed, beautiful scenery, cinematic, beautiful light, hyperreal, octane render, hdr, long exposure, 8K, realistic, fog, moody, fire and explosions, smoke, 50mm f2.8", ] with gr.Blocks(css="scripts/style.css") as demo: gr.Markdown(DESCRIPTION) gr.DuplicateButton( value="Duplicate Space for private use", elem_id="duplicate-button", visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1", ) with gr.Group(): with gr.Row(): prompt = gr.Text( label="Prompt", show_label=False, max_lines=1, placeholder="Enter your prompt", container=False, ) run_button = gr.Button("Run", scale=0) result = gr.Gallery(label="Result", columns=NUM_IMAGES_PER_PROMPT, show_label=False) with gr.Accordion("Advanced options", open=False): with gr.Row(): use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=False, visible=True) negative_prompt = gr.Text( label="Negative prompt", max_lines=1, placeholder="Enter a negative prompt", visible=True, ) style_selection = gr.Radio( show_label=True, container=True, interactive=True, choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME, label="Image Style", ) seed = gr.Slider( label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0, ) randomize_seed = gr.Checkbox(label="Randomize seed", value=True) with gr.Row(visible=True): width = gr.Slider( label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024, ) height = gr.Slider( label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024, ) with gr.Row(): inference_steps = gr.Slider( label="LCM inference steps", minimum=1, maximum=30, step=1, value=4, ) gr.Examples( examples=examples, inputs=prompt, outputs=[result, seed], fn=generate, cache_examples=CACHE_EXAMPLES, ) use_negative_prompt.change( fn=lambda x: gr.update(visible=x), inputs=use_negative_prompt, outputs=negative_prompt, api_name=False, ) gr.on( triggers=[ prompt.submit, negative_prompt.submit, run_button.click, ], fn=generate, inputs=[ prompt, negative_prompt, style_selection, use_negative_prompt, seed, width, height, inference_steps, randomize_seed, ], outputs=[result, seed], api_name="run", ) if __name__ == "__main__": demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=PORT, debug=True) ================================================ FILE: PixArt-alpha-ToCa/app/style.css ================================================ .gradio-container{width:680px!important} ================================================ FILE: PixArt-alpha-ToCa/app/style_controlnet.css ================================================ .gradio-container{width:768px!important} ================================================ FILE: PixArt-alpha-ToCa/asset/docs/pixart-dreambooth.md ================================================ # 🔥 How to Train PixArt + Dreambooth - PixArt + [Dreambooth](https://dreambooth.github.io/)
You **ONLY** need to change the **config** file in [config](../../configs/pixart_app_config/PixArt_xl2_img1024_dreambooth.py) and **dataloader** in [dataset](../../diffusion/data/datasets/Dreambooth.py). The directory structure for Dreambooth dataset is: ``` cd ./data/dreambooth dataset ├──dog6/ │ ├──00.jpg │ ├──01.jpg │ ├──...... ├──cat/ │ ├──00.jpg │ ├──01.jpg │ ├──...... ``` To get started, first install the required dependencies, then run on your local machine: ```bash cd data/ git clone https://github.com/google/dreambooth.git python -m torch.distributed.launch --nproc_per_node=1 --master_port=26666 train_scripts/train_dreambooth.py configs/pixart_app_config/PixArt_xl2_img1024_dreambooth.py --work-dir output/path ``` ================================================ FILE: PixArt-alpha-ToCa/asset/docs/pixart.md ================================================ [//]: # ((reference from [hugging Face](https://github.com/huggingface/diffusers/blob/docs/8bit-inference-pixart/docs/source/en/api/pipelines/pixart.md))) ## Running the `PixArtAlphaPipeline` in under 8GB GPU VRAM It is possible to run the [`PixArtAlphaPipeline`] under 8GB GPU VRAM by loading the text encoder in 8-bit numerical precision. Let's walk through a full-fledged example. First, install the `bitsandbytes` library: ```bash pip install -U bitsandbytes ``` Then load the text encoder in 8-bit: ```python from transformers import T5EncoderModel from diffusers import PixArtAlphaPipeline text_encoder = T5EncoderModel.from_pretrained( "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder", load_in_8bit=True, device_map="auto", ) pipe = PixArtAlphaPipeline.from_pretrained( "PixArt-alpha/PixArt-XL-2-1024-MS", text_encoder=text_encoder, transformer=None, device_map="auto" ) ``` Now, use the `pipe` to encode a prompt: ```python with torch.no_grad(): prompt = "cute cat" prompt_embeds, prompt_attention_mask, negative_embeds, negative_prompt_attention_mask = pipe.encode_prompt(prompt) del text_encoder del pipe flush() ``` `flush()` is just a utility function to clear the GPU VRAM and is implemented like so: ```python import gc def flush(): gc.collect() torch.cuda.empty_cache() ``` Then compute the latents providing the prompt embeddings as inputs: ```python pipe = PixArtAlphaPipeline.from_pretrained( "PixArt-alpha/PixArt-XL-2-1024-MS", text_encoder=None, torch_dtype=torch.float16, ).to("cuda") latents = pipe( negative_prompt=None, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, prompt_attention_mask=prompt_attention_mask, negative_prompt_attention_mask=negative_prompt_attention_mask, num_images_per_prompt=1, output_type="latent", ).images del pipe.transformer flush() ``` Notice that while initializing `pipe`, you're setting `text_encoder` to `None` so that it's not loaded. Once the latents are computed, pass it off the VAE to decode into a real image: ```python with torch.no_grad(): image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor, return_dict=False)[0] image = pipe.image_processor.postprocess(image, output_type="pil") image.save("cat.png") ``` All of this, put together, should allow you to run [`PixArtAlphaPipeline`] under 8GB GPU VRAM. ![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/pixart/8bits_cat.png) Find the script [here](https://gist.github.com/sayakpaul/3ae0f847001d342af27018a96f467e4e) that can be run end-to-end to report the memory being used. Text embeddings computed in 8-bit can have an impact on the quality of the generated images because of the information loss in the representation space induced by the reduced precision. It's recommended to compare the outputs with and without 8-bit. ================================================ FILE: PixArt-alpha-ToCa/asset/docs/pixart_comfyui.md ================================================ ## 🔥 How to use PixArt in ComfyUI ### 1. Preparation for PixArt running envrironment ```bash cd /workspace conda create -n pixart python==3.9.0 conda activate pixart pip install torch==2.0.0+cu117 torchvision==0.15.1+cu117 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu117 git clone https://github.com/PixArt-alpha/PixArt-alpha.git cd PixArt-alpha pip install -r requirements.txt ``` ### 2. Install ComfyUI related dependencies ```bash cd /workspace git clone https://github.com/comfyanonymous/ComfyUI.git cd ComfyUI git clone https://github.com/city96/ComfyUI_ExtraModels custom_nodes/ComfyUI_ExtraModels ``` ### 3. Download all the checkpoints: PixArt, VAE, T5 with script ```bash cd /workspace/PixArt python tools/download.py --model_names "PixArt-XL-2-1024-MS.pth" ``` or download with urls:[PixArt ckpt](https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-1024-MS.pth), [VAE ckpt](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/sd-vae-ft-ema), [T5 ckpt](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl). ### 4. Put Checkpoints into corresponding folders ```bash cd /workspace/ComfyUI mv /path/to/PixArt-XL-2-1024-MS.pth ./models/checkpoints/ mv /path/to/sd-vae-ft-ema ./models/VAE/ mv /path/to/t5-v1_1-xxl ./models/t5/ ``` ### 5. run the ComfyUI website ```bash cd /workspace/ComfyUI python main.py --port 11111 --listen 0.0.0.0 ``` Open http://your-server-ip:11111 to play with PixArt. ### 6. Create your own custom nodes Here we prepare two examples for better understanding: 1) [PixArt Text-to-Image workflow](https://huggingface.co/PixArt-alpha/PixArt-alpha/blob/main/PixArt-image-to-image-workflow.json) 2) [PixArt Image-to-Image workflow](https://huggingface.co/PixArt-alpha/PixArt-alpha/blob/main/PixArt-image-to-image-workflow.json) Once you download these json files, you can open your server website which is `http://your-server-ip:11111` and drop the json file into the website window to begin the PixArt-ComfyUI playground. ================================================ FILE: PixArt-alpha-ToCa/asset/docs/pixart_controlnet.md ================================================ ## 🔥 ControlNet We incorporate a ControlNet-like(https://github.com/lllyasviel/ControlNet) module enables fine-grained control over text-to-image diffusion models. We introduce a novel ControlNet-Transformer architecture, specifically tailored for Transformers, achieving explicit controllability alongside high-quality image generation. For more details about PixArt-ControlNet, please check the technical report [PixArt-δ](https://arxiv.org/abs/2401.05252).

## Training the `PixArt + ControlNet` on your machine ```bash # Train on 1024px python -m torch.distributed.launch --nproc_per_node=2 --master_port=12345 train_scripts/train_controlnet.py configs/pixart_app_config/PixArt_xl2_img1024_controlHed.py --work-dir output/pixartcontrolnet-xl2-img1024 # Train on 512px python -m torch.distributed.launch --nproc_per_node=2 --master_port=12345 train_scripts/train_controlnet.py configs/pixart_app_config/PixArt_xl2_img512_controlHed.py --work-dir output/pixartcontrolnet-xl2-img512 ``` ## Testing the `PixArt + ControlNet` ```bash # Test on 1024px DEMO_PORT= 12345 python app/app_controlnet.py configs/pixart_app_config/PixArt_xl2_img1024_controlHed.py --model_path path/to/1024px/PixArt-XL-2-1024-ControlNet.pth # Test on 512px DEMO_PORT= 12345 python app/app_controlnet.py configs/pixart_app_config/PixArt_xl2_img512_controlHed.py --model_path path/to/512px/pixart_controlnet_ckpt ``` Then have a look at a simple example using the http://your-server-ip:12345 ================================================ FILE: PixArt-alpha-ToCa/asset/docs/pixart_inpaint.md ================================================ ```python import torch from scripts.pipeline_pixart_inpaint import PixArtAlphaInpaintPipeline from PIL import Image pipe = PixArtAlphaInpaintPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16) prompt = "" image = Image.open('') mask_image = Image.open('') out = pipe(prompt, image=image, mask_image=mask_image, strength=1.0).images[0] out.save('./cactus_removed.png') ``` ================================================ FILE: PixArt-alpha-ToCa/asset/docs/pixart_lcm.md ================================================

## 🔥 Why Need PixArt-LCM Following [LCM LoRA](https://huggingface.co/blog/lcm_lora), we illustrative of the generation speed we achieve on various computers. Let us stress again how liberating it is to explore image generation so easily with PixArt-LCM. | Hardware | PixArt-LCM (4 steps) | SDXL LoRA LCM (4 steps) | PixArt standard (14 steps) | SDXL standard (25 steps) | |-----------------------------|----------------------|-------------------------|----------------------------|---------------------------| | T4 (Google Colab Free Tier) | 3.3s | 8.4s | 16.0s | 26.5s | | A100 (80 GB) | 0.51s | 1.2s | 2.2s | 3.8s | | V100 (32 GB) | 0.8s | 1.2s | 5.5s | 7.7s | These tests were run with a batch size of 1 in all cases. For cards with a lot of capacity, such as A100, performance increases significantly when generating multiple images at once, which is usually the case for production workloads. ## Training the `PixArt + LCM` on your machine ```bash python -m torch.distributed.launch --nproc_per_node=2 --master_port=12345 train_scripts/train_pixart_lcm.py configs/pixart_config/PixArt_xl2_img1024_lcm.py --work-dir output/pixartlcm-xl2-img1024_ft ``` ## Trainig the `PixArt + LCM-LoRA` ```bash python -m torch.distributed.launch --nproc_per_node=2 --master_port=12345 train_scripts/train_pixart_lcm_lora.py configs/pixart_config/PixArt_xl2_img1024_lcm.py --work-dir output/pixartlcm-lora-xl2-img1024_ft ``` ## Testing the `PixArt + LCM` on your machine ```bash DEMO_PORT=12345 python app/app_lcm.py Then have a look at a simple example using the http://your-server-ip:12345 ``` ## Testing the `PixArt + LCM-LoRA` ```bash DEMO_PORT=12345 python app/app_lcm.py --is_lora --lora_repo_id output/pixartlcm-lora-xl2-img1024_ft/checkpoint-xxx Then have a look at a simple example using the http://your-server-ip:12345 ``` ## Integration in diffusers ### Using in 🧨 diffusers Make sure you have the updated versions of the following libraries: ```bash pip install -U transformers accelerate diffusers ``` And then: ```python import torch from diffusers import PixArtAlphaPipeline, AutoencoderKL # for PixArt-LCM pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-LCM-XL-2-1024-MS", torch_dtype=torch.float16, use_safetensors=True) # for PixArt-LCM-LoRA # transformer = Transformer2DModel.from_pretrained("PixArt-alpha/PixArt-LCM-XL-2-1024-MS", subfolder="transformer", torch_dtype=torch.float16) # transformer = PeftModel.from_pretrained(transformer, "PixArt-alpha/PixArt-LCM-LoRA-XL-2-1024-MS") # pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-LCM-XL-2-1024-MS", transformer=transformer, torch_dtype=torch.float16, use_safetensors=True) # del transformer # Enable memory optimizations. pipe.enable_model_cpu_offload() prompt = "A small cactus with a happy face in the Sahara desert." image = pipe(prompt, guidance_scale=0., num_inference_steps=4).images[0] ``` This integration allows running the pipeline with a batch size of 4 under 11 GBs of GPU VRAM. Check out the [documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/pixart) to learn more. # Keeping updating ================================================ FILE: PixArt-alpha-ToCa/asset/docs/sasolver.md ================================================ ## SA-Solver: Stochastic Adams Solver for Fast Sampling of Diffusion Models (Neurips 2023)
> [**SA-Solver: Stochastic Adams Solver for Fast Sampling of Diffusion Models (Neurips 2023)**](https://arxiv.org/pdf/2309.05019.pdf)
> [Shuchen Xue*](https://github.com/scxue), [Mingyang Yi]()†, > [Weijian Luo](), [Shifeng Zhang](), [Jiacheng Sun](), > [Zhenguo Li](https://scholar.google.com/citations?user=XboZC1AAAAAJ), > [Zhi-Ming Ma]() >
University of Chinese Academy of Sciences, Huawei Noah’s Ark Lab, Peking University
--- ## 🐱 Abstract SA-Solver is a stochastic diffusion sampler based on Stochastic Adams Method. It is training-free and can be employed into pretrained diffusion models. It is a multistep SDE solver that can do fast stochastic sampling. 1. The parameter 'tau function' controls the stochasticity in the sampling process. Inspired by EDM, we choose the 'tau function' to be a piecewise constant function that is greater than 0 in the middle stage of sampling process and equals zero in the start and end stage. Specifically, we choose the default value of this parameter to be ```python tau_func = lambda t: 1 if t >= 200 and t <= 800 else 0 ``` in diffusers library and ```python tau_t = lambda t: eta if 0.2 <= t <= 0.8 else 0 ``` in ldm library. (The difference is because the time transformation * 1000). The value '1' represents the magnitude of stochasticity. Higher value are recommended with more NFEs. If you want to employ deterministic sampling (solving diffusion ODE) in SA-Solver, please set ```python tau_func = lambda t: 0 ``` If you want to employ original stochastic sampling (solving original diffusion SDE) in SA-Solver, please set ```python tau_func = lambda t: 1 ``` 2. The parameter 'predictor_order' and 'corrector_order' controls the specific orders of 'SA-Predictor' and 'SA-Corrector'. For unconditional generation and conditional generation with small classifier-free guidance scale, the recommended orders are 'predictor_order = 3' and 'corrector_order = 4'; for conditional generation with large classifier-free guidance scale (e.g. t2i), the recommended orders are 'predictor_order = 2' and 'corrector_order = 2'. ================================================ FILE: PixArt-alpha-ToCa/asset/examples.py ================================================ examples = [ [ "A small cactus with a happy face in the Sahara desert.", "dpm-solver", 20, 4.5, "https://github.com/PixArt-alpha/PixArt-alpha.github.io/blob/master/static/images/carousel/carousel1.png", "Prompt: A small cactus with a happy face in the Sahara desert. \nSize: --ar 1:1.", "Model path: PixArt-XL-2-1024x1024.pt.\nBase image size: 1024, \nSampling Algo: dpm-solver"], [ "Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, " "spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, " "intricate detail. --ar 6144:4096.", "dpm-solver", 20, 4.5, "https://github.com/PixArt-alpha/PixArt-alpha.github.io/blob/master/static/images/samples/15.png", "Prompt: Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, " "spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, " "intricate detail.\nSize: --ar 6144:4096.", "Model path: PixArt-XL-2-1024x1024.pt.\nBase image size: 1024, \nSampling Algo: dpm-solver"], [ "stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, " "blue and pink, brilliantly illuminated in the background.", "dpm-solver", 20, 4.5, "https://github.com/PixArt-alpha/PixArt-alpha.github.io/blob/master/static/images/samples/13.png", "stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, blue and pink, brilliantly illuminated in the background.", "Model path: PixArt-XL-2-1024x1024.pt.\nBase image size: 1024, \nSampling Algo: dpm-solver"], [ "nature vs human nature, surreal, UHD, 8k, hyper details, rich colors, photograph.", "dpm-solver", 20, 4.5, "https://github.com/PixArt-alpha/PixArt-alpha.github.io/blob/master/static/images/samples/14.png", "nature vs human nature, surreal, UHD, 8k, hyper details, rich colors, photograph.", "Model path: PixArt-XL-2-1024x1024.pt.\nBase image size: 1024, \nSampling Algo: dpm-solver"], ] ================================================ FILE: PixArt-alpha-ToCa/asset/samples.txt ================================================ A small cactus with a happy face in the Sahara desert. Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail. beautiful lady, freckles, big smile, blue eyes, short ginger hair, dark makeup, wearing a floral blue vest top, soft light, dark grey background stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, blue and pink, brilliantly illuminated in the background. nature vs human nature, surreal, UHD, 8k, hyper details, rich colors, photograph. Spectacular Tiny World in the Transparent Jar On the Table, interior of the Great Hall, Elaborate, Carved Architecture, Anatomy, Symetrical, Geometric and Parameteric Details, Precision Flat line Details, Pattern, Dark fantasy, Dark errie mood and ineffably mysterious mood, Technical design, Intricate Ultra Detail, Ornate Detail, Stylized and Futuristic and Biomorphic Details, Architectural Concept, Low contrast Details, Cinematic Lighting, 8k, by moebius, Fullshot, Epic, Fullshot, Octane render, Unreal ,Photorealistic, Hyperrealism anthropomorphic profile of the white snow owl Crystal priestess , art deco painting, pretty and expressive eyes, ornate costume, mythical, ethereal, intricate, elaborate, hyperrealism, hyper detailed, 3D, 8K, Ultra Realistic, high octane, ultra resolution, amazing detail, perfection, In frame, photorealistic, cinematic lighting, visual clarity, shading , Lumen Reflections, Super-Resolution, gigapixel, color grading, retouch, enhanced, PBR, Blender, V-ray, Procreate, zBrush, Unreal Engine 5, cinematic, volumetric, dramatic, neon lighting, wide angle lens ,no digital painting blur The parametric hotel lobby is a sleek and modern space with plenty of natural light. The lobby is spacious and open with a variety of seating options. The front desk is a sleek white counter with a parametric design. The walls are a light blue color with parametric patterns. The floor is a light wood color with a parametric design. There are plenty of plants and flowers throughout the space. The overall effect is a calm and relaxing space. occlusion, moody, sunset, concept art, octane rendering, 8k, highly detailed, concept art, highly detailed, beautiful scenery, cinematic, beautiful light, hyperreal, octane render, hdr, long exposure, 8K, realistic, fog, moody, fire and explosions, smoke, 50mm f2.8 Bright scene, aerial view, ancient city, fantasy, gorgeous light, mirror reflection, high detail, wide angle lens. 8k uhd A man looks up at the starry sky, lonely and ethereal, Minimalism, Chaotic composition Op Art A middle-aged woman of Asian descent, her dark hair streaked with silver, appears fractured and splintered, intricately embedded within a sea of broken porcelain. The porcelain glistens with splatter paint patterns in a harmonious blend of glossy and matte blues, greens, oranges, and reds, capturing her dance in a surreal juxtaposition of movement and stillness. Her skin tone, a light hue like the porcelain, adds an almost mystical quality to her form. A 4k dslr image of a lemur wearing a red magician hat and a blue coat performing magic tricks with cards in a garden. A alpaca made of colorful building blocks, cyberpunk A baby painter trying to draw very simple picture, white background A boy and a girl fall in love A dog that has been meditating all the time A man is sitting in a chair with his chin resting on his hand. The chair, along with the man's feet, are submerged in the sea. Strikingly, the man's back is on fire. A painter study hard to learn how to draw with many concepts in the air, white background A painter with low quality, white background, pixel art A person standing on the desert, desert waves, gossip illustration, half red, half blue, abstract image of sand, clear style, trendy illustration, outdoor, top view, clear style, precision art, ultra high definition image A silhouette of a grand piano overlooking a dusky cityscape viewed from a top-floor penthouse, rendered in the bold and vivid sytle of a vintage travel poster. A sureal parallel world where mankind avoid extinction by preserving nature, epic trees, water streams, various flowers, intricate details, rich colors, rich vegetation, cinematic, symmetrical, beautiful lighting, V-Ray render, sun rays, magical lights, photography A woman is shopping for fresh produce at the farmer's market. A worker that looks like a mixture of cow and horse is working hard to type code A young man dressed in ancient Chinese clothing, Asian people, White robe, Handsome, Hand gestures forming a spell, Martial arts and fairy-like vibe, Carrying a legendary-level giant sword on the back, Game character, Surrounded by runes, Cyberpunk style, neon lights, best quality, masterpiece, cg, hdr, high-definition, extremely detailed, photorealistic, epic, character design, detailed face, superhero, hero, detailed UHD, real-time, vfx, 3D rendering, 8k An alien octopus floats through a protal reading a newspaper An epressive oil painting of a basketbal player dunking, depicted as an explosion of a nebula art collection style and fashion shoot, in the style of made of glass, dark blue and light pink, paul rand, solarpunk, camille vivier, beth didonato hair, barbiecore, hyper-realistic artistic beautiful secen Crocodile in a sweater Design a letter A, 3D stereoscopic Ice material Interior light blue Conceptual product design Futuristic Blind box toy Handcrafted Exquisite 3D effect Full body display Ultra-high precision Ultra-detailed Perfect lighting OC Renderer Blender 8k Ultra-sharp Ultra-noise reduction Floating,colossal,futuristic statue in the sky, awe-inspiring and serenein the style of Stuart Lippincott:2with detailed composition and subtle geometric elements.This sanctuary-ike atmosphere features crisp clarity and soft amber tones.In contrasttiny human figures surround the statueThe pieceincorporates flowing draperiesreminiscent of Shwedoff and Philip McKay's stylesemphasizing thejuxtaposition between the powerful presence of the statue and thevulnerability of the minuscule human figuresshwedoff knolling of a drawing tools for painter Leonardo da Vinci's Last Supper content, Van Goph's Starry Night Style Luffy from ONEPIECE, handsome face, fantasy photography shot through an outdoor window of a coffee shop with neon sign lighting, window glares and reflections, depth of field, {little girl with red hair sitting at a table, portrait, kodak portra 800,105 mm f1.8 poster of a mechanical cat, techical Schematics viewed from front and side view on light white blueprint paper, illustartion drafting style, illustation, typography, conceptual art, dark fantasy steampunk, cinematic, dark fantasy The girl in the car is filled with goldfish and flowers, goldfish can fly, Kawaguchi Renko's art, natural posture, holiday dadcore, youthful energy and pressure, body stretching, goldfish simulation movies in the sky, super details, and dreamy high photography. Colorful. Covered by water and goldfish, indoor scene, close-up shot in XT4 movie The image features a woman wearing a red shirt with an icon. She appears to be posing for the camera, and her outfit includes a pair of jeans. The woman seems to be in a good mood, as she is smiling. The background of the image is blurry, focusing more on the woman and her attire. The towel was on top of the hard counter. A vast landscape made entirely of various meats spreads out before the viewer. tender, succulent hills of roast beef, chicken drumstick trees, bacon rivers, and ham boulders create a surreal, yet appetizing scene. the sky is adorned with pepperoni sun and salami clouds. I want to supplement vitamin c, please help me paint related food. A vibrant yellow banana-shaped couch sits in a cozy living room, its curve cradling a pile of colorful cushions. on the wooden floor, a patterned rug adds a touch of eclectic charm, and a potted plant sits in the corner, reaching towards the sunlight filtering through the window. A transparent sculpture of a duck made out of glass. The sculpture is in front of a painting of a landscape. A blue jay standing on a large basket of rainbow macarons. A bucket bag made of blue suede. The bag is decorated with intricate golden paisley patterns. The handle of the bag is made of rubies and pearls. An alien octopus floats through a portal reading a newspaper. bird's eye view of a city. beautiful scene A 2D animation of a folk music band composed of anthropomorphic autumn leaves, each playing traditional bluegrass instruments, amidst a rustic forest setting dappled with the soft light of a harvest moon. In front of a deep black backdrop, a figure of middle years, her Tongan skin rich and glowing, is captured mid-twirl, her curly hair flowing like a storm behind her. Her attire resembles a whirlwind of marble and porcelain fragments. Illuminated by the gleam of scattered porcelain shards, creating a dreamlike atmosphere, the dancer manages to appear fragmented, yet maintains a harmonious and fluid form. Digital illustration of a beach scene crafted from yarn. The sandy beach is depicted with beige yarn, waves are made of blue and white yarn crashing onto the shore. A yarn sun sets on the horizon, casting a warm glow. Yarn palm trees sway gently, and little yarn seashells dot the shoreline. Illustration of a chic chair with a design reminiscent of a pumpkin’s form, with deep orange cushioning, in a stylish loft setting. A detailed oil painting of an old sea captain, steering his ship through a storm. Saltwater is splashing against his weathered face, determination in his eyes. Twirling malevolent clouds are seen above and stern waves threaten to submerge the ship while seagulls dive and twirl through the chaotic landscape. Thunder and lights embark in the distance, illuminating the scene with an eerie green glow. An illustration of a human heart made of translucent glass, standing on a pedestal amidst a stormy sea. Rays of sunlight pierce the clouds, illuminating the heart, revealing a tiny universe within. The quote 'Find the universe within you' is etched in bold letters across the horizon. A modern architectural building with large glass windows, situated on a cliff overlooking a serene ocean at sunset photo of an ancient shipwreck nestled on the ocean floor. Marine plants have claimed the wooden structure, and fish swim in and out of its hollow spaces. Sunken treasures and old cannons are scattered around, providing a glimpse into the past A 3D render of a coffee mug placed on a window sill during a stormy day. The storm outside the window is reflected in the coffee, with miniature lightning bolts and turbulent waves seen inside the mug. The room is dimly lit, adding to the dramatic atmosphere.A minimap diorama of a cafe adorned with indoor plants. Wooden beams crisscross above, and a cold brew station stands out with tiny bottles and glasses. An antique botanical illustration drawn with fine lines and a touch of watercolour whimsy, depicting a strange lily crossed with a Venus flytrap, its petals poised as if ready to snap shut on any unsuspecting insects.An illustration inspired by old-world botanical sketches blends a cactus with lilac blooms into a Möbius strip, using detailed lines and subtle watercolor touches to capture nature's diverse beauty and mathematical intrigue. An ink sketch style illustration of a small hedgehog holding a piece of watermelon with its tiny paws, taking little bites with its eyes closed in delight.Photo of a lychee-inspired spherical chair, with a bumpy white exterior and plush interior, set against a tropical wallpaper. 3d digital art of an adorable ghost, glowing within, holding a heart shaped pumpkin, Halloween, super cute, spooky haunted house background professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest. an astronaut sitting in a diner, eating fries, cinematic, analog film ================================================ FILE: PixArt-alpha-ToCa/configs/PixArt_xl2_internal.py ================================================ data_root = '/data/data' data = dict(type='InternalData', root='images', image_list_json=['data_info.json'], transform='default_train', load_vae_feat=True) image_size = 256 # the generated image resolution train_batch_size = 32 eval_batch_size = 16 use_fsdp=False # if use FSDP mode valid_num=0 # take as valid aspect-ratio when sample number >= valid_num # model setting model = 'PixArt_XL_2' aspect_ratio_type = None # base aspect ratio [ASPECT_RATIO_512 or ASPECT_RATIO_256] multi_scale = False # if use multiscale dataset model training lewei_scale = 1.0 # lewei_scale for positional embedding interpolation # training setting num_workers=4 train_sampling_steps = 1000 eval_sampling_steps = 250 model_max_length = 120 lora_rank = 4 num_epochs = 80 gradient_accumulation_steps = 1 grad_checkpointing = False gradient_clip = 1.0 gc_step = 1 auto_lr = dict(rule='sqrt') # we use different weight decay with the official implementation since it results better result optimizer = dict(type='AdamW', lr=1e-4, weight_decay=3e-2, eps=1e-10) lr_schedule = 'constant' lr_schedule_args = dict(num_warmup_steps=500) save_image_epochs = 1 save_model_epochs = 1 save_model_steps=1000000 sample_posterior = True mixed_precision = 'fp16' scale_factor = 0.18215 ema_rate = 0.9999 tensorboard_mox_interval = 50 log_interval = 50 cfg_scale = 4 mask_type='null' num_group_tokens=0 mask_loss_coef=0. load_mask_index=False # load prepared mask_type index # load model settings vae_pretrained = "/cache/pretrained_models/sd-vae-ft-ema" load_from = None resume_from = dict(checkpoint=None, load_ema=False, resume_optimizer=True, resume_lr_scheduler=True) snr_loss=False # work dir settings work_dir = '/cache/exps/' s3_work_dir = None seed = 43 ================================================ FILE: PixArt-alpha-ToCa/configs/PixArt_xl2_sam.py ================================================ data_root = '/data/data' data = dict(type='SAM', root='images', image_list_txt='part0.txt', transform='default_train', load_vae_feat=True) image_size = 256 # the generated image resolution train_batch_size = 32 eval_batch_size = 16 use_fsdp=False # if use FSDP mode # model setting model = 'PixArt_XL_2' aspect_ratio_type = None # base aspect ratio [ASPECT_RATIO_512 or ASPECT_RATIO_1024] multi_scale = False # if use multiscale dataset model training lewei_scale = 1.0 model_max_length = 120 lora_rank = 4 # training setting num_workers=4 train_sampling_steps = 1000 eval_sampling_steps = 250 num_epochs = 80 gradient_accumulation_steps = 1 grad_checkpointing = False gc_step = 1 gradient_clip = 1.0 auto_lr = dict(rule='sqrt') # we use different weight decay with the official implementation since it results better result optimizer = dict(type='AdamW', lr=1e-4, weight_decay=3e-2, eps=1e-10) lr_schedule = 'constant' lr_schedule_args = dict(num_warmup_steps=500) save_image_epochs = 1 save_model_epochs = 1 save_model_steps=1000000 sample_posterior = True mixed_precision = 'fp16' scale_factor = 0.18215 ema_rate = 0.9999 tensorboard_mox_interval = 50 log_interval = 50 cfg_scale = 4 mask_type='null' num_group_tokens=0 mask_loss_coef=0. load_mask_index=False # load prepared mask_type index # load model settings vae_pretrained = "/cache/pretrained_models/sd-vae-ft-ema" load_from = None resume_from = dict(checkpoint=None, load_ema=False, resume_optimizer=True, resume_lr_scheduler=True) snr_loss=False # work dir settings work_dir = '/cache/exps/' s3_work_dir = None seed = 43 ================================================ FILE: PixArt-alpha-ToCa/configs/pixart_app_config/PixArt_xl2_img1024_controlHed.py ================================================ _base_ = ['../PixArt_xl2_internal.py'] data_root = 'data' image_list_json = ['data_info.json',] data = dict(type='InternalDataHed', root='InternData', image_list_json=image_list_json, transform='default_train', load_vae_feat=True) image_size = 1024 # model setting model = 'PixArtMS_XL_2' fp32_attention = False # Set to True if you got NaN loss load_from = 'path-to-pixart-checkpoints' vae_pretrained = "output/pretrained_models/sd-vae-ft-ema" window_block_indexes = [] window_size=0 use_rel_pos=False lewei_scale = 2.0 # training setting num_workers=10 train_batch_size = 4 # set the batch size according to your VRAM num_epochs = 10 # 3 gradient_accumulation_steps = 4 grad_checkpointing = True gradient_clip = 0.01 optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10) lr_schedule_args = dict(num_warmup_steps=0) save_model_epochs=5 save_model_steps=1000 log_interval = 20 eval_sampling_steps = 200 work_dir = 'output_debug/debug' # controlnet related params copy_blocks_num = 13 class_dropout_prob = 0.5 train_ratio = 1 ================================================ FILE: PixArt-alpha-ToCa/configs/pixart_app_config/PixArt_xl2_img1024_dreambooth.py ================================================ _base_ = ['../PixArt_xl2_internal.py'] data_root = 'data/dreambooth/dataset' data = dict(type='DreamBooth', root='dog6', prompt=['a photo of sks dog'], transform='default_train', load_vae_feat=True) image_size = 1024 # model setting model = 'PixArtMS_XL_2' # model for multi-scale training fp32_attention = True load_from = 'Path/to/PixArt-XL-2-1024-MS.pth' vae_pretrained = "output/pretrained_models/sd-vae-ft-ema" window_block_indexes = [] window_size=0 use_rel_pos=False aspect_ratio_type = 'ASPECT_RATIO_1024' # base aspect ratio [ASPECT_RATIO_512 or ASPECT_RATIO_256] multi_scale = True # if use multiscale dataset model training lewei_scale = 2.0 # training setting num_workers=1 train_batch_size = 1 num_epochs = 200 gradient_accumulation_steps = 1 grad_checkpointing = True gradient_clip = 0.01 optimizer = dict(type='AdamW', lr=5e-6, weight_decay=3e-2, eps=1e-10) lr_schedule_args = dict(num_warmup_steps=0) auto_lr = None log_interval = 1 save_model_epochs=10000 save_model_steps=100 work_dir = 'output/debug' ================================================ FILE: PixArt-alpha-ToCa/configs/pixart_app_config/PixArt_xl2_img512_controlHed.py ================================================ _base_ = ['../PixArt_xl2_internal.py'] data_root = 'data' image_list_json = ['data_info.json',] data = dict(type='InternalDataHed', root='InternData', image_list_json=image_list_json, transform='default_train', load_vae_feat=True) image_size = 512 # model setting model = 'PixArt_XL_2' fp32_attention = False # Set to True if you got NaN loss load_from = 'path-to-pixart-checkpoints' vae_pretrained = "output/pretrained_models/sd-vae-ft-ema" window_block_indexes = [] window_size=0 use_rel_pos=False lewei_scale = 1.0 # training setting num_workers=10 train_batch_size = 12 # 32 # max 96 for DiT-L/4 when grad_checkpoint num_epochs = 1000 # 3 gradient_accumulation_steps = 4 grad_checkpointing = True gradient_clip = 0.01 optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10) lr_schedule_args = dict(num_warmup_steps=0) save_model_epochs=5 save_model_steps=1000 log_interval = 20 eval_sampling_steps = 200 work_dir = 'output_debug/debug' # controlnet related params copy_blocks_num = 13 class_dropout_prob = 0.5 train_ratio = 0.1 ================================================ FILE: PixArt-alpha-ToCa/configs/pixart_config/PixArt_xl2_img1024_internal.py ================================================ _base_ = ['../PixArt_xl2_internal.py'] data_root = 'data' image_list_json = ['data_info.json',] data = dict(type='InternalData', root='InternData', image_list_json=image_list_json, transform='default_train', load_vae_feat=True) image_size = 1024 # model setting window_block_indexes = [] window_size=0 use_rel_pos=False model = 'PixArt_XL_2' fp32_attention = True load_from = None vae_pretrained = "output/pretrained_models/sd-vae-ft-ema" lewei_scale = 2.0 # training setting num_workers=10 train_batch_size = 2 # 32 num_epochs = 200 # 3 gradient_accumulation_steps = 1 grad_checkpointing = True gradient_clip = 0.01 optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10) lr_schedule_args = dict(num_warmup_steps=1000) eval_sampling_steps = 200 log_interval = 20 save_model_epochs=1 save_model_steps=2000 work_dir = 'output/debug' ================================================ FILE: PixArt-alpha-ToCa/configs/pixart_config/PixArt_xl2_img1024_internalms.py ================================================ _base_ = ['../PixArt_xl2_internal.py'] data_root = 'data' image_list_json = ['data_info.json',] data = dict(type='InternalDataMS', root='InternData', image_list_json=image_list_json, transform='default_train', load_vae_feat=True) image_size = 1024 # model setting model = 'PixArtMS_XL_2' # model for multi-scale training fp32_attention = True load_from = None vae_pretrained = "output/pretrained_models/sd-vae-ft-ema" window_block_indexes = [] window_size=0 use_rel_pos=False aspect_ratio_type = 'ASPECT_RATIO_1024' # base aspect ratio [ASPECT_RATIO_512 or ASPECT_RATIO_256] multi_scale = True # if use multiscale dataset model training lewei_scale = 2.0 # training setting num_workers=10 train_batch_size = 12 # max 14 for PixArt-xL/2 when grad_checkpoint num_epochs = 10 # 3 gradient_accumulation_steps = 1 grad_checkpointing = True gradient_clip = 0.01 optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10) lr_schedule_args = dict(num_warmup_steps=1000) save_model_epochs=1 save_model_steps=2000 log_interval = 20 eval_sampling_steps = 200 work_dir = 'output/debug' ================================================ FILE: PixArt-alpha-ToCa/configs/pixart_config/PixArt_xl2_img1024_lcm.py ================================================ _base_ = ['../PixArt_xl2_internal.py'] data_root = 'data' image_list_json = ['data_info.json',] data = dict(type='InternalDataMS', root='InternData', image_list_json=image_list_json, transform='default_train', load_vae_feat=True) image_size = 1024 # model setting model = 'PixArtMS_XL_2' # model for multi-scale training fp32_attention = False # Set to True if you got NaN loss load_from = None vae_pretrained = "output/pretrained_models/sd-vae-ft-ema" window_block_indexes = [] window_size=0 use_rel_pos=False aspect_ratio_type = 'ASPECT_RATIO_1024' # base aspect ratio [ASPECT_RATIO_512 or ASPECT_RATIO_256] multi_scale = True # if use multiscale dataset model training lewei_scale = 2.0 # training setting num_workers=4 train_batch_size = 16 # max 12 for PixArt-xL/2 when grad_checkpoint 16 for LCM-LoRA num_epochs = 10 # 3 gradient_accumulation_steps = 1 grad_checkpointing = True gradient_clip = 0.01 optimizer = dict(type='AdamW', lr=2e-5, weight_decay=0.0, eps=1e-10) # optimizer = dict(type='CAMEWrapper', lr=1e-7, weight_decay=0.0, betas=(0.9, 0.999, 0.9999), eps=(1e-30, 1e-16)) lr_schedule_args = dict(num_warmup_steps=100) save_model_epochs=1 save_model_steps=200 valid_num=0 # take as valid aspect-ratio when sample number >= valid_num log_interval = 10 eval_sampling_steps = 200 work_dir = 'output/debug' # LCM loss_type = 'huber' huber_c = 0.001 num_ddim_timesteps=50 w_max = 15.0 w_min = 3.0 ema_decay = 0.95 cfg_scale = 4.5 class_dropout_prob = 0. lora_rank = 32 ================================================ FILE: PixArt-alpha-ToCa/configs/pixart_config/PixArt_xl2_img256_SAM.py ================================================ _base_ = ['../PixArt_xl2_sam.py'] data_root = 'data' image_list_txt = ['part0.txt', 'part1.txt', 'part2.txt', 'part3.txt', 'part4.txt', 'part5.txt', 'part6.txt', 'part7.txt', 'part8.txt', 'part9.txt', 'part10.txt', 'part11.txt', 'part12.txt', 'part13.txt', 'part14.txt','part15.txt','part16.txt', 'part17.txt','part18.txt','part19.txt','part20.txt','part21.txt', 'part22.txt', 'part23.txt', 'part24.txt', 'part25.txt', 'part26.txt', 'part27.txt', 'part28.txt', 'part29.txt', 'part30.txt', 'part31.txt'] data = dict(type='SAM', root='SA1B', image_list_txt=image_list_txt, transform='default_train', load_vae_feat=True) image_size = 256 # model setting window_block_indexes=[] window_size=0 use_rel_pos=False model = 'PixArt_XL_2' fp32_attention = True load_from = None vae_pretrained = "output/pretrained_models/sd-vae-ft-ema" # training setting use_fsdp=False # if use FSDP mode num_workers=10 train_batch_size = 176 # 32 num_epochs = 200 # 3 gradient_accumulation_steps = 1 grad_checkpointing = True gradient_clip = 0.01 optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10) lr_schedule_args = dict(num_warmup_steps=1000) eval_sampling_steps = 200 log_interval = 20 save_model_epochs=2 save_model_steps=20000 work_dir = 'output/debug' ================================================ FILE: PixArt-alpha-ToCa/configs/pixart_config/PixArt_xl2_img256_internal.py ================================================ _base_ = ['../PixArt_xl2_internal.py'] data_root = 'data' image_list_json = ['data_info.json',] data = dict(type='InternalData', root='InternData', image_list_json=image_list_json, transform='default_train', load_vae_feat=True) image_size = 256 # model setting window_block_indexes=[] window_size=0 use_rel_pos=False model = 'PixArt_XL_2' fp32_attention = True load_from = None vae_pretrained = "output/pretrained_models/sd-vae-ft-ema" # training setting eval_sampling_steps = 200 num_workers=10 train_batch_size = 176 # 32 # max 96 for PixArt-L/4 when grad_checkpoint num_epochs = 200 # 3 gradient_accumulation_steps = 1 grad_checkpointing = True gradient_clip = 0.01 optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10) lr_schedule_args = dict(num_warmup_steps=1000) log_interval = 20 save_model_epochs=5 work_dir = 'output/debug' ================================================ FILE: PixArt-alpha-ToCa/configs/pixart_config/PixArt_xl2_img512_internal.py ================================================ _base_ = ['../PixArt_xl2_internal.py'] data_root = 'data' image_list_json = ['data_info.json',] data = dict(type='InternalData', root='InternData', image_list_json=image_list_json, transform='default_train', load_vae_feat=True) image_size = 512 # model setting window_block_indexes = [] window_size=0 use_rel_pos=False model = 'PixArt_XL_2' fp32_attention = True load_from = None vae_pretrained = "output/pretrained_models/sd-vae-ft-ema" lewei_scale = 1.0 # training setting use_fsdp=False # if use FSDP mode num_workers=10 train_batch_size = 38 # 32 num_epochs = 200 # 3 gradient_accumulation_steps = 1 grad_checkpointing = True gradient_clip = 0.01 optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10) lr_schedule_args = dict(num_warmup_steps=1000) eval_sampling_steps = 200 log_interval = 20 save_model_epochs=1 work_dir = 'output/debug' ================================================ FILE: PixArt-alpha-ToCa/configs/pixart_config/PixArt_xl2_img512_internalms.py ================================================ _base_ = ['../PixArt_xl2_internal.py'] data_root = 'data' image_list_json = ['data_info.json',] data = dict(type='InternalDataMS', root='InternData', image_list_json=image_list_json, transform='default_train', load_vae_feat=True) image_size = 512 # model setting model = 'PixArtMS_XL_2' # model for multi-scale training fp32_attention = True load_from = None vae_pretrained = "output/pretrained_models/sd-vae-ft-ema" window_block_indexes = [] window_size=0 use_rel_pos=False aspect_ratio_type = 'ASPECT_RATIO_512' # base aspect ratio [ASPECT_RATIO_512 or ASPECT_RATIO_256] multi_scale = True # if use multiscale dataset model training lewei_scale = 1.0 # training setting num_workers=10 train_batch_size = 40 # max 40 for PixArt-xL/2 when grad_checkpoint num_epochs = 20 # 3 gradient_accumulation_steps = 1 grad_checkpointing = True gradient_clip = 0.01 optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10) lr_schedule_args = dict(num_warmup_steps=1000) save_model_epochs=1 save_model_steps=2000 log_interval = 20 eval_sampling_steps = 200 work_dir = 'output/debug' ================================================ FILE: PixArt-alpha-ToCa/diffusion/__init__.py ================================================ # Modified from OpenAI's diffusion repos # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py from .iddpm import IDDPM from .dpm_solver import DPMS from .sa_sampler import SASolverSampler ================================================ FILE: PixArt-alpha-ToCa/diffusion/data/__init__.py ================================================ from .datasets import * from .transforms import get_transform ================================================ FILE: PixArt-alpha-ToCa/diffusion/data/builder.py ================================================ import os import time from mmcv import Registry, build_from_cfg from torch.utils.data import DataLoader from diffusion.data.transforms import get_transform from diffusion.utils.logger import get_root_logger DATASETS = Registry('datasets') DATA_ROOT = '/cache/data' def set_data_root(data_root): global DATA_ROOT DATA_ROOT = data_root def get_data_path(data_dir): if os.path.isabs(data_dir): return data_dir global DATA_ROOT return os.path.join(DATA_ROOT, data_dir) def build_dataset(cfg, resolution=224, **kwargs): logger = get_root_logger() dataset_type = cfg.get('type') logger.info(f"Constructing dataset {dataset_type}...") t = time.time() transform = cfg.pop('transform', 'default_train') transform = get_transform(transform, resolution) dataset = build_from_cfg(cfg, DATASETS, default_args=dict(transform=transform, resolution=resolution, **kwargs)) logger.info(f"Dataset {dataset_type} constructed. time: {(time.time() - t):.2f} s, length (use/ori): {len(dataset)}/{dataset.ori_imgs_nums}") return dataset def build_dataloader(dataset, batch_size=256, num_workers=4, shuffle=True, **kwargs): return ( DataLoader( dataset, batch_sampler=kwargs['batch_sampler'], num_workers=num_workers, pin_memory=True, ) if 'batch_sampler' in kwargs else DataLoader( dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=True, **kwargs ) ) ================================================ FILE: PixArt-alpha-ToCa/diffusion/data/datasets/Dreambooth.py ================================================ from PIL import Image import numpy as np import torch from torchvision.datasets.folder import default_loader, IMG_EXTENSIONS from torch.utils.data import Dataset from diffusers.utils.torch_utils import randn_tensor from torchvision import transforms as T import pathlib from diffusers.models import AutoencoderKL from diffusion.data.builder import get_data_path, DATASETS from diffusion.data.datasets.utils import * IMAGE_EXTENSIONS = {'bmp', 'jpg', 'jpeg', 'pgm', 'png', 'ppm', 'tif', 'tiff', 'webp', 'JPEG'} @DATASETS.register_module() class DreamBooth(Dataset): def __init__(self, root, transform=None, resolution=1024, **kwargs): self.root = get_data_path(root) path = pathlib.Path(self.root) self.transform = transform self.resolution = resolution self.img_samples = sorted( [file for ext in IMAGE_EXTENSIONS for file in path.glob(f'*.{ext}')] ) self.ori_imgs_nums = len(self) self.loader = default_loader self.base_size = int(kwargs['aspect_ratio_type'].split('_')[-1]) self.aspect_ratio = eval(kwargs.pop('aspect_ratio_type')) # base aspect ratio self.ratio_nums = {} for k, v in self.aspect_ratio.items(): self.ratio_nums[float(k)] = 0 # used for batch-sampler self.data_info = {'img_hw': torch.tensor([resolution, resolution], dtype=torch.float32), 'aspect_ratio': 1.} # image related with torch.inference_mode(): vae = AutoencoderKL.from_pretrained("output/pretrained_models/sd-vae-ft-ema") imgs = [] for img_path in self.img_samples: img = self.loader(img_path) self.ratio_nums[1.0] += 1 if self.transform is not None: imgs.append(self.transform(img)) imgs = torch.stack(imgs, dim=0) self.img_vae = vae.encode(imgs).latent_dist.sample() del vae def __getitem__(self, index): return self.img_vae[index], self.data_info @staticmethod def vae_feat_loader(path): # [mean, std] mean, std = torch.from_numpy(np.load(path)).chunk(2) sample = randn_tensor(mean.shape, generator=None, device=mean.device, dtype=mean.dtype) return mean + std * sample def load_ori_img(self, img_path): # 加载图像并转换为Tensor transform = T.Compose([ T.Resize(256), # Image.BICUBIC T.CenterCrop(256), T.ToTensor(), ]) return transform(Image.open(img_path)) def __len__(self): return len(self.img_samples) def __getattr__(self, name): if name == "set_epoch": return lambda epoch: None raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") def get_data_info(self, idx): return {'height': self.resolution, 'width': self.resolution} ================================================ FILE: PixArt-alpha-ToCa/diffusion/data/datasets/InternalData.py ================================================ import os import random from PIL import Image import numpy as np import torch from torchvision.datasets.folder import default_loader, IMG_EXTENSIONS from torch.utils.data import Dataset from diffusers.utils.torch_utils import randn_tensor from torchvision import transforms as T from diffusion.data.builder import get_data_path, DATASETS from diffusion.utils.logger import get_root_logger import json @DATASETS.register_module() class InternalData(Dataset): def __init__(self, root, image_list_json='data_info.json', transform=None, resolution=256, sample_subset=None, load_vae_feat=False, input_size=32, patch_size=2, mask_ratio=0.0, load_mask_index=False, max_length=120, config=None, **kwargs): self.root = get_data_path(root) self.transform = transform self.load_vae_feat = load_vae_feat self.ori_imgs_nums = 0 self.resolution = resolution self.N = int(resolution // (input_size // patch_size)) self.mask_ratio = mask_ratio self.load_mask_index = load_mask_index self.max_lenth = max_length self.meta_data_clean = [] self.img_samples = [] self.txt_feat_samples = [] self.vae_feat_samples = [] self.mask_index_samples = [] self.prompt_samples = [] image_list_json = image_list_json if isinstance(image_list_json, list) else [image_list_json] for json_file in image_list_json: meta_data = self.load_json(os.path.join(self.root, 'partition', json_file)) self.ori_imgs_nums += len(meta_data) meta_data_clean = [item for item in meta_data if item['ratio'] <= 4] self.meta_data_clean.extend(meta_data_clean) self.img_samples.extend([os.path.join(self.root.replace('InternData', "InternImgs"), item['path']) for item in meta_data_clean]) self.txt_feat_samples.extend([os.path.join(self.root, 'caption_feature_wmask', '_'.join(item['path'].rsplit('/', 1)).replace('.png', '.npz')) for item in meta_data_clean]) self.vae_feat_samples.extend([os.path.join(self.root, f'img_vae_features_{resolution}resolution/noflip', '_'.join(item['path'].rsplit('/', 1)).replace('.png', '.npy')) for item in meta_data_clean]) self.prompt_samples.extend([item['prompt'] for item in meta_data_clean]) # Set loader and extensions if load_vae_feat: self.transform = None self.loader = self.vae_feat_loader else: self.loader = default_loader if sample_subset is not None: self.sample_subset(sample_subset) # sample dataset for local debug logger = get_root_logger() if config is None else get_root_logger(os.path.join(config.work_dir, 'train_log.log')) logger.info(f"T5 max token length: {self.max_lenth}") def getdata(self, index): img_path = self.img_samples[index] npz_path = self.txt_feat_samples[index] npy_path = self.vae_feat_samples[index] prompt = self.prompt_samples[index] data_info = { 'img_hw': torch.tensor([torch.tensor(self.resolution), torch.tensor(self.resolution)], dtype=torch.float32), 'aspect_ratio': torch.tensor(1.) } img = self.loader(npy_path) if self.load_vae_feat else self.loader(img_path) txt_info = np.load(npz_path) txt_fea = torch.from_numpy(txt_info['caption_feature']) # 1xTx4096 attention_mask = torch.ones(1, 1, txt_fea.shape[1]) # 1x1xT if 'attention_mask' in txt_info.keys(): attention_mask = torch.from_numpy(txt_info['attention_mask'])[None] if txt_fea.shape[1] != self.max_lenth: txt_fea = torch.cat([txt_fea, txt_fea[:, -1:].repeat(1, self.max_lenth-txt_fea.shape[1], 1)], dim=1) attention_mask = torch.cat([attention_mask, torch.zeros(1, 1, self.max_lenth-attention_mask.shape[-1])], dim=-1) if self.transform: img = self.transform(img) data_info['prompt'] = prompt return img, txt_fea, attention_mask, data_info def __getitem__(self, idx): for _ in range(20): try: return self.getdata(idx) except Exception as e: print(f"Error details: {str(e)}") idx = np.random.randint(len(self)) raise RuntimeError('Too many bad data.') def get_data_info(self, idx): data_info = self.meta_data_clean[idx] return {'height': data_info['height'], 'width': data_info['width']} @staticmethod def vae_feat_loader(path): # [mean, std] mean, std = torch.from_numpy(np.load(path)).chunk(2) sample = randn_tensor(mean.shape, generator=None, device=mean.device, dtype=mean.dtype) return mean + std * sample def load_ori_img(self, img_path): # 加载图像并转换为Tensor transform = T.Compose([ T.Resize(256), # Image.BICUBIC T.CenterCrop(256), T.ToTensor(), ]) return transform(Image.open(img_path)) def load_json(self, file_path): with open(file_path, 'r') as f: meta_data = json.load(f) return meta_data def sample_subset(self, ratio): sampled_idx = random.sample(list(range(len(self))), int(len(self) * ratio)) self.img_samples = [self.img_samples[i] for i in sampled_idx] def __len__(self): return len(self.img_samples) def __getattr__(self, name): if name == "set_epoch": return lambda epoch: None raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") ================================================ FILE: PixArt-alpha-ToCa/diffusion/data/datasets/InternalData_ms.py ================================================ import os import numpy as np import torch import random from torchvision.datasets.folder import default_loader from diffusion.data.datasets.InternalData import InternalData from diffusion.data.builder import get_data_path, DATASETS from diffusion.utils.logger import get_root_logger import torchvision.transforms as T from torchvision.transforms.functional import InterpolationMode from diffusion.data.datasets.utils import * def get_closest_ratio(height: float, width: float, ratios: dict): aspect_ratio = height / width closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio)) return ratios[closest_ratio], float(closest_ratio) @DATASETS.register_module() class InternalDataMS(InternalData): def __init__(self, root, image_list_json='data_info.json', transform=None, resolution=256, sample_subset=None, load_vae_feat=False, input_size=32, patch_size=2, mask_ratio=0.0, mask_type='null', load_mask_index=False, max_length=120, config=None, **kwargs): self.root = get_data_path(root) self.transform = transform self.load_vae_feat = load_vae_feat self.ori_imgs_nums = 0 self.resolution = resolution self.N = int(resolution // (input_size // patch_size)) self.mask_ratio = mask_ratio self.load_mask_index = load_mask_index self.mask_type = mask_type self.base_size = int(kwargs['aspect_ratio_type'].split('_')[-1]) self.max_lenth = max_length self.aspect_ratio = eval(kwargs.pop('aspect_ratio_type')) # base aspect ratio self.meta_data_clean = [] self.img_samples = [] self.txt_feat_samples = [] self.vae_feat_samples = [] self.mask_index_samples = [] self.ratio_index = {} self.ratio_nums = {} for k, v in self.aspect_ratio.items(): self.ratio_index[float(k)] = [] # used for self.getitem self.ratio_nums[float(k)] = 0 # used for batch-sampler image_list_json = image_list_json if isinstance(image_list_json, list) else [image_list_json] for json_file in image_list_json: meta_data = self.load_json(os.path.join(self.root, 'partition_filter', json_file)) self.ori_imgs_nums += len(meta_data) meta_data_clean = [item for item in meta_data if item['ratio'] <= 4] self.meta_data_clean.extend(meta_data_clean) self.img_samples.extend([os.path.join(self.root.replace('InternData', "InternImgs"), item['path']) for item in meta_data_clean]) self.txt_feat_samples.extend([os.path.join(self.root, 'caption_feature_wmask', '_'.join(item['path'].rsplit('/', 1)).replace('.png', '.npz')) for item in meta_data_clean]) self.vae_feat_samples.extend([os.path.join(self.root, f'img_vae_fatures_{resolution}_multiscale/ms', '_'.join(item['path'].rsplit('/', 1)).replace('.png', '.npy')) for item in meta_data_clean]) # Set loader and extensions if load_vae_feat: self.transform = None self.loader = self.vae_feat_loader else: self.loader = default_loader if sample_subset is not None: self.sample_subset(sample_subset) # sample dataset for local debug # scan the dataset for ratio static for i, info in enumerate(self.meta_data_clean[:len(self.meta_data_clean)//3]): ori_h, ori_w = info['height'], info['width'] closest_size, closest_ratio = get_closest_ratio(ori_h, ori_w, self.aspect_ratio) self.ratio_nums[closest_ratio] += 1 if len(self.ratio_index[closest_ratio]) == 0: self.ratio_index[closest_ratio].append(i) # print(self.ratio_nums) logger = get_root_logger() if config is None else get_root_logger(os.path.join(config.work_dir, 'train_log.log')) logger.info(f"T5 max token length: {self.max_lenth}") def getdata(self, index): img_path = self.img_samples[index] npz_path = self.txt_feat_samples[index] npy_path = self.vae_feat_samples[index] ori_h, ori_w = self.meta_data_clean[index]['height'], self.meta_data_clean[index]['width'] # Calculate the closest aspect ratio and resize & crop image[w, h] closest_size, closest_ratio = get_closest_ratio(ori_h, ori_w, self.aspect_ratio) closest_size = list(map(lambda x: int(x), closest_size)) self.closest_ratio = closest_ratio if self.load_vae_feat: try: img = self.loader(npy_path) if index not in self.ratio_index[closest_ratio]: self.ratio_index[closest_ratio].append(index) except Exception: index = random.choice(self.ratio_index[closest_ratio]) return self.getdata(index) h, w = (img.shape[1], img.shape[2]) assert h, w == (ori_h//8, ori_w//8) else: img = self.loader(img_path) h, w = (img.size[1], img.size[0]) assert h, w == (ori_h, ori_w) data_info = {'img_hw': torch.tensor([ori_h, ori_w], dtype=torch.float32)} data_info['aspect_ratio'] = closest_ratio data_info["mask_type"] = self.mask_type txt_info = np.load(npz_path) txt_fea = torch.from_numpy(txt_info['caption_feature']) attention_mask = torch.ones(1, 1, txt_fea.shape[1]) if 'attention_mask' in txt_info.keys(): attention_mask = torch.from_numpy(txt_info['attention_mask'])[None] if not self.load_vae_feat: if closest_size[0] / ori_h > closest_size[1] / ori_w: resize_size = closest_size[0], int(ori_w * closest_size[0] / ori_h) else: resize_size = int(ori_h * closest_size[1] / ori_w), closest_size[1] self.transform = T.Compose([ T.Lambda(lambda img: img.convert('RGB')), T.Resize(resize_size, interpolation=InterpolationMode.BICUBIC), # Image.BICUBIC T.CenterCrop(closest_size), T.ToTensor(), T.Normalize([.5], [.5]), ]) if self.transform: img = self.transform(img) return img, txt_fea, attention_mask, data_info def __getitem__(self, idx): for _ in range(20): try: return self.getdata(idx) except Exception as e: print(f"Error details: {str(e)}") idx = random.choice(self.ratio_index[self.closest_ratio]) raise RuntimeError('Too many bad data.') ================================================ FILE: PixArt-alpha-ToCa/diffusion/data/datasets/SA.py ================================================ import os import random import time import numpy as np import torch from torchvision.datasets.folder import default_loader, IMG_EXTENSIONS from torch.utils.data import Dataset from diffusers.utils.torch_utils import randn_tensor from diffusion.data.builder import get_data_path, DATASETS @DATASETS.register_module() class SAM(Dataset): def __init__(self, root, image_list_txt='part0.txt', transform=None, resolution=256, sample_subset=None, load_vae_feat=False, mask_ratio=0.0, mask_type='null', **kwargs): self.root = get_data_path(root) self.transform = transform self.load_vae_feat = load_vae_feat self.mask_type = mask_type self.mask_ratio = mask_ratio self.resolution = resolution self.img_samples = [] self.txt_feat_samples = [] self.vae_feat_samples = [] image_list_txt = image_list_txt if isinstance(image_list_txt, list) else [image_list_txt] if image_list_txt == 'all': image_list_txts = os.listdir(os.path.join(self.root, 'partition')) for txt in image_list_txts: image_list = os.path.join(self.root, 'partition', txt) with open(image_list, 'r') as f: lines = [line.strip() for line in f.readlines()] self.img_samples.extend([os.path.join(self.root, 'images', i+'.jpg') for i in lines]) self.txt_feat_samples.extend([os.path.join(self.root, 'caption_feature_wmask', i+'.npz') for i in lines]) elif isinstance(image_list_txt, list): for txt in image_list_txt: image_list = os.path.join(self.root, 'partition', txt) with open(image_list, 'r') as f: lines = [line.strip() for line in f.readlines()] self.img_samples.extend([os.path.join(self.root, 'images', i + '.jpg') for i in lines]) self.txt_feat_samples.extend([os.path.join(self.root, 'caption_feature_wmask', i + '.npz') for i in lines]) self.vae_feat_samples.extend([os.path.join(self.root, 'img_vae_feature/train_vae_256/noflip', i + '.npy') for i in lines]) self.ori_imgs_nums = len(self) # self.img_samples = self.img_samples[:10000] # Set loader and extensions if load_vae_feat: self.transform = None self.loader = self.vae_feat_loader else: self.loader = default_loader if sample_subset is not None: self.sample_subset(sample_subset) # sample dataset for local debug def getdata(self, idx): img_path = self.img_samples[idx] npz_path = self.txt_feat_samples[idx] npy_path = self.vae_feat_samples[idx] data_info = {'img_hw': torch.tensor([self.resolution, self.resolution], dtype=torch.float32), 'aspect_ratio': torch.tensor(1.)} img = self.loader(npy_path) if self.load_vae_feat else self.loader(img_path) npz_info = np.load(npz_path) txt_fea = torch.from_numpy(npz_info['caption_feature']) attention_mask = torch.ones(1, 1, txt_fea.shape[1]) if 'attention_mask' in npz_info.keys(): attention_mask = torch.from_numpy(npz_info['attention_mask'])[None] if self.transform: img = self.transform(img) data_info["mask_type"] = self.mask_type return img, txt_fea, attention_mask, data_info def __getitem__(self, idx): for _ in range(20): try: return self.getdata(idx) except Exception: print(self.img_samples[idx], ' info is not correct') idx = np.random.randint(len(self)) raise RuntimeError('Too many bad data.') @staticmethod def vae_feat_loader(path): # [mean, std] mean, std = torch.from_numpy(np.load(path)).chunk(2) sample = randn_tensor(mean.shape, generator=None, device=mean.device, dtype=mean.dtype) return mean + std * sample # return mean def sample_subset(self, ratio): sampled_idx = random.sample(list(range(len(self))), int(len(self) * ratio)) self.img_samples = [self.img_samples[i] for i in sampled_idx] self.txt_feat_samples = [self.txt_feat_samples[i] for i in sampled_idx] def __len__(self): return len(self.img_samples) ================================================ FILE: PixArt-alpha-ToCa/diffusion/data/datasets/__init__.py ================================================ from .SA import SAM from .InternalData import InternalData from .InternalData_ms import InternalDataMS from .Dreambooth import DreamBooth from .pixart_control import InternalDataHed from .utils import * ================================================ FILE: PixArt-alpha-ToCa/diffusion/data/datasets/pixart_control.py ================================================ import os import random from PIL import Image import numpy as np import torch from torchvision.datasets.folder import default_loader, IMG_EXTENSIONS from torch.utils.data import Dataset from diffusers.utils.torch_utils import randn_tensor from torchvision import transforms as T from diffusion.data.builder import get_data_path, DATASETS import json, time @DATASETS.register_module() class InternalDataHed(Dataset): def __init__(self, root, image_list_json='data_info.json', transform=None, resolution=256, sample_subset=None, load_vae_feat=False, input_size=32, patch_size=2, mask_ratio=0.0, load_mask_index=False, train_ratio=1.0, mode='train', **kwargs): self.root = get_data_path(root) self.transform = transform self.load_vae_feat = load_vae_feat self.ori_imgs_nums = 0 self.resolution = resolution self.N = int(resolution // (input_size // patch_size)) self.mask_ratio = mask_ratio self.load_mask_index = load_mask_index self.meta_data_clean = [] self.img_samples = [] self.txt_feat_samples = [] self.vae_feat_samples = [] self.hed_feat_samples = [] self.prompt_samples = [] image_list_json = image_list_json if isinstance(image_list_json, list) else [image_list_json] for json_file in image_list_json: meta_data = self.load_json(os.path.join(self.root, 'partition_filter', json_file)) self.ori_imgs_nums += len(meta_data) meta_data_clean = [item for item in meta_data if item['ratio'] <= 4] self.meta_data_clean.extend(meta_data_clean) self.img_samples.extend([os.path.join(self.root.replace('InternData', "InternImgs"), item['path']) for item in meta_data_clean]) self.txt_feat_samples.extend([os.path.join(self.root, 'caption_features', '_'.join(item['path'].rsplit('/', 1)).replace('.png', '.npz')) for item in meta_data_clean]) self.vae_feat_samples.extend([os.path.join(self.root, f'img_vae_features_{resolution}resolution/noflip', '_'.join(item['path'].rsplit('/', 1)).replace('.png', '.npy')) for item in meta_data_clean]) self.hed_feat_samples.extend([os.path.join(self.root, f'hed_feature_{resolution}', item['path'].replace('.png', '.npz')) for item in meta_data_clean]) self.prompt_samples.extend([item['prompt'] for item in meta_data_clean]) total_sample = len(self.img_samples) used_sample_num = int(total_sample * train_ratio) print("using mode", mode) if mode == 'train': self.img_samples = self.img_samples[:used_sample_num] self.txt_feat_samples = self.txt_feat_samples[:used_sample_num] self.vae_feat_samples = self.vae_feat_samples[:used_sample_num] self.hed_feat_samples = self.hed_feat_samples[:used_sample_num] self.prompt_samples = self.prompt_samples[:used_sample_num] else: self.img_samples = self.img_samples[-used_sample_num:] self.txt_feat_samples = self.txt_feat_samples[-used_sample_num:] self.vae_feat_samples = self.vae_feat_samples[-used_sample_num:] self.hed_feat_samples = self.hed_feat_samples[-used_sample_num:] self.prompt_samples = self.prompt_samples[-used_sample_num:] # Set loader and extensions if load_vae_feat: self.transform = None self.loader = self.vae_feat_loader else: self.loader = default_loader if sample_subset is not None: self.sample_subset(sample_subset) # sample dataset for local debug def getdata(self, index): img_path = self.img_samples[index] npz_path = self.txt_feat_samples[index] npy_path = self.vae_feat_samples[index] hed_npz_path = self.hed_feat_samples[index] prompt = self.prompt_samples[index] # only trained on single-scale 1024 res data data_info = {'img_hw': torch.tensor([1024., 1024.], dtype=torch.float32), 'aspect_ratio': torch.tensor(1.)} if self.load_vae_feat: img = self.loader(npy_path) else: img = self.loader(img_path) hed_fea = self.vae_feat_loader_npz(hed_npz_path) txt_info = np.load(npz_path) txt_fea = torch.from_numpy(txt_info['caption_feature']) attention_mask = torch.ones(1, 1, txt_fea.shape[1]) if 'attention_mask' in txt_info.keys(): attention_mask = torch.from_numpy(txt_info['attention_mask'])[None] if self.transform: img = self.transform(img) data_info['condition'] = hed_fea data_info['prompt'] = prompt return img, txt_fea, attention_mask, data_info def __getitem__(self, idx): for i in range(20): try: data = self.getdata(idx) return data except Exception as e: print(f"Error details: {str(e)}") idx = np.random.randint(len(self)) raise RuntimeError('Too many bad data.') def get_data_info(self, idx): data_info = self.meta_data_clean[idx] return {'height': data_info['height'], 'width': data_info['width']} @staticmethod def vae_feat_loader(path): # [mean, std] mean, std = torch.from_numpy(np.load(path)).chunk(2) sample = randn_tensor(mean.shape, generator=None, device=mean.device, dtype=mean.dtype) return mean + std * sample @staticmethod def vae_feat_loader_npz(path): # [mean, std] mean, std = torch.from_numpy(np.load(path)['arr_0']).chunk(2) sample = randn_tensor(mean.shape, generator=None, device=mean.device, dtype=mean.dtype) return mean + std * sample def load_json(self, file_path): with open(file_path, 'r') as f: meta_data = json.load(f) return meta_data def sample_subset(self, ratio): sampled_idx = random.sample(list(range(len(self))), int(len(self) * ratio)) self.img_samples = [self.img_samples[i] for i in sampled_idx] def __len__(self): return len(self.img_samples) def __getattr__(self, name): if name == "set_epoch": return lambda epoch: None raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") ================================================ FILE: PixArt-alpha-ToCa/diffusion/data/datasets/utils.py ================================================ ASPECT_RATIO_1024 = { '0.25': [512., 2048.], '0.26': [512., 1984.], '0.27': [512., 1920.], '0.28': [512., 1856.], '0.32': [576., 1792.], '0.33': [576., 1728.], '0.35': [576., 1664.], '0.4': [640., 1600.], '0.42': [640., 1536.], '0.48': [704., 1472.], '0.5': [704., 1408.], '0.52': [704., 1344.], '0.57': [768., 1344.], '0.6': [768., 1280.], '0.68': [832., 1216.], '0.72': [832., 1152.], '0.78': [896., 1152.], '0.82': [896., 1088.], '0.88': [960., 1088.], '0.94': [960., 1024.], '1.0': [1024., 1024.], '1.07': [1024., 960.], '1.13': [1088., 960.], '1.21': [1088., 896.], '1.29': [1152., 896.], '1.38': [1152., 832.], '1.46': [1216., 832.], '1.67': [1280., 768.], '1.75': [1344., 768.], '2.0': [1408., 704.], '2.09': [1472., 704.], '2.4': [1536., 640.], '2.5': [1600., 640.], '2.89': [1664., 576.], '3.0': [1728., 576.], '3.11': [1792., 576.], '3.62': [1856., 512.], '3.75': [1920., 512.], '3.88': [1984., 512.], '4.0': [2048., 512.], } ASPECT_RATIO_512 = { '0.25': [256.0, 1024.0], '0.26': [256.0, 992.0], '0.27': [256.0, 960.0], '0.28': [256.0, 928.0], '0.32': [288.0, 896.0], '0.33': [288.0, 864.0], '0.35': [288.0, 832.0], '0.4': [320.0, 800.0], '0.42': [320.0, 768.0], '0.48': [352.0, 736.0], '0.5': [352.0, 704.0], '0.52': [352.0, 672.0], '0.57': [384.0, 672.0], '0.6': [384.0, 640.0], '0.68': [416.0, 608.0], '0.72': [416.0, 576.0], '0.78': [448.0, 576.0], '0.82': [448.0, 544.0], '0.88': [480.0, 544.0], '0.94': [480.0, 512.0], '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], '1.13': [544.0, 480.0], '1.21': [544.0, 448.0], '1.29': [576.0, 448.0], '1.38': [576.0, 416.0], '1.46': [608.0, 416.0], '1.67': [640.0, 384.0], '1.75': [672.0, 384.0], '2.0': [704.0, 352.0], '2.09': [736.0, 352.0], '2.4': [768.0, 320.0], '2.5': [800.0, 320.0], '2.89': [832.0, 288.0], '3.0': [864.0, 288.0], '3.11': [896.0, 288.0], '3.62': [928.0, 256.0], '3.75': [960.0, 256.0], '3.88': [992.0, 256.0], '4.0': [1024.0, 256.0] } ASPECT_RATIO_256 = { '0.25': [128.0, 512.0], '0.26': [128.0, 496.0], '0.27': [128.0, 480.0], '0.28': [128.0, 464.0], '0.32': [144.0, 448.0], '0.33': [144.0, 432.0], '0.35': [144.0, 416.0], '0.4': [160.0, 400.0], '0.42': [160.0, 384.0], '0.48': [176.0, 368.0], '0.5': [176.0, 352.0], '0.52': [176.0, 336.0], '0.57': [192.0, 336.0], '0.6': [192.0, 320.0], '0.68': [208.0, 304.0], '0.72': [208.0, 288.0], '0.78': [224.0, 288.0], '0.82': [224.0, 272.0], '0.88': [240.0, 272.0], '0.94': [240.0, 256.0], '1.0': [256.0, 256.0], '1.07': [256.0, 240.0], '1.13': [272.0, 240.0], '1.21': [272.0, 224.0], '1.29': [288.0, 224.0], '1.38': [288.0, 208.0], '1.46': [304.0, 208.0], '1.67': [320.0, 192.0], '1.75': [336.0, 192.0], '2.0': [352.0, 176.0], '2.09': [368.0, 176.0], '2.4': [384.0, 160.0], '2.5': [400.0, 160.0], '2.89': [416.0, 144.0], '3.0': [432.0, 144.0], '3.11': [448.0, 144.0], '3.62': [464.0, 128.0], '3.75': [480.0, 128.0], '3.88': [496.0, 128.0], '4.0': [512.0, 128.0] } ASPECT_RATIO_256_TEST = { '0.25': [128.0, 512.0], '0.28': [128.0, 464.0], '0.32': [144.0, 448.0], '0.33': [144.0, 432.0], '0.35': [144.0, 416.0], '0.4': [160.0, 400.0], '0.42': [160.0, 384.0], '0.48': [176.0, 368.0], '0.5': [176.0, 352.0], '0.52': [176.0, 336.0], '0.57': [192.0, 336.0], '0.6': [192.0, 320.0], '0.68': [208.0, 304.0], '0.72': [208.0, 288.0], '0.78': [224.0, 288.0], '0.82': [224.0, 272.0], '0.88': [240.0, 272.0], '0.94': [240.0, 256.0], '1.0': [256.0, 256.0], '1.07': [256.0, 240.0], '1.13': [272.0, 240.0], '1.21': [272.0, 224.0], '1.29': [288.0, 224.0], '1.38': [288.0, 208.0], '1.46': [304.0, 208.0], '1.67': [320.0, 192.0], '1.75': [336.0, 192.0], '2.0': [352.0, 176.0], '2.09': [368.0, 176.0], '2.4': [384.0, 160.0], '2.5': [400.0, 160.0], '3.0': [432.0, 144.0], '4.0': [512.0, 128.0] } ASPECT_RATIO_512_TEST = { '0.25': [256.0, 1024.0], '0.28': [256.0, 928.0], '0.32': [288.0, 896.0], '0.33': [288.0, 864.0], '0.35': [288.0, 832.0], '0.4': [320.0, 800.0], '0.42': [320.0, 768.0], '0.48': [352.0, 736.0], '0.5': [352.0, 704.0], '0.52': [352.0, 672.0], '0.57': [384.0, 672.0], '0.6': [384.0, 640.0], '0.68': [416.0, 608.0], '0.72': [416.0, 576.0], '0.78': [448.0, 576.0], '0.82': [448.0, 544.0], '0.88': [480.0, 544.0], '0.94': [480.0, 512.0], '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], '1.13': [544.0, 480.0], '1.21': [544.0, 448.0], '1.29': [576.0, 448.0], '1.38': [576.0, 416.0], '1.46': [608.0, 416.0], '1.67': [640.0, 384.0], '1.75': [672.0, 384.0], '2.0': [704.0, 352.0], '2.09': [736.0, 352.0], '2.4': [768.0, 320.0], '2.5': [800.0, 320.0], '3.0': [864.0, 288.0], '4.0': [1024.0, 256.0] } ASPECT_RATIO_1024_TEST = { '0.25': [512., 2048.], '0.28': [512., 1856.], '0.32': [576., 1792.], '0.33': [576., 1728.], '0.35': [576., 1664.], '0.4': [640., 1600.], '0.42': [640., 1536.], '0.48': [704., 1472.], '0.5': [704., 1408.], '0.52': [704., 1344.], '0.57': [768., 1344.], '0.6': [768., 1280.], '0.68': [832., 1216.], '0.72': [832., 1152.], '0.78': [896., 1152.], '0.82': [896., 1088.], '0.88': [960., 1088.], '0.94': [960., 1024.], '1.0': [1024., 1024.], '1.07': [1024., 960.], '1.13': [1088., 960.], '1.21': [1088., 896.], '1.29': [1152., 896.], '1.38': [1152., 832.], '1.46': [1216., 832.], '1.67': [1280., 768.], '1.75': [1344., 768.], '2.0': [1408., 704.], '2.09': [1472., 704.], '2.4': [1536., 640.], '2.5': [1600., 640.], '3.0': [1728., 576.], '4.0': [2048., 512.], } def get_chunks(lst, n): for i in range(0, len(lst), n): yield lst[i:i + n] ================================================ FILE: PixArt-alpha-ToCa/diffusion/data/transforms.py ================================================ import torchvision.transforms as T TRANSFORMS = {} def register_transform(transform): name = transform.__name__ if name in TRANSFORMS: raise RuntimeError(f'Transform {name} has already registered.') TRANSFORMS.update({name: transform}) def get_transform(type, resolution): transform = TRANSFORMS[type](resolution) transform = T.Compose(transform) transform.image_size = resolution return transform @register_transform def default_train(n_px): return [ T.Lambda(lambda img: img.convert('RGB')), T.Resize(n_px), # Image.BICUBIC T.CenterCrop(n_px), # T.RandomHorizontalFlip(), T.ToTensor(), T.Normalize([0.5], [0.5]), ] ================================================ FILE: PixArt-alpha-ToCa/diffusion/dpm_solver.py ================================================ import torch from .model import gaussian_diffusion as gd from .model.dpm_solver import model_wrapper, DPM_Solver, NoiseScheduleVP def DPMS(model, condition, uncondition, cfg_scale, model_type='noise', noise_schedule="linear", guidance_type='classifier-free', model_kwargs=None, diffusion_steps=1000): if model_kwargs is None: model_kwargs = {} betas = torch.tensor(gd.get_named_beta_schedule(noise_schedule, diffusion_steps)) ## 1. Define the noise schedule. noise_schedule = NoiseScheduleVP(schedule='discrete', betas=betas) ## 2. Convert your discrete-time `model` to the continuous-time ## noise prediction model. Here is an example for a diffusion model ## `model` with the noise prediction type ("noise") . model_fn = model_wrapper( model, noise_schedule, model_type=model_type, model_kwargs=model_kwargs, guidance_type=guidance_type, condition=condition, unconditional_condition=uncondition, guidance_scale=cfg_scale, ) ## 3. Define dpm-solver and sample by multistep DPM-Solver. return DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++") ================================================ FILE: PixArt-alpha-ToCa/diffusion/iddpm.py ================================================ # Modified from OpenAI's diffusion repos # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py from diffusion.model.respace import SpacedDiffusion, space_timesteps from .model import gaussian_diffusion as gd def IDDPM( timestep_respacing, noise_schedule="linear", use_kl=False, sigma_small=False, predict_xstart=False, learn_sigma=True, pred_sigma=True, rescale_learned_sigmas=False, diffusion_steps=1000, snr=False, return_startx=False, ): betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps) if use_kl: loss_type = gd.LossType.RESCALED_KL elif rescale_learned_sigmas: loss_type = gd.LossType.RESCALED_MSE else: loss_type = gd.LossType.MSE if timestep_respacing is None or timestep_respacing == "": timestep_respacing = [diffusion_steps] return SpacedDiffusion( use_timesteps=space_timesteps(diffusion_steps, timestep_respacing), betas=betas, model_mean_type=( gd.ModelMeanType.START_X if predict_xstart else gd.ModelMeanType.EPSILON ), model_var_type=( (gd.ModelVarType.LEARNED_RANGE if learn_sigma else ( gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL ) ) if pred_sigma else None ), loss_type=loss_type, snr=snr, return_startx=return_startx, # rescale_timesteps=rescale_timesteps, ) ================================================ FILE: PixArt-alpha-ToCa/diffusion/lcm_scheduler.py ================================================ # Copyright 2023 Stanford University Team and The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion # and https://github.com/hojonathanho/diffusion import math from dataclasses import dataclass from typing import List, Optional, Tuple, Union import numpy as np import torch from diffusers import ConfigMixin, SchedulerMixin from diffusers.configuration_utils import register_to_config from diffusers.utils import BaseOutput @dataclass # Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM class LCMSchedulerOutput(BaseOutput): """ Output class for the scheduler's `step` function output. Args: prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ prev_sample: torch.FloatTensor denoised: Optional[torch.FloatTensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar def betas_for_alpha_bar( num_diffusion_timesteps, max_beta=0.999, alpha_transform_type="cosine", ): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up to that part of the diffusion process. Args: num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ if alpha_transform_type == "cosine": def alpha_bar_fn(t): return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 elif alpha_transform_type == "exp": def alpha_bar_fn(t): return math.exp(t * -12.0) else: raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) def rescale_zero_terminal_snr(betas): """ Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) Args: betas (`torch.FloatTensor`): the betas that the scheduler is being initialized with. Returns: `torch.FloatTensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas alphas_cumprod = torch.cumprod(alphas, dim=0) alphas_bar_sqrt = alphas_cumprod.sqrt() # Store old values. alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone() alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone() # Shift so the last timestep is zero. alphas_bar_sqrt -= alphas_bar_sqrt_T # Scale so the first timestep is back to the old value. alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T) # Convert alphas_bar_sqrt to betas alphas_bar = alphas_bar_sqrt ** 2 # Revert sqrt alphas = alphas_bar[1:] / alphas_bar[:-1] # Revert cumprod alphas = torch.cat([alphas_bar[:1], alphas]) betas = 1 - alphas return betas class LCMScheduler(SchedulerMixin, ConfigMixin): """ `LCMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with non-Markovian guidance. This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic methods the library implements for all schedulers such as loading and saving. Args: num_train_timesteps (`int`, defaults to 1000): The number of diffusion steps to train the model. beta_start (`float`, defaults to 0.0001): The starting `beta` value of inference. beta_end (`float`, defaults to 0.02): The final `beta` value. beta_schedule (`str`, defaults to `"linear"`): The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from `linear`, `scaled_linear`, or `squaredcos_cap_v2`. trained_betas (`np.ndarray`, *optional*): Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`. clip_sample (`bool`, defaults to `True`): Clip the predicted sample for numerical stability. clip_sample_range (`float`, defaults to 1.0): The maximum magnitude for sample clipping. Valid only when `clip_sample=True`. set_alpha_to_one (`bool`, defaults to `True`): Each diffusion step uses the alphas product value at that step and at the previous one. For the final step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`, otherwise it uses the alpha value at step 0. steps_offset (`int`, defaults to 0): An offset added to the inference steps. You can use a combination of `offset=1` and `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable Diffusion. prediction_type (`str`, defaults to `epsilon`, *optional*): Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process), `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen Video](https://imagen.research.google/video/paper.pdf) paper). thresholding (`bool`, defaults to `False`): Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such as Stable Diffusion. dynamic_thresholding_ratio (`float`, defaults to 0.995): The ratio for the dynamic thresholding method. Valid only when `thresholding=True`. sample_max_value (`float`, defaults to 1.0): The threshold value for dynamic thresholding. Valid only when `thresholding=True`. timestep_spacing (`str`, defaults to `"leading"`): The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information. rescale_betas_zero_snr (`bool`, defaults to `False`): Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and dark samples instead of limiting it to samples with medium brightness. Loosely related to [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506). """ # _compatibles = [e.name for e in KarrasDiffusionSchedulers] order = 1 @register_to_config def __init__( self, num_train_timesteps: int = 1000, beta_start: float = 0.0001, beta_end: float = 0.02, beta_schedule: str = "linear", trained_betas: Optional[Union[np.ndarray, List[float]]] = None, clip_sample: bool = True, set_alpha_to_one: bool = True, steps_offset: int = 0, prediction_type: str = "epsilon", thresholding: bool = False, dynamic_thresholding_ratio: float = 0.995, clip_sample_range: float = 1.0, sample_max_value: float = 1.0, timestep_spacing: str = "leading", rescale_betas_zero_snr: bool = False, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) elif beta_schedule == "linear": self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32) elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. self.betas = ( torch.linspace(beta_start ** 0.5, beta_end ** 0.5, num_train_timesteps, dtype=torch.float32) ** 2 ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") # Rescale for zero SNR if rescale_betas_zero_snr: self.betas = rescale_zero_terminal_snr(self.betas) self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) # At every step in ddim, we are looking into the previous alphas_cumprod # For the final step, there is no previous alphas_cumprod because we are already at 0 # `set_alpha_to_one` decides whether we set this parameter simply to one or # whether we use the final alpha of the "non-previous" one. self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0] # standard deviation of the initial noise distribution self.init_noise_sigma = 1.0 # setable values self.num_inference_steps = None self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: sample (`torch.FloatTensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: `torch.FloatTensor`: A scaled input sample. """ return sample def _get_variance(self, timestep, prev_timestep): alpha_prod_t = self.alphas_cumprod[timestep] alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod beta_prod_t = 1 - alpha_prod_t beta_prod_t_prev = 1 - alpha_prod_t_prev return (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev) # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing pixels from saturation at each step. We find that dynamic thresholding results in significantly better photorealism as well as better image-text alignment, especially when using very large guidance weights." https://arxiv.org/abs/2205.11487 """ dtype = sample.dtype batch_size, channels, height, width = sample.shape if dtype not in (torch.float32, torch.float64): sample = sample.float() # upcast for quantile calculation, and clamp not implemented for cpu half # Flatten sample for doing quantile calculation along each image sample = sample.reshape(batch_size, channels * height * width) abs_sample = sample.abs() # "a certain percentile absolute pixel value" s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1) s = torch.clamp( s, min=1, max=self.config.sample_max_value ) # When clamped to min=1, equivalent to standard clipping to [-1, 1] s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0 sample = torch.clamp(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s" sample = sample.reshape(batch_size, channels, height, width) sample = sample.to(dtype) return sample def set_timesteps(self, num_inference_steps: int, lcm_origin_steps: int, device: Union[str, torch.device] = None): """ Sets the discrete timesteps used for the diffusion chain (to be run before inference). Args: num_inference_steps (`int`): The number of diffusion steps used when generating samples with a pre-trained model. """ if num_inference_steps > self.config.num_train_timesteps: raise ValueError( f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:" f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle" f" maximal {self.config.num_train_timesteps} timesteps." ) self.num_inference_steps = num_inference_steps # LCM Timesteps Setting: # Linear Spacing c = self.config.num_train_timesteps // lcm_origin_steps lcm_origin_timesteps = np.asarray(list(range(1, lcm_origin_steps + 1))) * c - 1 # LCM Training Steps Schedule skipping_step = len(lcm_origin_timesteps) // num_inference_steps timesteps = lcm_origin_timesteps[::-skipping_step][:num_inference_steps] # LCM Inference Steps Schedule self.timesteps = torch.from_numpy(timesteps.copy()).to(device) def get_scalings_for_boundary_condition_discrete(self, t): self.sigma_data = 0.5 # Default: 0.5 # By dividing 0.1: This is almost a delta function at t=0. c_skip = self.sigma_data ** 2 / ((t / 0.1) ** 2 + self.sigma_data ** 2) c_out = ((t / 0.1) / ((t / 0.1) ** 2 + self.sigma_data ** 2) ** 0.5) return c_skip, c_out def step( self, model_output: torch.FloatTensor, timeindex: int, timestep: int, sample: torch.FloatTensor, eta: float = 0.0, use_clipped_model_output: bool = False, generator=None, variance_noise: Optional[torch.FloatTensor] = None, return_dict: bool = True, ) -> Union[LCMSchedulerOutput, Tuple]: """ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion process from the learned model outputs (most often the predicted noise). Args: model_output (`torch.FloatTensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. sample (`torch.FloatTensor`): A current instance of a sample created by the diffusion process. eta (`float`): The weight of noise for added noise in diffusion step. use_clipped_model_output (`bool`, defaults to `False`): If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would coincide with the one provided as input and `use_clipped_model_output` has no effect. generator (`torch.Generator`, *optional*): A random number generator. variance_noise (`torch.FloatTensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`CycleDiffusion`]. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`. Returns: [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`: If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a tuple is returned where the first element is the sample tensor. """ if self.num_inference_steps is None: raise ValueError( "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler" ) # 1. get previous step value prev_timeindex = timeindex + 1 if prev_timeindex < len(self.timesteps): prev_timestep = self.timesteps[prev_timeindex] else: prev_timestep = timestep # 2. compute alphas, betas alpha_prod_t = self.alphas_cumprod[timestep] alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod beta_prod_t = 1 - alpha_prod_t beta_prod_t_prev = 1 - alpha_prod_t_prev # 3. Get scalings for boundary conditions c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep) # 4. Different Parameterization: parameterization = self.config.prediction_type if parameterization == "epsilon": # noise-prediction pred_x0 = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt() elif parameterization == "sample": # x-prediction pred_x0 = model_output elif parameterization == "v_prediction": # v-prediction pred_x0 = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output # 4. Denoise model output using boundary conditions denoised = c_out * pred_x0 + c_skip * sample # 5. Sample z ~ N(0, I), For MultiStep Inference # Noise is not used for one-step sampling. if len(self.timesteps) > 1: noise = torch.randn(model_output.shape).to(model_output.device) prev_sample = alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise else: prev_sample = denoised if not return_dict: return (prev_sample, denoised) return LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised) # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, original_samples: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor, ) -> torch.FloatTensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype) timesteps = timesteps.to(original_samples.device) sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(original_samples.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) return sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity def get_velocity( self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor ) -> torch.FloatTensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype) timesteps = timesteps.to(sample.device) sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(sample.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) return sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample def __len__(self): return self.config.num_train_timesteps ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/__init__.py ================================================ from .nets import * ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/builder.py ================================================ from mmcv import Registry from diffusion.model.utils import set_grad_checkpoint MODELS = Registry('models') def build_model(cfg, use_grad_checkpoint=False, use_fp32_attention=False, gc_step=1, **kwargs): if isinstance(cfg, str): cfg = dict(type=cfg) model = MODELS.build(cfg, default_args=kwargs) if use_grad_checkpoint: set_grad_checkpoint(model, use_fp32_attention=use_fp32_attention, gc_step=gc_step) return model ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/__init__.py ================================================ from .cache_cutfresh import cache_cutfresh from .fresh_ratio_scheduler import fresh_ratio_scheduler from .score_evaluate import score_evaluate from .global_force_fresh import global_force_fresh from .cache_cutfresh import cache_cutfresh from .update_cache import update_cache from .force_init import force_init from .attention import cached_attention_forward from .cache_init import cache_init ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/attention.py ================================================ # Besides, re-arrange the attention module from torch.jit import Final import torch import torch.nn as nn import torch.nn.functional as F from typing import Optional, Union from xformers.ops.fmha.attn_bias import BlockDiagonalMask def cached_attention_forward( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attn_bias: Optional[Union[torch.Tensor, BlockDiagonalMask]] = None, p: float = 0.0, scale: Optional[float] = None ) -> torch.Tensor: scale = 1.0 / query.shape[-1] ** 0.5 query = query * scale query = query.transpose(1, 2) key = key.transpose(1, 2) value = value.transpose(1, 2) attn = query @ key.transpose(-2, -1) if attn_bias is not None: attn_bias = attn_bias.materialize(shape= attn.shape, dtype= attn.dtype, device= attn.device) attn = attn + attn_bias #out_map = attn attn_map = attn.softmax(-1) attn = F.dropout(attn_map, p) attn = attn @ value return attn.transpose(1, 2).contiguous(), attn_map.mean(dim=1) ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/cache_cutfresh.py ================================================ from .fresh_ratio_scheduler import fresh_ratio_scheduler from .score_evaluate import score_evaluate #from .token_merge import token_merge import torch def cache_cutfresh(cache_dic, tokens, current): ''' Cut fresh tokens from the input tokens and update the cache counter. cache_dic: dict, the cache dictionary containing cache(main extra memory cost), indices and some other information. tokens: torch.Tensor, the input tokens to be cut. current: dict, the current step, layer, and module information. Particularly convenient for debugging. ''' step = current['step'] layer = current['layer'] module = current['module'] fresh_ratio = fresh_ratio_scheduler(cache_dic, current) fresh_ratio = torch.clamp(torch.tensor(fresh_ratio, device = tokens.device), min=0, max=1) # Generate the index tensor for fresh tokens score = score_evaluate(cache_dic, tokens, current) # s1, s2, s3 mentioned in the paper score = local_selection_with_bonus(score, 0.4, 4) # Uniform Spatial Distribution s4 mentioned in the paper indices = score.argsort(dim=-1, descending=True) topk = int(fresh_ratio * score.shape[1]) fresh_indices = indices[:, :topk] stale_indices = indices[:, topk:] # (B, fresh_ratio *N) # Updating the Cache Frequency Score s3 mentioned in the paper # stale tokens index + 1 in each ***module***, fresh tokens index = 0 cache_dic['cache_index'][-1][layer][module] += 1 cache_dic['cache_index'][-1][layer][module].scatter_(dim=1, index=fresh_indices, src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device)) cache_dic['cache_index']['layer_index'][module] += 1 cache_dic['cache_index']['layer_index'][module].scatter_(dim=1, index=fresh_indices, src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device)) fresh_indices_expand = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1]) if module in ['mlp', 'attn', 'cross-attn']: fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices_expand) return fresh_indices, fresh_tokens else: raise ValueError("Unrecognized module?", module) def local_selection_with_bonus(score, bonus_ratio, grid_size=2): batch_size, num_tokens = score.shape image_size = int(num_tokens ** 0.5) block_size = grid_size * grid_size assert num_tokens % block_size == 0, "The number of tokens must be divisible by the block size." # Step 1: Reshape score to group it by blocks score_reshaped = score.view(batch_size, image_size // grid_size, grid_size, image_size // grid_size, grid_size) score_reshaped = score_reshaped.permute(0, 1, 3, 2, 4).contiguous() score_reshaped = score_reshaped.view(batch_size, -1, block_size) # [batch_size, num_blocks, block_size] # Step 2: Find the max token in each block max_scores, max_indices = score_reshaped.max(dim=-1, keepdim=True) # [batch_size, num_blocks, 1] # Step 3: Create a mask to identify max score tokens mask = torch.zeros_like(score_reshaped) mask.scatter_(-1, max_indices, 1) # Set mask to 1 at the max indices # Step 4: Apply the bonus only to the max score tokens score_reshaped = score_reshaped + (mask * max_scores * bonus_ratio) # Apply bonus only to max tokens # Step 5: Reshape the score back to its original shape score_modified = score_reshaped.view(batch_size, image_size // grid_size, image_size // grid_size, grid_size, grid_size) score_modified = score_modified.permute(0, 1, 3, 2, 4).contiguous() score_modified = score_modified.view(batch_size, num_tokens) return score_modified ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/cache_init.py ================================================ def cache_init(model_kwargs, num_steps): ''' Initialization for cache. ''' cache_dic = {} cache = {} cache_index = {} cache[-1]={} cache_index[-1]={} cache_index['layer_index']={} cache_dic['attn_map'] = {} cache_dic['attn_map'][-1] = {} cache_dic['cross_attn_map'] = {} cache_dic['cross_attn_map'][-1] = {} for j in range(28): cache[-1][j] = {} cache_index[-1][j] = {} cache_dic['attn_map'][-1][j] = {} cache_dic['cross_attn_map'][-1][j] = {} cache_dic['cache_type'] = model_kwargs['cache_type'] cache_dic['cache_index'] = cache_index cache_dic['cache'] = cache cache_dic['fresh_ratio_schedule'] = model_kwargs['ratio_scheduler'] cache_dic['fresh_ratio'] = model_kwargs['fresh_ratio'] cache_dic['fresh_threshold'] = model_kwargs['fresh_threshold'] cache_dic['force_fresh'] = model_kwargs['force_fresh'] cache_dic['soft_fresh_weight'] = model_kwargs['soft_fresh_weight'] #cache_dic['merge_weight'] = merge_weight current = {} current['num_steps'] = num_steps return cache_dic, current ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/force_init.py ================================================ import torch from .force_scheduler import force_scheduler def force_init(cache_dic, current, tokens): ''' Initialization for Force Activation step. ''' cache_dic['cache_index'][-1][current['layer']][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device) force_scheduler(cache_dic, current) if current['layer'] == 0: cache_dic['cache_index']['layer_index'][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device) ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/force_scheduler.py ================================================ import torch def force_scheduler(cache_dic, current): if cache_dic['fresh_ratio'] == 0: # FORA linear_step_weight = 0.0 else: # TokenCache linear_step_weight = 0.2 step_factor = torch.tensor(1 - linear_step_weight + 2 * linear_step_weight * current['step'] / current['num_steps']) threshold = torch.round(cache_dic['fresh_threshold'] / step_factor) # no force constrain for sensitive steps, cause the performance is good enough. # you may have a try. cache_dic['cal_threshold'] = threshold #return threshold ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/fresh_ratio_scheduler.py ================================================ import torch def fresh_ratio_scheduler(cache_dic, current): ''' Return the fresh ratio for the current step. ''' fresh_ratio = cache_dic['fresh_ratio'] fresh_ratio_schedule = cache_dic['fresh_ratio_schedule'] step = current['step'] num_steps = current['num_steps'] threshold = cache_dic['fresh_threshold'] weight = 0.9 if fresh_ratio_schedule == 'constant': return fresh_ratio elif fresh_ratio_schedule == 'linear': return fresh_ratio * (1 + weight - 2 * weight * step / num_steps) elif fresh_ratio_schedule == 'exp': #return 0.5 * (0.052 ** (step/num_steps)) return fresh_ratio * (weight ** (step / num_steps)) elif fresh_ratio_schedule == 'linear-mode': mode = (step % threshold)/threshold - 0.5 mode_weight = 0.1 return fresh_ratio * (1 + weight - 2 * weight * step / num_steps + mode_weight * mode) elif fresh_ratio_schedule == 'layerwise': return fresh_ratio * (1 + weight - 2 * weight * current['layer'] / 27) elif fresh_ratio_schedule == 'linear-layerwise': step_weight = -0.9 #0.9 step_factor = 1 - step_weight + 2 * step_weight * step / num_steps #if current['layer'] == 2: # return 1.0 #sigmoid #sigmoid_weight = 0.13 #layer_factor = 2 * torch.sigmoid(torch.tensor([sigmoid_weight * (13.5 - current['layer'])])) layer_weight = 0.6 layer_factor = 1 + layer_weight - 2 * layer_weight * current['layer'] / 27 module_weight = 1.0 #TokenCache N=8 2.5 N=6 2.5 #N=4 2.1 module_time_weight = 0.6 module_factor = (1 - (1-module_time_weight) * module_weight) if current['module']=='cross-attn' else (1 + module_time_weight * module_weight) return fresh_ratio * layer_factor * step_factor * module_factor elif fresh_ratio_schedule == 'ToCa': step_weight = -0.9 #0.9 step_factor = 1 - step_weight + 2 * step_weight * step / num_steps layer_weight = 0.6 layer_factor = 1 + layer_weight - 2 * layer_weight * current['layer'] / 27 module_weight = 1.0 module_time_weight = 0.6 # this means 60*x% cross-attn computation, and 160*x% mlp computation. This is designed for cross-attn has best temporal redundancy, and mlp has worse. # so cross-attn compute less and mlp compute more. module_factor = (1 - (1-module_time_weight) * module_weight) if current['module']=='cross-attn' else (1 + module_time_weight * module_weight) return fresh_ratio * layer_factor * step_factor * module_factor else: raise ValueError("unrecognized fresh ratio schedule", fresh_ratio_schedule) ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/global_force_fresh.py ================================================ from .force_scheduler import force_scheduler def global_force_fresh(cache_dic, current): ''' Return whether to force fresh tokens globally. ''' first_step = (current['step'] == 0) force_fresh = cache_dic['force_fresh'] if not first_step: fresh_threshold = cache_dic['cal_threshold'] else: fresh_threshold = cache_dic['fresh_threshold'] if force_fresh == 'global': return (first_step or (current['step']% fresh_threshold == 0)) elif force_fresh == 'local': return first_step elif force_fresh == 'none': return first_step else: raise ValueError("unrecognized force fresh strategy", force_fresh) ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/score_evaluate.py ================================================ import torch import torch.nn as nn from .scores import attn_score, similarity_score, norm_score def score_evaluate(cache_dic, tokens, current) -> torch.Tensor: ''' Return the score tensor (B, N) for the given tokens. ''' #if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')): # # abandoned branch, if you want to explore the local force fresh strategy, this may help. # force_fresh_mask = torch.as_tensor((cache_dic['cache_index'][-1][current['layer']][current['module']] >= 2 * cache_dic['fresh_threshold']), dtype = int) # 2 because the threshold is for step, not module # force_len = force_fresh_mask.sum(dim=1) # force_indices = force_fresh_mask.argsort(dim = -1, descending = True)[:, :force_len.min()] # force_indices = force_indices[:, torch.randperm(force_indices.shape[1])] # Just see more explanation in the version of DiT-ToCa if needed. if cache_dic['cache_type'] == 'random': score = torch.rand(int(tokens.shape[0]*0.5), tokens.shape[1], device=tokens.device) score = torch.cat([score, score], dim=0).to(tokens.device) elif cache_dic['cache_type'] == 'straight': score = torch.ones(tokens.shape[0], tokens.shape[1]).to(tokens.device) elif cache_dic['cache_type'] == 'attention': # cache_dic['attn_map'][step][layer] (B, N, N), the last dimention has get softmaxed score = attn_score(cache_dic, current) #score = score + 0.0 * torch.rand_like(score, device= score.device) elif cache_dic['cache_type'] == 'similarity': score = similarity_score(cache_dic, current, tokens) elif cache_dic['cache_type'] == 'norm': score = norm_score(cache_dic, current, tokens) elif cache_dic['cache_type'] == 'compress': score1 = torch.rand(int(tokens.shape[0]*0.5), tokens.shape[1]) score1 = torch.cat([score1, score1], dim=0).to(tokens.device) score2 = cache_dic['attn_map'][-1][current['layer']].sum(dim=1)#.mean(dim=0) # (B, N) # normalize score2 = score2 / score2.max(dim=1, keepdim=True)[0] score = 0.5 * score1 + 0.5 * score2 # abandoned the branch, if you want to explore the local force fresh strategy, this may help. #if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')): # current['is_force_fresh'] is False, cause when it is True, no cut and fresh are needed # #print(torch.ones_like(force_indices, dtype=float, device=force_indices.device).dtype) # score.scatter_(dim=1, index=force_indices, src=torch.ones_like(force_indices, dtype=torch.float32, # device=force_indices.device)) if (True and (cache_dic['force_fresh'] == 'global')): soft_step_score = cache_dic['cache_index'][-1][current['layer']][current['module']].float() / (cache_dic['fresh_threshold']) soft_layer_score = cache_dic['cache_index']['layer_index'][current['module']].float() / (27) score = score + cache_dic['soft_fresh_weight'] * soft_step_score #+ 0.1 *soft_layer_score return score.to(tokens.device) ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/scores.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F def attn_score(cache_dic, current): #self_attn_score = 1- cache_dic['attn_map'][-1][current['layer']].diagonal(dim1=1, dim2=2) #self_attn_score = F.normalize(self_attn_score, dim=1, p=2) #attention_score = F.normalize(cache_dic['attn_map'][-1][current['layer']].sum(dim=1), dim=1, p=2) #cross_attn_map = F.threshold(cache_dic['cross_attn_map'][-1][current['layer']],threshold=0.0, value=0.0) #cross_attention_score = F.normalize(cross_attn_map.sum(dim=-1), dim=-1, p=2) # Note: It is important to give a same selection method for cfg and no cfg. # Because the influence of **Cross-Attention** in text-contidional models makes cfg and no cfg a BIG difference. # Same selection for cfg and no cfg cond_cmap, uncond_cmap = torch.split(cache_dic['cross_attn_map'][-1][current['layer']], len(cache_dic['cross_attn_map'][-1][current['layer']]) // 2, dim=0) cond_weight = 0.5 cmap = cond_weight * cond_cmap + (1 - cond_weight) * uncond_cmap # Entropy score cross_attention_entropy = -torch.sum(cmap * torch.log(cmap + 1e-7), dim=-1) cross_attention_score = F.normalize(1 + cross_attention_entropy, dim=1, p=2) # Note here "1" does not influence the sorted sequence, but provie stability. score = cross_attention_score.repeat(2, 1) # In PixArt, the cross_attention_score (s2) is used as the score, for a better text-image alignment. # You can try conbining the self_attention_score (s1) and cross_attention_score (s2) as the final score, there exists a balance. #cross_weight = 0.0 #score = (1-cross_weight) * attention_score + cross_weight * cross_attention_score return score def similarity_score(cache_dic, current, tokens): cosine_sim = F.cosine_similarity(tokens, cache_dic['cache'][-1][current['layer']][current['module']], dim=-1) return F.normalize(1- cosine_sim, dim=-1, p=2) def norm_score(cache_dic, current, tokens): norm = tokens.norm(dim=-1, p=2) return F.normalize(norm, dim=-1, p=2) ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/token_merge.py ================================================ import torch def token_merge(cache_dic, tokens, current, fresh_indices, stale_indices): ''' An abandoned branch in exploring if token merge helps. The answer is no, at least no for training-free strategy. ''' if (current['layer'] % 1 == 0): fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1])) stale_tokens = torch.gather(input = tokens, dim = 1, index = stale_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1])) method = 'similarity' if method == 'distance': descending = False distance = torch.cdist(stale_tokens, fresh_tokens, p=1) stale_fresh_dist, stale_fresh_indices_allstale = torch.min(distance, dim=2) elif method == 'similarity': descending = True fresh_tokens = torch.nn.functional.normalize(fresh_tokens, p=2, dim=-1) stale_tokens = torch.nn.functional.normalize(stale_tokens, p=2, dim=-1) similarity = stale_tokens @ fresh_tokens.transpose(1, 2) stale_fresh_dist, stale_fresh_indices_allstale = torch.max(similarity, dim=2) saved_topk_stale = int((stale_fresh_dist > 0.995).sum(dim=1).min()) merged_stale_sequence = torch.sort(stale_fresh_dist, dim=1, descending=descending)[1][:,:saved_topk_stale] stale_fresh_indices = stale_fresh_indices_allstale.gather(1, merged_stale_sequence) merged_stale_sequence = stale_indices.gather(1, merged_stale_sequence) merged_stale_fresh_indices = fresh_indices.gather(1, stale_fresh_indices) cache_dic['merged_stale_fresh_indices'] = merged_stale_fresh_indices cache_dic['merged_stale_sequence'] = merged_stale_sequence ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/update_cache.py ================================================ import torch def update_cache(fresh_indices, fresh_tokens, cache_dic, current, fresh_attn_map=None): ''' Update the cache with the fresh tokens. ''' step = current['step'] layer = current['layer'] module = current['module'] # Update the cached tokens at the positions if module == 'attn': # this branch is not used in the final version, but if you explore the partial fresh strategy of attention, it works (probably a few bugs). indices = fresh_indices#.sort(dim=1, descending=False)[0] cache_dic['attn_map'][-1][layer].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_attn_map.shape[-1]), src=fresh_attn_map) elif module == 'cross-attn': indices = fresh_indices#.sort(dim=1, descending=False)[0] cache_dic['cross_attn_map'][-1][layer].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_attn_map.shape[-1]), src=fresh_attn_map) elif module == 'mlp': indices = fresh_indices cache_dic['cache'][-1][layer][module].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_tokens.shape[-1]), src=fresh_tokens) ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/diffusion_utils.py ================================================ # Modified from OpenAI's diffusion repos # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py import numpy as np import torch as th def normal_kl(mean1, logvar1, mean2, logvar2): """ Compute the KL divergence between two gaussians. Shapes are automatically broadcasted, so batches can be compared to scalars, among other use cases. """ tensor = next( ( obj for obj in (mean1, logvar1, mean2, logvar2) if isinstance(obj, th.Tensor) ), None, ) assert tensor is not None, "at least one argument must be a Tensor" # Force variances to be Tensors. Broadcasting helps convert scalars to # Tensors, but it does not work for th.exp(). logvar1, logvar2 = [ x if isinstance(x, th.Tensor) else th.tensor(x, device=tensor.device) for x in (logvar1, logvar2) ] return 0.5 * ( -1.0 + logvar2 - logvar1 + th.exp(logvar1 - logvar2) + ((mean1 - mean2) ** 2) * th.exp(-logvar2) ) def approx_standard_normal_cdf(x): """ A fast approximation of the cumulative distribution function of the standard normal. """ return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3)))) def continuous_gaussian_log_likelihood(x, *, means, log_scales): """ Compute the log-likelihood of a continuous Gaussian distribution. :param x: the targets :param means: the Gaussian mean Tensor. :param log_scales: the Gaussian log stddev Tensor. :return: a tensor like x of log probabilities (in nats). """ centered_x = x - means inv_stdv = th.exp(-log_scales) normalized_x = centered_x * inv_stdv return th.distributions.Normal(th.zeros_like(x), th.ones_like(x)).log_prob( normalized_x ) def discretized_gaussian_log_likelihood(x, *, means, log_scales): """ Compute the log-likelihood of a Gaussian distribution discretizing to a given image. :param x: the target images. It is assumed that this was uint8 values, rescaled to the range [-1, 1]. :param means: the Gaussian mean Tensor. :param log_scales: the Gaussian log stddev Tensor. :return: a tensor like x of log probabilities (in nats). """ assert x.shape == means.shape == log_scales.shape centered_x = x - means inv_stdv = th.exp(-log_scales) plus_in = inv_stdv * (centered_x + 1.0 / 255.0) cdf_plus = approx_standard_normal_cdf(plus_in) min_in = inv_stdv * (centered_x - 1.0 / 255.0) cdf_min = approx_standard_normal_cdf(min_in) log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12)) log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12)) cdf_delta = cdf_plus - cdf_min log_probs = th.where( x < -0.999, log_cdf_plus, th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))), ) assert log_probs.shape == x.shape return log_probs ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/dpm_solver.py ================================================ import torch from tqdm import tqdm from ..model.cache_functions import cache_init class NoiseScheduleVP: def __init__( self, schedule='discrete', betas=None, alphas_cumprod=None, continuous_beta_0=0.1, continuous_beta_1=20., dtype=torch.float32, ): """Create a wrapper class for the forward SDE (VP type). *** Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t. We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images. *** The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ). We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper). Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have: log_alpha_t = self.marginal_log_mean_coeff(t) sigma_t = self.marginal_std(t) lambda_t = self.marginal_lambda(t) Moreover, as lambda(t) is an invertible function, we also support its inverse function: t = self.inverse_lambda(lambda_t) =============================================================== We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]). 1. For discrete-time DPMs: For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by: t_i = (i + 1) / N e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1. We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3. Args: betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details) alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details) Note that we always have alphas_cumprod = cumprod(1 - betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`. **Important**: Please pay special attention for the args for `alphas_cumprod`: The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ). Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have alpha_{t_n} = \sqrt{\hat{alpha_n}}, and log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}). 2. For continuous-time DPMs: We support the linear VPSDE for the continuous time setting. The hyperparameters for the noise schedule are the default settings in Yang Song's ScoreSDE: Args: beta_min: A `float` number. The smallest beta for the linear schedule. beta_max: A `float` number. The largest beta for the linear schedule. T: A `float` number. The ending time of the forward process. =============================================================== Args: schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs, 'linear' for continuous-time DPMs. Returns: A wrapper object of the forward SDE (VP type). =============================================================== Example: # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1): >>> ns = NoiseScheduleVP('discrete', betas=betas) # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1): >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod) # For continuous-time DPMs (VPSDE), linear schedule: >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.) """ if schedule not in ['discrete', 'linear']: raise ValueError( f"Unsupported noise schedule {schedule}. The schedule needs to be 'discrete' or 'linear'" ) self.schedule = schedule if schedule == 'discrete': if betas is not None: log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0) else: assert alphas_cumprod is not None log_alphas = 0.5 * torch.log(alphas_cumprod) self.T = 1. self.log_alpha_array = self.numerical_clip_alpha(log_alphas).reshape((1, -1,)).to(dtype=dtype) self.total_N = self.log_alpha_array.shape[1] self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1)).to(dtype=dtype) else: self.T = 1. self.total_N = 1000 self.beta_0 = continuous_beta_0 self.beta_1 = continuous_beta_1 def numerical_clip_alpha(self, log_alphas, clipped_lambda=-5.1): """ For some beta schedules such as cosine schedule, the log-SNR has numerical isssues. We clip the log-SNR near t=T within -5.1 to ensure the stability. Such a trick is very useful for diffusion models with the cosine schedule, such as i-DDPM, guided-diffusion and GLIDE. """ log_sigmas = 0.5 * torch.log(1. - torch.exp(2. * log_alphas)) lambs = log_alphas - log_sigmas idx = torch.searchsorted(torch.flip(lambs, [0]), clipped_lambda) if idx > 0: log_alphas = log_alphas[:-idx] return log_alphas def marginal_log_mean_coeff(self, t): """ Compute log(alpha_t) of a given continuous-time label t in [0, T]. """ if self.schedule == 'discrete': return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device)).reshape((-1)) elif self.schedule == 'linear': return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0 def marginal_alpha(self, t): """ Compute alpha_t of a given continuous-time label t in [0, T]. """ return torch.exp(self.marginal_log_mean_coeff(t)) def marginal_std(self, t): """ Compute sigma_t of a given continuous-time label t in [0, T]. """ return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t))) def marginal_lambda(self, t): """ Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T]. """ log_mean_coeff = self.marginal_log_mean_coeff(t) log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff)) return log_mean_coeff - log_std def inverse_lambda(self, lamb): """ Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t. """ if self.schedule == 'linear': tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb)) Delta = self.beta_0 ** 2 + tmp return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0) elif self.schedule == 'discrete': log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb) t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]), torch.flip(self.t_array.to(lamb.device), [1])) return t.reshape((-1,)) def model_wrapper( model, noise_schedule, model_type="noise", model_kwargs={}, guidance_type="uncond", condition=None, unconditional_condition=None, guidance_scale=1., classifier_fn=None, classifier_kwargs={}, ): """Create a wrapper function for the noise prediction model. DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to firstly wrap the model function to a noise prediction model that accepts the continuous time as the input. We support four types of the diffusion model by setting `model_type`: 1. "noise": noise prediction model. (Trained by predicting noise). 2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0). 3. "v": velocity prediction model. (Trained by predicting the velocity). The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2]. [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models." arXiv preprint arXiv:2202.00512 (2022). [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models." arXiv preprint arXiv:2210.02303 (2022). 4. "score": marginal score function. (Trained by denoising score matching). Note that the score function and the noise prediction model follows a simple relationship: ``` noise(x_t, t) = -sigma_t * score(x_t, t) ``` We support three types of guided sampling by DPMs by setting `guidance_type`: 1. "uncond": unconditional sampling by DPMs. The input `model` has the following format: `` model(x, t_input, **model_kwargs) -> noise | x_start | v | score `` 2. "classifier": classifier guidance sampling [3] by DPMs and another classifier. The input `model` has the following format: `` model(x, t_input, **model_kwargs) -> noise | x_start | v | score `` The input `classifier_fn` has the following format: `` classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond) `` [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis," in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794. 3. "classifier-free": classifier-free guidance sampling by conditional DPMs. The input `model` has the following format: `` model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score `` And if cond == `unconditional_condition`, the model output is the unconditional DPM output. [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance." arXiv preprint arXiv:2207.12598 (2022). The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999) or continuous-time labels (i.e. epsilon to T). We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise: `` def model_fn(x, t_continuous) -> noise: t_input = get_model_input_time(t_continuous) return noise_pred(model, x, t_input, **model_kwargs) `` where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver. =============================================================== Args: model: A diffusion model with the corresponding format described above. noise_schedule: A noise schedule object, such as NoiseScheduleVP. model_type: A `str`. The parameterization type of the diffusion model. "noise" or "x_start" or "v" or "score". model_kwargs: A `dict`. A dict for the other inputs of the model function. guidance_type: A `str`. The type of the guidance for sampling. "uncond" or "classifier" or "classifier-free". condition: A pytorch tensor. The condition for the guided sampling. Only used for "classifier" or "classifier-free" guidance type. unconditional_condition: A pytorch tensor. The condition for the unconditional sampling. Only used for "classifier-free" guidance type. guidance_scale: A `float`. The scale for the guided sampling. classifier_fn: A classifier function. Only used for the classifier guidance. classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function. Returns: A noise prediction model that accepts the noised data and the continuous time as the inputs. """ def get_model_input_time(t_continuous): """ Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time. For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N]. For continuous-time DPMs, we just use `t_continuous`. """ if noise_schedule.schedule == 'discrete': return (t_continuous - 1. / noise_schedule.total_N) * 1000. else: return t_continuous def noise_pred_fn(x, t_continuous, current, cache_dic, cond=None): t_input = get_model_input_time(t_continuous) if cond is None: output = model(x, t_input, current, cache_dic, **model_kwargs) else: output = model(x, t_input, current, cache_dic, cond, **model_kwargs) if model_type == "noise": return output elif model_type == "x_start": alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous) return (x - expand_dims(alpha_t, x.dim()) * output) / expand_dims(sigma_t, x.dim()) elif model_type == "v": alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous) return expand_dims(alpha_t, x.dim()) * output + expand_dims(sigma_t, x.dim()) * x elif model_type == "score": sigma_t = noise_schedule.marginal_std(t_continuous) return -expand_dims(sigma_t, x.dim()) * output def cond_grad_fn(x, t_input): """ Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t). """ with torch.enable_grad(): x_in = x.detach().requires_grad_(True) log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs) return torch.autograd.grad(log_prob.sum(), x_in)[0] def model_fn(x, t_continuous, current, cache_dic): """ The noise predicition model function that is used for DPM-Solver. """ if guidance_type == "uncond": return noise_pred_fn(x, t_continuous) elif guidance_type == "classifier": assert classifier_fn is not None t_input = get_model_input_time(t_continuous) cond_grad = cond_grad_fn(x, t_input) sigma_t = noise_schedule.marginal_std(t_continuous) noise = noise_pred_fn(x, t_continuous) return noise - guidance_scale * expand_dims(sigma_t, x.dim()) * cond_grad elif guidance_type == "classifier-free": if guidance_scale == 1. or unconditional_condition is None: return noise_pred_fn(x, t_continuous, cond=condition) x_in = torch.cat([x] * 2) t_in = torch.cat([t_continuous] * 2) c_in = torch.cat([unconditional_condition, condition]) noise_uncond, noise = noise_pred_fn(x_in, t_in, current, cache_dic, cond=c_in).chunk(2) return noise_uncond + guidance_scale * (noise - noise_uncond) assert model_type in ["noise", "x_start", "v", "score"] assert guidance_type in ["uncond", "classifier", "classifier-free"] return model_fn class DPM_Solver: def __init__( self, model_fn, noise_schedule, algorithm_type="dpmsolver++", correcting_x0_fn=None, correcting_xt_fn=None, thresholding_max_val=1., dynamic_thresholding_ratio=0.995, ): """Construct a DPM-Solver. We support both DPM-Solver (`algorithm_type="dpmsolver"`) and DPM-Solver++ (`algorithm_type="dpmsolver++"`). We also support the "dynamic thresholding" method in Imagen[1]. For pixel-space diffusion models, you can set both `algorithm_type="dpmsolver++"` and `correcting_x0_fn="dynamic_thresholding"` to use the dynamic thresholding. The "dynamic thresholding" can greatly improve the sample quality for pixel-space DPMs with large guidance scales. Note that the thresholding method is **unsuitable** for latent-space DPMs (such as stable-diffusion). To support advanced algorithms in image-to-image applications, we also support corrector functions for both x0 and xt. Args: model_fn: A noise prediction model function which accepts the continuous-time input (t in [epsilon, T]): `` def model_fn(x, t_continuous): return noise `` The shape of `x` is `(batch_size, **shape)`, and the shape of `t_continuous` is `(batch_size,)`. noise_schedule: A noise schedule object, such as NoiseScheduleVP. algorithm_type: A `str`. Either "dpmsolver" or "dpmsolver++". correcting_x0_fn: A `str` or a function with the following format: ``` def correcting_x0_fn(x0, t): x0_new = ... return x0_new ``` This function is to correct the outputs of the data prediction model at each sampling step. e.g., ``` x0_pred = data_pred_model(xt, t) if correcting_x0_fn is not None: x0_pred = correcting_x0_fn(x0_pred, t) xt_1 = update(x0_pred, xt, t) ``` If `correcting_x0_fn="dynamic_thresholding"`, we use the dynamic thresholding proposed in Imagen[1]. correcting_xt_fn: A function with the following format: ``` def correcting_xt_fn(xt, t, step): x_new = ... return x_new ``` This function is to correct the intermediate samples xt at each sampling step. e.g., ``` xt = ... xt = correcting_xt_fn(xt, t, step) ``` thresholding_max_val: A `float`. The max value for thresholding. Valid only when use `dpmsolver++` and `correcting_x0_fn="dynamic_thresholding"`. dynamic_thresholding_ratio: A `float`. The ratio for dynamic thresholding (see Imagen[1] for details). Valid only when use `dpmsolver++` and `correcting_x0_fn="dynamic_thresholding"`. [1] Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour, Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b. """ self.model = lambda x, t, current, cache_dic: model_fn(x, t.expand((x.shape[0])), current, cache_dic) self.noise_schedule = noise_schedule assert algorithm_type in ["dpmsolver", "dpmsolver++"] self.algorithm_type = algorithm_type if correcting_x0_fn == "dynamic_thresholding": self.correcting_x0_fn = self.dynamic_thresholding_fn else: self.correcting_x0_fn = correcting_x0_fn self.correcting_xt_fn = correcting_xt_fn self.dynamic_thresholding_ratio = dynamic_thresholding_ratio self.thresholding_max_val = thresholding_max_val def dynamic_thresholding_fn(self, x0, t): """ The dynamic thresholding method. """ dims = x0.dim() p = self.dynamic_thresholding_ratio s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1) s = expand_dims(torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)), dims) x0 = torch.clamp(x0, -s, s) / s return x0 def noise_prediction_fn(self, x, t, current, cache_dic): """ Return the noise prediction model. """ return self.model(x, t, current, cache_dic) def data_prediction_fn(self, x, t, current, cache_dic): """ Return the data prediction model (with corrector). """ noise = self.noise_prediction_fn(x, t, current, cache_dic) alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t) x0 = (x - sigma_t * noise) / alpha_t if self.correcting_x0_fn is not None: x0 = self.correcting_x0_fn(x0, t) return x0 def model_fn(self, x, t, current, cache_dic): """ Convert the model to the noise prediction model or the data prediction model. """ if self.algorithm_type == "dpmsolver++": return self.data_prediction_fn(x, t, current, cache_dic) else: return self.noise_prediction_fn(x, t, current, cache_dic) def get_time_steps(self, skip_type, t_T, t_0, N, device): """Compute the intermediate time steps for sampling. Args: skip_type: A `str`. The type for the spacing of the time steps. We support three types: - 'logSNR': uniform logSNR for the time steps. - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.) - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.) t_T: A `float`. The starting time of the sampling (default is T). t_0: A `float`. The ending time of the sampling (default is epsilon). N: A `int`. The total number of the spacing of the time steps. device: A torch device. Returns: A pytorch tensor of the time steps, with the shape (N + 1,). """ if skip_type == 'logSNR': lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device)) lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device)) logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device) return self.noise_schedule.inverse_lambda(logSNR_steps) elif skip_type == 'time_uniform': return torch.linspace(t_T, t_0, N + 1).to(device) elif skip_type == 'time_quadratic': t_order = 2 return ( torch.linspace( t_T ** (1.0 / t_order), t_0 ** (1.0 / t_order), N + 1 ) .pow(t_order) .to(device) ) else: raise ValueError( f"Unsupported skip_type {skip_type}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'" ) def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device): """ Get the order of each step for sampling by the singlestep DPM-Solver. We combine both DPM-Solver-1,2,3 to use all the function evaluations, which is named as "DPM-Solver-fast". Given a fixed number of function evaluations by `steps`, the sampling procedure by DPM-Solver-fast is: - If order == 1: We take `steps` of DPM-Solver-1 (i.e. DDIM). - If order == 2: - Denote K = (steps // 2). We take K or (K + 1) intermediate time steps for sampling. - If steps % 2 == 0, we use K steps of DPM-Solver-2. - If steps % 2 == 1, we use K steps of DPM-Solver-2 and 1 step of DPM-Solver-1. - If order == 3: - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling. - If steps % 3 == 0, we use (K - 2) steps of DPM-Solver-3, and 1 step of DPM-Solver-2 and 1 step of DPM-Solver-1. - If steps % 3 == 1, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-1. - If steps % 3 == 2, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-2. ============================================ Args: order: A `int`. The max order for the solver (2 or 3). steps: A `int`. The total number of function evaluations (NFE). skip_type: A `str`. The type for the spacing of the time steps. We support three types: - 'logSNR': uniform logSNR for the time steps. - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.) - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.) t_T: A `float`. The starting time of the sampling (default is T). t_0: A `float`. The ending time of the sampling (default is epsilon). device: A torch device. Returns: orders: A list of the solver order of each step. """ if order == 3: K = steps // 3 + 1 if steps % 3 == 0: orders = [3, ] * (K - 2) + [2, 1] elif steps % 3 == 1: orders = [3, ] * (K - 1) + [1] else: orders = [3, ] * (K - 1) + [2] elif order == 2: if steps % 2 == 0: K = steps // 2 orders = [2, ] * K else: K = steps // 2 + 1 orders = [2, ] * (K - 1) + [1] elif order == 1: K = 1 orders = [1, ] * steps else: raise ValueError("'order' must be '1' or '2' or '3'.") if skip_type == 'logSNR': # To reproduce the results in DPM-Solver paper timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device) else: timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[ torch.cumsum(torch.tensor([0, ] + orders), 0).to(device)] return timesteps_outer, orders def denoise_to_zero_fn(self, x, s): """ Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization. """ return self.data_prediction_fn(x, s) def dpm_solver_first_update(self, x, s, t, current, cache_dic, model_s=None, return_intermediate=False): """ DPM-Solver-1 (equivalent to DDIM) from time `s` to time `t`. Args: x: A pytorch tensor. The initial value at time `s`. s: A pytorch tensor. The starting time, with the shape (1,). t: A pytorch tensor. The ending time, with the shape (1,). model_s: A pytorch tensor. The model function evaluated at time `s`. If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it. return_intermediate: A `bool`. If true, also return the model value at time `s`. Returns: x_t: A pytorch tensor. The approximated solution at time `t`. """ ns = self.noise_schedule dims = x.dim() lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t) h = lambda_t - lambda_s log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t) sigma_s, sigma_t = ns.marginal_std(s), ns.marginal_std(t) alpha_t = torch.exp(log_alpha_t) if self.algorithm_type == "dpmsolver++": phi_1 = torch.expm1(-h) if model_s is None: model_s = self.model_fn(x, s, current, cache_dic) x_t = ( sigma_t / sigma_s * x - alpha_t * phi_1 * model_s ) else: phi_1 = torch.expm1(h) if model_s is None: model_s = self.model_fn(x, s, current, cache_dic) x_t = ( torch.exp(log_alpha_t - log_alpha_s) * x - (sigma_t * phi_1) * model_s ) return (x_t, {'model_s': model_s}) if return_intermediate else x_t def singlestep_dpm_solver_second_update(self, x, s, t, current, cache_dic, r1=0.5, model_s=None, return_intermediate=False, solver_type='dpmsolver'): """ Singlestep solver DPM-Solver-2 from time `s` to time `t`. Args: x: A pytorch tensor. The initial value at time `s`. s: A pytorch tensor. The starting time, with the shape (1,). t: A pytorch tensor. The ending time, with the shape (1,). r1: A `float`. The hyperparameter of the second-order solver. model_s: A pytorch tensor. The model function evaluated at time `s`. If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it. return_intermediate: A `bool`. If true, also return the model value at time `s` and `s1` (the intermediate time). solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers. The type slightly impacts the performance. We recommend to use 'dpmsolver' type. Returns: x_t: A pytorch tensor. The approximated solution at time `t`. """ if solver_type not in ['dpmsolver', 'taylor']: raise ValueError( f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}" ) if r1 is None: r1 = 0.5 ns = self.noise_schedule lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t) h = lambda_t - lambda_s lambda_s1 = lambda_s + r1 * h s1 = ns.inverse_lambda(lambda_s1) log_alpha_s, log_alpha_s1, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff( s1), ns.marginal_log_mean_coeff(t) sigma_s, sigma_s1, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(t) alpha_s1, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_t) if self.algorithm_type == "dpmsolver++": phi_11 = torch.expm1(-r1 * h) phi_1 = torch.expm1(-h) if model_s is None: model_s = self.model_fn(x, s, current, cache_dic) x_s1 = ( (sigma_s1 / sigma_s) * x - (alpha_s1 * phi_11) * model_s ) model_s1 = self.model_fn(x_s1, s1, current, cache_dic) if solver_type == 'dpmsolver': x_t = ( (sigma_t / sigma_s) * x - (alpha_t * phi_1) * model_s - (0.5 / r1) * (alpha_t * phi_1) * (model_s1 - model_s) ) elif solver_type == 'taylor': x_t = ( (sigma_t / sigma_s) * x - (alpha_t * phi_1) * model_s + (1. / r1) * (alpha_t * (phi_1 / h + 1.)) * (model_s1 - model_s) ) else: phi_11 = torch.expm1(r1 * h) phi_1 = torch.expm1(h) if model_s is None: model_s = self.model_fn(x, s, current, cache_dic) x_s1 = ( torch.exp(log_alpha_s1 - log_alpha_s) * x - (sigma_s1 * phi_11) * model_s ) model_s1 = self.model_fn(x_s1, s1, current, cache_dic) if solver_type == 'dpmsolver': x_t = ( torch.exp(log_alpha_t - log_alpha_s) * x - (sigma_t * phi_1) * model_s - (0.5 / r1) * (sigma_t * phi_1) * (model_s1 - model_s) ) elif solver_type == 'taylor': x_t = ( torch.exp(log_alpha_t - log_alpha_s) * x - (sigma_t * phi_1) * model_s - (1. / r1) * (sigma_t * (phi_1 / h - 1.)) * (model_s1 - model_s) ) if return_intermediate: return x_t, {'model_s': model_s, 'model_s1': model_s1} else: return x_t def singlestep_dpm_solver_third_update(self, x, s, t, current, cache_dic, r1=1. / 3., r2=2. / 3., model_s=None, model_s1=None, return_intermediate=False, solver_type='dpmsolver'): """ Singlestep solver DPM-Solver-3 from time `s` to time `t`. Args: x: A pytorch tensor. The initial value at time `s`. s: A pytorch tensor. The starting time, with the shape (1,). t: A pytorch tensor. The ending time, with the shape (1,). r1: A `float`. The hyperparameter of the third-order solver. r2: A `float`. The hyperparameter of the third-order solver. model_s: A pytorch tensor. The model function evaluated at time `s`. If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it. model_s1: A pytorch tensor. The model function evaluated at time `s1` (the intermediate time given by `r1`). If `model_s1` is None, we evaluate the model at `s1`; otherwise we directly use it. return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times). solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers. The type slightly impacts the performance. We recommend to use 'dpmsolver' type. Returns: x_t: A pytorch tensor. The approximated solution at time `t`. """ if solver_type not in ['dpmsolver', 'taylor']: raise ValueError( f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}" ) if r1 is None: r1 = 1. / 3. if r2 is None: r2 = 2. / 3. ns = self.noise_schedule lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t) h = lambda_t - lambda_s lambda_s1 = lambda_s + r1 * h lambda_s2 = lambda_s + r2 * h s1 = ns.inverse_lambda(lambda_s1) s2 = ns.inverse_lambda(lambda_s2) log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = ns.marginal_log_mean_coeff( s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(s2), ns.marginal_log_mean_coeff(t) sigma_s, sigma_s1, sigma_s2, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std( s2), ns.marginal_std(t) alpha_s1, alpha_s2, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_s2), torch.exp(log_alpha_t) if self.algorithm_type == "dpmsolver++": phi_11 = torch.expm1(-r1 * h) phi_12 = torch.expm1(-r2 * h) phi_1 = torch.expm1(-h) phi_22 = torch.expm1(-r2 * h) / (r2 * h) + 1. phi_2 = phi_1 / h + 1. phi_3 = phi_2 / h - 0.5 if model_s is None: model_s = self.model_fn(x, s, current, cache_dic) if model_s1 is None: x_s1 = ( (sigma_s1 / sigma_s) * x - (alpha_s1 * phi_11) * model_s ) model_s1 = self.model_fn(x_s1, s1, current, cache_dic) x_s2 = ( (sigma_s2 / sigma_s) * x - (alpha_s2 * phi_12) * model_s + r2 / r1 * (alpha_s2 * phi_22) * (model_s1 - model_s) ) model_s2 = self.model_fn(x_s2, s2, current, cache_dic) if solver_type == 'dpmsolver': x_t = ( (sigma_t / sigma_s) * x - (alpha_t * phi_1) * model_s + (1. / r2) * (alpha_t * phi_2) * (model_s2 - model_s) ) elif solver_type == 'taylor': D1_0 = (1. / r1) * (model_s1 - model_s) D1_1 = (1. / r2) * (model_s2 - model_s) D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1) D2 = 2. * (D1_1 - D1_0) / (r2 - r1) x_t = ( (sigma_t / sigma_s) * x - (alpha_t * phi_1) * model_s + (alpha_t * phi_2) * D1 - (alpha_t * phi_3) * D2 ) else: phi_11 = torch.expm1(r1 * h) phi_12 = torch.expm1(r2 * h) phi_1 = torch.expm1(h) phi_22 = torch.expm1(r2 * h) / (r2 * h) - 1. phi_2 = phi_1 / h - 1. phi_3 = phi_2 / h - 0.5 if model_s is None: model_s = self.model_fn(x, s, current, cache_dic) if model_s1 is None: x_s1 = ( (torch.exp(log_alpha_s1 - log_alpha_s)) * x - (sigma_s1 * phi_11) * model_s ) model_s1 = self.model_fn(x_s1, s1, current, cache_dic) x_s2 = ( (torch.exp(log_alpha_s2 - log_alpha_s)) * x - (sigma_s2 * phi_12) * model_s - r2 / r1 * (sigma_s2 * phi_22) * (model_s1 - model_s) ) model_s2 = self.model_fn(x_s2, s2, current, cache_dic) if solver_type == 'dpmsolver': x_t = ( (torch.exp(log_alpha_t - log_alpha_s)) * x - (sigma_t * phi_1) * model_s - (1. / r2) * (sigma_t * phi_2) * (model_s2 - model_s) ) elif solver_type == 'taylor': D1_0 = (1. / r1) * (model_s1 - model_s) D1_1 = (1. / r2) * (model_s2 - model_s) D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1) D2 = 2. * (D1_1 - D1_0) / (r2 - r1) x_t = ( (torch.exp(log_alpha_t - log_alpha_s)) * x - (sigma_t * phi_1) * model_s - (sigma_t * phi_2) * D1 - (sigma_t * phi_3) * D2 ) if return_intermediate: return x_t, {'model_s': model_s, 'model_s1': model_s1, 'model_s2': model_s2} else: return x_t def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t, solver_type="dpmsolver"): """ Multistep solver DPM-Solver-2 from time `t_prev_list[-1]` to time `t`. Args: x: A pytorch tensor. The initial value at time `s`. model_prev_list: A list of pytorch tensor. The previous computed model values. t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,) t: A pytorch tensor. The ending time, with the shape (1,). solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers. The type slightly impacts the performance. We recommend to use 'dpmsolver' type. Returns: x_t: A pytorch tensor. The approximated solution at time `t`. """ if solver_type not in ['dpmsolver', 'taylor']: raise ValueError( f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}" ) ns = self.noise_schedule model_prev_1, model_prev_0 = model_prev_list[-2], model_prev_list[-1] t_prev_1, t_prev_0 = t_prev_list[-2], t_prev_list[-1] lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_1), ns.marginal_lambda( t_prev_0), ns.marginal_lambda(t) log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t) sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t) alpha_t = torch.exp(log_alpha_t) h_0 = lambda_prev_0 - lambda_prev_1 h = lambda_t - lambda_prev_0 r0 = h_0 / h D1_0 = (1. / r0) * (model_prev_0 - model_prev_1) if self.algorithm_type == "dpmsolver++": phi_1 = torch.expm1(-h) if solver_type == 'dpmsolver': x_t = ( (sigma_t / sigma_prev_0) * x - (alpha_t * phi_1) * model_prev_0 - 0.5 * (alpha_t * phi_1) * D1_0 ) elif solver_type == 'taylor': x_t = ( (sigma_t / sigma_prev_0) * x - (alpha_t * phi_1) * model_prev_0 + (alpha_t * (phi_1 / h + 1.)) * D1_0 ) else: phi_1 = torch.expm1(h) if solver_type == 'dpmsolver': x_t = ( (torch.exp(log_alpha_t - log_alpha_prev_0)) * x - (sigma_t * phi_1) * model_prev_0 - 0.5 * (sigma_t * phi_1) * D1_0 ) elif solver_type == 'taylor': x_t = ( (torch.exp(log_alpha_t - log_alpha_prev_0)) * x - (sigma_t * phi_1) * model_prev_0 - (sigma_t * (phi_1 / h - 1.)) * D1_0 ) return x_t def multistep_dpm_solver_third_update(self, x, model_prev_list, t_prev_list, t, solver_type='dpmsolver'): """ Multistep solver DPM-Solver-3 from time `t_prev_list[-1]` to time `t`. Args: x: A pytorch tensor. The initial value at time `s`. model_prev_list: A list of pytorch tensor. The previous computed model values. t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,) t: A pytorch tensor. The ending time, with the shape (1,). solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers. The type slightly impacts the performance. We recommend to use 'dpmsolver' type. Returns: x_t: A pytorch tensor. The approximated solution at time `t`. """ ns = self.noise_schedule model_prev_2, model_prev_1, model_prev_0 = model_prev_list t_prev_2, t_prev_1, t_prev_0 = t_prev_list lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_2), ns.marginal_lambda( t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t) log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t) sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t) alpha_t = torch.exp(log_alpha_t) h_1 = lambda_prev_1 - lambda_prev_2 h_0 = lambda_prev_0 - lambda_prev_1 h = lambda_t - lambda_prev_0 r0, r1 = h_0 / h, h_1 / h D1_0 = (1. / r0) * (model_prev_0 - model_prev_1) D1_1 = (1. / r1) * (model_prev_1 - model_prev_2) D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1) D2 = (1. / (r0 + r1)) * (D1_0 - D1_1) if self.algorithm_type == "dpmsolver++": phi_1 = torch.expm1(-h) phi_2 = phi_1 / h + 1. phi_3 = phi_2 / h - 0.5 return ( (sigma_t / sigma_prev_0) * x - (alpha_t * phi_1) * model_prev_0 + (alpha_t * phi_2) * D1 - (alpha_t * phi_3) * D2 ) else: phi_1 = torch.expm1(h) phi_2 = phi_1 / h - 1. phi_3 = phi_2 / h - 0.5 return ( (torch.exp(log_alpha_t - log_alpha_prev_0)) * x - (sigma_t * phi_1) * model_prev_0 - (sigma_t * phi_2) * D1 - (sigma_t * phi_3) * D2 ) def singlestep_dpm_solver_update(self, x, s, t, current, cache_dic, order, return_intermediate=False, solver_type='dpmsolver', r1=None, r2=None): """ Singlestep DPM-Solver with the order `order` from time `s` to time `t`. Args: x: A pytorch tensor. The initial value at time `s`. s: A pytorch tensor. The starting time, with the shape (1,). t: A pytorch tensor. The ending time, with the shape (1,). order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3. return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times). solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers. The type slightly impacts the performance. We recommend to use 'dpmsolver' type. r1: A `float`. The hyperparameter of the second-order or third-order solver. r2: A `float`. The hyperparameter of the third-order solver. Returns: x_t: A pytorch tensor. The approximated solution at time `t`. """ if order == 1: return self.dpm_solver_first_update(x, s, t, current, cache_dic, return_intermediate=return_intermediate) elif order == 2: return self.singlestep_dpm_solver_second_update(x, s, t, current, cache_dic, return_intermediate=return_intermediate, solver_type=solver_type, r1=r1) elif order == 3: return self.singlestep_dpm_solver_third_update(x, s, t, current, cache_dic, return_intermediate=return_intermediate, solver_type=solver_type, r1=r1, r2=r2) else: raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}") def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, current, cache_dic, order, solver_type='dpmsolver'): """ Multistep DPM-Solver with the order `order` from time `t_prev_list[-1]` to time `t`. Args: x: A pytorch tensor. The initial value at time `s`. model_prev_list: A list of pytorch tensor. The previous computed model values. t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,) t: A pytorch tensor. The ending time, with the shape (1,). order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3. solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers. The type slightly impacts the performance. We recommend to use 'dpmsolver' type. Returns: x_t: A pytorch tensor. The approximated solution at time `t`. """ if order == 1: return self.dpm_solver_first_update(x, t_prev_list[-1], t, current, cache_dic, model_s=model_prev_list[-1]) elif order == 2: return self.multistep_dpm_solver_second_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type) elif order == 3: return self.multistep_dpm_solver_third_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type) else: raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}") def dpm_solver_adaptive(self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol=0.05, theta=0.9, t_err=1e-5, solver_type='dpmsolver'): """ The adaptive step size solver based on singlestep DPM-Solver. Args: x: A pytorch tensor. The initial value at time `t_T`. order: A `int`. The (higher) order of the solver. We only support order == 2 or 3. t_T: A `float`. The starting time of the sampling (default is T). t_0: A `float`. The ending time of the sampling (default is epsilon). h_init: A `float`. The initial step size (for logSNR). atol: A `float`. The absolute tolerance of the solver. For image data, the default setting is 0.0078, followed [1]. rtol: A `float`. The relative tolerance of the solver. The default setting is 0.05. theta: A `float`. The safety hyperparameter for adapting the step size. The default setting is 0.9, followed [1]. t_err: A `float`. The tolerance for the time. We solve the diffusion ODE until the absolute error between the current time and `t_0` is less than `t_err`. The default setting is 1e-5. solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers. The type slightly impacts the performance. We recommend to use 'dpmsolver' type. Returns: x_0: A pytorch tensor. The approximated solution at time `t_0`. [1] A. Jolicoeur-Martineau, K. Li, R. Piché-Taillefer, T. Kachman, and I. Mitliagkas, "Gotta go fast when generating data with score-based models," arXiv preprint arXiv:2105.14080, 2021. """ ns = self.noise_schedule s = t_T * torch.ones((1,)).to(x) lambda_s = ns.marginal_lambda(s) lambda_0 = ns.marginal_lambda(t_0 * torch.ones_like(s).to(x)) h = h_init * torch.ones_like(s).to(x) x_prev = x nfe = 0 if order == 2: r1 = 0.5 lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True) higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, solver_type=solver_type, **kwargs) elif order == 3: r1, r2 = 1. / 3., 2. / 3. lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, return_intermediate=True, solver_type=solver_type) higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs) else: raise ValueError( f"For adaptive step size solver, order must be 2 or 3, got {order}" ) while torch.abs((s - t_0)).mean() > t_err: t = ns.inverse_lambda(lambda_s + h) x_lower, lower_noise_kwargs = lower_update(x, s, t) x_higher = higher_update(x, s, t, **lower_noise_kwargs) delta = torch.max(torch.ones_like(x).to(x) * atol, rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev))) norm_fn = lambda v: torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True)) E = norm_fn((x_higher - x_lower) / delta).max() if torch.all(E <= 1.): x = x_higher s = t x_prev = x_lower lambda_s = ns.marginal_lambda(s) h = torch.min(theta * h * torch.float_power(E, -1. / order).float(), lambda_0 - lambda_s) nfe += order print('adaptive solver nfe', nfe) return x def add_noise(self, x, t, noise=None): """ Compute the noised input xt = alpha_t * x + sigma_t * noise. Args: x: A `torch.Tensor` with shape `(batch_size, *shape)`. t: A `torch.Tensor` with shape `(t_size,)`. Returns: xt with shape `(t_size, batch_size, *shape)`. """ alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t) if noise is None: noise = torch.randn((t.shape[0], *x.shape), device=x.device) x = x.reshape((-1, *x.shape)) xt = expand_dims(alpha_t, x.dim()) * x + expand_dims(sigma_t, x.dim()) * noise return xt.squeeze(0) if t.shape[0] == 1 else xt def inverse(self, x, steps=20, t_start=None, t_end=None, order=2, skip_type='time_uniform', method='multistep', lower_order_final=True, denoise_to_zero=False, solver_type='dpmsolver', atol=0.0078, rtol=0.05, return_intermediate=False, ): """ Inverse the sample `x` from time `t_start` to `t_end` by DPM-Solver. For discrete-time DPMs, we use `t_start=1/N`, where `N` is the total time steps during training. """ t_0 = 1. / self.noise_schedule.total_N if t_start is None else t_start t_T = self.noise_schedule.T if t_end is None else t_end assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" return self.sample(x, steps=steps, t_start=t_0, t_end=t_T, order=order, skip_type=skip_type, method=method, lower_order_final=lower_order_final, denoise_to_zero=denoise_to_zero, solver_type=solver_type, atol=atol, rtol=rtol, return_intermediate=return_intermediate) def sample(self, x, steps=20, t_start=None, t_end=None, order=2, skip_type='time_uniform', method='multistep', lower_order_final=True, denoise_to_zero=False, solver_type='dpmsolver', atol=0.0078, rtol=0.05, return_intermediate=False, model_kwargs = {}, rank = None, ): """ Compute the sample at time `t_end` by DPM-Solver, given the initial `x` at time `t_start`. ===================================================== We support the following algorithms for both noise prediction model and data prediction model: - 'singlestep': Singlestep DPM-Solver (i.e. "DPM-Solver-fast" in the paper), which combines different orders of singlestep DPM-Solver. We combine all the singlestep solvers with order <= `order` to use up all the function evaluations (steps). The total number of function evaluations (NFE) == `steps`. Given a fixed NFE == `steps`, the sampling procedure is: - If `order` == 1: - Denote K = steps. We use K steps of DPM-Solver-1 (i.e. DDIM). - If `order` == 2: - Denote K = (steps // 2) + (steps % 2). We take K intermediate time steps for sampling. - If steps % 2 == 0, we use K steps of singlestep DPM-Solver-2. - If steps % 2 == 1, we use (K - 1) steps of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1. - If `order` == 3: - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling. - If steps % 3 == 0, we use (K - 2) steps of singlestep DPM-Solver-3, and 1 step of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1. - If steps % 3 == 1, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of DPM-Solver-1. - If steps % 3 == 2, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of singlestep DPM-Solver-2. - 'multistep': Multistep DPM-Solver with the order of `order`. The total number of function evaluations (NFE) == `steps`. We initialize the first `order` values by lower order multistep solvers. Given a fixed NFE == `steps`, the sampling procedure is: Denote K = steps. - If `order` == 1: - We use K steps of DPM-Solver-1 (i.e. DDIM). - If `order` == 2: - We firstly use 1 step of DPM-Solver-1, then use (K - 1) step of multistep DPM-Solver-2. - If `order` == 3: - We firstly use 1 step of DPM-Solver-1, then 1 step of multistep DPM-Solver-2, then (K - 2) step of multistep DPM-Solver-3. - 'singlestep_fixed': Fixed order singlestep DPM-Solver (i.e. DPM-Solver-1 or singlestep DPM-Solver-2 or singlestep DPM-Solver-3). We use singlestep DPM-Solver-`order` for `order`=1 or 2 or 3, with total [`steps` // `order`] * `order` NFE. - 'adaptive': Adaptive step size DPM-Solver (i.e. "DPM-Solver-12" and "DPM-Solver-23" in the paper). We ignore `steps` and use adaptive step size DPM-Solver with a higher order of `order`. You can adjust the absolute tolerance `atol` and the relative tolerance `rtol` to balance the computatation costs (NFE) and the sample quality. - If `order` == 2, we use DPM-Solver-12 which combines DPM-Solver-1 and singlestep DPM-Solver-2. - If `order` == 3, we use DPM-Solver-23 which combines singlestep DPM-Solver-2 and singlestep DPM-Solver-3. ===================================================== Some advices for choosing the algorithm: - For **unconditional sampling** or **guided sampling with small guidance scale** by DPMs: Use singlestep DPM-Solver or DPM-Solver++ ("DPM-Solver-fast" in the paper) with `order = 3`. e.g., DPM-Solver: >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver") >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3, skip_type='time_uniform', method='singlestep') e.g., DPM-Solver++: >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++") >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3, skip_type='time_uniform', method='singlestep') - For **guided sampling with large guidance scale** by DPMs: Use multistep DPM-Solver with `algorithm_type="dpmsolver++"` and `order = 2`. e.g. >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++") >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=2, skip_type='time_uniform', method='multistep') We support three types of `skip_type`: - 'logSNR': uniform logSNR for the time steps. **Recommended for low-resolutional images** - 'time_uniform': uniform time for the time steps. **Recommended for high-resolutional images**. - 'time_quadratic': quadratic time for the time steps. ===================================================== Args: x: A pytorch tensor. The initial value at time `t_start` e.g. if `t_start` == T, then `x` is a sample from the standard normal distribution. steps: A `int`. The total number of function evaluations (NFE). t_start: A `float`. The starting time of the sampling. If `T` is None, we use self.noise_schedule.T (default is 1.0). t_end: A `float`. The ending time of the sampling. If `t_end` is None, we use 1. / self.noise_schedule.total_N. e.g. if total_N == 1000, we have `t_end` == 1e-3. For discrete-time DPMs: - We recommend `t_end` == 1. / self.noise_schedule.total_N. For continuous-time DPMs: - We recommend `t_end` == 1e-3 when `steps` <= 15; and `t_end` == 1e-4 when `steps` > 15. order: A `int`. The order of DPM-Solver. skip_type: A `str`. The type for the spacing of the time steps. 'time_uniform' or 'logSNR' or 'time_quadratic'. method: A `str`. The method for sampling. 'singlestep' or 'multistep' or 'singlestep_fixed' or 'adaptive'. denoise_to_zero: A `bool`. Whether to denoise to time 0 at the final step. Default is `False`. If `denoise_to_zero` is `True`, the total NFE is (`steps` + 1). This trick is firstly proposed by DDPM (https://arxiv.org/abs/2006.11239) and score_sde (https://arxiv.org/abs/2011.13456). Such trick can improve the FID for diffusion models sampling by diffusion SDEs for low-resolutional images (such as CIFAR-10). However, we observed that such trick does not matter for high-resolutional images. As it needs an additional NFE, we do not recommend it for high-resolutional images. lower_order_final: A `bool`. Whether to use lower order solvers at the final steps. Only valid for `method=multistep` and `steps < 15`. We empirically find that this trick is a key to stabilizing the sampling by DPM-Solver with very few steps (especially for steps <= 10). So we recommend to set it to be `True`. solver_type: A `str`. The taylor expansion type for the solver. `dpmsolver` or `taylor`. We recommend `dpmsolver`. atol: A `float`. The absolute tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'. rtol: A `float`. The relative tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'. return_intermediate: A `bool`. Whether to save the xt at each step. When set to `True`, method returns a tuple (x0, intermediates); when set to False, method returns only x0. Returns: x_end: A pytorch tensor. The approximated solution at time `t_end`. """ t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end t_T = self.noise_schedule.T if t_start is None else t_start assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" if return_intermediate: assert method in ['multistep', 'singlestep', 'singlestep_fixed'], "Cannot use adaptive solver when saving intermediate values" if self.correcting_xt_fn is not None: assert method in ['multistep', 'singlestep', 'singlestep_fixed'], "Cannot use adaptive solver when correcting_xt_fn is not None" device = x.device intermediates = [] cache_dic, current = cache_init(model_kwargs=model_kwargs, num_steps=steps) with torch.no_grad(): if method == 'adaptive': x = self.dpm_solver_adaptive(x, order=order, t_T=t_T, t_0=t_0, atol=atol, rtol=rtol, solver_type=solver_type) elif method == 'multistep': assert steps >= order timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device) assert timesteps.shape[0] - 1 == steps # Init the initial values. step = 0 current['step'] = step t = timesteps[step] t_prev_list = [t] model_prev_list = [self.model_fn(x, t, current, cache_dic)] if self.correcting_xt_fn is not None: x = self.correcting_xt_fn(x, t, step) if return_intermediate: intermediates.append(x) # Init the first `order` values by lower order multistep DPM-Solver. for step in range(1, order): current['step'] = step t = timesteps[step] x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, t, current, cache_dic, step, solver_type=solver_type) if self.correcting_xt_fn is not None: x = self.correcting_xt_fn(x, t, step) if return_intermediate: intermediates.append(x) t_prev_list.append(t) model_prev_list.append(self.model_fn(x, t, current, cache_dic)) # Compute the remaining values by `order`-th order multistep DPM-Solver. pbar = tqdm(range(order, steps + 1), leave=False) if (rank == 0) or (rank == None) else range(order, steps + 1) for step in pbar: current['step'] = step t = timesteps[step] # We only use lower order for steps < 10 if lower_order_final and steps < 10: step_order = min(order, steps + 1 - step) else: step_order = order x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, t, current, cache_dic, step_order, solver_type=solver_type) if self.correcting_xt_fn is not None: x = self.correcting_xt_fn(x, t, step) if return_intermediate: intermediates.append(x) for i in range(order - 1): t_prev_list[i] = t_prev_list[i + 1] model_prev_list[i] = model_prev_list[i + 1] t_prev_list[-1] = t # We do not need to evaluate the final model value. if step < steps: model_prev_list[-1] = self.model_fn(x, t, current, cache_dic) elif method in ['singlestep', 'singlestep_fixed']: if method == 'singlestep': timesteps_outer, orders = self.get_orders_and_timesteps_for_singlestep_solver(steps=steps, order=order, skip_type=skip_type, t_T=t_T, t_0=t_0, device=device) elif method == 'singlestep_fixed': K = steps // order orders = [order, ] * K timesteps_outer = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=K, device=device) for step, order in enumerate(orders): s, t = timesteps_outer[step], timesteps_outer[step + 1] timesteps_inner = self.get_time_steps(skip_type=skip_type, t_T=s.item(), t_0=t.item(), N=order, device=device) lambda_inner = self.noise_schedule.marginal_lambda(timesteps_inner) h = lambda_inner[-1] - lambda_inner[0] r1 = None if order <= 1 else (lambda_inner[1] - lambda_inner[0]) / h r2 = None if order <= 2 else (lambda_inner[2] - lambda_inner[0]) / h x = self.singlestep_dpm_solver_update(x, s, t, order, solver_type=solver_type, r1=r1, r2=r2) if self.correcting_xt_fn is not None: x = self.correcting_xt_fn(x, t, step) if return_intermediate: intermediates.append(x) else: raise ValueError(f"Got wrong method {method}") if denoise_to_zero: t = torch.ones((1,)).to(device) * t_0 x = self.denoise_to_zero_fn(x, t) if self.correcting_xt_fn is not None: x = self.correcting_xt_fn(x, t, step + 1) if return_intermediate: intermediates.append(x) return (x, intermediates) if return_intermediate else x ############################################################# # other utility functions ############################################################# def interpolate_fn(x, xp, yp): """ A piecewise linear function y = f(x), using xp and yp as keypoints. We implement f(x) in a differentiable way (i.e. applicable for autograd). The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.) Args: x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver). xp: PyTorch tensor with shape [C, K], where K is the number of keypoints. yp: PyTorch tensor with shape [C, K]. Returns: The function values f(x), with shape [N, C]. """ N, K = x.shape[0], xp.shape[1] all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2) sorted_all_x, x_indices = torch.sort(all_x, dim=2) x_idx = torch.argmin(x_indices, dim=2) cand_start_idx = x_idx - 1 start_idx = torch.where( torch.eq(x_idx, 0), torch.tensor(1, device=x.device), torch.where( torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx, ), ) end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1) start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2) end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2) start_idx2 = torch.where( torch.eq(x_idx, 0), torch.tensor(0, device=x.device), torch.where( torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx, ), ) y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1) start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2) end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2) return start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x) def expand_dims(v, dims): """ Expand the tensor `v` to the dim `dims`. Args: `v`: a PyTorch tensor with shape [N]. `dim`: a `int`. Returns: a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`. """ return v[(...,) + (None,) * (dims - 1)] ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/edm_sample.py ================================================ import random import numpy as np from tqdm import tqdm from diffusion.model.utils import * # ---------------------------------------------------------------------------- # Proposed EDM sampler (Algorithm 2). def edm_sampler( net, latents, class_labels=None, cfg_scale=None, randn_like=torch.randn_like, num_steps=18, sigma_min=0.002, sigma_max=80, rho=7, S_churn=0, S_min=0, S_max=float('inf'), S_noise=1, **kwargs ): # Adjust noise levels based on what's supported by the network. sigma_min = max(sigma_min, net.sigma_min) sigma_max = min(sigma_max, net.sigma_max) # Time step discretization. step_indices = torch.arange(num_steps, dtype=torch.float64, device=latents.device) t_steps = (sigma_max ** (1 / rho) + step_indices / (num_steps - 1) * ( sigma_min ** (1 / rho) - sigma_max ** (1 / rho))) ** rho t_steps = torch.cat([net.round_sigma(t_steps), torch.zeros_like(t_steps[:1])]) # t_N = 0 # Main sampling loop. x_next = latents.to(torch.float64) * t_steps[0] for i, (t_cur, t_next) in tqdm(list(enumerate(zip(t_steps[:-1], t_steps[1:])))): # 0, ..., N-1 x_cur = x_next # Increase noise temporarily. gamma = min(S_churn / num_steps, np.sqrt(2) - 1) if S_min <= t_cur <= S_max else 0 t_hat = net.round_sigma(t_cur + gamma * t_cur) x_hat = x_cur + (t_hat ** 2 - t_cur ** 2).sqrt() * S_noise * randn_like(x_cur) # Euler step. denoised = net(x_hat.float(), t_hat, class_labels, cfg_scale, **kwargs)['x'].to(torch.float64) d_cur = (x_hat - denoised) / t_hat x_next = x_hat + (t_next - t_hat) * d_cur # Apply 2nd order correction. if i < num_steps - 1: denoised = net(x_next.float(), t_next, class_labels, cfg_scale, **kwargs)['x'].to(torch.float64) d_prime = (x_next - denoised) / t_next x_next = x_hat + (t_next - t_hat) * (0.5 * d_cur + 0.5 * d_prime) return x_next # ---------------------------------------------------------------------------- # Generalized ablation sampler, representing the superset of all sampling # methods discussed in the paper. def ablation_sampler( net, latents, class_labels=None, cfg_scale=None, feat=None, randn_like=torch.randn_like, num_steps=18, sigma_min=None, sigma_max=None, rho=7, solver='heun', discretization='edm', schedule='linear', scaling='none', epsilon_s=1e-3, C_1=0.001, C_2=0.008, M=1000, alpha=1, S_churn=0, S_min=0, S_max=float('inf'), S_noise=1, ): assert solver in ['euler', 'heun'] assert discretization in ['vp', 've', 'iddpm', 'edm'] assert schedule in ['vp', 've', 'linear'] assert scaling in ['vp', 'none'] # Helper functions for VP & VE noise level schedules. vp_sigma = lambda beta_d, beta_min: lambda t: (np.e ** (0.5 * beta_d * (t ** 2) + beta_min * t) - 1) ** 0.5 vp_sigma_deriv = lambda beta_d, beta_min: lambda t: 0.5 * (beta_min + beta_d * t) * (sigma(t) + 1 / sigma(t)) vp_sigma_inv = lambda beta_d, beta_min: lambda sigma: ((beta_min ** 2 + 2 * beta_d * ( sigma ** 2 + 1).log()).sqrt() - beta_min) / beta_d ve_sigma = lambda t: t.sqrt() ve_sigma_deriv = lambda t: 0.5 / t.sqrt() ve_sigma_inv = lambda sigma: sigma ** 2 # Select default noise level range based on the specified time step discretization. if sigma_min is None: vp_def = vp_sigma(beta_d=19.1, beta_min=0.1)(t=epsilon_s) sigma_min = {'vp': vp_def, 've': 0.02, 'iddpm': 0.002, 'edm': 0.002}[discretization] if sigma_max is None: vp_def = vp_sigma(beta_d=19.1, beta_min=0.1)(t=1) sigma_max = {'vp': vp_def, 've': 100, 'iddpm': 81, 'edm': 80}[discretization] # Adjust noise levels based on what's supported by the network. sigma_min = max(sigma_min, net.sigma_min) sigma_max = min(sigma_max, net.sigma_max) # Compute corresponding betas for VP. vp_beta_d = 2 * (np.log(sigma_min ** 2 + 1) / epsilon_s - np.log(sigma_max ** 2 + 1)) / (epsilon_s - 1) vp_beta_min = np.log(sigma_max ** 2 + 1) - 0.5 * vp_beta_d # Define time steps in terms of noise level. step_indices = torch.arange(num_steps, dtype=torch.float64, device=latents.device) if discretization == 'vp': orig_t_steps = 1 + step_indices / (num_steps - 1) * (epsilon_s - 1) sigma_steps = vp_sigma(vp_beta_d, vp_beta_min)(orig_t_steps) elif discretization == 've': orig_t_steps = (sigma_max ** 2) * ((sigma_min ** 2 / sigma_max ** 2) ** (step_indices / (num_steps - 1))) sigma_steps = ve_sigma(orig_t_steps) elif discretization == 'iddpm': u = torch.zeros(M + 1, dtype=torch.float64, device=latents.device) alpha_bar = lambda j: (0.5 * np.pi * j / M / (C_2 + 1)).sin() ** 2 for j in torch.arange(M, 0, -1, device=latents.device): # M, ..., 1 u[j - 1] = ((u[j] ** 2 + 1) / (alpha_bar(j - 1) / alpha_bar(j)).clip(min=C_1) - 1).sqrt() u_filtered = u[torch.logical_and(u >= sigma_min, u <= sigma_max)] sigma_steps = u_filtered[((len(u_filtered) - 1) / (num_steps - 1) * step_indices).round().to(torch.int64)] else: assert discretization == 'edm' sigma_steps = (sigma_max ** (1 / rho) + step_indices / (num_steps - 1) * ( sigma_min ** (1 / rho) - sigma_max ** (1 / rho))) ** rho # Define noise level schedule. if schedule == 'vp': sigma = vp_sigma(vp_beta_d, vp_beta_min) sigma_deriv = vp_sigma_deriv(vp_beta_d, vp_beta_min) sigma_inv = vp_sigma_inv(vp_beta_d, vp_beta_min) elif schedule == 've': sigma = ve_sigma sigma_deriv = ve_sigma_deriv sigma_inv = ve_sigma_inv else: assert schedule == 'linear' sigma = lambda t: t sigma_deriv = lambda t: 1 sigma_inv = lambda sigma: sigma # Define scaling schedule. if scaling == 'vp': s = lambda t: 1 / (1 + sigma(t) ** 2).sqrt() s_deriv = lambda t: -sigma(t) * sigma_deriv(t) * (s(t) ** 3) else: assert scaling == 'none' s = lambda t: 1 s_deriv = lambda t: 0 # Compute final time steps based on the corresponding noise levels. t_steps = sigma_inv(net.round_sigma(sigma_steps)) t_steps = torch.cat([t_steps, torch.zeros_like(t_steps[:1])]) # t_N = 0 # Main sampling loop. t_next = t_steps[0] x_next = latents.to(torch.float64) * (sigma(t_next) * s(t_next)) for i, (t_cur, t_next) in enumerate(zip(t_steps[:-1], t_steps[1:])): # 0, ..., N-1 x_cur = x_next # Increase noise temporarily. gamma = min(S_churn / num_steps, np.sqrt(2) - 1) if S_min <= sigma(t_cur) <= S_max else 0 t_hat = sigma_inv(net.round_sigma(sigma(t_cur) + gamma * sigma(t_cur))) x_hat = s(t_hat) / s(t_cur) * x_cur + (sigma(t_hat) ** 2 - sigma(t_cur) ** 2).clip(min=0).sqrt() * s( t_hat) * S_noise * randn_like(x_cur) # Euler step. h = t_next - t_hat denoised = net(x_hat.float() / s(t_hat), sigma(t_hat), class_labels, cfg_scale, feat=feat)['x'].to( torch.float64) d_cur = (sigma_deriv(t_hat) / sigma(t_hat) + s_deriv(t_hat) / s(t_hat)) * x_hat - sigma_deriv(t_hat) * s( t_hat) / sigma(t_hat) * denoised x_prime = x_hat + alpha * h * d_cur t_prime = t_hat + alpha * h # Apply 2nd order correction. if solver == 'euler' or i == num_steps - 1: x_next = x_hat + h * d_cur else: assert solver == 'heun' denoised = net(x_prime.float() / s(t_prime), sigma(t_prime), class_labels, cfg_scale, feat=feat)['x'].to( torch.float64) d_prime = (sigma_deriv(t_prime) / sigma(t_prime) + s_deriv(t_prime) / s(t_prime)) * x_prime - sigma_deriv( t_prime) * s(t_prime) / sigma(t_prime) * denoised x_next = x_hat + h * ((1 - 1 / (2 * alpha)) * d_cur + 1 / (2 * alpha) * d_prime) return x_next ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/gaussian_diffusion.py ================================================ # Modified from OpenAI's diffusion repos # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py import enum import math import numpy as np import torch as th import torch.nn.functional as F from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl from .cache_functions import cache_init def mean_flat(tensor): """ Take the mean over all non-batch dimensions. """ return tensor.mean(dim=list(range(1, len(tensor.shape)))) class ModelMeanType(enum.Enum): """ Which type of output the model predicts. """ PREVIOUS_X = enum.auto() # the model predicts x_{t-1} START_X = enum.auto() # the model predicts x_0 EPSILON = enum.auto() # the model predicts epsilon class ModelVarType(enum.Enum): """ What is used as the model's output variance. The LEARNED_RANGE option has been added to allow the model to predict values between FIXED_SMALL and FIXED_LARGE, making its job easier. """ LEARNED = enum.auto() FIXED_SMALL = enum.auto() FIXED_LARGE = enum.auto() LEARNED_RANGE = enum.auto() class LossType(enum.Enum): MSE = enum.auto() # use raw MSE loss (and KL when learning variances) RESCALED_MSE = ( enum.auto() ) # use raw MSE loss (with RESCALED_KL when learning variances) KL = enum.auto() # use the variational lower-bound RESCALED_KL = enum.auto() # like KL, but rescale to estimate the full VLB def is_vb(self): return self in [LossType.KL, LossType.RESCALED_KL] def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac): betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64) warmup_time = int(num_diffusion_timesteps * warmup_frac) betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64) return betas def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps): """ This is the deprecated API for creating beta schedules. See get_named_beta_schedule() for the new library of schedules. """ if beta_schedule == "quad": betas = ( np.linspace( beta_start ** 0.5, beta_end ** 0.5, num_diffusion_timesteps, dtype=np.float64, ) ** 2 ) elif beta_schedule == "linear": betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64) elif beta_schedule == "warmup10": betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1) elif beta_schedule == "warmup50": betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5) elif beta_schedule == "const": betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64) elif beta_schedule == "jsd": # 1/T, 1/(T-1), 1/(T-2), ..., 1 betas = 1.0 / np.linspace( num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64 ) else: raise NotImplementedError(beta_schedule) assert betas.shape == (num_diffusion_timesteps,) return betas def get_named_beta_schedule(schedule_name, num_diffusion_timesteps): """ Get a pre-defined beta schedule for the given name. The beta schedule library consists of beta schedules which remain similar in the limit of num_diffusion_timesteps. Beta schedules may be added, but should not be removed or changed once they are committed to maintain backwards compatibility. """ if schedule_name == "linear": # Linear schedule from Ho et al, extended to work for any number of # diffusion steps. scale = 1000 / num_diffusion_timesteps return get_beta_schedule( "linear", beta_start=scale * 0.0001, beta_end=scale * 0.02, num_diffusion_timesteps=num_diffusion_timesteps, ) elif schedule_name == "squaredcos_cap_v2": return betas_for_alpha_bar( num_diffusion_timesteps, lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2, ) else: raise NotImplementedError(f"unknown beta schedule: {schedule_name}") def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. :param num_diffusion_timesteps: the number of betas to produce. :param alpha_bar: a lambda that takes an argument t from 0 to 1 and produces the cumulative product of (1-beta) up to that part of the diffusion process. :param max_beta: the maximum beta to use; use values lower than 1 to prevent singularities. """ betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) return np.array(betas) class GaussianDiffusion: """ Utilities for training and sampling diffusion models. Original ported from this codebase: https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42 :param betas: a 1-D numpy array of betas for each diffusion timestep, starting at T and going to 1. """ def __init__( self, *, betas, model_mean_type, model_var_type, loss_type, snr=False, return_startx=False, ): self.model_mean_type = model_mean_type self.model_var_type = model_var_type self.loss_type = loss_type self.snr = snr self.return_startx = return_startx # Use float64 for accuracy. betas = np.array(betas, dtype=np.float64) self.betas = betas assert len(betas.shape) == 1, "betas must be 1-D" assert (betas > 0).all() and (betas <= 1).all() self.num_timesteps = int(betas.shape[0]) alphas = 1.0 - betas self.alphas_cumprod = np.cumprod(alphas, axis=0) self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1]) self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0) assert self.alphas_cumprod_prev.shape == (self.num_timesteps,) # calculations for diffusion q(x_t | x_{t-1}) and others self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod) self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod) self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod) self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod) self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1) # calculations for posterior q(x_{t-1} | x_t, x_0) self.posterior_variance = ( betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod) ) # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain self.posterior_log_variance_clipped = np.log( np.append(self.posterior_variance[1], self.posterior_variance[1:]) ) if len(self.posterior_variance) > 1 else np.array([]) self.posterior_mean_coef1 = ( betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod) ) self.posterior_mean_coef2 = ( (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod) ) def q_mean_variance(self, x_start, t): """ Get the distribution q(x_t | x_0). :param x_start: the [N x C x ...] tensor of noiseless inputs. :param t: the number of diffusion steps (minus 1). Here, 0 means one step. :return: A tuple (mean, variance, log_variance), all of x_start's shape. """ mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape) log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape) return mean, variance, log_variance def q_sample(self, x_start, t, noise=None): """ Diffuse the data for a given number of diffusion steps. In other words, sample from q(x_t | x_0). :param x_start: the initial data batch. :param t: the number of diffusion steps (minus 1). Here, 0 means one step. :param noise: if specified, the split-out normal noise. :return: A noisy version of x_start. """ if noise is None: noise = th.randn_like(x_start) assert noise.shape == x_start.shape return ( _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise ) def q_posterior_mean_variance(self, x_start, x_t, t): """ Compute the mean and variance of the diffusion posterior: q(x_{t-1} | x_t, x_0) """ assert x_start.shape == x_t.shape posterior_mean = ( _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t ) posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape) posterior_log_variance_clipped = _extract_into_tensor( self.posterior_log_variance_clipped, t, x_t.shape ) assert ( posterior_mean.shape[0] == posterior_variance.shape[0] == posterior_log_variance_clipped.shape[0] == x_start.shape[0] ) return posterior_mean, posterior_variance, posterior_log_variance_clipped def p_mean_variance(self, model, x, t, current, cache_dic, clip_denoised=True, denoised_fn=None, model_kwargs=None): """ Apply the model to get p(x_{t-1} | x_t), as well as a prediction of the initial x, x_0. :param model: the model, which takes a signal and a batch of timesteps as input. :param x: the [N x C x ...] tensor at time t. :param t: a 1-D Tensor of timesteps. :param clip_denoised: if True, clip the denoised signal into [-1, 1]. :param denoised_fn: if not None, a function which applies to the x_start prediction before it is used to sample. Applies before clip_denoised. :param model_kwargs: if not None, a dict of extra keyword arguments to pass to the model. This can be used for conditioning. :return: a dict with the following keys: - 'mean': the model mean output. - 'variance': the model variance output. - 'log_variance': the log of 'variance'. - 'pred_xstart': the prediction for x_0. """ if model_kwargs is None: model_kwargs = {} B, C = x.shape[:2] assert t.shape == (B,) model_output = model(x, t, current=current, cache_dic=cache_dic, **model_kwargs) if isinstance(model_output, tuple): model_output, extra = model_output else: extra = None if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]: assert model_output.shape == (B, C * 2, *x.shape[2:]) model_output, model_var_values = th.split(model_output, C, dim=1) min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape) max_log = _extract_into_tensor(np.log(self.betas), t, x.shape) # The model_var_values is [-1, 1] for [min_var, max_var]. frac = (model_var_values + 1) / 2 model_log_variance = frac * max_log + (1 - frac) * min_log model_variance = th.exp(model_log_variance) elif self.model_var_type in [ModelVarType.FIXED_LARGE, ModelVarType.FIXED_SMALL]: model_variance, model_log_variance = { # for fixedlarge, we set the initial (log-)variance like so # to get a better decoder log likelihood. ModelVarType.FIXED_LARGE: ( np.append(self.posterior_variance[1], self.betas[1:]), np.log(np.append(self.posterior_variance[1], self.betas[1:])), ), ModelVarType.FIXED_SMALL: ( self.posterior_variance, self.posterior_log_variance_clipped, ), }[self.model_var_type] model_variance = _extract_into_tensor(model_variance, t, x.shape) model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape) else: model_variance = th.zeros_like(model_output) model_log_variance = th.zeros_like(model_output) def process_xstart(x): if denoised_fn is not None: x = denoised_fn(x) return x.clamp(-1, 1) if clip_denoised else x if self.model_mean_type == ModelMeanType.START_X: pred_xstart = process_xstart(model_output) else: pred_xstart = process_xstart( self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output) ) model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t) assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape return { "mean": model_mean, "variance": model_variance, "log_variance": model_log_variance, "pred_xstart": pred_xstart, "extra": extra, } def _predict_xstart_from_eps(self, x_t, t, eps): assert x_t.shape == eps.shape return ( _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps ) def _predict_eps_from_xstart(self, x_t, t, pred_xstart): return ( _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None): """ Compute the mean for the previous step, given a function cond_fn that computes the gradient of a conditional log probability with respect to x. In particular, cond_fn computes grad(log(p(y|x))), and we want to condition on y. This uses the conditioning strategy from Sohl-Dickstein et al. (2015). """ gradient = cond_fn(x, t, **model_kwargs) return p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float() def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None): """ Compute what the p_mean_variance output would have been, should the model's score function be conditioned by cond_fn. See condition_mean() for details on cond_fn. Unlike condition_mean(), this instead uses the conditioning strategy from Song et al (2020). """ alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"]) eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs) out = p_mean_var.copy() out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps) out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t) return out def p_sample( self, model, x, t, current=None, cache_dic=None, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, ): """ Sample x_{t-1} from the model at the given timestep. :param model: the model to sample from. :param x: the current tensor at x_{t-1}. :param t: the value of t, starting at 0 for the first diffusion step. :param clip_denoised: if True, clip the x_start prediction to [-1, 1]. :param denoised_fn: if not None, a function which applies to the x_start prediction before it is used to sample. :param cond_fn: if not None, this is a gradient function that acts similarly to the model. :param model_kwargs: if not None, a dict of extra keyword arguments to pass to the model. This can be used for conditioning. :return: a dict containing the following keys: - 'sample': a random sample from the model. - 'pred_xstart': a prediction of x_0. """ out = self.p_mean_variance( model, x, t, current=current, cache_dic=cache_dic, clip_denoised=clip_denoised, denoised_fn=denoised_fn, model_kwargs=model_kwargs, ) noise = th.randn_like(x) nonzero_mask = ( (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) ) # no noise when t == 0 if cond_fn is not None: out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs) sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise return {"sample": sample, "pred_xstart": out["pred_xstart"]} def p_sample_loop( self, model, shape, noise=None, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, device=None, progress=False ): """ Generate samples from the model. :param model: the model module. :param shape: the shape of the samples, (N, C, H, W). :param noise: if specified, the noise from the encoder to sample. Should be of the same shape as `shape`. :param clip_denoised: if True, clip x_start predictions to [-1, 1]. :param denoised_fn: if not None, a function which applies to the x_start prediction before it is used to sample. :param cond_fn: if not None, this is a gradient function that acts similarly to the model. :param model_kwargs: if not None, a dict of extra keyword arguments to pass to the model. This can be used for conditioning. :param device: if specified, the device to create the samples on. If not specified, use a model parameter's device. :param progress: if True, show a tqdm progress bar. :return: a non-differentiable batch of samples. """ final = None for sample in self.p_sample_loop_progressive( model, shape, noise=noise, clip_denoised=clip_denoised, denoised_fn=denoised_fn, cond_fn=cond_fn, model_kwargs=model_kwargs, device=device, progress=progress ): final = sample return final["sample"] def p_sample_loop_progressive( self, model, shape, noise=None, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, device=None, progress=False ): """ Generate samples from the model and yield intermediate samples from each timestep of diffusion. Arguments are the same as p_sample_loop(). Returns a generator over dicts, where each dict is the return value of p_sample(). """ if device is None: device = next(model.parameters()).device assert isinstance(shape, (tuple, list)) img = noise if noise is not None else th.randn(*shape, device=device) indices = list(range(self.num_timesteps))[::-1] if progress: # Lazy import so that we don't depend on tqdm. from tqdm.auto import tqdm indices = tqdm(indices) cache_dic, current = cache_init(model_kwargs=model_kwargs, num_steps=self.num_timesteps) for i in indices: t = th.tensor([i] * shape[0], device=device) with th.no_grad(): current['step'] = i out = self.p_sample( model, img, t, current=current, cache_dic=cache_dic, clip_denoised=clip_denoised, denoised_fn=denoised_fn, cond_fn=cond_fn, model_kwargs=model_kwargs, ) yield out img = out["sample"] def ddim_sample( self, model, x, t, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, eta=0.0, ): """ Sample x_{t-1} from the model using DDIM. Same usage as p_sample(). """ out = self.p_mean_variance( model, x, t, clip_denoised=clip_denoised, denoised_fn=denoised_fn, model_kwargs=model_kwargs, ) if cond_fn is not None: out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs) # Usually our model outputs epsilon, but we re-derive it # in case we used x_start or x_prev prediction. eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"]) alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape) sigma = ( eta * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * th.sqrt(1 - alpha_bar / alpha_bar_prev) ) # Equation 12. noise = th.randn_like(x) mean_pred = ( out["pred_xstart"] * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps ) nonzero_mask = ( (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) ) # no noise when t == 0 sample = mean_pred + nonzero_mask * sigma * noise return {"sample": sample, "pred_xstart": out["pred_xstart"]} def ddim_reverse_sample( self, model, x, t, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, eta=0.0, ): """ Sample x_{t+1} from the model using DDIM reverse ODE. """ assert eta == 0.0, "Reverse ODE only for deterministic path" out = self.p_mean_variance( model, x, t, clip_denoised=clip_denoised, denoised_fn=denoised_fn, model_kwargs=model_kwargs, ) if cond_fn is not None: out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs) # Usually our model outputs epsilon, but we re-derive it # in case we used x_start or x_prev prediction. eps = ( _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x - out["pred_xstart"] ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape) alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape) # Equation 12. reversed mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]} def ddim_sample_loop( self, model, shape, noise=None, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, device=None, progress=False, eta=0.0, ): """ Generate samples from the model using DDIM. Same usage as p_sample_loop(). """ final = None for sample in self.ddim_sample_loop_progressive( model, shape, noise=noise, clip_denoised=clip_denoised, denoised_fn=denoised_fn, cond_fn=cond_fn, model_kwargs=model_kwargs, device=device, progress=progress, eta=eta, ): final = sample return final["sample"] def ddim_sample_loop_progressive( self, model, shape, noise=None, clip_denoised=True, denoised_fn=None, cond_fn=None, model_kwargs=None, device=None, progress=False, eta=0.0, ): """ Use DDIM to sample from the model and yield intermediate samples from each timestep of DDIM. Same usage as p_sample_loop_progressive(). """ if device is None: device = next(model.parameters()).device assert isinstance(shape, (tuple, list)) img = noise if noise is not None else th.randn(*shape, device=device) indices = list(range(self.num_timesteps))[::-1] if progress: # Lazy import so that we don't depend on tqdm. from tqdm.auto import tqdm indices = tqdm(indices) for i in indices: t = th.tensor([i] * shape[0], device=device) with th.no_grad(): out = self.ddim_sample( model, img, t, clip_denoised=clip_denoised, denoised_fn=denoised_fn, cond_fn=cond_fn, model_kwargs=model_kwargs, eta=eta, ) yield out img = out["sample"] def _vb_terms_bpd( self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None ): """ Get a term for the variational lower-bound. The resulting units are bits (rather than nats, as one might expect). This allows for comparison to other papers. :return: a dict with the following keys: - 'output': a shape [N] tensor of NLLs or KLs. - 'pred_xstart': the x_0 predictions. """ true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance( x_start=x_start, x_t=x_t, t=t ) out = self.p_mean_variance( model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs ) kl = normal_kl( true_mean, true_log_variance_clipped, out["mean"], out["log_variance"] ) kl = mean_flat(kl) / np.log(2.0) decoder_nll = -discretized_gaussian_log_likelihood( x_start, means=out["mean"], log_scales=0.5 * out["log_variance"] ) assert decoder_nll.shape == x_start.shape decoder_nll = mean_flat(decoder_nll) / np.log(2.0) # At the first timestep return the decoder NLL, # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t)) output = th.where((t == 0), decoder_nll, kl) return {"output": output, "pred_xstart": out["pred_xstart"]} def training_losses(self, model, x_start, timestep, model_kwargs=None, noise=None, skip_noise=False): """ Compute training losses for a single timestep. :param model: the model to evaluate loss on. :param x_start: the [N x C x ...] tensor of inputs. :param t: a batch of timestep indices. :param model_kwargs: if not None, a dict of extra keyword arguments to pass to the model. This can be used for conditioning. :param noise: if specified, the specific Gaussian noise to try to remove. :return: a dict with the key "loss" containing a tensor of shape [N]. Some mean or variance settings may also have other keys. """ t = timestep if model_kwargs is None: model_kwargs = {} if skip_noise: x_t = x_start else: if noise is None: noise = th.randn_like(x_start) x_t = self.q_sample(x_start, t, noise=noise) terms = {} if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL: terms["loss"] = self._vb_terms_bpd( model=model, x_start=x_start, x_t=x_t, t=t, clip_denoised=False, model_kwargs=model_kwargs, )["output"] if self.loss_type == LossType.RESCALED_KL: terms["loss"] *= self.num_timesteps elif self.loss_type in [LossType.MSE, LossType.RESCALED_MSE]: model_output = model(x_t, t, **model_kwargs) if isinstance(model_output, dict) and model_output.get('x', None) is not None: output = model_output['x'] else: output = model_output if self.return_startx and self.model_mean_type == ModelMeanType.EPSILON: return self._extracted_from_training_losses_diffusers(x_t, output, t) # self.model_var_type = ModelVarType.LEARNED_RANGE:4 if self.model_var_type in [ ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE, ]: B, C = x_t.shape[:2] assert output.shape == (B, C * 2, *x_t.shape[2:]) output, model_var_values = th.split(output, C, dim=1) # Learn the variance using the variational bound, but don't let it affect our mean prediction. frozen_out = th.cat([output.detach(), model_var_values], dim=1) # vb variational bound terms["vb"] = self._vb_terms_bpd( model=lambda *args, r=frozen_out, **kwargs: r, x_start=x_start, x_t=x_t, t=t, clip_denoised=False, )["output"] if self.loss_type == LossType.RESCALED_MSE: # Divide by 1000 for equivalence with initial implementation. # Without a factor of 1/1000, the VB term hurts the MSE term. terms["vb"] *= self.num_timesteps / 1000.0 target = { ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance( x_start=x_start, x_t=x_t, t=t )[0], ModelMeanType.START_X: x_start, ModelMeanType.EPSILON: noise, }[self.model_mean_type] assert output.shape == target.shape == x_start.shape if self.snr: if self.model_mean_type == ModelMeanType.START_X: pred_noise = self._predict_eps_from_xstart(x_t=x_t, t=t, pred_xstart=output) pred_startx = output elif self.model_mean_type == ModelMeanType.EPSILON: pred_noise = output pred_startx = self._predict_xstart_from_eps(x_t=x_t, t=t, eps=output) # terms["mse_eps"] = mean_flat((noise - pred_noise) ** 2) # terms["mse_x0"] = mean_flat((x_start - pred_startx) ** 2) t = t[:, None, None, None].expand(pred_startx.shape) # [128, 4, 32, 32] # best target = th.where(t > 249, noise, x_start) output = th.where(t > 249, pred_noise, pred_startx) loss = (target - output) ** 2 if model_kwargs.get('mask_ratio', False) and model_kwargs['mask_ratio'] > 0: assert 'mask' in model_output loss = F.avg_pool2d(loss.mean(dim=1), model.model.module.patch_size).flatten(1) mask = model_output['mask'] unmask = 1 - mask terms['mse'] = mean_flat(loss * unmask) * unmask.shape[1]/unmask.sum(1) if model_kwargs['mask_loss_coef'] > 0: terms['mae'] = model_kwargs['mask_loss_coef'] * mean_flat(loss * mask) * mask.shape[1]/mask.sum(1) else: terms["mse"] = mean_flat(loss) terms["loss"] = terms["mse"] + terms["vb"] if "vb" in terms else terms["mse"] if "mae" in terms: terms["loss"] = terms["loss"] + terms["mae"] else: raise NotImplementedError(self.loss_type) return terms def training_losses_diffusers(self, model, x_start, timestep, model_kwargs=None, noise=None, skip_noise=False): """ Compute training losses for a single timestep. :param model: the model to evaluate loss on. :param x_start: the [N x C x ...] tensor of inputs. :param t: a batch of timestep indices. :param model_kwargs: if not None, a dict of extra keyword arguments to pass to the model. This can be used for conditioning. :param noise: if specified, the specific Gaussian noise to try to remove. :return: a dict with the key "loss" containing a tensor of shape [N]. Some mean or variance settings may also have other keys. """ t = timestep if model_kwargs is None: model_kwargs = {} if skip_noise: x_t = x_start else: if noise is None: noise = th.randn_like(x_start) x_t = self.q_sample(x_start, t, noise=noise) terms = {} if self.loss_type in [LossType.KL, LossType.RESCALED_KL]: terms["loss"] = self._vb_terms_bpd( model=model, x_start=x_start, x_t=x_t, t=t, clip_denoised=False, model_kwargs=model_kwargs, )["output"] if self.loss_type == LossType.RESCALED_KL: terms["loss"] *= self.num_timesteps elif self.loss_type in [LossType.MSE, LossType.RESCALED_MSE]: output = model(x_t, timestep=t, **model_kwargs, return_dict=False)[0] if self.return_startx and self.model_mean_type == ModelMeanType.EPSILON: return self._extracted_from_training_losses_diffusers(x_t, output, t) if self.model_var_type in [ ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE, ]: B, C = x_t.shape[:2] assert output.shape == (B, C * 2, *x_t.shape[2:]) output, model_var_values = th.split(output, C, dim=1) # Learn the variance using the variational bound, but don't let it affect our mean prediction. frozen_out = th.cat([output.detach(), model_var_values], dim=1) terms["vb"] = self._vb_terms_bpd( model=lambda *args, r=frozen_out, **kwargs: r, x_start=x_start, x_t=x_t, t=t, clip_denoised=False, )["output"] if self.loss_type == LossType.RESCALED_MSE: # Divide by 1000 for equivalence with initial implementation. # Without a factor of 1/1000, the VB term hurts the MSE term. terms["vb"] *= self.num_timesteps / 1000.0 target = { ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance( x_start=x_start, x_t=x_t, t=t )[0], ModelMeanType.START_X: x_start, ModelMeanType.EPSILON: noise, }[self.model_mean_type] assert output.shape == target.shape == x_start.shape if self.snr: if self.model_mean_type == ModelMeanType.START_X: pred_noise = self._predict_eps_from_xstart(x_t=x_t, t=t, pred_xstart=output) pred_startx = output elif self.model_mean_type == ModelMeanType.EPSILON: pred_noise = output pred_startx = self._predict_xstart_from_eps(x_t=x_t, t=t, eps=output) # terms["mse_eps"] = mean_flat((noise - pred_noise) ** 2) # terms["mse_x0"] = mean_flat((x_start - pred_startx) ** 2) t = t[:, None, None, None].expand(pred_startx.shape) # [128, 4, 32, 32] # best target = th.where(t > 249, noise, x_start) output = th.where(t > 249, pred_noise, pred_startx) loss = (target - output) ** 2 terms["mse"] = mean_flat(loss) terms["loss"] = terms["mse"] + terms["vb"] if "vb" in terms else terms["mse"] if "mae" in terms: terms["loss"] = terms["loss"] + terms["mae"] else: raise NotImplementedError(self.loss_type) return terms def _extracted_from_training_losses_diffusers(self, x_t, output, t): B, C = x_t.shape[:2] assert output.shape == (B, C * 2, *x_t.shape[2:]) output = th.split(output, C, dim=1)[0] return output, self._predict_xstart_from_eps(x_t=x_t, t=t, eps=output), x_t def _prior_bpd(self, x_start): """ Get the prior KL term for the variational lower-bound, measured in bits-per-dim. This term can't be optimized, as it only depends on the encoder. :param x_start: the [N x C x ...] tensor of inputs. :return: a batch of [N] KL values (in bits), one per batch element. """ batch_size = x_start.shape[0] t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device) qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t) kl_prior = normal_kl( mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0 ) return mean_flat(kl_prior) / np.log(2.0) def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None): """ Compute the entire variational lower-bound, measured in bits-per-dim, as well as other related quantities. :param model: the model to evaluate loss on. :param x_start: the [N x C x ...] tensor of inputs. :param clip_denoised: if True, clip denoised samples. :param model_kwargs: if not None, a dict of extra keyword arguments to pass to the model. This can be used for conditioning. :return: a dict containing the following keys: - total_bpd: the total variational lower-bound, per batch element. - prior_bpd: the prior term in the lower-bound. - vb: an [N x T] tensor of terms in the lower-bound. - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep. - mse: an [N x T] tensor of epsilon MSEs for each timestep. """ device = x_start.device batch_size = x_start.shape[0] vb = [] xstart_mse = [] mse = [] for t in list(range(self.num_timesteps))[::-1]: t_batch = th.tensor([t] * batch_size, device=device) noise = th.randn_like(x_start) x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise) # Calculate VLB term at the current timestep with th.no_grad(): out = self._vb_terms_bpd( model, x_start=x_start, x_t=x_t, t=t_batch, clip_denoised=clip_denoised, model_kwargs=model_kwargs, ) vb.append(out["output"]) xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2)) eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"]) mse.append(mean_flat((eps - noise) ** 2)) vb = th.stack(vb, dim=1) xstart_mse = th.stack(xstart_mse, dim=1) mse = th.stack(mse, dim=1) prior_bpd = self._prior_bpd(x_start) total_bpd = vb.sum(dim=1) + prior_bpd return { "total_bpd": total_bpd, "prior_bpd": prior_bpd, "vb": vb, "xstart_mse": xstart_mse, "mse": mse, } def _extract_into_tensor(arr, timesteps, broadcast_shape): """ Extract values from a 1-D numpy array for a batch of indices. :param arr: the 1-D numpy array. :param timesteps: a tensor of indices into the array to extract. :param broadcast_shape: a larger shape of K dimensions with the batch dimension equal to the length of timesteps. :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims. """ res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float() while len(res.shape) < len(broadcast_shape): res = res[..., None] return res + th.zeros(broadcast_shape, device=timesteps.device) ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/hed.py ================================================ # This is an improved version and model of HED edge detection with Apache License, Version 2.0. # Please use this implementation in your products # This implementation may produce slightly different results from Saining Xie's official implementations, # but it generates smoother edges and is more suitable for ControlNet as well as other image-to-image translations. # Different from official models and other implementations, this is an RGB-input model (rather than BGR) # and in this way it works better for gradio's RGB protocol import sys from pathlib import Path current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent.parent)) from torch import nn import torch import numpy as np from torchvision import transforms as T from tqdm import tqdm from torch.utils.data import Dataset, DataLoader import json from PIL import Image import torchvision.transforms.functional as TF from accelerate import Accelerator from diffusers.models import AutoencoderKL import os image_resize = 1024 class DoubleConvBlock(nn.Module): def __init__(self, input_channel, output_channel, layer_number): super().__init__() self.convs = torch.nn.Sequential() self.convs.append(torch.nn.Conv2d(in_channels=input_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1)) for i in range(1, layer_number): self.convs.append(torch.nn.Conv2d(in_channels=output_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1)) self.projection = torch.nn.Conv2d(in_channels=output_channel, out_channels=1, kernel_size=(1, 1), stride=(1, 1), padding=0) def forward(self, x, down_sampling=False): h = x if down_sampling: h = torch.nn.functional.max_pool2d(h, kernel_size=(2, 2), stride=(2, 2)) for conv in self.convs: h = conv(h) h = torch.nn.functional.relu(h) return h, self.projection(h) class ControlNetHED_Apache2(nn.Module): def __init__(self): super().__init__() self.norm = torch.nn.Parameter(torch.zeros(size=(1, 3, 1, 1))) self.block1 = DoubleConvBlock(input_channel=3, output_channel=64, layer_number=2) self.block2 = DoubleConvBlock(input_channel=64, output_channel=128, layer_number=2) self.block3 = DoubleConvBlock(input_channel=128, output_channel=256, layer_number=3) self.block4 = DoubleConvBlock(input_channel=256, output_channel=512, layer_number=3) self.block5 = DoubleConvBlock(input_channel=512, output_channel=512, layer_number=3) def forward(self, x): h = x - self.norm h, projection1 = self.block1(h) h, projection2 = self.block2(h, down_sampling=True) h, projection3 = self.block3(h, down_sampling=True) h, projection4 = self.block4(h, down_sampling=True) h, projection5 = self.block5(h, down_sampling=True) return projection1, projection2, projection3, projection4, projection5 class InternData(Dataset): def __init__(self): #### with open('data/InternData/partition/data_info.json', 'r') as f: self.j = json.load(f) self.transform = T.Compose([ T.Lambda(lambda img: img.convert('RGB')), T.Resize(image_resize), # Image.BICUBIC T.CenterCrop(image_resize), T.ToTensor(), ]) def __len__(self): return len(self.j) def getdata(self, idx): path = self.j[idx]['path'] image = Image.open("data/InternImgs/" + path) image = self.transform(image) return image, path def __getitem__(self, idx): for i in range(20): try: data = self.getdata(idx) return data except Exception as e: print(f"Error details: {str(e)}") idx = np.random.randint(len(self)) raise RuntimeError('Too many bad data.') class HEDdetector(nn.Module): def __init__(self, feature=True, vae=None): super().__init__() self.model = ControlNetHED_Apache2() self.model.load_state_dict(torch.load('output/pretrained_models/ControlNetHED.pth', map_location='cpu')) self.model.eval() self.model.requires_grad_(False) if feature: if vae is None: self.vae = AutoencoderKL.from_pretrained("output/pretrained_models/sd-vae-ft-ema") else: self.vae = vae self.vae.eval() self.vae.requires_grad_(False) else: self.vae = None def forward(self, input_image): B, C, H, W = input_image.shape with torch.inference_mode(): edges = self.model(input_image * 255.) edges = torch.cat([TF.resize(e, [H, W]) for e in edges], dim=1) edge = 1 / (1 + torch.exp(-torch.mean(edges, dim=1, keepdim=True))) edge.clip_(0, 1) if self.vae: edge = TF.normalize(edge, [.5], [.5]) edge = edge.repeat(1, 3, 1, 1) posterior = self.vae.encode(edge).latent_dist edge = torch.cat([posterior.mean, posterior.std], dim=1).cpu().numpy() return edge def main(): dataset = InternData() dataloader = DataLoader(dataset, batch_size=10, shuffle=False, num_workers=8, pin_memory=True) hed = HEDdetector() accelerator = Accelerator() hed, dataloader = accelerator.prepare(hed, dataloader) for img, path in tqdm(dataloader): out = hed(img.cuda()) for p, o in zip(path, out): save = f'data/InternalData/hed_feature_{image_resize}/' + p.replace('.png', '.npz') if os.path.exists(save): continue os.makedirs(os.path.dirname(save), exist_ok=True) np.savez_compressed(save, o) if __name__ == "__main__": main() ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/llava/__init__.py ================================================ from diffusion.model.llava.llava_mpt import LlavaMPTForCausalLM, LlavaMPTConfig ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/llava/llava_mpt.py ================================================ # Copyright 2023 Haotian Liu # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List, Optional, Tuple, Union import warnings import torch import torch.nn as nn import torch.nn.functional as F from torch.nn import CrossEntropyLoss import math from transformers import AutoConfig, AutoModelForCausalLM, CLIPVisionModel, CLIPImageProcessor from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from diffusion.model.llava.mpt.modeling_mpt import MPTConfig, MPTForCausalLM, MPTModel DEFAULT_IMAGE_TOKEN = "" DEFAULT_IMAGE_PATCH_TOKEN = "" DEFAULT_IM_START_TOKEN = "" DEFAULT_IM_END_TOKEN = "" class LlavaMPTConfig(MPTConfig): model_type = "llava_mpt" class LlavaMPTModel(MPTModel): config_class = LlavaMPTConfig def __init__(self, config: MPTConfig, mm_vision_tower=None, mm_hidden_size=None): super(LlavaMPTModel, self).__init__(config) if hasattr(config, "mm_vision_tower"): # HACK: for FSDP self.vision_tower = [CLIPVisionModel.from_pretrained(config.mm_vision_tower)] # self.vision_tower = CLIPVisionModel.from_pretrained(config.mm_vision_tower) if hasattr(config, "use_mm_proj"): self.mm_projector = nn.Linear(config.mm_hidden_size, config.d_model) def initialize_vision_modules(self, vision_tower, mm_vision_select_layer, pretrain_mm_mlp_adapter=None, tune_mm_mlp_adapter=False): self.config.mm_vision_tower = vision_tower image_processor = CLIPImageProcessor.from_pretrained(vision_tower) if not hasattr(self, 'vision_tower'): vision_tower = CLIPVisionModel.from_pretrained(vision_tower) else: vision_tower = self.vision_tower[0] vision_tower.requires_grad_(False) vision_tower = vision_tower.to(torch.float16) self.vision_tower = [vision_tower] vision_config = vision_tower.config num_patches = (vision_config.image_size // vision_config.patch_size) ** 2 self.config.use_mm_proj = True self.config.mm_hidden_size = vision_config.hidden_size self.config.mm_vision_select_layer = mm_vision_select_layer if not hasattr(self, 'mm_projector'): self.mm_projector = nn.Linear(vision_config.hidden_size, self.config.d_model) if pretrain_mm_mlp_adapter is not None: mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu') self.mm_projector.load_state_dict({k.split('.')[-1]: v for k, v in mm_projector_weights.items() if 'mm_projector' in k}) return dict( image_processor=image_processor, image_token_len=num_patches, vision_config=vision_config ) def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, images=None): # HACK: replace back original embeddings for LLaVA pretraining orig_embeds_params = getattr(self, 'orig_embeds_params', None) # if orig_embeds_params is not None: # orig_embeds_params = orig_embeds_params[0] # with torch.no_grad(): # self.get_input_embeddings().weight.data[:-2] = orig_embeds_params[:-2].data inputs_embeds = self.wte(input_ids) vision_tower = getattr(self, 'vision_tower', None) if vision_tower is not None and (input_ids.shape[1] != 1 or self.training) and images is not None: # TODO: this is a modified multimodal LLM -- Haotian Liu vision_tower = vision_tower[0] # HACK: for FSDP with torch.no_grad(): if type(images) is list: # variable length images image_features = [] for image in images: image_forward_out = vision_tower(image.unsqueeze(0), output_hidden_states=True) select_hidden_state_layer = getattr(self.config, "mm_vision_select_layer", -1) select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer] image_feature = select_hidden_state[:, 1:] image_features.append(image_feature) else: image_forward_outs = vision_tower(images, output_hidden_states=True) select_hidden_state_layer = getattr(self.config, "mm_vision_select_layer", -1) select_hidden_state = image_forward_outs.hidden_states[select_hidden_state_layer] image_features = select_hidden_state[:, 1:] if type(images) is list: image_features = [self.mm_projector(image_feature)[0] for image_feature in image_features] else: image_features = self.mm_projector(image_features) dummy_image_features = torch.zeros(256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype) dummy_image_features = self.mm_projector(dummy_image_features) new_input_embeds = [] cur_image_idx = 0 for cur_input_ids, cur_input_embeds in zip(input_ids, inputs_embeds): if (cur_input_ids == vision_tower.config.im_patch_token).sum() == 0: # multimodal LLM, but the current sample is not multimodal cur_input_embeds = cur_input_embeds + (0. * dummy_image_features).sum() new_input_embeds.append(cur_input_embeds) continue cur_image_features = image_features[cur_image_idx] num_patches = cur_image_features.shape[0] if vision_tower.config.use_im_start_end: if (cur_input_ids == vision_tower.config.im_start_token).sum() != (cur_input_ids == vision_tower.config.im_end_token).sum(): raise ValueError("The number of image start tokens and image end tokens should be the same.") image_start_tokens = torch.where(cur_input_ids == vision_tower.config.im_start_token)[0] for image_start_token_pos in image_start_tokens: cur_image_features = image_features[cur_image_idx].to(device=cur_input_embeds.device) num_patches = cur_image_features.shape[0] if cur_input_ids[image_start_token_pos + num_patches + 1] != vision_tower.config.im_end_token: raise ValueError("The image end token should follow the image start token.") if orig_embeds_params is not None: cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos].detach(), cur_input_embeds[image_start_token_pos:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:image_start_token_pos + num_patches + 2], cur_input_embeds[image_start_token_pos + num_patches + 2:].detach()), dim=0) else: cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:]), dim=0) cur_image_idx += 1 else: if (cur_input_ids == vision_tower.config.im_patch_token).sum() != num_patches: raise ValueError("The number of image patch tokens should be the same as the number of image patches.") masked_indices = torch.where(cur_input_ids == vision_tower.config.im_patch_token)[0] mask_index_start = masked_indices[0] if (masked_indices != torch.arange(mask_index_start, mask_index_start+num_patches, device=masked_indices.device, dtype=masked_indices.dtype)).any(): raise ValueError("The image patch tokens should be consecutive.") if orig_embeds_params is not None: cur_new_input_embeds = torch.cat((cur_input_embeds[:mask_index_start].detach(), cur_image_features, cur_input_embeds[mask_index_start+num_patches:].detach()), dim=0) else: cur_new_input_embeds = torch.cat((cur_input_embeds[:mask_index_start], cur_image_features, cur_input_embeds[mask_index_start+num_patches:]), dim=0) new_input_embeds.append(cur_new_input_embeds) inputs_embeds = torch.stack(new_input_embeds, dim=0) return super(LlavaMPTModel, self).forward(input_ids=None, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, tok_emb=inputs_embeds) class LlavaMPTForCausalLM(MPTForCausalLM): config_class = LlavaMPTConfig supports_gradient_checkpointing = True def __init__(self, config): super(MPTForCausalLM, self).__init__(config) if not config.tie_word_embeddings: raise ValueError('MPTForCausalLM only supports tied word embeddings') self.transformer = LlavaMPTModel(config) self.logit_scale = None if config.logit_scale is not None: logit_scale = config.logit_scale if isinstance(logit_scale, str): if logit_scale == 'inv_sqrt_d_model': logit_scale = 1 / math.sqrt(config.d_model) else: raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.") self.logit_scale = logit_scale def get_model(self): return self.transformer def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, LlavaMPTModel): module.gradient_checkpointing = value def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, images=None): return_dict = return_dict if return_dict is not None else self.config.return_dict use_cache = use_cache if use_cache is not None else self.config.use_cache outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, images=images) logits = F.linear(outputs.last_hidden_state, self.transformer.wte.weight) if self.logit_scale is not None: if self.logit_scale == 0: warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.') logits *= self.logit_scale loss = None if labels is not None: labels = torch.roll(labels, shifts=-1) labels[:, -1] = -100 loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1)) return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states) def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): if inputs_embeds is not None: raise NotImplementedError('inputs_embeds is not implemented for MPT yet') attention_mask = kwargs['attention_mask'].bool() if attention_mask[:, -1].sum() != attention_mask.shape[0]: raise NotImplementedError('MPT does not support generation with right padding.') if self.transformer.attn_uses_sequence_id and self.training: sequence_id = torch.zeros_like(input_ids[:1]) else: sequence_id = None if past_key_values is not None: input_ids = input_ids[:, -1].unsqueeze(-1) if self.transformer.prefix_lm: prefix_mask = torch.ones_like(attention_mask) if kwargs.get('use_cache') == False: raise NotImplementedError('MPT with prefix_lm=True does not support use_cache=False.') else: prefix_mask = None return {'input_ids': input_ids, 'attention_mask': attention_mask, 'prefix_mask': prefix_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True), "images": kwargs.get("images", None)} def initialize_vision_tokenizer(self, mm_use_im_start_end, tokenizer, device, tune_mm_mlp_adapter=False, pretrain_mm_mlp_adapter=None): vision_config = self.get_model().vision_tower[0].config vision_config.use_im_start_end = mm_use_im_start_end tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) self.resize_token_embeddings(len(tokenizer)) if mm_use_im_start_end: num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True) self.resize_token_embeddings(len(tokenizer)) vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN]) if num_new_tokens > 0: input_embeddings = ( self._extracted_from_initialize_vision_tokenizer_14( num_new_tokens ) ) if tune_mm_mlp_adapter: self.get_model().orig_embeds_params = [self.get_input_embeddings().weight.data.clone().to(device=device)] for p in self.get_input_embeddings().parameters(): p.requires_grad = True for p in self.get_output_embeddings().parameters(): p.requires_grad = False if pretrain_mm_mlp_adapter: mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu') embed_tokens_weight = mm_projector_weights['transformer.wte.weight'] assert num_new_tokens == 2 if input_embeddings.shape == embed_tokens_weight.shape: input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:] elif embed_tokens_weight.shape[0] == num_new_tokens: input_embeddings[-num_new_tokens:] = embed_tokens_weight else: raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.") vision_config.im_patch_token = tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_PATCH_TOKEN])[0] # TODO Rename this here and in `initialize_vision_tokenizer` def _extracted_from_initialize_vision_tokenizer_14(self, num_new_tokens): result = self.get_input_embeddings().weight.data output_embeddings = self.get_output_embeddings().weight.data input_embeddings_avg = result[:-num_new_tokens].mean(dim=0, keepdim=True) output_embeddings_avg = output_embeddings[:-num_new_tokens].mean( dim=0, keepdim=True) result[-num_new_tokens:] = input_embeddings_avg output_embeddings[-num_new_tokens:] = output_embeddings_avg return result AutoConfig.register("llava_mpt", LlavaMPTConfig) AutoModelForCausalLM.register(LlavaMPTConfig, LlavaMPTForCausalLM) ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/llava/mpt/attention.py ================================================ """Attention layers.""" import math import warnings from typing import Optional import torch import torch.nn as nn from einops import rearrange from torch import nn from .norm import LPLayerNorm def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool): if original_is_causal and num_query_tokens != num_key_tokens: if num_query_tokens != 1: raise NotImplementedError('MPT does not support query and key with different number of tokens, unless number of query tokens is 1.') else: return False return original_is_causal def scaled_multihead_dot_product_attention(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False): q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads) k = rearrange(key, 'b s (h d) -> b h d s', h=1 if multiquery else n_heads) v = rearrange(value, 'b s (h d) -> b h s d', h=1 if multiquery else n_heads) min_val = torch.finfo(q.dtype).min (b, _, s_q, d) = q.shape s_k = k.size(-1) if softmax_scale is None: softmax_scale = 1 / math.sqrt(d) attn_weight = q.matmul(k) * softmax_scale if attn_bias is not None: if attn_bias.size(-1) not in [1, s_k] or attn_bias.size(-2) not in [ 1, s_q, ]: raise RuntimeError(f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.') attn_weight = attn_weight + attn_bias if key_padding_mask is not None: if attn_bias is not None: warnings.warn('Propogating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unneccessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.') attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val) if is_causal: s = max(s_q, s_k) causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16) causal_mask = causal_mask.tril() causal_mask = causal_mask.to(torch.bool) causal_mask = ~causal_mask causal_mask = causal_mask[-s_q:, -s_k:] attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val) attn_weight = torch.softmax(attn_weight, dim=-1) if dropout_p: attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True) out = attn_weight.matmul(v) out = rearrange(out, 'b h s d -> b s (h d)') return (out, attn_weight) if needs_weights else (out, None) def check_valid_inputs(*tensors, valid_dtypes=None): if valid_dtypes is None: valid_dtypes = [torch.float16, torch.bfloat16] for tensor in tensors: if tensor.dtype not in valid_dtypes: raise TypeError(f'tensor.dtype={tensor.dtype!r} must be in valid_dtypes={valid_dtypes!r}.') if not tensor.is_cuda: raise TypeError(f'Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r}).') def flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False): try: from flash_attn import bert_padding, flash_attn_interface except: raise RuntimeError('Please install flash-attn==1.0.3.post0') check_valid_inputs(query, key, value) if attn_bias is not None: raise NotImplementedError('attn_bias not implemented for flash attn.') (batch_size, seqlen) = query.shape[:2] if key_padding_mask is None: key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool) query_padding_mask = key_padding_mask[:, -query.size(1):] (query_unpad, indices_q, cu_seqlens_q, max_seqlen_q) = bert_padding.unpad_input(query, query_padding_mask) query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads) (key_unpad, _, cu_seqlens_k, max_seqlen_k) = bert_padding.unpad_input(key, key_padding_mask) key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=1 if multiquery else n_heads) (value_unpad, _, _, _) = bert_padding.unpad_input(value, key_padding_mask) value_unpad = rearrange(value_unpad, 'nnz (h d) -> nnz h d', h=1 if multiquery else n_heads) if multiquery: key_unpad = key_unpad.expand(key_unpad.size(0), n_heads, key_unpad.size(-1)) value_unpad = value_unpad.expand(value_unpad.size(0), n_heads, value_unpad.size(-1)) dropout_p = dropout_p if training else 0.0 reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal) output_unpad = flash_attn_interface.flash_attn_unpadded_func(query_unpad, key_unpad, value_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights) output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen) return (output, None) def triton_flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False): try: from flash_attn import flash_attn_triton except: raise RuntimeError('Please install flash-attn==1.0.3.post0 and triton==2.0.0.dev20221202') check_valid_inputs(query, key, value) if dropout_p: raise NotImplementedError('Dropout not implemented for attn_impl: triton.') if needs_weights: raise NotImplementedError('attn_impl: triton cannot return attn weights.') if key_padding_mask is not None: warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unnecessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.') (b_size, s_k) = key_padding_mask.shape[:2] if attn_bias is None: attn_bias = query.new_zeros(b_size, 1, 1, s_k) attn_bias = attn_bias.masked_fill(~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min) query = rearrange(query, 'b s (h d) -> b s h d', h=n_heads) key = rearrange(key, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads) value = rearrange(value, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads) if multiquery: key = key.expand(*key.shape[:2], n_heads, key.size(-1)) value = value.expand(*value.shape[:2], n_heads, value.size(-1)) reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal) attn_output = flash_attn_triton.flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale) output = attn_output.view(*attn_output.shape[:2], -1) return (output, None) class MultiheadAttention(nn.Module): """Multi-head self attention. Using torch or triton attention implemetation enables user to also use additive bias. """ def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, device: Optional[str]=None): super().__init__() self.attn_impl = attn_impl self.clip_qkv = clip_qkv self.qk_ln = qk_ln self.d_model = d_model self.n_heads = n_heads self.softmax_scale = softmax_scale if self.softmax_scale is None: self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads) self.attn_dropout_p = attn_pdrop self.Wqkv = nn.Linear(self.d_model, 3 * self.d_model, device=device) fuse_splits = (d_model, 2 * d_model) self.Wqkv._fused = (0, fuse_splits) if self.qk_ln: layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm self.q_ln = layernorm_class(self.d_model, device=device) self.k_ln = layernorm_class(self.d_model, device=device) if self.attn_impl == 'flash': self.attn_fn = flash_attn_fn elif self.attn_impl == 'triton': self.attn_fn = triton_flash_attn_fn warnings.warn('While `attn_impl: triton` can be faster than `attn_impl: flash` ' + 'it uses more memory. When training larger models this can trigger ' + 'alloc retries which hurts performance. If encountered, we recommend ' + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.') elif self.attn_impl == 'torch': self.attn_fn = scaled_multihead_dot_product_attention if torch.cuda.is_available(): warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.') else: raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.') self.out_proj = nn.Linear(self.d_model, self.d_model, device=device) self.out_proj._is_residual = True def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False): qkv = self.Wqkv(x) if self.clip_qkv: qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv) (query, key, value) = qkv.chunk(3, dim=2) key_padding_mask = attention_mask if self.qk_ln: dtype = query.dtype query = self.q_ln(query).to(dtype) key = self.k_ln(key).to(dtype) if past_key_value is not None: if len(past_key_value) != 0: key = torch.cat([past_key_value[0], key], dim=1) value = torch.cat([past_key_value[1], value], dim=1) past_key_value = (key, value) if attn_bias is not None: attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):] (context, attn_weights) = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights) return (self.out_proj(context), attn_weights, past_key_value) class MultiQueryAttention(nn.Module): """Multi-Query self attention. Using torch or triton attention implemetation enables user to also use additive bias. """ def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, device: Optional[str]=None): super().__init__() self.attn_impl = attn_impl self.clip_qkv = clip_qkv self.qk_ln = qk_ln self.d_model = d_model self.n_heads = n_heads self.head_dim = d_model // n_heads self.softmax_scale = softmax_scale if self.softmax_scale is None: self.softmax_scale = 1 / math.sqrt(self.head_dim) self.attn_dropout_p = attn_pdrop self.Wqkv = nn.Linear(d_model, d_model + 2 * self.head_dim, device=device) fuse_splits = (d_model, d_model + self.head_dim) self.Wqkv._fused = (0, fuse_splits) if self.qk_ln: layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm self.q_ln = layernorm_class(d_model, device=device) self.k_ln = layernorm_class(self.head_dim, device=device) if self.attn_impl == 'flash': self.attn_fn = flash_attn_fn elif self.attn_impl == 'triton': self.attn_fn = triton_flash_attn_fn warnings.warn('While `attn_impl: triton` can be faster than `attn_impl: flash` ' + 'it uses more memory. When training larger models this can trigger ' + 'alloc retries which hurts performance. If encountered, we recommend ' + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.') elif self.attn_impl == 'torch': self.attn_fn = scaled_multihead_dot_product_attention if torch.cuda.is_available(): warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.') else: raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.') self.out_proj = nn.Linear(self.d_model, self.d_model, device=device) self.out_proj._is_residual = True def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False): qkv = self.Wqkv(x) if self.clip_qkv: qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv) (query, key, value) = qkv.split([self.d_model, self.head_dim, self.head_dim], dim=2) key_padding_mask = attention_mask if self.qk_ln: dtype = query.dtype query = self.q_ln(query).to(dtype) key = self.k_ln(key).to(dtype) if past_key_value is not None: if len(past_key_value) != 0: key = torch.cat([past_key_value[0], key], dim=1) value = torch.cat([past_key_value[1], value], dim=1) past_key_value = (key, value) if attn_bias is not None: attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):] (context, attn_weights) = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, multiquery=True) return (self.out_proj(context), attn_weights, past_key_value) def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id): if attn_impl == 'flash': return None elif attn_impl in ['torch', 'triton']: if alibi: if (prefix_lm or not causal) or use_sequence_id: return (1, n_heads, seq_len, seq_len) return (1, n_heads, 1, seq_len) elif prefix_lm or use_sequence_id: return (1, 1, seq_len, seq_len) return None else: raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.') def build_attn_bias(attn_impl, attn_bias, n_heads, seq_len, causal=False, alibi=False, alibi_bias_max=8): if attn_impl == 'flash': return None elif attn_impl in ['torch', 'triton']: if alibi: (device, dtype) = (attn_bias.device, attn_bias.dtype) attn_bias = attn_bias.add(build_alibi_bias(n_heads, seq_len, full=not causal, alibi_bias_max=alibi_bias_max, device=device, dtype=dtype)) return attn_bias else: raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.') def gen_slopes(n_heads, alibi_bias_max=8, device=None): _n_heads = 2 ** math.ceil(math.log2(n_heads)) m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device) m = m.mul(alibi_bias_max / _n_heads) slopes = 1.0 / torch.pow(2, m) if _n_heads != n_heads: slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads] return slopes.view(1, n_heads, 1, 1) def build_alibi_bias(n_heads, seq_len, full=False, alibi_bias_max=8, device=None, dtype=None): alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, 1, seq_len) if full: alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, seq_len, 1) alibi_bias = alibi_bias.abs().mul(-1) slopes = gen_slopes(n_heads, alibi_bias_max, device=device) alibi_bias = alibi_bias * slopes return alibi_bias.to(dtype=dtype) ATTN_CLASS_REGISTRY = {'multihead_attention': MultiheadAttention, 'multiquery_attention': MultiQueryAttention} ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/llava/mpt/blocks.py ================================================ """GPT Blocks used for the GPT Model.""" from typing import Dict, Optional, Tuple import torch import torch.nn as nn from .attention import ATTN_CLASS_REGISTRY from .norm import NORM_CLASS_REGISTRY class MPTMLP(nn.Module): def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None): super().__init__() self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device) self.act = nn.GELU(approximate='none') self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device) self.down_proj._is_residual = True def forward(self, x): return self.down_proj(self.act(self.up_proj(x))) class MPTBlock(nn.Module): def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict = None, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', device: Optional[str]=None, **kwargs): if attn_config is None: attn_config = { 'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8, } del kwargs super().__init__() norm_class = NORM_CLASS_REGISTRY[norm_type.lower()] attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']] self.norm_1 = norm_class(d_model, device=device) self.attn = attn_class(attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], d_model=d_model, n_heads=n_heads, device=device) self.norm_2 = norm_class(d_model, device=device) self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device) self.resid_attn_dropout = nn.Dropout(resid_pdrop) self.resid_ffn_dropout = nn.Dropout(resid_pdrop) def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]: a = self.norm_1(x) (b, _, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal) x = x + self.resid_attn_dropout(b) m = self.norm_2(x) n = self.ffn(m) x = x + self.resid_ffn_dropout(n) return (x, past_key_value) ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/llava/mpt/configuration_mpt.py ================================================ """A HuggingFace-style model configuration.""" from typing import Dict, Optional, Union from transformers import PretrainedConfig attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8} init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu'} class MPTConfig(PretrainedConfig): model_type = 'mpt' def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, **kwargs): """The MPT configuration class. Args: d_model (int): The size of the embedding dimension of the model. n_heads (int): The number of attention heads. n_layers (int): The number of layers in the model. expansion_ratio (int): The ratio of the up/down scale in the MLP. max_seq_len (int): The maximum sequence length of the model. vocab_size (int): The size of the vocabulary. resid_pdrop (float): The dropout probability applied to the attention output before combining with residual. emb_pdrop (float): The dropout probability for the embedding layer. learned_pos_emb (bool): Whether to use learned positional embeddings attn_config (Dict): A dictionary used to configure the model's attention module: attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention attn_pdrop (float): The dropout probability for the attention layers. attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'. qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer. clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to this value. softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None, use the default scale of ``1/sqrt(d_keys)``. prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix can attend to one another bi-directionally. Tokens outside the prefix use causal attention. attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id. When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates which sub-sequence each token belongs to. Defaults to ``False`` meaning any provided `sequence_id` will be ignored. alibi (bool): Whether to use the alibi bias instead of position embeddings. alibi_bias_max (int): The maximum value of the alibi bias. init_device (str): The device to use for parameter initialization. logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value. no_bias (bool): Whether to use bias in all layers. verbose (int): The verbosity level. 0 is silent. embedding_fraction (float): The fraction to scale the gradients of the embedding layer by. norm_type (str): choose type of norm to use multiquery_attention (bool): Whether to use multiquery attention implementation. use_cache (bool): Whether or not the model should return the last key/values attentions init_config (Dict): A dictionary used to configure the model initialization: init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_', 'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or 'xavier_normal_'. These mimic the parameter initialization methods in PyTorch. init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True. emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer. emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``. init_std (float): The standard deviation of the normal distribution used to initialize the model, if using the baseline_ parameter initialization scheme. init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes. fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes. init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes. --- See llmfoundry.models.utils.param_init_fns.py for info on other param init config options """ self.d_model = d_model self.n_heads = n_heads self.n_layers = n_layers self.expansion_ratio = expansion_ratio self.max_seq_len = max_seq_len self.vocab_size = vocab_size self.resid_pdrop = resid_pdrop self.emb_pdrop = emb_pdrop self.learned_pos_emb = learned_pos_emb self.attn_config = attn_config self.init_device = init_device self.logit_scale = logit_scale self.no_bias = no_bias self.verbose = verbose self.embedding_fraction = embedding_fraction self.norm_type = norm_type self.use_cache = use_cache self.init_config = init_config if 'name' in kwargs: del kwargs['name'] if 'loss_fn' in kwargs: del kwargs['loss_fn'] super().__init__(**kwargs) self._validate_config() def _set_config_defaults(self, config, config_defaults): for (k, v) in config_defaults.items(): if k not in config: config[k] = v return config def _validate_config(self): self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults) self.init_config = self._set_config_defaults(self.init_config, init_config_defaults) if self.d_model % self.n_heads != 0: raise ValueError('d_model must be divisible by n_heads') if any((prob < 0 or prob > 1 for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])): raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1") if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']: raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}") if self.attn_config['prefix_lm'] and self.attn_config['attn_impl'] not in ['torch', 'triton']: raise NotImplementedError('prefix_lm only implemented with torch and triton attention.') if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in ['torch', 'triton']: raise NotImplementedError('alibi only implemented with torch and triton attention.') if self.attn_config['attn_uses_sequence_id'] and self.attn_config['attn_impl'] not in ['torch', 'triton']: raise NotImplementedError('attn_uses_sequence_id only implemented with torch and triton attention.') if self.embedding_fraction > 1 or self.embedding_fraction <= 0: raise ValueError('model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!') if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model': raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.") if self.init_config.get('name', None) is None: raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.") if not self.learned_pos_emb and (not self.attn_config['alibi']): raise ValueError( 'Positional information must be provided to the model using either learned_pos_emb or alibi.' ) ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/llava/mpt/modeling_mpt.py ================================================ """A simple, flexible implementation of a GPT model. Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py """ import math import warnings from typing import List, Optional, Tuple, Union import torch import torch.nn as nn import torch.nn.functional as F from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from .attention import attn_bias_shape, build_attn_bias from .blocks import MPTBlock from .norm import NORM_CLASS_REGISTRY from .configuration_mpt import MPTConfig from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_ Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast] from transformers.utils import logging logger = logging.get_logger(__name__) class MPTPreTrainedModel(PreTrainedModel): config_class = MPTConfig base_model_prefix = 'model' class MPTModel(MPTPreTrainedModel): def __init__(self, config: MPTConfig): config._validate_config() super().__init__(config) self.attn_impl = config.attn_config['attn_impl'] self.prefix_lm = config.attn_config['prefix_lm'] self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id'] self.alibi = config.attn_config['alibi'] self.alibi_bias_max = config.attn_config['alibi_bias_max'] if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys(): norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys()) raise NotImplementedError(f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).') norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()] self.embedding_fraction = config.embedding_fraction self.wte = nn.Embedding(config.vocab_size, config.d_model, device=config.init_device) if not self.alibi: self.wpe = nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device) self.emb_drop = nn.Dropout(config.emb_pdrop) self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)]) self.norm_f = norm_class(config.d_model, device=config.init_device) if config.init_device != 'meta': self.apply(self.param_init_fn) self.is_causal = not self.prefix_lm self._attn_bias_initialized = False self.attn_bias = None self.attn_bias_shape = attn_bias_shape(self.attn_impl, config.n_heads, config.max_seq_len, self.alibi, prefix_lm=self.prefix_lm, causal=self.is_causal, use_sequence_id=self.attn_uses_sequence_id) if config.no_bias: for module in self.modules(): if hasattr(module, 'bias') and isinstance(module.bias, nn.Parameter): if config.verbose: warnings.warn(f'Removing bias ({module.bias}) from {module}.') module.register_parameter('bias', None) if config.verbose and config.verbose > 2: print(self) if 'verbose' not in self.config.init_config: self.config.init_config['verbose'] = self.config.verbose if self.config.init_config['verbose'] > 1: init_fn_name = self.config.init_config['name'] warnings.warn(f'Using {init_fn_name} initialization.') self.gradient_checkpointing = False def get_input_embeddings(self): return self.wte def set_input_embeddings(self, value): self.wte = value @torch.no_grad() def _attn_bias(self, device, dtype, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None): if not self._attn_bias_initialized: if self.attn_bias_shape: self.attn_bias = torch.zeros(self.attn_bias_shape, device=device, dtype=dtype) self.attn_bias = build_attn_bias(self.attn_impl, self.attn_bias, self.config.n_heads, self.config.max_seq_len, causal=self.is_causal, alibi=self.alibi, alibi_bias_max=self.alibi_bias_max) self._attn_bias_initialized = True if self.attn_impl == 'flash': return (self.attn_bias, attention_mask) if self.attn_bias is not None: self.attn_bias = self.attn_bias.to(dtype=dtype, device=device) attn_bias = self.attn_bias if self.prefix_lm: assert isinstance(attn_bias, torch.Tensor) assert isinstance(prefix_mask, torch.Tensor) attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask) if self.attn_uses_sequence_id and sequence_id is not None: assert isinstance(attn_bias, torch.Tensor) attn_bias = self._apply_sequence_id(attn_bias, sequence_id) if attention_mask is not None: s_k = attention_mask.shape[-1] if attn_bias is None: attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype) else: attn_bias = attn_bias[:, :, :, -s_k:] if prefix_mask is not None and attention_mask.shape != prefix_mask.shape: raise ValueError(f'attention_mask shape={attention_mask.shape} ' + f'and prefix_mask shape={prefix_mask.shape} are not equal.') min_val = torch.finfo(attn_bias.dtype).min attn_bias = attn_bias.masked_fill(~attention_mask.view(-1, 1, 1, s_k), min_val) return (attn_bias, None) def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: torch.Tensor): (s_k, s_q) = attn_bias.shape[-2:] if s_k != self.config.max_seq_len or s_q != self.config.max_seq_len: raise ValueError( f'attn_bias does not match the expected shape. The last two dimensions should both be {self.config.max_length} ' + f'but are {s_k} and {s_q}.' ) seq_len = prefix_mask.shape[-1] if seq_len > self.config.max_seq_len: raise ValueError(f'prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}') attn_bias = attn_bias[..., :seq_len, :seq_len] causal = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool, device=prefix_mask.device)).view(1, 1, seq_len, seq_len) prefix = prefix_mask.view(-1, 1, 1, seq_len) cannot_attend = ~torch.logical_or(causal, prefix.bool()) return self._extracted_from__apply_sequence_id_15(attn_bias, cannot_attend) def _apply_sequence_id(self, attn_bias: torch.Tensor, sequence_id: torch.LongTensor): seq_len = sequence_id.shape[-1] if seq_len > self.config.max_seq_len: raise ValueError(f'sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}') attn_bias = attn_bias[..., :seq_len, :seq_len] cannot_attend = torch.logical_not(torch.eq(sequence_id.view(-1, seq_len, 1), sequence_id.view(-1, 1, seq_len))).unsqueeze(1) return self._extracted_from__apply_sequence_id_15(attn_bias, cannot_attend) # TODO Rename this here and in `_apply_prefix_mask` and `_apply_sequence_id` def _extracted_from__apply_sequence_id_15(self, attn_bias, cannot_attend): min_val = torch.finfo(attn_bias.dtype).min attn_bias = attn_bias.masked_fill(cannot_attend, min_val) return attn_bias def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, tok_emb: Optional[torch.FloatTensor]=None): return_dict = return_dict if return_dict is not None else self.config.return_dict use_cache = use_cache if use_cache is not None else self.config.use_cache if self.gradient_checkpointing and self.training and use_cache: logger.warning_once( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." ) use_cache = False if attention_mask is not None: attention_mask = attention_mask.bool() if prefix_mask is not None: prefix_mask = prefix_mask.bool() if not return_dict: raise NotImplementedError('return_dict False is not implemented yet for MPT') if output_attentions: raise NotImplementedError('output_attentions is not implemented yet for MPT') if attention_mask is not None and attention_mask[:, 0].sum() != attention_mask.shape[0] and self.training: raise NotImplementedError('MPT does not support training with left padding.') if self.prefix_lm and prefix_mask is None: raise ValueError('prefix_mask is a required argument when MPT is configured with prefix_lm=True.') if self.training: if self.attn_uses_sequence_id and sequence_id is None: raise ValueError('sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True ' + 'and the model is in train mode.') elif self.attn_uses_sequence_id is False and sequence_id is not None: warnings.warn('MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. ' + 'This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True.') if input_ids is not None: S = input_ids.size(1) assert S <= self.config.max_seq_len, f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}' tok_emb = self.wte(input_ids) else: assert tok_emb is not None S = tok_emb.size(1) if self.alibi: x = tok_emb else: past_position = 0 if past_key_values is not None: if len(past_key_values) != self.config.n_layers: raise ValueError( f'past_key_values must provide a past_key_value for each attention layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r}).' ) past_position = past_key_values[0][0].size(1) if S + past_position > self.config.max_seq_len: raise ValueError(f'Cannot forward input with past sequence length {past_position} and current sequence length {S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.') pos = torch.arange(past_position, S + past_position, dtype=torch.long, device=input_ids.device).unsqueeze(0) if attention_mask is not None: pos = torch.clamp(pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:], min=0) pos_emb = self.wpe(pos) x = tok_emb + pos_emb if self.embedding_fraction == 1: x = self.emb_drop(x) else: x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction) assert isinstance(self.emb_drop, nn.Module) x = self.emb_drop(x_shrunk) (attn_bias, attention_mask) = self._attn_bias(device=x.device, dtype=x.dtype, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id) if use_cache and past_key_values is None: past_key_values = [() for _ in range(self.config.n_layers)] all_hidden_states = () if output_hidden_states else None for (b_idx, block) in enumerate(self.blocks): if output_hidden_states: assert all_hidden_states is not None all_hidden_states = all_hidden_states + (x,) past_key_value = past_key_values[b_idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: (x, past_key_value) = torch.utils.checkpoint.checkpoint( block, x, past_key_value, attn_bias, attention_mask, self.is_causal ) else: (x, past_key_value) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=self.is_causal) if past_key_values is not None: past_key_values[b_idx] = past_key_value x = self.norm_f(x) return BaseModelOutputWithPast(last_hidden_state=x, past_key_values=past_key_values, hidden_states=all_hidden_states) def param_init_fn(self, module): init_fn_name = self.config.init_config['name'] MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config) def fsdp_wrap_fn(self, module): return isinstance(module, MPTBlock) def activation_checkpointing_fn(self, module): return isinstance(module, MPTBlock) class MPTForCausalLM(MPTPreTrainedModel): def __init__(self, config: MPTConfig): super().__init__(config) if not config.tie_word_embeddings: raise ValueError('MPTForCausalLM only supports tied word embeddings') self.transformer = MPTModel(config) self.logit_scale = None if config.logit_scale is not None: logit_scale = config.logit_scale if isinstance(logit_scale, str): if logit_scale == 'inv_sqrt_d_model': logit_scale = 1 / math.sqrt(config.d_model) else: raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.") self.logit_scale = logit_scale def get_input_embeddings(self): return self.transformer.wte def set_input_embeddings(self, value): self.transformer.wte = value def get_output_embeddings(self): return self.transformer.wte def set_output_embeddings(self, new_embeddings): self.transformer.wte = new_embeddings def set_decoder(self, decoder): self.transformer = decoder def get_decoder(self): return self.transformer def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None): return_dict = return_dict if return_dict is not None else self.config.return_dict use_cache = use_cache if use_cache is not None else self.config.use_cache outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache) logits = F.linear(outputs.last_hidden_state, self.transformer.wte.weight) if self.logit_scale is not None: if self.logit_scale == 0: warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.') logits *= self.logit_scale loss = None if labels is not None: labels = torch.roll(labels, shifts=-1) labels[:, -1] = -100 loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1)) return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states) def param_init_fn(self, module): init_fn_name = self.config.init_config['name'] MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config) def fsdp_wrap_fn(self, module): return isinstance(module, MPTBlock) def activation_checkpointing_fn(self, module): return isinstance(module, MPTBlock) def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): if inputs_embeds is not None: raise NotImplementedError('inputs_embeds is not implemented for MPT yet') attention_mask = kwargs['attention_mask'].bool() if attention_mask[:, -1].sum() != attention_mask.shape[0]: raise NotImplementedError('MPT does not support generation with right padding.') if self.transformer.attn_uses_sequence_id and self.training: sequence_id = torch.zeros_like(input_ids[:1]) else: sequence_id = None if past_key_values is not None: input_ids = input_ids[:, -1].unsqueeze(-1) if self.transformer.prefix_lm: prefix_mask = torch.ones_like(attention_mask) if kwargs.get('use_cache') == False: raise NotImplementedError('MPT with prefix_lm=True does not support use_cache=False.') else: prefix_mask = None return {'input_ids': input_ids, 'attention_mask': attention_mask, 'prefix_mask': prefix_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True)} @staticmethod def _reorder_cache(past_key_values, beam_idx): """Used by HuggingFace generate when using beam search with kv-caching. See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133 for an example in transformers. """ return [ tuple( (past_state.index_select(0, beam_idx) for past_state in layer_past) ) for layer_past in past_key_values ] ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/llava/mpt/norm.py ================================================ import torch def _cast_if_autocast_enabled(tensor): if torch.is_autocast_enabled(): if tensor.device.type == 'cuda': dtype = torch.get_autocast_gpu_dtype() elif tensor.device.type == 'cpu': dtype = torch.get_autocast_cpu_dtype() else: raise NotImplementedError() return tensor.to(dtype=dtype) return tensor class LPLayerNorm(torch.nn.LayerNorm): def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None): super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype) def forward(self, x): module_device = x.device downcast_x = _cast_if_autocast_enabled(x) downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias with torch.autocast(enabled=False, device_type=module_device.type): return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps) def rms_norm(x, weight=None, eps=1e-05): output = x / torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps) return output * weight if weight is not None else output class RMSNorm(torch.nn.Module): def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None): super().__init__() self.eps = eps if weight: self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device)) else: self.register_parameter('weight', None) def forward(self, x): return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype) class LPRMSNorm(RMSNorm): def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None): super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device) def forward(self, x): downcast_x = _cast_if_autocast_enabled(x) downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight with torch.autocast(enabled=False, device_type=x.device.type): return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype) NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm} ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/llava/mpt/param_init_fns.py ================================================ import math import warnings from collections.abc import Sequence from functools import partial from typing import Optional, Tuple, Union import torch from torch import nn from .norm import NORM_CLASS_REGISTRY def torch_default_param_init_fn_(module: nn.Module, verbose: int=0, **kwargs): del kwargs if verbose > 1: warnings.warn("Initializing network using module's reset_parameters attribute") if hasattr(module, 'reset_parameters'): module.reset_parameters() def fused_init_helper_(module: nn.Module, init_fn_): _fused = getattr(module, '_fused', None) if _fused is None: raise RuntimeError('Internal logic error') (dim, splits) = _fused splits = (0, *splits, module.weight.size(dim)) for (s, e) in zip(splits[:-1], splits[1:]): slice_indices = [slice(None)] * module.weight.ndim slice_indices[dim] = slice(s, e) init_fn_(module.weight[slice_indices]) def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs): del kwargs if verbose > 1: warnings.warn('If model has bias parameters they are initialized to 0.') init_div_is_residual = init_div_is_residual if init_div_is_residual is False: div_is_residual = 1.0 elif init_div_is_residual is True: div_is_residual = math.sqrt(2 * n_layers) elif isinstance(init_div_is_residual, (float, int)): div_is_residual = init_div_is_residual elif isinstance(init_div_is_residual, str) and init_div_is_residual.isnumeric(): div_is_residual = float(init_div_is_residual) else: div_is_residual = 1.0 raise ValueError(f'Expected init_div_is_residual to be boolean or numeric, got {init_div_is_residual}') if init_div_is_residual is not False and verbose > 1: warnings.warn( f'Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. Set `init_div_is_residual: false` in init config to disable this.' ) if isinstance(module, nn.Linear): if hasattr(module, '_fused'): fused_init_helper_(module, init_fn_) else: init_fn_(module.weight) if module.bias is not None: torch.nn.init.zeros_(module.bias) if init_div_is_residual is not False and getattr(module, '_is_residual', False): with torch.no_grad(): module.weight.div_(div_is_residual) elif isinstance(module, nn.Embedding): if emb_init_std is not None: std = emb_init_std if std == 0: warnings.warn('Embedding layer initialized to 0.') emb_init_fn_ = partial(torch.nn.init.normal_, mean=0.0, std=std) if verbose > 1: warnings.warn(f'Embedding layer initialized using normal distribution with mean=0 and std={std!r}.') elif emb_init_uniform_lim is not None: lim = emb_init_uniform_lim if isinstance(lim, Sequence): if len(lim) > 2: raise ValueError(f'Uniform init requires a min and a max limit. User input: {lim}.') if lim[0] == lim[1]: warnings.warn(f'Embedding layer initialized to {lim[0]}.') else: if lim == 0: warnings.warn('Embedding layer initialized to 0.') lim = [-lim, lim] (a, b) = lim emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b) if verbose > 1: warnings.warn(f'Embedding layer initialized using uniform distribution in range {lim}.') else: emb_init_fn_ = init_fn_ emb_init_fn_(module.weight) elif isinstance(module, tuple(set(NORM_CLASS_REGISTRY.values()))): if verbose > 1: warnings.warn( 'Norm weights are set to 1. If norm layer has a bias it is initialized to 0.' ) if hasattr(module, 'weight') and module.weight is not None: torch.nn.init.ones_(module.weight) if hasattr(module, 'bias') and module.bias is not None: torch.nn.init.zeros_(module.bias) elif isinstance(module, nn.MultiheadAttention): if module._qkv_same_embed_dim: _extracted_from_generic_param_init_fn__69(module, d_model, init_fn_) else: assert module.q_proj_weight is not None and module.k_proj_weight is not None and (module.v_proj_weight is not None) assert module.in_proj_weight is None init_fn_(module.q_proj_weight) init_fn_(module.k_proj_weight) init_fn_(module.v_proj_weight) if module.in_proj_bias is not None: torch.nn.init.zeros_(module.in_proj_bias) if module.bias_k is not None: torch.nn.init.zeros_(module.bias_k) if module.bias_v is not None: torch.nn.init.zeros_(module.bias_v) init_fn_(module.out_proj.weight) if init_div_is_residual is not False and getattr(module.out_proj, '_is_residual', False): with torch.no_grad(): module.out_proj.weight.div_(div_is_residual) if module.out_proj.bias is not None: torch.nn.init.zeros_(module.out_proj.bias) else: for _ in module.parameters(recurse=False): raise NotImplementedError(f'{module.__class__.__name__} parameters are not initialized by param_init_fn.') # TODO Rename this here and in `generic_param_init_fn_` def _extracted_from_generic_param_init_fn__69(module, d_model, init_fn_): assert module.in_proj_weight is not None assert module.q_proj_weight is None and module.k_proj_weight is None and (module.v_proj_weight is None) assert d_model is not None _d = d_model splits = (0, _d, 2 * _d, 3 * _d) for (s, e) in zip(splits[:-1], splits[1:]): init_fn_(module.in_proj_weight[s:e]) def _normal_init_(std, mean=0.0): return partial(torch.nn.init.normal_, mean=mean, std=std) def _normal_param_init_fn_(module: nn.Module, std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs): del kwargs init_fn_ = _normal_init_(std=std) if verbose > 1: warnings.warn(f'Using torch.nn.init.normal_ init fn mean=0.0, std={std}') generic_param_init_fn_(module=module, init_fn_=init_fn_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose) def baseline_param_init_fn_(module: nn.Module, init_std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs): del kwargs if init_std is None: raise ValueError("You must set model.init_config['init_std'] to a float value to use the default initialization scheme.") _normal_param_init_fn_(module=module, std=init_std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose) def small_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs): del kwargs std = math.sqrt(2 / (5 * d_model)) _normal_param_init_fn_(module=module, std=std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose) def neox_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs): """From section 2.3.1 of GPT-NeoX-20B: An Open-Source AutoregressiveLanguage Model — Black et. al. (2022) see https://github.com/EleutherAI/gpt-neox/blob/9610391ab319403cef079b438edd016a2443af54/megatron/model/init_functions.py#L151 and https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/transformer.py """ del kwargs residual_div = n_layers / math.sqrt(10) if verbose > 1: warnings.warn(f'setting init_div_is_residual to {residual_div}') small_param_init_fn_(module=module, d_model=d_model, n_layers=n_layers, init_div_is_residual=residual_div, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose) def kaiming_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs): del kwargs if verbose > 1: warnings.warn( f'Using nn.init.kaiming_uniform_ init fn with parameters: a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}' ) kaiming_uniform_ = partial(nn.init.kaiming_uniform_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity) generic_param_init_fn_(module=module, init_fn_=kaiming_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose) def kaiming_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs): del kwargs if verbose > 1: warnings.warn( f'Using nn.init.kaiming_normal_ init fn with parameters: a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}' ) kaiming_normal_ = partial(torch.nn.init.kaiming_normal_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity) generic_param_init_fn_(module=module, init_fn_=kaiming_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose) def xavier_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, verbose: int=0, **kwargs): del kwargs xavier_uniform_ = partial(torch.nn.init.xavier_uniform_, gain=init_gain) if verbose > 1: warnings.warn( f'Using torch.nn.init.xavier_uniform_ init fn with parameters: gain={init_gain}' ) generic_param_init_fn_(module=module, init_fn_=xavier_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose) def xavier_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, verbose: int=0, **kwargs): xavier_normal_ = partial(torch.nn.init.xavier_normal_, gain=init_gain) if verbose > 1: warnings.warn( f'Using torch.nn.init.xavier_normal_ init fn with parameters: gain={init_gain}' ) generic_param_init_fn_(module=module, init_fn_=xavier_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose) MODEL_INIT_REGISTRY = {'default_': torch_default_param_init_fn_, 'baseline_': baseline_param_init_fn_, 'kaiming_uniform_': kaiming_uniform_param_init_fn_, 'kaiming_normal_': kaiming_normal_param_init_fn_, 'neox_init_': neox_param_init_fn_, 'small_init_': small_param_init_fn_, 'xavier_uniform_': xavier_uniform_param_init_fn_, 'xavier_normal_': xavier_normal_param_init_fn_} ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/nets/PixArt.py ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # GLIDE: https://github.com/openai/glide-text2im # MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py # -------------------------------------------------------- import math import torch import torch.nn as nn import os import numpy as np from timm.models.layers import DropPath from timm.models.vision_transformer import PatchEmbed, Mlp from diffusion.model.builder import MODELS from diffusion.model.utils import auto_grad_checkpoint, to_2tuple from diffusion.model.nets.PixArt_blocks import t2i_modulate, CaptionEmbedder, WindowAttention, MultiHeadCrossAttention, T2IFinalLayer, TimestepEmbedder, LabelEmbedder, FinalLayer from diffusion.utils.logger import get_root_logger from diffusion.model.cache_functions import global_force_fresh, cache_cutfresh, update_cache, force_init import json class PixArtBlock(nn.Module): """ A PixArt block with adaptive layer norm (adaLN-single) conditioning. """ def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., window_size=0, input_size=None, use_rel_pos=False, **block_kwargs): super().__init__() self.hidden_size = hidden_size self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) self.attn = WindowAttention(hidden_size, num_heads=num_heads, qkv_bias=True, input_size=input_size if window_size == 0 else (window_size, window_size), use_rel_pos=use_rel_pos, **block_kwargs) self.cross_attn = MultiHeadCrossAttention(hidden_size, num_heads, **block_kwargs) self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) # to be compatible with lower version pytorch approx_gelu = lambda: nn.GELU(approximate="tanh") self.mlp = Mlp(in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0) self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.window_size = window_size self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size ** 0.5) def forward(self, x, y, t, current, cache_dic, mask=None, **kwargs): B, N, C = x.shape shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1) is_force_fresh = global_force_fresh(cache_dic, current) current['is_force_fresh'] = is_force_fresh if is_force_fresh: # Compute all tokens, and save them to cache current['module'] = 'attn' cache_dic['cache'][-1][current['layer']][current['module']], cache_dic['attn_map'][-1][current['layer']] = self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa))#.reshape(B, N, C) force_init(cache_dic, current, x) x = x + self.drop_path(gate_msa * cache_dic['cache'][-1][current['layer']][current['module']]) current['module'] = 'cross-attn' cache_dic['cache'][-1][current['layer']][current['module']], cache_dic['cross_attn_map'][-1][current['layer']] = self.cross_attn(x, y, mask) force_init(cache_dic, current, x) x = x + cache_dic['cache'][-1][current['layer']][current['module']] current['module'] = 'mlp' cache_dic['cache'][-1][current['layer']][current['module']] = self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)) force_init(cache_dic, current, x) x = x + self.drop_path(gate_mlp * cache_dic['cache'][-1][current['layer']][current['module']]) else: current['module'] = 'attn' # no partial computation for attn. if you want to have an exploration, below may help. #fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x, current) #fresh_tokens, fresh_attn_map = self.attn(t2i_modulate(self.norm1(fresh_tokens), shift_msa, scale_msa))#.reshape(B, N, C) #update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current, fresh_attn_map=fresh_attn_map) #cache_dic['cache'][-1][current['layer']][current['module']], cache_dic['attn_map'][-1][current['layer']] = self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa))#.reshape(B, N, C) x = x + self.drop_path(gate_msa * cache_dic['cache'][-1][current['layer']][current['module']]) current['module'] = 'cross-attn' fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x, current) fresh_tokens, fresh_cross_attn_map = self.cross_attn(fresh_tokens, y, mask) update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current, fresh_attn_map=fresh_cross_attn_map) x = x + cache_dic['cache'][-1][current['layer']][current['module']] current['module'] = 'mlp' fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x, current) fresh_tokens = self.mlp(t2i_modulate(self.norm2(fresh_tokens), shift_mlp, scale_mlp)) update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current) x = x + self.drop_path(gate_mlp * cache_dic['cache'][-1][current['layer']][current['module']]) return x ############################################################################# # Core PixArt Model # ################################################################################# @MODELS.register_module() class PixArt(nn.Module): """ Diffusion model with a Transformer backbone. """ def __init__(self, input_size=32, patch_size=2, in_channels=4, hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, pred_sigma=True, drop_path: float = 0., window_size=0, window_block_indexes=None, use_rel_pos=False, caption_channels=4096, lewei_scale=1.0, config=None, model_max_length=120, **kwargs): if window_block_indexes is None: window_block_indexes = [] super().__init__() self.pred_sigma = pred_sigma self.in_channels = in_channels self.out_channels = in_channels * 2 if pred_sigma else in_channels self.patch_size = patch_size self.num_heads = num_heads self.lewei_scale = lewei_scale, self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True) self.t_embedder = TimestepEmbedder(hidden_size) num_patches = self.x_embedder.num_patches self.base_size = input_size // self.patch_size # Will use fixed sin-cos embedding: self.register_buffer("pos_embed", torch.zeros(1, num_patches, hidden_size)) approx_gelu = lambda: nn.GELU(approximate="tanh") self.t_block = nn.Sequential( nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True) ) self.y_embedder = CaptionEmbedder(in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=model_max_length) drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] # stochastic depth decay rule self.blocks = nn.ModuleList([ PixArtBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i], input_size=(input_size // patch_size, input_size // patch_size), window_size=window_size if i in window_block_indexes else 0, use_rel_pos=use_rel_pos if i in window_block_indexes else False) for i in range(depth) ]) self.final_layer = T2IFinalLayer(hidden_size, patch_size, self.out_channels) self.initialize_weights() if config: logger = get_root_logger(os.path.join(config.work_dir, 'train_log.log')) logger.warning(f"lewei scale: {self.lewei_scale}, base size: {self.base_size}") else: print(f'Warning: lewei scale: {self.lewei_scale}, base size: {self.base_size}') def forward(self, x, timestep, current, cache_dic, y, mask=None, data_info=None, **kwargs): """ Forward pass of PixArt. x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) t: (N,) tensor of diffusion timesteps y: (N, 1, 120, C) tensor of class labels """ x = x.to(self.dtype) timestep = timestep.to(self.dtype) y = y.to(self.dtype) pos_embed = self.pos_embed.to(self.dtype) self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size x = self.x_embedder(x) + pos_embed # (N, T, D), where T = H * W / patch_size ** 2 t = self.t_embedder(timestep.to(x.dtype)) # (N, D) t0 = self.t_block(t) y = self.y_embedder(y, self.training) # (N, 1, L, D) if mask is not None: if mask.shape[0] != y.shape[0]: mask = mask.repeat(y.shape[0] // mask.shape[0], 1) mask = mask.squeeze(1).squeeze(1) y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) y_lens = mask.sum(dim=1).tolist() else: y_lens = [y.shape[2]] * y.shape[0] y = y.squeeze(1).view(1, -1, x.shape[-1]) for i, block in enumerate(self.blocks): current['layer'] = i x = auto_grad_checkpoint(block, x, y, t0, current, cache_dic, y_lens) # (N, T, D) #support grad checkpoint x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels) x = self.unpatchify(x) # (N, out_channels, H, W) return x def forward_with_dpmsolver(self, x, timestep, current, cache_dic, y, mask=None, **kwargs): """ dpm solver donnot need variance prediction """ # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb model_out = self.forward(x, timestep, current, cache_dic, y, mask) return model_out.chunk(2, dim=1)[0] def forward_with_cfg(self, x, timestep, current, cache_dic, y, cfg_scale, mask=None, **kwargs): """ Forward pass of PixArt, but also batches the unconditional forward pass for classifier-free guidance. """ # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb half = x[: len(x) // 2] combined = torch.cat([half, half], dim=0) model_out = self.forward(combined, timestep, current, cache_dic, y, mask, kwargs) model_out = model_out['x'] if isinstance(model_out, dict) else model_out eps, rest = model_out[:, :3], model_out[:, 3:] cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0) half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps) eps = torch.cat([half_eps, half_eps], dim=0) return torch.cat([eps, rest], dim=1) def unpatchify(self, x): """ x: (N, T, patch_size**2 * C) imgs: (N, H, W, C) """ c = self.out_channels p = self.x_embedder.patch_size[0] h = w = int(x.shape[1] ** 0.5) assert h * w == x.shape[1] x = x.reshape(shape=(x.shape[0], h, w, p, p, c)) x = torch.einsum('nhwpqc->nchpwq', x) return x.reshape(shape=(x.shape[0], c, h * p, h * p)) def initialize_weights(self): # Initialize transformer layers: def _basic_init(module): if isinstance(module, nn.Linear): torch.nn.init.xavier_uniform_(module.weight) if module.bias is not None: nn.init.constant_(module.bias, 0) self.apply(_basic_init) # Initialize (and freeze) pos_embed by sin-cos embedding: pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5), lewei_scale=self.lewei_scale, base_size=self.base_size) self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0)) # Initialize patch_embed like nn.Linear (instead of nn.Conv2d): w = self.x_embedder.proj.weight.data nn.init.xavier_uniform_(w.view([w.shape[0], -1])) # Initialize timestep embedding MLP: nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) nn.init.normal_(self.t_block[1].weight, std=0.02) # Initialize caption embedding MLP: nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02) nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02) # Zero-out adaLN modulation layers in PixArt blocks: for block in self.blocks: nn.init.constant_(block.cross_attn.proj.weight, 0) nn.init.constant_(block.cross_attn.proj.bias, 0) # Zero-out output layers: nn.init.constant_(self.final_layer.linear.weight, 0) nn.init.constant_(self.final_layer.linear.bias, 0) @property def dtype(self): return next(self.parameters()).dtype def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, lewei_scale=1.0, base_size=16): """ grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) """ if isinstance(grid_size, int): grid_size = to_2tuple(grid_size) grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0]/base_size) / lewei_scale grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1]/base_size) / lewei_scale grid = np.meshgrid(grid_w, grid_h) # here w goes first grid = np.stack(grid, axis=0) grid = grid.reshape([2, 1, grid_size[1], grid_size[0]]) pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) if cls_token and extra_tokens > 0: pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0) return pos_embed def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): assert embed_dim % 2 == 0 # use half of dimensions to encode grid_h emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) return np.concatenate([emb_h, emb_w], axis=1) def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): """ embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D) """ assert embed_dim % 2 == 0 omega = np.arange(embed_dim // 2, dtype=np.float64) omega /= embed_dim / 2. omega = 1. / 10000 ** omega # (D/2,) pos = pos.reshape(-1) # (M,) out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product emb_sin = np.sin(out) # (M, D/2) emb_cos = np.cos(out) # (M, D/2) return np.concatenate([emb_sin, emb_cos], axis=1) ################################################################################# # PixArt Configs # ################################################################################# @MODELS.register_module() def PixArt_XL_2(**kwargs): return PixArt(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs) ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/nets/PixArtMS.py ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # GLIDE: https://github.com/openai/glide-text2im # MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py # -------------------------------------------------------- import torch import torch.nn as nn from timm.models.layers import DropPath from timm.models.vision_transformer import Mlp from diffusion.model.builder import MODELS from diffusion.model.utils import auto_grad_checkpoint, to_2tuple from diffusion.model.nets.PixArt_blocks import t2i_modulate, CaptionEmbedder, WindowAttention, MultiHeadCrossAttention, T2IFinalLayer, TimestepEmbedder, SizeEmbedder from diffusion.model.nets.PixArt import PixArt, get_2d_sincos_pos_embed class PatchEmbed(nn.Module): """ 2D Image to Patch Embedding """ def __init__( self, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True, bias=True, ): super().__init__() patch_size = to_2tuple(patch_size) self.patch_size = patch_size self.flatten = flatten self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias) self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() def forward(self, x): x = self.proj(x) if self.flatten: x = x.flatten(2).transpose(1, 2) # BCHW -> BNC x = self.norm(x) return x class PixArtMSBlock(nn.Module): """ A PixArt block with adaptive layer norm zero (adaLN-Zero) conditioning. """ def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., window_size=0, input_size=None, use_rel_pos=False, **block_kwargs): super().__init__() self.hidden_size = hidden_size self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) self.attn = WindowAttention(hidden_size, num_heads=num_heads, qkv_bias=True, input_size=input_size if window_size == 0 else (window_size, window_size), use_rel_pos=use_rel_pos, **block_kwargs) self.cross_attn = MultiHeadCrossAttention(hidden_size, num_heads, **block_kwargs) self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) # to be compatible with lower version pytorch approx_gelu = lambda: nn.GELU(approximate="tanh") self.mlp = Mlp(in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0) self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.window_size = window_size self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size ** 0.5) def forward(self, x, y, t, mask=None, **kwargs): B, N, C = x.shape shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1) x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa))) x = x + self.cross_attn(x, y, mask) x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp))) return x ############################################################################# # Core PixArt Model # ################################################################################# @MODELS.register_module() class PixArtMS(PixArt): """ Diffusion model with a Transformer backbone. """ def __init__(self, input_size=32, patch_size=2, in_channels=4, hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, learn_sigma=True, pred_sigma=True, drop_path: float = 0., window_size=0, window_block_indexes=None, use_rel_pos=False, caption_channels=4096, lewei_scale=1., config=None, model_max_length=120, **kwargs): if window_block_indexes is None: window_block_indexes = [] super().__init__( input_size=input_size, patch_size=patch_size, in_channels=in_channels, hidden_size=hidden_size, depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio, class_dropout_prob=class_dropout_prob, learn_sigma=learn_sigma, pred_sigma=pred_sigma, drop_path=drop_path, window_size=window_size, window_block_indexes=window_block_indexes, use_rel_pos=use_rel_pos, lewei_scale=lewei_scale, config=config, model_max_length=model_max_length, **kwargs, ) self.h = self.w = 0 approx_gelu = lambda: nn.GELU(approximate="tanh") self.t_block = nn.Sequential( nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True) ) self.x_embedder = PatchEmbed(patch_size, in_channels, hidden_size, bias=True) self.y_embedder = CaptionEmbedder(in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=model_max_length) self.csize_embedder = SizeEmbedder(hidden_size//3) # c_size embed self.ar_embedder = SizeEmbedder(hidden_size//3) # aspect ratio embed drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] # stochastic depth decay rule self.blocks = nn.ModuleList([ PixArtMSBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i], input_size=(input_size // patch_size, input_size // patch_size), window_size=window_size if i in window_block_indexes else 0, use_rel_pos=use_rel_pos if i in window_block_indexes else False) for i in range(depth) ]) self.final_layer = T2IFinalLayer(hidden_size, patch_size, self.out_channels) self.initialize() def forward(self, x, timestep, y, mask=None, data_info=None, **kwargs): """ Forward pass of PixArt. x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) t: (N,) tensor of diffusion timesteps y: (N, 1, 120, C) tensor of class labels """ bs = x.shape[0] x = x.to(self.dtype) timestep = timestep.to(self.dtype) y = y.to(self.dtype) c_size, ar = data_info['img_hw'].to(self.dtype), data_info['aspect_ratio'].to(self.dtype) self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.pos_embed.shape[-1], (self.h, self.w), lewei_scale=self.lewei_scale, base_size=self.base_size)).unsqueeze(0).to(x.device).to(self.dtype) x = self.x_embedder(x) + pos_embed # (N, T, D), where T = H * W / patch_size ** 2 t = self.t_embedder(timestep) # (N, D) csize = self.csize_embedder(c_size, bs) # (N, D) ar = self.ar_embedder(ar, bs) # (N, D) t = t + torch.cat([csize, ar], dim=1) t0 = self.t_block(t) y = self.y_embedder(y, self.training) # (N, D) if mask is not None: if mask.shape[0] != y.shape[0]: mask = mask.repeat(y.shape[0] // mask.shape[0], 1) mask = mask.squeeze(1).squeeze(1) y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) y_lens = mask.sum(dim=1).tolist() else: y_lens = [y.shape[2]] * y.shape[0] y = y.squeeze(1).view(1, -1, x.shape[-1]) for block in self.blocks: x = auto_grad_checkpoint(block, x, y, t0, y_lens, **kwargs) # (N, T, D) #support grad checkpoint x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels) x = self.unpatchify(x) # (N, out_channels, H, W) return x def forward_with_dpmsolver(self, x, timestep, y, data_info, **kwargs): """ dpm solver donnot need variance prediction """ # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb model_out = self.forward(x, timestep, y, data_info=data_info, **kwargs) return model_out.chunk(2, dim=1)[0] def forward_with_cfg(self, x, timestep, y, cfg_scale, data_info, **kwargs): """ Forward pass of PixArt, but also batches the unconditional forward pass for classifier-free guidance. """ # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb half = x[: len(x) // 2] combined = torch.cat([half, half], dim=0) model_out = self.forward(combined, timestep, y, data_info=data_info) eps, rest = model_out[:, :3], model_out[:, 3:] cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0) half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps) eps = torch.cat([half_eps, half_eps], dim=0) return torch.cat([eps, rest], dim=1) def unpatchify(self, x): """ x: (N, T, patch_size**2 * C) imgs: (N, H, W, C) """ c = self.out_channels p = self.x_embedder.patch_size[0] assert self.h * self.w == x.shape[1] x = x.reshape(shape=(x.shape[0], self.h, self.w, p, p, c)) x = torch.einsum('nhwpqc->nchpwq', x) return x.reshape(shape=(x.shape[0], c, self.h * p, self.w * p)) def initialize(self): # Initialize transformer layers: def _basic_init(module): if isinstance(module, nn.Linear): torch.nn.init.xavier_uniform_(module.weight) if module.bias is not None: nn.init.constant_(module.bias, 0) self.apply(_basic_init) # Initialize patch_embed like nn.Linear (instead of nn.Conv2d): w = self.x_embedder.proj.weight.data nn.init.xavier_uniform_(w.view([w.shape[0], -1])) # Initialize timestep embedding MLP: nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) nn.init.normal_(self.t_block[1].weight, std=0.02) nn.init.normal_(self.csize_embedder.mlp[0].weight, std=0.02) nn.init.normal_(self.csize_embedder.mlp[2].weight, std=0.02) nn.init.normal_(self.ar_embedder.mlp[0].weight, std=0.02) nn.init.normal_(self.ar_embedder.mlp[2].weight, std=0.02) # Initialize caption embedding MLP: nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02) nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02) # Zero-out adaLN modulation layers in PixArt blocks: for block in self.blocks: nn.init.constant_(block.cross_attn.proj.weight, 0) nn.init.constant_(block.cross_attn.proj.bias, 0) # Zero-out output layers: nn.init.constant_(self.final_layer.linear.weight, 0) nn.init.constant_(self.final_layer.linear.bias, 0) ################################################################################# # PixArt Configs # ################################################################################# @MODELS.register_module() def PixArtMS_XL_2(**kwargs): return PixArtMS(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs) ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/nets/PixArt_blocks.py ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # References: # GLIDE: https://github.com/openai/glide-text2im # MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py # -------------------------------------------------------- import math import torch import torch.nn as nn from timm.models.vision_transformer import Mlp, Attention as Attention_ from einops import rearrange, repeat import xformers.ops from diffusion.model.utils import add_decomposed_rel_pos from diffusion.model.cache_functions import cached_attention_forward def modulate(x, shift, scale): return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) def t2i_modulate(x, shift, scale): return x * (1 + scale) + shift class MultiHeadCrossAttention(nn.Module): def __init__(self, d_model, num_heads, attn_drop=0., proj_drop=0., **block_kwargs): super(MultiHeadCrossAttention, self).__init__() assert d_model % num_heads == 0, "d_model must be divisible by num_heads" self.d_model = d_model self.num_heads = num_heads self.head_dim = d_model // num_heads self.q_linear = nn.Linear(d_model, d_model) self.kv_linear = nn.Linear(d_model, d_model*2) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(d_model, d_model) self.proj_drop = nn.Dropout(proj_drop) def forward(self, x, cond, mask=None): # query: img tokens; key/value: condition; mask: if padding tokens B, N, C = x.shape q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim) kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim) k, v = kv.unbind(2) attn_bias = None if mask is not None: attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask) #x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias) # we need to save the cross-attn map here, so we use our own function for cross-attention, not the xformers.ops.memory_efficient_attention # maybe there is a future version of xformers.ops.memory_efficient_attention that can return the attn_map x, attn_map = cached_attention_forward(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias) x = x.view(B, -1, C) attn_map = attn_map.view(B, -1, attn_map.shape[-1]) x = self.proj(x) x = self.proj_drop(x) #q = self.q_linear(x).reshape(B, -1, self.num_heads, self.head_dim) #kv = self.kv_linear(cond).reshape(B, -1, 2, self.num_heads, self.head_dim) #k, v = kv.unbind(2) #attn_bias = None #if mask is not None: # attn_bias = torch.zeros([B * self.num_heads, q.shape[1], k.shape[1]], dtype=q.dtype, device=q.device) # attn_bias.masked_fill_(mask.squeeze(1).repeat(self.num_heads, 1, 1) == 0, float('-inf')) ##x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias) #x, attn_map = cached_attention_forward(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias) #x = x.contiguous().reshape(B, -1, C) #x = self.proj(x) #x = self.proj_drop(x) return x, attn_map class WindowAttention(Attention_): """Multi-head Attention block with relative position embeddings.""" def __init__( self, dim, num_heads=8, qkv_bias=True, use_rel_pos=False, rel_pos_zero_init=True, input_size=None, **block_kwargs, ): """ Args: dim (int): Number of input channels. num_heads (int): Number of attention heads. qkv_bias (bool: If True, add a learnable bias to query, key, value. rel_pos (bool): If True, add relative positional embeddings to the attention map. rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. input_size (int or None): Input resolution for calculating the relative positional parameter size. """ super().__init__(dim, num_heads=num_heads, qkv_bias=qkv_bias, **block_kwargs) self.use_rel_pos = use_rel_pos if self.use_rel_pos: # initialize relative positional embeddings self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, self.head_dim)) self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, self.head_dim)) if not rel_pos_zero_init: nn.init.trunc_normal_(self.rel_pos_h, std=0.02) nn.init.trunc_normal_(self.rel_pos_w, std=0.02) def forward(self, x, mask=None): B, N, C = x.shape qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) q, k, v = qkv.unbind(2) if use_fp32_attention := getattr(self, 'fp32_attention', False): q, k, v = q.float(), k.float(), v.float() attn_bias = None if mask is not None: attn_bias = torch.zeros([B * self.num_heads, q.shape[1], k.shape[1]], dtype=q.dtype, device=q.device) attn_bias.masked_fill_(mask.squeeze(1).repeat(self.num_heads, 1, 1) == 0, float('-inf')) #x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias) #attn_map = None # we need to save the self-attn map here, so we use our own function for self-attention, not the xformers.ops.memory_efficient_attention # maybe there is a future version of xformers.ops.memory_efficient_attention that can return the attn_map # However, you can use the xformers.ops.memory_efficient_attention for self-attention, and use our own function for cross-attention. # This is because in our final version, only cross attention map is used, you can use the xformers.ops.memory_efficient_attention for self-attention for a faster speed, if you don't need the self-attention score(s1). x, attn_map = cached_attention_forward(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias) x = x.view(B, N, C) x = self.proj(x) x = self.proj_drop(x) return x, attn_map ################################################################################# # AMP attention with fp32 softmax to fix loss NaN problem during training # ################################################################################# class Attention(Attention_): def forward(self, x): B, N, C = x.shape qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) use_fp32_attention = getattr(self, 'fp32_attention', False) if use_fp32_attention: q, k = q.float(), k.float() with torch.cuda.amp.autocast(enabled=not use_fp32_attention): attn = (q @ k.transpose(-2, -1)) * self.scale attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, C) x = self.proj(x) x = self.proj_drop(x) return x class FinalLayer(nn.Module): """ The final layer of PixArt. """ def __init__(self, hidden_size, patch_size, out_channels): super().__init__() self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True) self.adaLN_modulation = nn.Sequential( nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True) ) def forward(self, x, c): shift, scale = self.adaLN_modulation(c).chunk(2, dim=1) x = modulate(self.norm_final(x), shift, scale) x = self.linear(x) return x class T2IFinalLayer(nn.Module): """ The final layer of PixArt. """ def __init__(self, hidden_size, patch_size, out_channels): super().__init__() self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True) self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size ** 0.5) self.out_channels = out_channels def forward(self, x, t): shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1) x = t2i_modulate(self.norm_final(x), shift, scale) x = self.linear(x) return x class MaskFinalLayer(nn.Module): """ The final layer of PixArt. """ def __init__(self, final_hidden_size, c_emb_size, patch_size, out_channels): super().__init__() self.norm_final = nn.LayerNorm(final_hidden_size, elementwise_affine=False, eps=1e-6) self.linear = nn.Linear(final_hidden_size, patch_size * patch_size * out_channels, bias=True) self.adaLN_modulation = nn.Sequential( nn.SiLU(), nn.Linear(c_emb_size, 2 * final_hidden_size, bias=True) ) def forward(self, x, t): shift, scale = self.adaLN_modulation(t).chunk(2, dim=1) x = modulate(self.norm_final(x), shift, scale) x = self.linear(x) return x class DecoderLayer(nn.Module): """ The final layer of PixArt. """ def __init__(self, hidden_size, decoder_hidden_size): super().__init__() self.norm_decoder = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) self.linear = nn.Linear(hidden_size, decoder_hidden_size, bias=True) self.adaLN_modulation = nn.Sequential( nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True) ) def forward(self, x, t): shift, scale = self.adaLN_modulation(t).chunk(2, dim=1) x = modulate(self.norm_decoder(x), shift, scale) x = self.linear(x) return x ################################################################################# # Embedding Layers for Timesteps and Class Labels # ################################################################################# class TimestepEmbedder(nn.Module): """ Embeds scalar timesteps into vector representations. """ def __init__(self, hidden_size, frequency_embedding_size=256): super().__init__() self.mlp = nn.Sequential( nn.Linear(frequency_embedding_size, hidden_size, bias=True), nn.SiLU(), nn.Linear(hidden_size, hidden_size, bias=True), ) self.frequency_embedding_size = frequency_embedding_size @staticmethod def timestep_embedding(t, dim, max_period=10000): """ Create sinusoidal timestep embeddings. :param t: a 1-D Tensor of N indices, one per batch element. These may be fractional. :param dim: the dimension of the output. :param max_period: controls the minimum frequency of the embeddings. :return: an (N, D) Tensor of positional embeddings. """ # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py half = dim // 2 freqs = torch.exp( -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half) args = t[:, None].float() * freqs[None] embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) if dim % 2: embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) return embedding def forward(self, t): t_freq = self.timestep_embedding(t, self.frequency_embedding_size).to(self.dtype) return self.mlp(t_freq) @property def dtype(self): # 返回模型参数的数据类型 return next(self.parameters()).dtype class SizeEmbedder(TimestepEmbedder): """ Embeds scalar timesteps into vector representations. """ def __init__(self, hidden_size, frequency_embedding_size=256): super().__init__(hidden_size=hidden_size, frequency_embedding_size=frequency_embedding_size) self.mlp = nn.Sequential( nn.Linear(frequency_embedding_size, hidden_size, bias=True), nn.SiLU(), nn.Linear(hidden_size, hidden_size, bias=True), ) self.frequency_embedding_size = frequency_embedding_size self.outdim = hidden_size def forward(self, s, bs): if s.ndim == 1: s = s[:, None] assert s.ndim == 2 if s.shape[0] != bs: s = s.repeat(bs//s.shape[0], 1) assert s.shape[0] == bs b, dims = s.shape[0], s.shape[1] s = rearrange(s, "b d -> (b d)") s_freq = self.timestep_embedding(s, self.frequency_embedding_size).to(self.dtype) s_emb = self.mlp(s_freq) s_emb = rearrange(s_emb, "(b d) d2 -> b (d d2)", b=b, d=dims, d2=self.outdim) return s_emb @property def dtype(self): # 返回模型参数的数据类型 return next(self.parameters()).dtype class LabelEmbedder(nn.Module): """ Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance. """ def __init__(self, num_classes, hidden_size, dropout_prob): super().__init__() use_cfg_embedding = dropout_prob > 0 self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size) self.num_classes = num_classes self.dropout_prob = dropout_prob def token_drop(self, labels, force_drop_ids=None): """ Drops labels to enable classifier-free guidance. """ if force_drop_ids is None: drop_ids = torch.rand(labels.shape[0]).cuda() < self.dropout_prob else: drop_ids = force_drop_ids == 1 labels = torch.where(drop_ids, self.num_classes, labels) return labels def forward(self, labels, train, force_drop_ids=None): use_dropout = self.dropout_prob > 0 if (train and use_dropout) or (force_drop_ids is not None): labels = self.token_drop(labels, force_drop_ids) return self.embedding_table(labels) class CaptionEmbedder(nn.Module): """ Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance. """ def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate='tanh'), token_num=120): super().__init__() self.y_proj = Mlp(in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size, act_layer=act_layer, drop=0) self.register_buffer("y_embedding", nn.Parameter(torch.randn(token_num, in_channels) / in_channels ** 0.5)) self.uncond_prob = uncond_prob def token_drop(self, caption, force_drop_ids=None): """ Drops labels to enable classifier-free guidance. """ if force_drop_ids is None: drop_ids = torch.rand(caption.shape[0]).cuda() < self.uncond_prob else: drop_ids = force_drop_ids == 1 caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption) return caption def forward(self, caption, train, force_drop_ids=None): if train: assert caption.shape[2:] == self.y_embedding.shape use_dropout = self.uncond_prob > 0 if (train and use_dropout) or (force_drop_ids is not None): caption = self.token_drop(caption, force_drop_ids) caption = self.y_proj(caption) return caption class CaptionEmbedderDoubleBr(nn.Module): """ Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance. """ def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate='tanh'), token_num=120): super().__init__() self.proj = Mlp(in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size, act_layer=act_layer, drop=0) self.embedding = nn.Parameter(torch.randn(1, in_channels) / 10 ** 0.5) self.y_embedding = nn.Parameter(torch.randn(token_num, in_channels) / 10 ** 0.5) self.uncond_prob = uncond_prob def token_drop(self, global_caption, caption, force_drop_ids=None): """ Drops labels to enable classifier-free guidance. """ if force_drop_ids is None: drop_ids = torch.rand(global_caption.shape[0]).cuda() < self.uncond_prob else: drop_ids = force_drop_ids == 1 global_caption = torch.where(drop_ids[:, None], self.embedding, global_caption) caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption) return global_caption, caption def forward(self, caption, train, force_drop_ids=None): assert caption.shape[2: ] == self.y_embedding.shape global_caption = caption.mean(dim=2).squeeze() use_dropout = self.uncond_prob > 0 if (train and use_dropout) or (force_drop_ids is not None): global_caption, caption = self.token_drop(global_caption, caption, force_drop_ids) y_embed = self.proj(global_caption) return y_embed, caption ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/nets/__init__.py ================================================ from .PixArt import PixArt, PixArt_XL_2 from .PixArtMS import PixArtMS, PixArtMS_XL_2, PixArtMSBlock from .pixart_controlnet import ControlPixArtHalf, ControlPixArtMSHalf ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/nets/pixart_controlnet.py ================================================ import re import torch import torch.nn as nn from copy import deepcopy from torch import Tensor from torch.nn import Module, Linear, init from typing import Any, Mapping from diffusion.model.nets import PixArtMSBlock, PixArtMS, PixArt from diffusion.model.nets.PixArt import get_2d_sincos_pos_embed from diffusion.model.utils import auto_grad_checkpoint # The implementation of ControlNet-Half architrecture # https://github.com/lllyasviel/ControlNet/discussions/188 class ControlT2IDitBlockHalf(Module): def __init__(self, base_block: PixArtMSBlock, block_index: 0) -> None: super().__init__() self.copied_block = deepcopy(base_block) self.block_index = block_index for p in self.copied_block.parameters(): p.requires_grad_(True) self.copied_block.load_state_dict(base_block.state_dict()) self.copied_block.train() self.hidden_size = hidden_size = base_block.hidden_size if self.block_index == 0: self.before_proj = Linear(hidden_size, hidden_size) init.zeros_(self.before_proj.weight) init.zeros_(self.before_proj.bias) self.after_proj = Linear(hidden_size, hidden_size) init.zeros_(self.after_proj.weight) init.zeros_(self.after_proj.bias) def forward(self, x, y, t, mask=None, c=None): if self.block_index == 0: # the first block c = self.before_proj(c) c = self.copied_block(x + c, y, t, mask) c_skip = self.after_proj(c) else: # load from previous c and produce the c for skip connection c = self.copied_block(c, y, t, mask) c_skip = self.after_proj(c) return c, c_skip # The implementation of ControlPixArtHalf net class ControlPixArtHalf(Module): # only support single res model def __init__(self, base_model: PixArt, copy_blocks_num: int = 13) -> None: super().__init__() self.base_model = base_model.eval() self.controlnet = [] self.copy_blocks_num = copy_blocks_num self.total_blocks_num = len(base_model.blocks) for p in self.base_model.parameters(): p.requires_grad_(False) # Copy first copy_blocks_num block for i in range(copy_blocks_num): self.controlnet.append(ControlT2IDitBlockHalf(base_model.blocks[i], i)) self.controlnet = nn.ModuleList(self.controlnet) def __getattr__(self, name: str) -> Tensor or Module: if name in ['forward', 'forward_with_dpmsolver', 'forward_with_cfg', 'forward_c', 'load_state_dict']: return self.__dict__[name] elif name in ['base_model', 'controlnet']: return super().__getattr__(name) else: return getattr(self.base_model, name) def forward_c(self, c): self.h, self.w = c.shape[-2]//self.patch_size, c.shape[-1]//self.patch_size pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.pos_embed.shape[-1], (self.h, self.w), lewei_scale=self.lewei_scale, base_size=self.base_size)).unsqueeze(0).to(c.device).to(self.dtype) return self.x_embedder(c) + pos_embed if c is not None else c # def forward(self, x, t, c, **kwargs): # return self.base_model(x, t, c=self.forward_c(c), **kwargs) def forward(self, x, timestep, y, mask=None, data_info=None, c=None, **kwargs): # modify the original PixArtMS forward function if c is not None: c = c.to(self.dtype) c = self.forward_c(c) """ Forward pass of PixArt. x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) t: (N,) tensor of diffusion timesteps y: (N, 1, 120, C) tensor of class labels """ x = x.to(self.dtype) timestep = timestep.to(self.dtype) y = y.to(self.dtype) pos_embed = self.pos_embed.to(self.dtype) self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size x = self.x_embedder(x) + pos_embed # (N, T, D), where T = H * W / patch_size ** 2 t = self.t_embedder(timestep.to(x.dtype)) # (N, D) t0 = self.t_block(t) y = self.y_embedder(y, self.training) # (N, 1, L, D) if mask is not None: if mask.shape[0] != y.shape[0]: mask = mask.repeat(y.shape[0] // mask.shape[0], 1) mask = mask.squeeze(1).squeeze(1) y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) y_lens = mask.sum(dim=1).tolist() else: y_lens = [y.shape[2]] * y.shape[0] y = y.squeeze(1).view(1, -1, x.shape[-1]) # define the first layer x = auto_grad_checkpoint(self.base_model.blocks[0], x, y, t0, y_lens, **kwargs) # (N, T, D) #support grad checkpoint if c is not None: # update c for index in range(1, self.copy_blocks_num + 1): c, c_skip = auto_grad_checkpoint(self.controlnet[index - 1], x, y, t0, y_lens, c, **kwargs) x = auto_grad_checkpoint(self.base_model.blocks[index], x + c_skip, y, t0, y_lens, **kwargs) # update x for index in range(self.copy_blocks_num + 1, self.total_blocks_num): x = auto_grad_checkpoint(self.base_model.blocks[index], x, y, t0, y_lens, **kwargs) else: for index in range(1, self.total_blocks_num): x = auto_grad_checkpoint(self.base_model.blocks[index], x, y, t0, y_lens, **kwargs) x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels) x = self.unpatchify(x) # (N, out_channels, H, W) return x def forward_with_dpmsolver(self, x, t, y, data_info, c, **kwargs): model_out = self.forward(x, t, y, data_info=data_info, c=c, **kwargs) return model_out.chunk(2, dim=1)[0] # def forward_with_dpmsolver(self, x, t, y, data_info, c, **kwargs): # return self.base_model.forward_with_dpmsolver(x, t, y, data_info=data_info, c=self.forward_c(c), **kwargs) def forward_with_cfg(self, x, t, y, cfg_scale, data_info, c, **kwargs): return self.base_model.forward_with_cfg(x, t, y, cfg_scale, data_info, c=self.forward_c(c), **kwargs) def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True): if all((k.startswith('base_model') or k.startswith('controlnet')) for k in state_dict.keys()): return super().load_state_dict(state_dict, strict) else: new_key = {} for k in state_dict.keys(): new_key[k] = re.sub(r"(blocks\.\d+)(.*)", r"\1.base_block\2", k) for k, v in new_key.items(): if k != v: print(f"replace {k} to {v}") state_dict[v] = state_dict.pop(k) return self.base_model.load_state_dict(state_dict, strict) def unpatchify(self, x): """ x: (N, T, patch_size**2 * C) imgs: (N, H, W, C) """ c = self.out_channels p = self.x_embedder.patch_size[0] assert self.h * self.w == x.shape[1] x = x.reshape(shape=(x.shape[0], self.h, self.w, p, p, c)) x = torch.einsum('nhwpqc->nchpwq', x) imgs = x.reshape(shape=(x.shape[0], c, self.h * p, self.w * p)) return imgs @property def dtype(self): # 返回模型参数的数据类型 return next(self.parameters()).dtype # The implementation for PixArtMS_Half + 1024 resolution class ControlPixArtMSHalf(ControlPixArtHalf): # support multi-scale res model (multi-scale model can also be applied to single reso training & inference) def __init__(self, base_model: PixArtMS, copy_blocks_num: int = 13) -> None: super().__init__(base_model=base_model, copy_blocks_num=copy_blocks_num) def forward(self, x, timestep, y, mask=None, data_info=None, c=None, **kwargs): # modify the original PixArtMS forward function """ Forward pass of PixArt. x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) t: (N,) tensor of diffusion timesteps y: (N, 1, 120, C) tensor of class labels """ if c is not None: c = c.to(self.dtype) c = self.forward_c(c) bs = x.shape[0] x = x.to(self.dtype) timestep = timestep.to(self.dtype) y = y.to(self.dtype) c_size, ar = data_info['img_hw'].to(self.dtype), data_info['aspect_ratio'].to(self.dtype) self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.pos_embed.shape[-1], (self.h, self.w), lewei_scale=self.lewei_scale, base_size=self.base_size)).unsqueeze(0).to(x.device).to(self.dtype) x = self.x_embedder(x) + pos_embed # (N, T, D), where T = H * W / patch_size ** 2 t = self.t_embedder(timestep) # (N, D) csize = self.csize_embedder(c_size, bs) # (N, D) ar = self.ar_embedder(ar, bs) # (N, D) t = t + torch.cat([csize, ar], dim=1) t0 = self.t_block(t) y = self.y_embedder(y, self.training) # (N, D) if mask is not None: if mask.shape[0] != y.shape[0]: mask = mask.repeat(y.shape[0] // mask.shape[0], 1) mask = mask.squeeze(1).squeeze(1) y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) y_lens = mask.sum(dim=1).tolist() else: y_lens = [y.shape[2]] * y.shape[0] y = y.squeeze(1).view(1, -1, x.shape[-1]) # define the first layer x = auto_grad_checkpoint(self.base_model.blocks[0], x, y, t0, y_lens, **kwargs) # (N, T, D) #support grad checkpoint if c is not None: # update c for index in range(1, self.copy_blocks_num + 1): c, c_skip = auto_grad_checkpoint(self.controlnet[index - 1], x, y, t0, y_lens, c, **kwargs) x = auto_grad_checkpoint(self.base_model.blocks[index], x + c_skip, y, t0, y_lens, **kwargs) # update x for index in range(self.copy_blocks_num + 1, self.total_blocks_num): x = auto_grad_checkpoint(self.base_model.blocks[index], x, y, t0, y_lens, **kwargs) else: for index in range(1, self.total_blocks_num): x = auto_grad_checkpoint(self.base_model.blocks[index], x, y, t0, y_lens, **kwargs) x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels) x = self.unpatchify(x) # (N, out_channels, H, W) return x ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/respace.py ================================================ # Modified from OpenAI's diffusion repos # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py import numpy as np import torch as th from .gaussian_diffusion import GaussianDiffusion def space_timesteps(num_timesteps, section_counts): """ Create a list of timesteps to use from an original diffusion process, given the number of timesteps we want to take from equally-sized portions of the original process. For example, if there's 300 timesteps and the section counts are [10,15,20] then the first 100 timesteps are strided to be 10 timesteps, the second 100 are strided to be 15 timesteps, and the final 100 are strided to be 20. If the stride is a string starting with "ddim", then the fixed striding from the DDIM paper is used, and only one section is allowed. :param num_timesteps: the number of diffusion steps in the original process to divide up. :param section_counts: either a list of numbers, or a string containing comma-separated numbers, indicating the step count per section. As a special case, use "ddimN" where N is a number of steps to use the striding from the DDIM paper. :return: a set of diffusion steps from the original process to use. """ if isinstance(section_counts, str): if section_counts.startswith("ddim"): desired_count = int(section_counts[len("ddim") :]) for i in range(1, num_timesteps): if len(range(0, num_timesteps, i)) == desired_count: return set(range(0, num_timesteps, i)) raise ValueError( f"cannot create exactly {num_timesteps} steps with an integer stride" ) section_counts = [int(x) for x in section_counts.split(",")] size_per = num_timesteps // len(section_counts) extra = num_timesteps % len(section_counts) start_idx = 0 all_steps = [] for i, section_count in enumerate(section_counts): size = size_per + (1 if i < extra else 0) if size < section_count: raise ValueError( f"cannot divide section of {size} steps into {section_count}" ) frac_stride = 1 if section_count <= 1 else (size - 1) / (section_count - 1) cur_idx = 0.0 taken_steps = [] for _ in range(section_count): taken_steps.append(start_idx + round(cur_idx)) cur_idx += frac_stride all_steps += taken_steps start_idx += size return set(all_steps) class SpacedDiffusion(GaussianDiffusion): """ A diffusion process which can skip steps in a base diffusion process. :param use_timesteps: a collection (sequence or set) of timesteps from the original diffusion process to retain. :param kwargs: the kwargs to create the base diffusion process. """ def __init__(self, use_timesteps, **kwargs): self.use_timesteps = set(use_timesteps) self.timestep_map = [] self.original_num_steps = len(kwargs["betas"]) base_diffusion = GaussianDiffusion(**kwargs) # pylint: disable=missing-kwoa last_alpha_cumprod = 1.0 new_betas = [] for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod): if i in self.use_timesteps: new_betas.append(1 - alpha_cumprod / last_alpha_cumprod) last_alpha_cumprod = alpha_cumprod self.timestep_map.append(i) kwargs["betas"] = np.array(new_betas) super().__init__(**kwargs) def p_mean_variance( self, model, *args, **kwargs ): # pylint: disable=signature-differs return super().p_mean_variance(self._wrap_model(model), *args, **kwargs) def training_losses( self, model, *args, **kwargs ): # pylint: disable=signature-differs return super().training_losses(self._wrap_model(model), *args, **kwargs) def training_losses_diffusers( self, model, *args, **kwargs ): # pylint: disable=signature-differs return super().training_losses_diffusers(self._wrap_model(model), *args, **kwargs) def condition_mean(self, cond_fn, *args, **kwargs): return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs) def condition_score(self, cond_fn, *args, **kwargs): return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs) def _wrap_model(self, model): if isinstance(model, _WrappedModel): return model return _WrappedModel( model, self.timestep_map, self.original_num_steps ) def _scale_timesteps(self, t): # Scaling is done by the wrapped model. return t class _WrappedModel: def __init__(self, model, timestep_map, original_num_steps): self.model = model self.timestep_map = timestep_map # self.rescale_timesteps = rescale_timesteps self.original_num_steps = original_num_steps def __call__(self, x, timestep, **kwargs): map_tensor = th.tensor(self.timestep_map, device=timestep.device, dtype=timestep.dtype) new_ts = map_tensor[timestep] # if self.rescale_timesteps: # new_ts = new_ts.float() * (1000.0 / self.original_num_steps) return self.model(x, timestep=new_ts, **kwargs) ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/sa_solver.py ================================================ import torch import torch.nn.functional as F import math from tqdm import tqdm class NoiseScheduleVP: def __init__( self, schedule='discrete', betas=None, alphas_cumprod=None, continuous_beta_0=0.1, continuous_beta_1=20., dtype=torch.float32, ): """Thanks to DPM-Solver for their code base""" """Create a wrapper class for the forward SDE (VP type). *** Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t. We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images. *** The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ). We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper). Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have: log_alpha_t = self.marginal_log_mean_coeff(t) sigma_t = self.marginal_std(t) lambda_t = self.marginal_lambda(t) Moreover, as lambda(t) is an invertible function, we also support its inverse function: t = self.inverse_lambda(lambda_t) =============================================================== We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]). 1. For discrete-time DPMs: For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by: t_i = (i + 1) / N e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1. We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3. Args: betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details) alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details) Note that we always have alphas_cumprod = cumprod(1 - betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`. **Important**: Please pay special attention for the args for `alphas_cumprod`: The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ). Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have alpha_{t_n} = \sqrt{\hat{alpha_n}}, and log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}). 2. For continuous-time DPMs: We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise schedule are the default settings in DDPM and improved-DDPM: Args: beta_min: A `float` number. The smallest beta for the linear schedule. beta_max: A `float` number. The largest beta for the linear schedule. cosine_s: A `float` number. The hyperparameter in the cosine schedule. cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule. T: A `float` number. The ending time of the forward process. =============================================================== Args: schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs, 'linear' or 'cosine' for continuous-time DPMs. Returns: A wrapper object of the forward SDE (VP type). =============================================================== Example: # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1): >>> ns = NoiseScheduleVP('discrete', betas=betas) # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1): >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod) # For continuous-time DPMs (VPSDE), linear schedule: >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.) """ if schedule not in ['discrete', 'linear', 'cosine']: raise ValueError( f"Unsupported noise schedule {schedule}. The schedule needs to be 'discrete' or 'linear' or 'cosine'" ) self.schedule = schedule if schedule == 'discrete': if betas is not None: log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0) else: assert alphas_cumprod is not None log_alphas = 0.5 * torch.log(alphas_cumprod) self.total_N = len(log_alphas) self.T = 1. self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1)).to(dtype=dtype) self.log_alpha_array = log_alphas.reshape((1, -1,)).to(dtype=dtype) else: self.total_N = 1000 self.beta_0 = continuous_beta_0 self.beta_1 = continuous_beta_1 self.cosine_s = 0.008 self.cosine_beta_max = 999. self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * ( 1. + self.cosine_s) / math.pi - self.cosine_s self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.)) self.schedule = schedule self.T = 0.9946 if schedule == 'cosine' else 1. def marginal_log_mean_coeff(self, t): """ Compute log(alpha_t) of a given continuous-time label t in [0, T]. """ if self.schedule == 'discrete': return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device)).reshape((-1)) elif self.schedule == 'linear': return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0 elif self.schedule == 'cosine': log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.)) return log_alpha_fn(t) - self.cosine_log_alpha_0 def marginal_alpha(self, t): """ Compute alpha_t of a given continuous-time label t in [0, T]. """ return torch.exp(self.marginal_log_mean_coeff(t)) def marginal_std(self, t): """ Compute sigma_t of a given continuous-time label t in [0, T]. """ return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t))) def marginal_lambda(self, t): """ Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T]. """ log_mean_coeff = self.marginal_log_mean_coeff(t) log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff)) return log_mean_coeff - log_std def inverse_lambda(self, lamb): """ Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t. """ if self.schedule == 'linear': tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb)) Delta = self.beta_0 ** 2 + tmp return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0) elif self.schedule == 'discrete': log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb) t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]), torch.flip(self.t_array.to(lamb.device), [1])) return t.reshape((-1,)) else: log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb)) t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * ( 1. + self.cosine_s) / math.pi - self.cosine_s return t_fn(log_alpha) def edm_sigma(self, t): return self.marginal_std(t) / self.marginal_alpha(t) def edm_inverse_sigma(self, edmsigma): alpha = 1 / (edmsigma ** 2 + 1).sqrt() sigma = alpha * edmsigma lambda_t = torch.log(alpha / sigma) return self.inverse_lambda(lambda_t) def model_wrapper( model, noise_schedule, model_type="noise", model_kwargs={}, guidance_type="uncond", condition=None, unconditional_condition=None, guidance_scale=1., classifier_fn=None, classifier_kwargs={}, ): """Thanks to DPM-Solver for their code base""" """Create a wrapper function for the noise prediction model. SA-Solver needs to solve the continuous-time diffusion SDEs. For DPMs trained on discrete-time labels, we need to firstly wrap the model function to a noise prediction model that accepts the continuous time as the input. We support four types of the diffusion model by setting `model_type`: 1. "noise": noise prediction model. (Trained by predicting noise). 2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0). 3. "v": velocity prediction model. (Trained by predicting the velocity). The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2]. [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models." arXiv preprint arXiv:2202.00512 (2022). [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models." arXiv preprint arXiv:2210.02303 (2022). 4. "score": marginal score function. (Trained by denoising score matching). Note that the score function and the noise prediction model follows a simple relationship: ``` noise(x_t, t) = -sigma_t * score(x_t, t) ``` We support three types of guided sampling by DPMs by setting `guidance_type`: 1. "uncond": unconditional sampling by DPMs. The input `model` has the following format: `` model(x, t_input, **model_kwargs) -> noise | x_start | v | score `` 2. "classifier": classifier guidance sampling [3] by DPMs and another classifier. The input `model` has the following format: `` model(x, t_input, **model_kwargs) -> noise | x_start | v | score `` The input `classifier_fn` has the following format: `` classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond) `` [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis," in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794. 3. "classifier-free": classifier-free guidance sampling by conditional DPMs. The input `model` has the following format: `` model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score `` And if cond == `unconditional_condition`, the model output is the unconditional DPM output. [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance." arXiv preprint arXiv:2207.12598 (2022). The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999) or continuous-time labels (i.e. epsilon to T). We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise: `` def model_fn(x, t_continuous) -> noise: t_input = get_model_input_time(t_continuous) return noise_pred(model, x, t_input, **model_kwargs) `` where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for SA-Solver. =============================================================== Args: model: A diffusion model with the corresponding format described above. noise_schedule: A noise schedule object, such as NoiseScheduleVP. model_type: A `str`. The parameterization type of the diffusion model. "noise" or "x_start" or "v" or "score". model_kwargs: A `dict`. A dict for the other inputs of the model function. guidance_type: A `str`. The type of the guidance for sampling. "uncond" or "classifier" or "classifier-free". condition: A pytorch tensor. The condition for the guided sampling. Only used for "classifier" or "classifier-free" guidance type. unconditional_condition: A pytorch tensor. The condition for the unconditional sampling. Only used for "classifier-free" guidance type. guidance_scale: A `float`. The scale for the guided sampling. classifier_fn: A classifier function. Only used for the classifier guidance. classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function. Returns: A noise prediction model that accepts the noised data and the continuous time as the inputs. """ def get_model_input_time(t_continuous): """ Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time. For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N]. For continuous-time DPMs, we just use `t_continuous`. """ if noise_schedule.schedule == 'discrete': return (t_continuous - 1. / noise_schedule.total_N) * 1000. else: return t_continuous def noise_pred_fn(x, t_continuous, cond=None): t_input = get_model_input_time(t_continuous) if cond is None: output = model(x, t_input, **model_kwargs) else: output = model(x, t_input, cond, **model_kwargs) if model_type == "noise": return output elif model_type == "x_start": alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous) return (x - alpha_t[0] * output) / sigma_t[0] elif model_type == "v": alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous) return alpha_t[0] * output + sigma_t[0] * x elif model_type == "score": sigma_t = noise_schedule.marginal_std(t_continuous) return -sigma_t[0] * output def cond_grad_fn(x, t_input): """ Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t). """ with torch.enable_grad(): x_in = x.detach().requires_grad_(True) log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs) return torch.autograd.grad(log_prob.sum(), x_in)[0] def model_fn(x, t_continuous): """ The noise predicition model function that is used for DPM-Solver. """ if guidance_type == "uncond": return noise_pred_fn(x, t_continuous) elif guidance_type == "classifier": assert classifier_fn is not None t_input = get_model_input_time(t_continuous) cond_grad = cond_grad_fn(x, t_input) sigma_t = noise_schedule.marginal_std(t_continuous) noise = noise_pred_fn(x, t_continuous) return noise - guidance_scale * sigma_t * cond_grad elif guidance_type == "classifier-free": if guidance_scale == 1. or unconditional_condition is None: return noise_pred_fn(x, t_continuous, cond=condition) x_in = torch.cat([x] * 2) t_in = torch.cat([t_continuous] * 2) c_in = torch.cat([unconditional_condition, condition]) noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2) return noise_uncond + guidance_scale * (noise - noise_uncond) assert model_type in ["noise", "x_start", "v", "score"] assert guidance_type in ["uncond", "classifier", "classifier-free"] return model_fn class SASolver: def __init__( self, model_fn, noise_schedule, algorithm_type="data_prediction", correcting_x0_fn=None, correcting_xt_fn=None, thresholding_max_val=1., dynamic_thresholding_ratio=0.995 ): """ Construct a SA-Solver The default value for algorithm_type is "data_prediction" and we recommend not to change it to "noise_prediction". For details, please see Appendix A.2.4 in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf """ self.model = lambda x, t: model_fn(x, t.expand((x.shape[0]))) self.noise_schedule = noise_schedule assert algorithm_type in ["data_prediction", "noise_prediction"] if correcting_x0_fn == "dynamic_thresholding": self.correcting_x0_fn = self.dynamic_thresholding_fn else: self.correcting_x0_fn = correcting_x0_fn self.correcting_xt_fn = correcting_xt_fn self.dynamic_thresholding_ratio = dynamic_thresholding_ratio self.thresholding_max_val = thresholding_max_val self.predict_x0 = algorithm_type == "data_prediction" self.sigma_min = float(self.noise_schedule.edm_sigma(torch.tensor([1e-3]))) self.sigma_max = float(self.noise_schedule.edm_sigma(torch.tensor([1]))) def dynamic_thresholding_fn(self, x0, t=None): """ The dynamic thresholding method. """ dims = x0.dim() p = self.dynamic_thresholding_ratio s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1) s = expand_dims(torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)), dims) x0 = torch.clamp(x0, -s, s) / s return x0 def noise_prediction_fn(self, x, t): """ Return the noise prediction model. """ return self.model(x, t) def data_prediction_fn(self, x, t): """ Return the data prediction model (with corrector). """ noise = self.noise_prediction_fn(x, t) alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t) x0 = (x - sigma_t * noise) / alpha_t if self.correcting_x0_fn is not None: x0 = self.correcting_x0_fn(x0) return x0 def model_fn(self, x, t): """ Convert the model to the noise prediction model or the data prediction model. """ if self.predict_x0: return self.data_prediction_fn(x, t) else: return self.noise_prediction_fn(x, t) def get_time_steps(self, skip_type, t_T, t_0, N, order, device): """Compute the intermediate time steps for sampling. """ if skip_type == 'logSNR': lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device)) lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device)) logSNR_steps = lambda_T + torch.linspace(torch.tensor(0.).cpu().item(), (lambda_0 - lambda_T).cpu().item() ** (1. / order), N + 1).pow( order).to(device) return self.noise_schedule.inverse_lambda(logSNR_steps) elif skip_type == 'time': t = torch.linspace(t_T ** (1. / order), t_0 ** (1. / order), N + 1).pow(order).to(device) return t elif skip_type == 'karras': sigma_min = max(0.002, self.sigma_min) sigma_max = min(80, self.sigma_max) sigma_steps = torch.linspace(sigma_max ** (1. / 7), sigma_min ** (1. / 7), N + 1).pow(7).to(device) return self.noise_schedule.edm_inverse_sigma(sigma_steps) else: raise ValueError( f"Unsupported skip_type {skip_type}, need to be 'logSNR' or 'time' or 'karras'" ) def denoise_to_zero_fn(self, x, s): """ Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization. """ return self.data_prediction_fn(x, s) def get_coefficients_exponential_negative(self, order, interval_start, interval_end): """ Calculate the integral of exp(-x) * x^order dx from interval_start to interval_end For calculating the coefficient of gradient terms after the lagrange interpolation, see Eq.(15) and Eq.(18) in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf For noise_prediction formula. """ assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3" if order == 0: return torch.exp(-interval_end) * (torch.exp(interval_end - interval_start) - 1) elif order == 1: return torch.exp(-interval_end) * ( (interval_start + 1) * torch.exp(interval_end - interval_start) - (interval_end + 1)) elif order == 2: return torch.exp(-interval_end) * ( (interval_start ** 2 + 2 * interval_start + 2) * torch.exp(interval_end - interval_start) - ( interval_end ** 2 + 2 * interval_end + 2)) elif order == 3: return torch.exp(-interval_end) * ( (interval_start ** 3 + 3 * interval_start ** 2 + 6 * interval_start + 6) * torch.exp( interval_end - interval_start) - (interval_end ** 3 + 3 * interval_end ** 2 + 6 * interval_end + 6)) def get_coefficients_exponential_positive(self, order, interval_start, interval_end, tau): """ Calculate the integral of exp(x(1+tau^2)) * x^order dx from interval_start to interval_end For calculating the coefficient of gradient terms after the lagrange interpolation, see Eq.(15) and Eq.(18) in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf For data_prediction formula. """ assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3" # after change of variable(cov) interval_end_cov = (1 + tau ** 2) * interval_end interval_start_cov = (1 + tau ** 2) * interval_start if order == 0: return torch.exp(interval_end_cov) * (1 - torch.exp(-(interval_end_cov - interval_start_cov))) / ( (1 + tau ** 2)) elif order == 1: return torch.exp(interval_end_cov) * ((interval_end_cov - 1) - (interval_start_cov - 1) * torch.exp( -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 2) elif order == 2: return torch.exp(interval_end_cov) * ((interval_end_cov ** 2 - 2 * interval_end_cov + 2) - ( interval_start_cov ** 2 - 2 * interval_start_cov + 2) * torch.exp( -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 3) elif order == 3: return torch.exp(interval_end_cov) * ( (interval_end_cov ** 3 - 3 * interval_end_cov ** 2 + 6 * interval_end_cov - 6) - ( interval_start_cov ** 3 - 3 * interval_start_cov ** 2 + 6 * interval_start_cov - 6) * torch.exp( -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 4) def lagrange_polynomial_coefficient(self, order, lambda_list): """ Calculate the coefficient of lagrange polynomial For lagrange interpolation """ assert order in [0, 1, 2, 3] assert order == len(lambda_list) - 1 if order == 0: return [[1]] elif order == 1: return [[1 / (lambda_list[0] - lambda_list[1]), -lambda_list[1] / (lambda_list[0] - lambda_list[1])], [1 / (lambda_list[1] - lambda_list[0]), -lambda_list[0] / (lambda_list[1] - lambda_list[0])]] elif order == 2: denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2]) denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2]) denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1]) return [[1 / denominator1, (-lambda_list[1] - lambda_list[2]) / denominator1, lambda_list[1] * lambda_list[2] / denominator1], [1 / denominator2, (-lambda_list[0] - lambda_list[2]) / denominator2, lambda_list[0] * lambda_list[2] / denominator2], [1 / denominator3, (-lambda_list[0] - lambda_list[1]) / denominator3, lambda_list[0] * lambda_list[1] / denominator3] ] elif order == 3: denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2]) * ( lambda_list[0] - lambda_list[3]) denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2]) * ( lambda_list[1] - lambda_list[3]) denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1]) * ( lambda_list[2] - lambda_list[3]) denominator4 = (lambda_list[3] - lambda_list[0]) * (lambda_list[3] - lambda_list[1]) * ( lambda_list[3] - lambda_list[2]) return [[1 / denominator1, (-lambda_list[1] - lambda_list[2] - lambda_list[3]) / denominator1, (lambda_list[1] * lambda_list[2] + lambda_list[1] * lambda_list[3] + lambda_list[2] * lambda_list[ 3]) / denominator1, (-lambda_list[1] * lambda_list[2] * lambda_list[3]) / denominator1], [1 / denominator2, (-lambda_list[0] - lambda_list[2] - lambda_list[3]) / denominator2, (lambda_list[0] * lambda_list[2] + lambda_list[0] * lambda_list[3] + lambda_list[2] * lambda_list[ 3]) / denominator2, (-lambda_list[0] * lambda_list[2] * lambda_list[3]) / denominator2], [1 / denominator3, (-lambda_list[0] - lambda_list[1] - lambda_list[3]) / denominator3, (lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[3] + lambda_list[1] * lambda_list[ 3]) / denominator3, (-lambda_list[0] * lambda_list[1] * lambda_list[3]) / denominator3], [1 / denominator4, (-lambda_list[0] - lambda_list[1] - lambda_list[2]) / denominator4, (lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[2] + lambda_list[1] * lambda_list[ 2]) / denominator4, (-lambda_list[0] * lambda_list[1] * lambda_list[2]) / denominator4] ] def get_coefficients_fn(self, order, interval_start, interval_end, lambda_list, tau): """ Calculate the coefficient of gradients. """ assert order in [1, 2, 3, 4] assert order == len(lambda_list), 'the length of lambda list must be equal to the order' coefficients = [] lagrange_coefficient = self.lagrange_polynomial_coefficient(order - 1, lambda_list) for i in range(order): coefficient = sum( lagrange_coefficient[i][j] * self.get_coefficients_exponential_positive( order - 1 - j, interval_start, interval_end, tau ) if self.predict_x0 else lagrange_coefficient[i][j] * self.get_coefficients_exponential_negative( order - 1 - j, interval_start, interval_end ) for j in range(order) ) coefficients.append(coefficient) assert len(coefficients) == order, 'the length of coefficients does not match the order' return coefficients def adams_bashforth_update(self, order, x, tau, model_prev_list, t_prev_list, noise, t): """ SA-Predictor, without the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf """ assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4" # get noise schedule ns = self.noise_schedule alpha_t = ns.marginal_alpha(t) sigma_t = ns.marginal_std(t) lambda_t = ns.marginal_lambda(t) alpha_prev = ns.marginal_alpha(t_prev_list[-1]) sigma_prev = ns.marginal_std(t_prev_list[-1]) gradient_part = torch.zeros_like(x) h = lambda_t - ns.marginal_lambda(t_prev_list[-1]) lambda_list = [ns.marginal_lambda(t_prev_list[-(i + 1)]) for i in range(order)] gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t, lambda_list, tau) for i in range(order): if self.predict_x0: gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[ i] * model_prev_list[-(i + 1)] else: gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)] if self.predict_x0: noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise else: noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise if self.predict_x0: x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part else: x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part return x_t def adams_moulton_update(self, order, x, tau, model_prev_list, t_prev_list, noise, t): """ SA-Corrector, without the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf """ assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4" # get noise schedule ns = self.noise_schedule alpha_t = ns.marginal_alpha(t) sigma_t = ns.marginal_std(t) lambda_t = ns.marginal_lambda(t) alpha_prev = ns.marginal_alpha(t_prev_list[-1]) sigma_prev = ns.marginal_std(t_prev_list[-1]) gradient_part = torch.zeros_like(x) h = lambda_t - ns.marginal_lambda(t_prev_list[-1]) t_list = t_prev_list + [t] lambda_list = [ns.marginal_lambda(t_list[-(i + 1)]) for i in range(order)] gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t, lambda_list, tau) for i in range(order): if self.predict_x0: gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[ i] * model_prev_list[-(i + 1)] else: gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)] if self.predict_x0: noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise else: noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise if self.predict_x0: x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part else: x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part return x_t def adams_bashforth_update_few_steps(self, order, x, tau, model_prev_list, t_prev_list, noise, t): """ SA-Predictor, with the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf """ assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4" # get noise schedule ns = self.noise_schedule alpha_t = ns.marginal_alpha(t) sigma_t = ns.marginal_std(t) lambda_t = ns.marginal_lambda(t) alpha_prev = ns.marginal_alpha(t_prev_list[-1]) sigma_prev = ns.marginal_std(t_prev_list[-1]) gradient_part = torch.zeros_like(x) h = lambda_t - ns.marginal_lambda(t_prev_list[-1]) lambda_list = [ns.marginal_lambda(t_prev_list[-(i + 1)]) for i in range(order)] gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t, lambda_list, tau) if self.predict_x0: if order == 2: ## if order = 2 we do a modification that does not influence the convergence order similar to unipc. Note: This is used only for few steps sampling. # The added term is O(h^3). Empirically we find it will slightly improve the image quality. # ODE case # gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2])) # gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2])) gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * ( h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / ( (1 + tau ** 2) ** 2)) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda( t_prev_list[-2])) gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * ( h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / ( (1 + tau ** 2) ** 2)) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda( t_prev_list[-2])) for i in range(order): if self.predict_x0: gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[ i] * model_prev_list[-(i + 1)] else: gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)] if self.predict_x0: noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise else: noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise if self.predict_x0: x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part else: x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part return x_t def adams_moulton_update_few_steps(self, order, x, tau, model_prev_list, t_prev_list, noise, t): """ SA-Corrector, without the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf """ assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4" # get noise schedule ns = self.noise_schedule alpha_t = ns.marginal_alpha(t) sigma_t = ns.marginal_std(t) lambda_t = ns.marginal_lambda(t) alpha_prev = ns.marginal_alpha(t_prev_list[-1]) sigma_prev = ns.marginal_std(t_prev_list[-1]) gradient_part = torch.zeros_like(x) h = lambda_t - ns.marginal_lambda(t_prev_list[-1]) t_list = t_prev_list + [t] lambda_list = [ns.marginal_lambda(t_list[-(i + 1)]) for i in range(order)] gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t, lambda_list, tau) if self.predict_x0: if order == 2: ## if order = 2 we do a modification that does not influence the convergence order similar to UniPC. Note: This is used only for few steps sampling. # The added term is O(h^3). Empirically we find it will slightly improve the image quality. # ODE case # gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h / 2 - (h - 1 + torch.exp(-h)) / h) # gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h / 2 - (h - 1 + torch.exp(-h)) / h) gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * ( h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / ( (1 + tau ** 2) ** 2 * h)) gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * ( h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / ( (1 + tau ** 2) ** 2 * h)) for i in range(order): if self.predict_x0: gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[ i] * model_prev_list[-(i + 1)] else: gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)] if self.predict_x0: noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise else: noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise if self.predict_x0: x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part else: x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part return x_t def sample_few_steps(self, x, tau, steps=5, t_start=None, t_end=None, skip_type='time', skip_order=1, predictor_order=3, corrector_order=4, pc_mode='PEC', return_intermediate=False ): """ For the PC-mode, please refer to the wiki page https://en.wikipedia.org/wiki/Predictor%E2%80%93corrector_method#PEC_mode_and_PECE_mode 'PEC' needs one model evaluation per step while 'PECE' needs two model evaluations We recommend use pc_mode='PEC' for NFEs is limited. 'PECE' mode is only for test with sufficient NFEs. """ skip_first_step = False skip_final_step = True lower_order_final = True denoise_to_zero = False assert pc_mode in ['PEC', 'PECE'], 'Predictor-corrector mode only supports PEC and PECE' t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end t_T = self.noise_schedule.T if t_start is None else t_start assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" device = x.device intermediates = [] with torch.no_grad(): assert steps >= max(predictor_order, corrector_order - 1) timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, order=skip_order, device=device) assert timesteps.shape[0] - 1 == steps # Init the initial values. step = 0 t = timesteps[step] noise = torch.randn_like(x) t_prev_list = [t] # do not evaluate if skip_first_step if skip_first_step: if self.predict_x0: alpha_t = self.noise_schedule.marginal_alpha(t) sigma_t = self.noise_schedule.marginal_std(t) model_prev_list = [(1 - sigma_t) / alpha_t * x] else: model_prev_list = [x] else: model_prev_list = [self.model_fn(x, t)] if self.correcting_xt_fn is not None: x = self.correcting_xt_fn(x, t, step) if return_intermediate: intermediates.append(x) # determine the first several values for step in tqdm(range(1, max(predictor_order, corrector_order - 1))): t = timesteps[step] predictor_order_used = min(predictor_order, step) corrector_order_used = min(corrector_order, step + 1) noise = torch.randn_like(x) # predictor step x_p = self.adams_bashforth_update_few_steps(order=predictor_order_used, x=x, tau=tau(t), model_prev_list=model_prev_list, t_prev_list=t_prev_list, noise=noise, t=t) # evaluation step model_x = self.model_fn(x_p, t) # update model_list model_prev_list.append(model_x) # corrector step if corrector_order > 0: x = self.adams_moulton_update_few_steps(order=corrector_order_used, x=x, tau=tau(t), model_prev_list=model_prev_list, t_prev_list=t_prev_list, noise=noise, t=t) else: x = x_p # evaluation step if correction and mode = pece if corrector_order > 0 and pc_mode == 'PECE': model_x = self.model_fn(x, t) del model_prev_list[-1] model_prev_list.append(model_x) if self.correcting_xt_fn is not None: x = self.correcting_xt_fn(x, t, step) if return_intermediate: intermediates.append(x) t_prev_list.append(t) for step in tqdm(range(max(predictor_order, corrector_order - 1), steps + 1)): if lower_order_final: predictor_order_used = min(predictor_order, steps - step + 1) corrector_order_used = min(corrector_order, steps - step + 2) else: predictor_order_used = predictor_order corrector_order_used = corrector_order t = timesteps[step] noise = torch.randn_like(x) # predictor step if skip_final_step and step == steps and not denoise_to_zero: x_p = self.adams_bashforth_update_few_steps(order=predictor_order_used, x=x, tau=0, model_prev_list=model_prev_list, t_prev_list=t_prev_list, noise=noise, t=t) else: x_p = self.adams_bashforth_update_few_steps(order=predictor_order_used, x=x, tau=tau(t), model_prev_list=model_prev_list, t_prev_list=t_prev_list, noise=noise, t=t) # evaluation step # do not evaluate if skip_final_step and step = steps if not skip_final_step or step < steps: model_x = self.model_fn(x_p, t) # update model_list # do not update if skip_final_step and step = steps if not skip_final_step or step < steps: model_prev_list.append(model_x) # corrector step # do not correct if skip_final_step and step = steps if corrector_order > 0 and (not skip_final_step or step < steps): x = self.adams_moulton_update_few_steps(order=corrector_order_used, x=x, tau=tau(t), model_prev_list=model_prev_list, t_prev_list=t_prev_list, noise=noise, t=t) else: x = x_p # evaluation step if mode = pece and step != steps if corrector_order > 0 and (pc_mode == 'PECE' and step < steps): model_x = self.model_fn(x, t) del model_prev_list[-1] model_prev_list.append(model_x) if self.correcting_xt_fn is not None: x = self.correcting_xt_fn(x, t, step) if return_intermediate: intermediates.append(x) t_prev_list.append(t) del model_prev_list[0] if denoise_to_zero: t = torch.ones((1,)).to(device) * t_0 x = self.denoise_to_zero_fn(x, t) if self.correcting_xt_fn is not None: x = self.correcting_xt_fn(x, t, step + 1) if return_intermediate: intermediates.append(x) return (x, intermediates) if return_intermediate else x def sample_more_steps(self, x, tau, steps=20, t_start=None, t_end=None, skip_type='time', skip_order=1, predictor_order=3, corrector_order=4, pc_mode='PEC', return_intermediate=False ): """ For the PC-mode, please refer to the wiki page https://en.wikipedia.org/wiki/Predictor%E2%80%93corrector_method#PEC_mode_and_PECE_mode 'PEC' needs one model evaluation per step while 'PECE' needs two model evaluations We recommend use pc_mode='PEC' for NFEs is limited. 'PECE' mode is only for test with sufficient NFEs. """ skip_first_step = False skip_final_step = False lower_order_final = True denoise_to_zero = True assert pc_mode in ['PEC', 'PECE'], 'Predictor-corrector mode only supports PEC and PECE' t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end t_T = self.noise_schedule.T if t_start is None else t_start assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" device = x.device intermediates = [] with torch.no_grad(): assert steps >= max(predictor_order, corrector_order - 1) timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, order=skip_order, device=device) assert timesteps.shape[0] - 1 == steps # Init the initial values. step = 0 t = timesteps[step] noise = torch.randn_like(x) t_prev_list = [t] # do not evaluate if skip_first_step if skip_first_step: if self.predict_x0: alpha_t = self.noise_schedule.marginal_alpha(t) sigma_t = self.noise_schedule.marginal_std(t) model_prev_list = [(1 - sigma_t) / alpha_t * x] else: model_prev_list = [x] else: model_prev_list = [self.model_fn(x, t)] if self.correcting_xt_fn is not None: x = self.correcting_xt_fn(x, t, step) if return_intermediate: intermediates.append(x) # determine the first several values for step in tqdm(range(1, max(predictor_order, corrector_order - 1))): t = timesteps[step] predictor_order_used = min(predictor_order, step) corrector_order_used = min(corrector_order, step + 1) noise = torch.randn_like(x) # predictor step x_p = self.adams_bashforth_update(order=predictor_order_used, x=x, tau=tau(t), model_prev_list=model_prev_list, t_prev_list=t_prev_list, noise=noise, t=t) # evaluation step model_x = self.model_fn(x_p, t) # update model_list model_prev_list.append(model_x) # corrector step if corrector_order > 0: x = self.adams_moulton_update(order=corrector_order_used, x=x, tau=tau(t), model_prev_list=model_prev_list, t_prev_list=t_prev_list, noise=noise, t=t) else: x = x_p # evaluation step if mode = pece if corrector_order > 0 and pc_mode == 'PECE': model_x = self.model_fn(x, t) del model_prev_list[-1] model_prev_list.append(model_x) if self.correcting_xt_fn is not None: x = self.correcting_xt_fn(x, t, step) if return_intermediate: intermediates.append(x) t_prev_list.append(t) for step in tqdm(range(max(predictor_order, corrector_order - 1), steps + 1)): if lower_order_final: predictor_order_used = min(predictor_order, steps - step + 1) corrector_order_used = min(corrector_order, steps - step + 2) else: predictor_order_used = predictor_order corrector_order_used = corrector_order t = timesteps[step] noise = torch.randn_like(x) # predictor step if skip_final_step and step == steps and not denoise_to_zero: x_p = self.adams_bashforth_update(order=predictor_order_used, x=x, tau=0, model_prev_list=model_prev_list, t_prev_list=t_prev_list, noise=noise, t=t) else: x_p = self.adams_bashforth_update(order=predictor_order_used, x=x, tau=tau(t), model_prev_list=model_prev_list, t_prev_list=t_prev_list, noise=noise, t=t) # evaluation step # do not evaluate if skip_final_step and step = steps if not skip_final_step or step < steps: model_x = self.model_fn(x_p, t) # update model_list # do not update if skip_final_step and step = steps if not skip_final_step or step < steps: model_prev_list.append(model_x) # corrector step # do not correct if skip_final_step and step = steps if corrector_order > 0: if not skip_final_step or step < steps: x = self.adams_moulton_update(order=corrector_order_used, x=x, tau=tau(t), model_prev_list=model_prev_list, t_prev_list=t_prev_list, noise=noise, t=t) else: x = x_p else: x = x_p # evaluation step if mode = pece and step != steps if corrector_order > 0 and (pc_mode == 'PECE' and step < steps): model_x = self.model_fn(x, t) del model_prev_list[-1] model_prev_list.append(model_x) if self.correcting_xt_fn is not None: x = self.correcting_xt_fn(x, t, step) if return_intermediate: intermediates.append(x) t_prev_list.append(t) del model_prev_list[0] if denoise_to_zero: t = torch.ones((1,)).to(device) * t_0 x = self.denoise_to_zero_fn(x, t) if self.correcting_xt_fn is not None: x = self.correcting_xt_fn(x, t, step + 1) if return_intermediate: intermediates.append(x) if return_intermediate: return x, intermediates else: return x def sample(self, mode, x, tau, steps, t_start=None, t_end=None, skip_type='time', skip_order=1, predictor_order=3, corrector_order=4, pc_mode='PEC', return_intermediate=False ): """ For the PC-mode, please refer to the wiki page https://en.wikipedia.org/wiki/Predictor%E2%80%93corrector_method#PEC_mode_and_PECE_mode 'PEC' needs one model evaluation per step while 'PECE' needs two model evaluations We recommend use pc_mode='PEC' for NFEs is limited. 'PECE' mode is only for test with sufficient NFEs. 'few_steps' mode is recommended. The differences between 'few_steps' and 'more_steps' are as below: 1) 'few_steps' do not correct at final step and do not denoise to zero, while 'more_steps' do these two. Thus the NFEs for 'few_steps' = steps, NFEs for 'more_steps' = steps + 2 For most of the experiments and tasks, we find these two operations do not have much help to sample quality. 2) 'few_steps' use a rescaling trick as in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf We find it will slightly improve the sample quality especially in few steps. """ assert mode in ['few_steps', 'more_steps'], "mode must be either 'few_steps' or 'more_steps'" if mode == 'few_steps': return self.sample_few_steps(x=x, tau=tau, steps=steps, t_start=t_start, t_end=t_end, skip_type=skip_type, skip_order=skip_order, predictor_order=predictor_order, corrector_order=corrector_order, pc_mode=pc_mode, return_intermediate=return_intermediate) else: return self.sample_more_steps(x=x, tau=tau, steps=steps, t_start=t_start, t_end=t_end, skip_type=skip_type, skip_order=skip_order, predictor_order=predictor_order, corrector_order=corrector_order, pc_mode=pc_mode, return_intermediate=return_intermediate) ############################################################# # other utility functions ############################################################# def interpolate_fn(x, xp, yp): """ A piecewise linear function y = f(x), using xp and yp as keypoints. We implement f(x) in a differentiable way (i.e. applicable for autograd). The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.) Args: x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver). xp: PyTorch tensor with shape [C, K], where K is the number of keypoints. yp: PyTorch tensor with shape [C, K]. Returns: The function values f(x), with shape [N, C]. """ N, K = x.shape[0], xp.shape[1] all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2) sorted_all_x, x_indices = torch.sort(all_x, dim=2) x_idx = torch.argmin(x_indices, dim=2) cand_start_idx = x_idx - 1 start_idx = torch.where( torch.eq(x_idx, 0), torch.tensor(1, device=x.device), torch.where( torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx, ), ) end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1) start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2) end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2) start_idx2 = torch.where( torch.eq(x_idx, 0), torch.tensor(0, device=x.device), torch.where( torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx, ), ) y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1) start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2) end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2) cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x) return cand def expand_dims(v, dims): """ Expand the tensor `v` to the dim `dims`. Args: `v`: a PyTorch tensor with shape [N]. `dim`: a `int`. Returns: a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`. """ return v[(...,) + (None,) * (dims - 1)] ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/t5.py ================================================ # -*- coding: utf-8 -*- import os import re import html import urllib.parse as ul import ftfy import torch from bs4 import BeautifulSoup from transformers import T5EncoderModel, AutoTokenizer from huggingface_hub import hf_hub_download class T5Embedder: available_models = ['t5-v1_1-xxl'] bad_punct_regex = re.compile(r'['+'#®•©™&@·º½¾¿¡§~'+'\)'+'\('+'\]'+'\['+'\}'+'\{'+'\|'+'\\'+'\/'+'\*' + r']{1,}') # noqa def __init__(self, device, dir_or_name='t5-v1_1-xxl', *, local_cache=False, cache_dir=None, hf_token=None, use_text_preprocessing=True, t5_model_kwargs=None, torch_dtype=None, use_offload_folder=None, model_max_length=120): self.device = torch.device(device) self.torch_dtype = torch_dtype or torch.bfloat16 if t5_model_kwargs is None: t5_model_kwargs = {'low_cpu_mem_usage': True, 'torch_dtype': self.torch_dtype} if use_offload_folder is not None: t5_model_kwargs['offload_folder'] = use_offload_folder t5_model_kwargs['device_map'] = { 'shared': self.device, 'encoder.embed_tokens': self.device, 'encoder.block.0': self.device, 'encoder.block.1': self.device, 'encoder.block.2': self.device, 'encoder.block.3': self.device, 'encoder.block.4': self.device, 'encoder.block.5': self.device, 'encoder.block.6': self.device, 'encoder.block.7': self.device, 'encoder.block.8': self.device, 'encoder.block.9': self.device, 'encoder.block.10': self.device, 'encoder.block.11': self.device, 'encoder.block.12': 'disk', 'encoder.block.13': 'disk', 'encoder.block.14': 'disk', 'encoder.block.15': 'disk', 'encoder.block.16': 'disk', 'encoder.block.17': 'disk', 'encoder.block.18': 'disk', 'encoder.block.19': 'disk', 'encoder.block.20': 'disk', 'encoder.block.21': 'disk', 'encoder.block.22': 'disk', 'encoder.block.23': 'disk', 'encoder.final_layer_norm': 'disk', 'encoder.dropout': 'disk', } else: t5_model_kwargs['device_map'] = {'shared': self.device, 'encoder': self.device} self.use_text_preprocessing = use_text_preprocessing self.hf_token = hf_token self.cache_dir = cache_dir or os.path.expanduser('~/.cache/IF_') self.dir_or_name = dir_or_name tokenizer_path, path = dir_or_name, dir_or_name if local_cache: cache_dir = os.path.join(self.cache_dir, dir_or_name) tokenizer_path, path = cache_dir, cache_dir elif dir_or_name in self.available_models: cache_dir = os.path.join(self.cache_dir, dir_or_name) for filename in [ 'config.json', 'special_tokens_map.json', 'spiece.model', 'tokenizer_config.json', 'pytorch_model.bin.index.json', 'pytorch_model-00001-of-00002.bin', 'pytorch_model-00002-of-00002.bin' ]: hf_hub_download(repo_id=f'DeepFloyd/{dir_or_name}', filename=filename, cache_dir=cache_dir, force_filename=filename, token=self.hf_token) tokenizer_path, path = cache_dir, cache_dir else: cache_dir = os.path.join(self.cache_dir, 't5-v1_1-xxl') for filename in [ 'config.json', 'special_tokens_map.json', 'spiece.model', 'tokenizer_config.json', ]: hf_hub_download(repo_id='DeepFloyd/t5-v1_1-xxl', filename=filename, cache_dir=cache_dir, force_filename=filename, token=self.hf_token) tokenizer_path = cache_dir print(tokenizer_path) self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) self.model = T5EncoderModel.from_pretrained(path, **t5_model_kwargs).eval() self.model_max_length = model_max_length def get_text_embeddings(self, texts): texts = [self.text_preprocessing(text) for text in texts] text_tokens_and_mask = self.tokenizer( texts, max_length=self.model_max_length, padding='max_length', truncation=True, return_attention_mask=True, add_special_tokens=True, return_tensors='pt' ) text_tokens_and_mask['input_ids'] = text_tokens_and_mask['input_ids'] text_tokens_and_mask['attention_mask'] = text_tokens_and_mask['attention_mask'] with torch.no_grad(): text_encoder_embs = self.model( input_ids=text_tokens_and_mask['input_ids'].to(self.device), attention_mask=text_tokens_and_mask['attention_mask'].to(self.device), )['last_hidden_state'].detach() return text_encoder_embs, text_tokens_and_mask['attention_mask'].to(self.device) def text_preprocessing(self, text): if self.use_text_preprocessing: # The exact text cleaning as was in the training stage: text = self.clean_caption(text) text = self.clean_caption(text) return text else: return text.lower().strip() @staticmethod def basic_clean(text): text = ftfy.fix_text(text) text = html.unescape(html.unescape(text)) return text.strip() def clean_caption(self, caption): caption = str(caption) caption = ul.unquote_plus(caption) caption = caption.strip().lower() caption = re.sub('', 'person', caption) # urls: caption = re.sub( r'\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))', # noqa '', caption) # regex for urls caption = re.sub( r'\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))', # noqa '', caption) # regex for urls # html: caption = BeautifulSoup(caption, features='html.parser').text # @ caption = re.sub(r'@[\w\d]+\b', '', caption) # 31C0—31EF CJK Strokes # 31F0—31FF Katakana Phonetic Extensions # 3200—32FF Enclosed CJK Letters and Months # 3300—33FF CJK Compatibility # 3400—4DBF CJK Unified Ideographs Extension A # 4DC0—4DFF Yijing Hexagram Symbols # 4E00—9FFF CJK Unified Ideographs caption = re.sub(r'[\u31c0-\u31ef]+', '', caption) caption = re.sub(r'[\u31f0-\u31ff]+', '', caption) caption = re.sub(r'[\u3200-\u32ff]+', '', caption) caption = re.sub(r'[\u3300-\u33ff]+', '', caption) caption = re.sub(r'[\u3400-\u4dbf]+', '', caption) caption = re.sub(r'[\u4dc0-\u4dff]+', '', caption) caption = re.sub(r'[\u4e00-\u9fff]+', '', caption) ####################################################### # все виды тире / all types of dash --> "-" caption = re.sub( r'[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+', # noqa '-', caption) # кавычки к одному стандарту caption = re.sub(r'[`´«»“”¨]', '"', caption) caption = re.sub(r'[‘’]', "'", caption) # " caption = re.sub(r'"?', '', caption) # & caption = re.sub(r'&', '', caption) # ip adresses: caption = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' ', caption) # article ids: caption = re.sub(r'\d:\d\d\s+$', '', caption) # \n caption = re.sub(r'\\n', ' ', caption) # "#123" caption = re.sub(r'#\d{1,3}\b', '', caption) # "#12345.." caption = re.sub(r'#\d{5,}\b', '', caption) # "123456.." caption = re.sub(r'\b\d{6,}\b', '', caption) # filenames: caption = re.sub(r'[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)', '', caption) # caption = re.sub(r'[\"\']{2,}', r'"', caption) # """AUSVERKAUFT""" caption = re.sub(r'[\.]{2,}', r' ', caption) # """AUSVERKAUFT""" caption = re.sub(self.bad_punct_regex, r' ', caption) # ***AUSVERKAUFT***, #AUSVERKAUFT caption = re.sub(r'\s+\.\s+', r' ', caption) # " . " # this-is-my-cute-cat / this_is_my_cute_cat regex2 = re.compile(r'(?:\-|\_)') if len(re.findall(regex2, caption)) > 3: caption = re.sub(regex2, ' ', caption) caption = self.basic_clean(caption) caption = re.sub(r'\b[a-zA-Z]{1,3}\d{3,15}\b', '', caption) # jc6640 caption = re.sub(r'\b[a-zA-Z]+\d+[a-zA-Z]+\b', '', caption) # jc6640vc caption = re.sub(r'\b\d+[a-zA-Z]+\d+\b', '', caption) # 6640vc231 caption = re.sub(r'(worldwide\s+)?(free\s+)?shipping', '', caption) caption = re.sub(r'(free\s)?download(\sfree)?', '', caption) caption = re.sub(r'\bclick\b\s(?:for|on)\s\w+', '', caption) caption = re.sub(r'\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?', '', caption) caption = re.sub(r'\bpage\s+\d+\b', '', caption) caption = re.sub(r'\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b', r' ', caption) # j2d1a2a... caption = re.sub(r'\b\d+\.?\d*[xх×]\d+\.?\d*\b', '', caption) caption = re.sub(r'\b\s+\:\s+', r': ', caption) caption = re.sub(r'(\D[,\./])\b', r'\1 ', caption) caption = re.sub(r'\s+', ' ', caption) caption.strip() caption = re.sub(r'^[\"\']([\w\W]+)[\"\']$', r'\1', caption) caption = re.sub(r'^[\'\_,\-\:;]', r'', caption) caption = re.sub(r'[\'\_,\-\:\-\+]$', r'', caption) caption = re.sub(r'^\.\S+$', '', caption) return caption.strip() ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/timestep_sampler.py ================================================ # Modified from OpenAI's diffusion repos # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py from abc import ABC, abstractmethod import numpy as np import torch as th import torch.distributed as dist def create_named_schedule_sampler(name, diffusion): """ Create a ScheduleSampler from a library of pre-defined samplers. :param name: the name of the sampler. :param diffusion: the diffusion object to sample for. """ if name == "uniform": return UniformSampler(diffusion) elif name == "loss-second-moment": return LossSecondMomentResampler(diffusion) else: raise NotImplementedError(f"unknown schedule sampler: {name}") class ScheduleSampler(ABC): """ A distribution over timesteps in the diffusion process, intended to reduce variance of the objective. By default, samplers perform unbiased importance sampling, in which the objective's mean is unchanged. However, subclasses may override sample() to change how the resampled terms are reweighted, allowing for actual changes in the objective. """ @abstractmethod def weights(self): """ Get a numpy array of weights, one per diffusion step. The weights needn't be normalized, but must be positive. """ def sample(self, batch_size, device): """ Importance-sample timesteps for a batch. :param batch_size: the number of timesteps. :param device: the torch device to save to. :return: a tuple (timesteps, weights): - timesteps: a tensor of timestep indices. - weights: a tensor of weights to scale the resulting losses. """ w = self.weights() p = w / np.sum(w) indices_np = np.random.choice(len(p), size=(batch_size,), p=p) indices = th.from_numpy(indices_np).long().to(device) weights_np = 1 / (len(p) * p[indices_np]) weights = th.from_numpy(weights_np).float().to(device) return indices, weights class UniformSampler(ScheduleSampler): def __init__(self, diffusion): self.diffusion = diffusion self._weights = np.ones([diffusion.num_timesteps]) def weights(self): return self._weights class LossAwareSampler(ScheduleSampler): def update_with_local_losses(self, local_ts, local_losses): """ Update the reweighting using losses from a model. Call this method from each rank with a batch of timesteps and the corresponding losses for each of those timesteps. This method will perform synchronization to make sure all of the ranks maintain the exact same reweighting. :param local_ts: an integer Tensor of timesteps. :param local_losses: a 1D Tensor of losses. """ batch_sizes = [ th.tensor([0], dtype=th.int32, device=local_ts.device) for _ in range(dist.get_world_size()) ] dist.all_gather( batch_sizes, th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device), ) # Pad all_gather batches to be the maximum batch size. batch_sizes = [x.item() for x in batch_sizes] max_bs = max(batch_sizes) timestep_batches = [th.zeros(max_bs, device=local_ts.device) for _ in batch_sizes] loss_batches = [th.zeros(max_bs, device=local_losses.device) for _ in batch_sizes] dist.all_gather(timestep_batches, local_ts) dist.all_gather(loss_batches, local_losses) timesteps = [ x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs] ] losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]] self.update_with_all_losses(timesteps, losses) @abstractmethod def update_with_all_losses(self, ts, losses): """ Update the reweighting using losses from a model. Sub-classes should override this method to update the reweighting using losses from the model. This method directly updates the reweighting without synchronizing between workers. It is called by update_with_local_losses from all ranks with identical arguments. Thus, it should have deterministic behavior to maintain state across workers. :param ts: a list of int timesteps. :param losses: a list of float losses, one per timestep. """ class LossSecondMomentResampler(LossAwareSampler): def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001): self.diffusion = diffusion self.history_per_term = history_per_term self.uniform_prob = uniform_prob self._loss_history = np.zeros( [diffusion.num_timesteps, history_per_term], dtype=np.float64 ) self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int) def weights(self): if not self._warmed_up(): return np.ones([self.diffusion.num_timesteps], dtype=np.float64) weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1)) weights /= np.sum(weights) weights *= 1 - self.uniform_prob weights += self.uniform_prob / len(weights) return weights def update_with_all_losses(self, ts, losses): for t, loss in zip(ts, losses): if self._loss_counts[t] == self.history_per_term: # Shift out the oldest loss term. self._loss_history[t, :-1] = self._loss_history[t, 1:] self._loss_history[t, -1] = loss else: self._loss_history[t, self._loss_counts[t]] = loss self._loss_counts[t] += 1 def _warmed_up(self): return (self._loss_counts == self.history_per_term).all() ================================================ FILE: PixArt-alpha-ToCa/diffusion/model/utils.py ================================================ import os import sys import torch.nn as nn from torch.utils.checkpoint import checkpoint, checkpoint_sequential import torch.nn.functional as F import torch import torch.distributed as dist import re import math from collections.abc import Iterable from itertools import repeat from torchvision import transforms as T import random from PIL import Image def _ntuple(n): def parse(x): if isinstance(x, Iterable) and not isinstance(x, str): return x return tuple(repeat(x, n)) return parse to_1tuple = _ntuple(1) to_2tuple = _ntuple(2) def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1): assert isinstance(model, nn.Module) def set_attr(module): module.grad_checkpointing = True module.fp32_attention = use_fp32_attention module.grad_checkpointing_step = gc_step model.apply(set_attr) def auto_grad_checkpoint(module, *args, **kwargs): if getattr(module, 'grad_checkpointing', False): if not isinstance(module, Iterable): return checkpoint(module, *args, **kwargs) gc_step = module[0].grad_checkpointing_step return checkpoint_sequential(module, gc_step, *args, **kwargs) return module(*args, **kwargs) def checkpoint_sequential(functions, step, input, *args, **kwargs): # Hack for keyword-only parameter in a python 2.7-compliant way preserve = kwargs.pop('preserve_rng_state', True) if kwargs: raise ValueError("Unexpected keyword arguments: " + ",".join(kwargs)) def run_function(start, end, functions): def forward(input): for j in range(start, end + 1): input = functions[j](input, *args) return input return forward if isinstance(functions, torch.nn.Sequential): functions = list(functions.children()) # the last chunk has to be non-volatile end = -1 segment = len(functions) // step for start in range(0, step * (segment - 1), step): end = start + step - 1 input = checkpoint(run_function(start, end, functions), input, preserve_rng_state=preserve) return run_function(end + 1, len(functions) - 1, functions)(input) def window_partition(x, window_size): """ Partition into non-overlapping windows with padding if needed. Args: x (tensor): input tokens with [B, H, W, C]. window_size (int): window size. Returns: windows: windows after partition with [B * num_windows, window_size, window_size, C]. (Hp, Wp): padded height and width before partition """ B, H, W, C = x.shape pad_h = (window_size - H % window_size) % window_size pad_w = (window_size - W % window_size) % window_size if pad_h > 0 or pad_w > 0: x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h)) Hp, Wp = H + pad_h, W + pad_w x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) return windows, (Hp, Wp) def window_unpartition(windows, window_size, pad_hw, hw): """ Window unpartition into original sequences and removing padding. Args: x (tensor): input tokens with [B * num_windows, window_size, window_size, C]. window_size (int): window size. pad_hw (Tuple): padded height and width (Hp, Wp). hw (Tuple): original height and width (H, W) before padding. Returns: x: unpartitioned sequences with [B, H, W, C]. """ Hp, Wp = pad_hw H, W = hw B = windows.shape[0] // (Hp * Wp // window_size // window_size) x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1) if Hp > H or Wp > W: x = x[:, :H, :W, :].contiguous() return x def get_rel_pos(q_size, k_size, rel_pos): """ Get relative positional embeddings according to the relative positions of query and key sizes. Args: q_size (int): size of query q. k_size (int): size of key k. rel_pos (Tensor): relative position embeddings (L, C). Returns: Extracted positional embeddings according to relative positions. """ max_rel_dist = int(2 * max(q_size, k_size) - 1) # Interpolate rel pos if needed. if rel_pos.shape[0] != max_rel_dist: # Interpolate rel pos. rel_pos_resized = F.interpolate( rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), size=max_rel_dist, mode="linear", ) rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) else: rel_pos_resized = rel_pos # Scale the coords with short length if shapes for q and k are different. q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) return rel_pos_resized[relative_coords.long()] def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size): """ Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950 Args: attn (Tensor): attention map. q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C). rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis. rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis. q_size (Tuple): spatial sequence size of query q with (q_h, q_w). k_size (Tuple): spatial sequence size of key k with (k_h, k_w). Returns: attn (Tensor): attention map with added relative positional embeddings. """ q_h, q_w = q_size k_h, k_w = k_size Rh = get_rel_pos(q_h, k_h, rel_pos_h) Rw = get_rel_pos(q_w, k_w, rel_pos_w) B, _, dim = q.shape r_q = q.reshape(B, q_h, q_w, dim) rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh) rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw) attn = ( attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :] ).view(B, q_h * q_w, k_h * k_w) return attn def mean_flat(tensor): return tensor.mean(dim=list(range(1, tensor.ndim))) ################################################################################# # Token Masking and Unmasking # ################################################################################# def get_mask(batch, length, mask_ratio, device, mask_type=None, data_info=None, extra_len=0): """ Get the binary mask for the input sequence. Args: - batch: batch size - length: sequence length - mask_ratio: ratio of tokens to mask - data_info: dictionary with info for reconstruction return: mask_dict with following keys: - mask: binary mask, 0 is keep, 1 is remove - ids_keep: indices of tokens to keep - ids_restore: indices to restore the original order """ assert mask_type in ['random', 'fft', 'laplacian', 'group'] mask = torch.ones([batch, length], device=device) len_keep = int(length * (1 - mask_ratio)) - extra_len if mask_type in ['random', 'group']: noise = torch.rand(batch, length, device=device) # noise in [0, 1] ids_shuffle = torch.argsort(noise, dim=1) # ascend: small is keep, large is remove ids_restore = torch.argsort(ids_shuffle, dim=1) # keep the first subset ids_keep = ids_shuffle[:, :len_keep] ids_removed = ids_shuffle[:, len_keep:] elif mask_type in ['fft', 'laplacian']: if 'strength' in data_info: strength = data_info['strength'] else: N = data_info['N'][0] img = data_info['ori_img'] # 获取原图的尺寸信息 _, C, H, W = img.shape if mask_type == 'fft': # 对图片进行reshape,将其变为patch (3, H/N, N, W/N, N) reshaped_image = img.reshape((batch, -1, H // N, N, W // N, N)) fft_image = torch.fft.fftn(reshaped_image, dim=(3, 5)) # 取绝对值并求和获取频率强度 strength = torch.sum(torch.abs(fft_image), dim=(1, 3, 5)).reshape((batch, -1,)) elif type == 'laplacian': laplacian_kernel = torch.tensor([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=torch.float32).reshape(1, 1, 3, 3) laplacian_kernel = laplacian_kernel.repeat(C, 1, 1, 1) # 对图片进行reshape,将其变为patch (3, H/N, N, W/N, N) reshaped_image = img.reshape(-1, C, H // N, N, W // N, N).permute(0, 2, 4, 1, 3, 5).reshape(-1, C, N, N) laplacian_response = F.conv2d(reshaped_image, laplacian_kernel, padding=1, groups=C) strength = laplacian_response.sum(dim=[1, 2, 3]).reshape((batch, -1,)) # 对频率强度进行归一化,然后使用torch.multinomial进行采样 probabilities = strength / (strength.max(dim=1)[0][:, None]+1e-5) ids_shuffle = torch.multinomial(probabilities.clip(1e-5, 1), length, replacement=False) ids_keep = ids_shuffle[:, :len_keep] ids_restore = torch.argsort(ids_shuffle, dim=1) ids_removed = ids_shuffle[:, len_keep:] mask[:, :len_keep] = 0 mask = torch.gather(mask, dim=1, index=ids_restore) return {'mask': mask, 'ids_keep': ids_keep, 'ids_restore': ids_restore, 'ids_removed': ids_removed} def mask_out_token(x, ids_keep, ids_removed=None): """ Mask out the tokens specified by ids_keep. Args: - x: input sequence, [N, L, D] - ids_keep: indices of tokens to keep return: - x_masked: masked sequence """ N, L, D = x.shape # batch, length, dim x_remain = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D)) if ids_removed is not None: x_masked = torch.gather(x, dim=1, index=ids_removed.unsqueeze(-1).repeat(1, 1, D)) return x_remain, x_masked else: return x_remain def mask_tokens(x, mask_ratio): """ Perform per-sample random masking by per-sample shuffling. Per-sample shuffling is done by argsort random noise. x: [N, L, D], sequence """ N, L, D = x.shape # batch, length, dim len_keep = int(L * (1 - mask_ratio)) noise = torch.rand(N, L, device=x.device) # noise in [0, 1] # sort noise for each sample ids_shuffle = torch.argsort(noise, dim=1) # ascend: small is keep, large is remove ids_restore = torch.argsort(ids_shuffle, dim=1) # keep the first subset ids_keep = ids_shuffle[:, :len_keep] x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D)) # generate the binary mask: 0 is keep, 1 is remove mask = torch.ones([N, L], device=x.device) mask[:, :len_keep] = 0 mask = torch.gather(mask, dim=1, index=ids_restore) return x_masked, mask, ids_restore def unmask_tokens(x, ids_restore, mask_token): # x: [N, T, D] if extras == 0 (i.e., no cls token) else x: [N, T+1, D] mask_tokens = mask_token.repeat(x.shape[0], ids_restore.shape[1] - x.shape[1], 1) x = torch.cat([x, mask_tokens], dim=1) x = torch.gather(x, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2])) # unshuffle return x # Parse 'None' to None and others to float value def parse_float_none(s): assert isinstance(s, str) return None if s == 'None' else float(s) #---------------------------------------------------------------------------- # Parse a comma separated list of numbers or ranges and return a list of ints. # Example: '1,2,5-10' returns [1, 2, 5, 6, 7, 8, 9, 10] def parse_int_list(s): if isinstance(s, list): return s ranges = [] range_re = re.compile(r'^(\d+)-(\d+)$') for p in s.split(','): if m := range_re.match(p): ranges.extend(range(int(m.group(1)), int(m.group(2))+1)) else: ranges.append(int(p)) return ranges def init_processes(fn, args): """ Initialize the distributed environment. """ os.environ['MASTER_ADDR'] = args.master_address os.environ['MASTER_PORT'] = str(random.randint(2000, 6000)) print(f'MASTER_ADDR = {os.environ["MASTER_ADDR"]}') print(f'MASTER_PORT = {os.environ["MASTER_PORT"]}') torch.cuda.set_device(args.local_rank) dist.init_process_group(backend='nccl', init_method='env://', rank=args.global_rank, world_size=args.global_size) fn(args) if args.global_size > 1: cleanup() def mprint(*args, **kwargs): """ Print only from rank 0. """ if dist.get_rank() == 0: print(*args, **kwargs) def cleanup(): """ End DDP training. """ dist.barrier() mprint("Done!") dist.barrier() dist.destroy_process_group() #---------------------------------------------------------------------------- # logging info. class Logger(object): """ Redirect stderr to stdout, optionally print stdout to a file, and optionally force flushing on both stdout and the file. """ def __init__(self, file_name=None, file_mode="w", should_flush=True): self.file = None if file_name is not None: self.file = open(file_name, file_mode) self.should_flush = should_flush self.stdout = sys.stdout self.stderr = sys.stderr sys.stdout = self sys.stderr = self def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() def write(self, text): """Write text to stdout (and a file) and optionally flush.""" if len(text) == 0: # workaround for a bug in VSCode debugger: sys.stdout.write(''); sys.stdout.flush() => crash return if self.file is not None: self.file.write(text) self.stdout.write(text) if self.should_flush: self.flush() def flush(self): """Flush written text to both stdout and a file, if open.""" if self.file is not None: self.file.flush() self.stdout.flush() def close(self): """Flush, close possible files, and remove stdout/stderr mirroring.""" self.flush() # if using multiple loggers, prevent closing in wrong order if sys.stdout is self: sys.stdout = self.stdout if sys.stderr is self: sys.stderr = self.stderr if self.file is not None: self.file.close() class StackedRandomGenerator: def __init__(self, device, seeds): super().__init__() self.generators = [torch.Generator(device).manual_seed(int(seed) % (1 << 32)) for seed in seeds] def randn(self, size, **kwargs): assert size[0] == len(self.generators) return torch.stack([torch.randn(size[1:], generator=gen, **kwargs) for gen in self.generators]) def randn_like(self, input): return self.randn(input.shape, dtype=input.dtype, layout=input.layout, device=input.device) def randint(self, *args, size, **kwargs): assert size[0] == len(self.generators) return torch.stack([torch.randint(*args, size=size[1:], generator=gen, **kwargs) for gen in self.generators]) def prepare_prompt_ar(prompt, ratios, device='cpu', show=True): # get aspect_ratio or ar aspect_ratios = re.findall(r"--aspect_ratio\s+(\d+:\d+)", prompt) ars = re.findall(r"--ar\s+(\d+:\d+)", prompt) custom_hw = re.findall(r"--hw\s+(\d+:\d+)", prompt) if show: print("aspect_ratios:", aspect_ratios, "ars:", ars, "hws:", custom_hw) prompt_clean = prompt.split("--aspect_ratio")[0].split("--ar")[0].split("--hw")[0] if len(aspect_ratios) + len(ars) + len(custom_hw) == 0 and show: print( "Wrong prompt format. Set to default ar: 1. change your prompt into format '--ar h:w or --hw h:w' for correct generating") if len(aspect_ratios) != 0: ar = float(aspect_ratios[0].split(':')[0]) / float(aspect_ratios[0].split(':')[1]) elif len(ars) != 0: ar = float(ars[0].split(':')[0]) / float(ars[0].split(':')[1]) else: ar = 1. closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar)) if len(custom_hw) != 0: custom_hw = [float(custom_hw[0].split(':')[0]), float(custom_hw[0].split(':')[1])] else: custom_hw = ratios[closest_ratio] default_hw = ratios[closest_ratio] prompt_show = f'prompt: {prompt_clean.strip()}\nSize: --ar {closest_ratio}, --bin hw {ratios[closest_ratio]}, --custom hw {custom_hw}' return prompt_clean, prompt_show, torch.tensor(default_hw, device=device)[None], torch.tensor([float(closest_ratio)], device=device)[None], torch.tensor(custom_hw, device=device)[None] def resize_and_crop_tensor(samples: torch.Tensor, new_width: int, new_height: int): orig_hw = torch.tensor([samples.shape[2], samples.shape[3]], dtype=torch.int) custom_hw = torch.tensor([int(new_height), int(new_width)], dtype=torch.int) if (orig_hw != custom_hw).all(): ratio = max(custom_hw[0] / orig_hw[0], custom_hw[1] / orig_hw[1]) resized_width = int(orig_hw[1] * ratio) resized_height = int(orig_hw[0] * ratio) transform = T.Compose([ T.Resize((resized_height, resized_width)), T.CenterCrop(custom_hw.tolist()) ]) return transform(samples) else: return samples def resize_and_crop_img(img: Image, new_width, new_height): orig_width, orig_height = img.size ratio = max(new_width/orig_width, new_height/orig_height) resized_width = int(orig_width * ratio) resized_height = int(orig_height * ratio) img = img.resize((resized_width, resized_height), Image.LANCZOS) left = (resized_width - new_width)/2 top = (resized_height - new_height)/2 right = (resized_width + new_width)/2 bottom = (resized_height + new_height)/2 img = img.crop((left, top, right, bottom)) return img def mask_feature(emb, mask): if emb.shape[0] == 1: keep_index = mask.sum().item() return emb[:, :, :keep_index, :], keep_index else: masked_feature = emb * mask[:, None, :, None] return masked_feature, emb.shape[2] ================================================ FILE: PixArt-alpha-ToCa/diffusion/sa_sampler.py ================================================ """SAMPLING ONLY.""" import torch import numpy as np from diffusion.model.sa_solver import NoiseScheduleVP, model_wrapper, SASolver from .model import gaussian_diffusion as gd class SASolverSampler(object): def __init__(self, model, noise_schedule="linear", diffusion_steps=1000, device='cpu', ): super().__init__() self.model = model self.device = device to_torch = lambda x: x.clone().detach().to(torch.float32).to(device) betas = torch.tensor(gd.get_named_beta_schedule(noise_schedule, diffusion_steps)) alphas = 1.0 - betas self.register_buffer('alphas_cumprod', to_torch(np.cumprod(alphas, axis=0))) def register_buffer(self, name, attr): if type(attr) == torch.Tensor and attr.device != torch.device("cuda"): attr = attr.to(torch.device("cuda")) setattr(self, name, attr) @torch.no_grad() def sample(self, S, batch_size, shape, conditioning=None, callback=None, normals_sequence=None, img_callback=None, quantize_x0=False, eta=0., mask=None, x0=None, temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None, verbose=True, x_T=None, log_every_t=100, unconditional_guidance_scale=1., unconditional_conditioning=None, model_kwargs=None, **kwargs): if model_kwargs is None: model_kwargs = {} if conditioning is not None: if isinstance(conditioning, dict): cbs = conditioning[list(conditioning.keys())[0]].shape[0] if cbs != batch_size: print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}") elif conditioning.shape[0] != batch_size: print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}") # sampling C, H, W = shape size = (batch_size, C, H, W) device = self.device img = torch.randn(size, device=device) if x_T is None else x_T ns = NoiseScheduleVP('discrete', alphas_cumprod=self.alphas_cumprod) model_fn = model_wrapper( self.model, ns, model_type="noise", guidance_type="classifier-free", condition=conditioning, unconditional_condition=unconditional_conditioning, guidance_scale=unconditional_guidance_scale, model_kwargs=model_kwargs, ) sasolver = SASolver(model_fn, ns, algorithm_type="data_prediction") tau_t = lambda t: eta if 0.2 <= t <= 0.8 else 0 x = sasolver.sample(mode='few_steps', x=img, tau=tau_t, steps=S, skip_type='time', skip_order=1, predictor_order=2, corrector_order=2, pc_mode='PEC', return_intermediate=False) return x.to(device), None ================================================ FILE: PixArt-alpha-ToCa/diffusion/sa_solver_diffusers.py ================================================ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # DISCLAIMER: check https://arxiv.org/abs/2309.05019 # The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py import math from typing import List, Optional, Tuple, Union, Callable import numpy as np import torch from diffusers.configuration_utils import ConfigMixin, register_to_config from diffusers.utils.torch_utils import randn_tensor from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar def betas_for_alpha_bar( num_diffusion_timesteps, max_beta=0.999, alpha_transform_type="cosine", ): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up to that part of the diffusion process. Args: num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ if alpha_transform_type == "cosine": def alpha_bar_fn(t): return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 elif alpha_transform_type == "exp": def alpha_bar_fn(t): return math.exp(t * -12.0) else: raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) class SASolverScheduler(SchedulerMixin, ConfigMixin): """ `SASolverScheduler` is a fast dedicated high-order solver for diffusion SDEs. This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic methods the library implements for all schedulers such as loading and saving. Args: num_train_timesteps (`int`, defaults to 1000): The number of diffusion steps to train the model. beta_start (`float`, defaults to 0.0001): The starting `beta` value of inference. beta_end (`float`, defaults to 0.02): The final `beta` value. beta_schedule (`str`, defaults to `"linear"`): The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from `linear`, `scaled_linear`, or `squaredcos_cap_v2`. trained_betas (`np.ndarray`, *optional*): Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`. predictor_order (`int`, defaults to 2): The predictor order which can be `1` or `2` or `3` or '4'. It is recommended to use `predictor_order=2` for guided sampling, and `predictor_order=3` for unconditional sampling. corrector_order (`int`, defaults to 2): The corrector order which can be `1` or `2` or `3` or '4'. It is recommended to use `corrector_order=2` for guided sampling, and `corrector_order=3` for unconditional sampling. predictor_corrector_mode (`str`, defaults to `PEC`): The predictor-corrector mode can be `PEC` or 'PECE'. It is recommended to use `PEC` mode for fast sampling, and `PECE` for high-quality sampling (PECE needs around twice model evaluations as PEC). prediction_type (`str`, defaults to `epsilon`, *optional*): Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process), `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen Video](https://imagen.research.google/video/paper.pdf) paper). thresholding (`bool`, defaults to `False`): Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such as Stable Diffusion. dynamic_thresholding_ratio (`float`, defaults to 0.995): The ratio for the dynamic thresholding method. Valid only when `thresholding=True`. sample_max_value (`float`, defaults to 1.0): The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `algorithm_type="dpmsolver++"`. algorithm_type (`str`, defaults to `data_prediction`): Algorithm type for the solver; can be `data_prediction` or `noise_prediction`. It is recommended to use `data_prediction` with `solver_order=2` for guided sampling like in Stable Diffusion. lower_order_final (`bool`, defaults to `True`): Whether to use lower-order solvers in the final steps. Default = True. use_karras_sigmas (`bool`, *optional*, defaults to `False`): Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`, the sigmas are determined according to a sequence of noise levels {σi}. lambda_min_clipped (`float`, defaults to `-inf`): Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the cosine (`squaredcos_cap_v2`) noise schedule. variance_type (`str`, *optional*): Set to "learned" or "learned_range" for diffusion models that predict variance. If set, the model's output contains the predicted Gaussian variance. timestep_spacing (`str`, defaults to `"linspace"`): The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information. steps_offset (`int`, defaults to 0): An offset added to the inference steps. You can use a combination of `offset=1` and `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable Diffusion. """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] order = 1 @register_to_config def __init__( self, num_train_timesteps: int = 1000, beta_start: float = 0.0001, beta_end: float = 0.02, beta_schedule: str = "linear", trained_betas: Optional[Union[np.ndarray, List[float]]] = None, predictor_order: int = 2, corrector_order: int = 2, predictor_corrector_mode: str = 'PEC', prediction_type: str = "epsilon", tau_func: Callable = lambda t: 1 if t >= 200 and t <= 800 else 0, thresholding: bool = False, dynamic_thresholding_ratio: float = 0.995, sample_max_value: float = 1.0, algorithm_type: str = "data_prediction", lower_order_final: bool = True, use_karras_sigmas: Optional[bool] = False, lambda_min_clipped: float = -float("inf"), variance_type: Optional[str] = None, timestep_spacing: str = "linspace", steps_offset: int = 0, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) elif beta_schedule == "linear": self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32) elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. self.betas = ( torch.linspace(beta_start ** 0.5, beta_end ** 0.5, num_train_timesteps, dtype=torch.float32) ** 2 ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) # Currently we only support VP-type noise schedule self.alpha_t = torch.sqrt(self.alphas_cumprod) self.sigma_t = torch.sqrt(1 - self.alphas_cumprod) self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t) # standard deviation of the initial noise distribution self.init_noise_sigma = 1.0 if algorithm_type not in ["data_prediction", "noise_prediction"]: raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}") # setable values self.num_inference_steps = None timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy() self.timesteps = torch.from_numpy(timesteps) self.timestep_list = [None] * max(predictor_order, corrector_order - 1) self.model_outputs = [None] * max(predictor_order, corrector_order - 1) self.tau_func = tau_func self.predict_x0 = algorithm_type == "data_prediction" self.lower_order_nums = 0 self.last_sample = None def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None): """ Sets the discrete timesteps used for the diffusion chain (to be run before inference). Args: num_inference_steps (`int`): The number of diffusion steps used when generating samples with a pre-trained model. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. """ # Clipping the minimum of all lambda(t) for numerical stability. # This is critical for cosine (squaredcos_cap_v2) noise schedule. clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped) last_timestep = ((self.config.num_train_timesteps - clipped_idx).numpy()).item() # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 if self.config.timestep_spacing == "linspace": timesteps = ( np.linspace(0, last_timestep - 1, num_inference_steps + 1).round()[::-1][:-1].copy().astype(np.int64) ) elif self.config.timestep_spacing == "leading": step_ratio = last_timestep // (num_inference_steps + 1) # creates integer timesteps by multiplying by ratio # casting to int to avoid issues when num_inference_step is power of 3 timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64) timesteps += self.config.steps_offset elif self.config.timestep_spacing == "trailing": step_ratio = self.config.num_train_timesteps / num_inference_steps # creates integer timesteps by multiplying by ratio # casting to int to avoid issues when num_inference_step is power of 3 timesteps = np.arange(last_timestep, 0, -step_ratio).round().copy().astype(np.int64) timesteps -= 1 else: raise ValueError( f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." ) sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) if self.config.use_karras_sigmas: log_sigmas = np.log(sigmas) sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps) timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round() timesteps = np.flip(timesteps).copy().astype(np.int64) self.sigmas = torch.from_numpy(sigmas) # when num_inference_steps == num_train_timesteps, we can end up with # duplicates in timesteps. _, unique_indices = np.unique(timesteps, return_index=True) timesteps = timesteps[np.sort(unique_indices)] self.timesteps = torch.from_numpy(timesteps).to(device) self.num_inference_steps = len(timesteps) self.model_outputs = [ None, ] * max(self.config.predictor_order, self.config.corrector_order - 1) self.lower_order_nums = 0 self.last_sample = None # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing pixels from saturation at each step. We find that dynamic thresholding results in significantly better photorealism as well as better image-text alignment, especially when using very large guidance weights." https://arxiv.org/abs/2205.11487 """ dtype = sample.dtype batch_size, channels, height, width = sample.shape if dtype not in (torch.float32, torch.float64): sample = sample.float() # upcast for quantile calculation, and clamp not implemented for cpu half # Flatten sample for doing quantile calculation along each image sample = sample.reshape(batch_size, channels * height * width) abs_sample = sample.abs() # "a certain percentile absolute pixel value" s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1) s = torch.clamp( s, min=1, max=self.config.sample_max_value ) # When clamped to min=1, equivalent to standard clipping to [-1, 1] s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0 sample = torch.clamp(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s" sample = sample.reshape(batch_size, channels, height, width) sample = sample.to(dtype) return sample # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t def _sigma_to_t(self, sigma, log_sigmas): # get log sigma log_sigma = np.log(sigma) # get distribution dists = log_sigma - log_sigmas[:, np.newaxis] # get sigmas range low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2) high_idx = low_idx + 1 low = log_sigmas[low_idx] high = log_sigmas[high_idx] # interpolate sigmas w = (low - log_sigma) / (low - high) w = np.clip(w, 0, 1) # transform interpolation to time range t = (1 - w) * low_idx + w * high_idx t = t.reshape(sigma.shape) return t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: """Constructs the noise schedule of Karras et al. (2022).""" sigma_min: float = in_sigmas[-1].item() sigma_max: float = in_sigmas[0].item() rho = 7.0 # 7.0 is the value used in the paper ramp = np.linspace(0, 1, num_inference_steps) min_inv_rho = sigma_min ** (1 / rho) max_inv_rho = sigma_max ** (1 / rho) return (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho def convert_model_output( self, model_output: torch.FloatTensor, timestep: int, sample: torch.FloatTensor ) -> torch.FloatTensor: """ Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an integral of the data prediction model. The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise prediction and data prediction models. Args: model_output (`torch.FloatTensor`): The direct output from the learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. sample (`torch.FloatTensor`): A current instance of a sample created by the diffusion process. Returns: `torch.FloatTensor`: The converted model output. """ # SA-Solver_data_prediction needs to solve an integral of the data prediction model. if self.config.algorithm_type in ["data_prediction"]: if self.config.prediction_type == "epsilon": # SA-Solver only needs the "mean" output. if self.config.variance_type in ["learned", "learned_range"]: model_output = model_output[:, :3] alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] x0_pred = (sample - sigma_t * model_output) / alpha_t elif self.config.prediction_type == "sample": x0_pred = model_output elif self.config.prediction_type == "v_prediction": alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] x0_pred = alpha_t * sample - sigma_t * model_output else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" " `v_prediction` for the SASolverScheduler." ) if self.config.thresholding: x0_pred = self._threshold_sample(x0_pred) return x0_pred # SA-Solver_noise_prediction needs to solve an integral of the noise prediction model. elif self.config.algorithm_type in ["noise_prediction"]: if self.config.prediction_type == "epsilon": # SA-Solver only needs the "mean" output. if self.config.variance_type in ["learned", "learned_range"]: epsilon = model_output[:, :3] else: epsilon = model_output elif self.config.prediction_type == "sample": alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] epsilon = (sample - alpha_t * model_output) / sigma_t elif self.config.prediction_type == "v_prediction": alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] epsilon = alpha_t * model_output + sigma_t * sample else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" " `v_prediction` for the SASolverScheduler." ) if self.config.thresholding: alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] x0_pred = (sample - sigma_t * epsilon) / alpha_t x0_pred = self._threshold_sample(x0_pred) epsilon = (sample - alpha_t * x0_pred) / sigma_t return epsilon def get_coefficients_exponential_negative(self, order, interval_start, interval_end): """ Calculate the integral of exp(-x) * x^order dx from interval_start to interval_end """ assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3" if order == 0: return torch.exp(-interval_end) * (torch.exp(interval_end - interval_start) - 1) elif order == 1: return torch.exp(-interval_end) * ( (interval_start + 1) * torch.exp(interval_end - interval_start) - (interval_end + 1)) elif order == 2: return torch.exp(-interval_end) * ( (interval_start ** 2 + 2 * interval_start + 2) * torch.exp(interval_end - interval_start) - ( interval_end ** 2 + 2 * interval_end + 2)) elif order == 3: return torch.exp(-interval_end) * ( (interval_start ** 3 + 3 * interval_start ** 2 + 6 * interval_start + 6) * torch.exp( interval_end - interval_start) - (interval_end ** 3 + 3 * interval_end ** 2 + 6 * interval_end + 6)) def get_coefficients_exponential_positive(self, order, interval_start, interval_end, tau): """ Calculate the integral of exp(x(1+tau^2)) * x^order dx from interval_start to interval_end """ assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3" # after change of variable(cov) interval_end_cov = (1 + tau ** 2) * interval_end interval_start_cov = (1 + tau ** 2) * interval_start if order == 0: return torch.exp(interval_end_cov) * (1 - torch.exp(-(interval_end_cov - interval_start_cov))) / ( (1 + tau ** 2)) elif order == 1: return torch.exp(interval_end_cov) * ((interval_end_cov - 1) - (interval_start_cov - 1) * torch.exp( -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 2) elif order == 2: return torch.exp(interval_end_cov) * ((interval_end_cov ** 2 - 2 * interval_end_cov + 2) - ( interval_start_cov ** 2 - 2 * interval_start_cov + 2) * torch.exp( -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 3) elif order == 3: return torch.exp(interval_end_cov) * ( (interval_end_cov ** 3 - 3 * interval_end_cov ** 2 + 6 * interval_end_cov - 6) - ( interval_start_cov ** 3 - 3 * interval_start_cov ** 2 + 6 * interval_start_cov - 6) * torch.exp( -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 4) def lagrange_polynomial_coefficient(self, order, lambda_list): """ Calculate the coefficient of lagrange polynomial """ assert order in [0, 1, 2, 3] assert order == len(lambda_list) - 1 if order == 0: return [[1]] elif order == 1: return [[1 / (lambda_list[0] - lambda_list[1]), -lambda_list[1] / (lambda_list[0] - lambda_list[1])], [1 / (lambda_list[1] - lambda_list[0]), -lambda_list[0] / (lambda_list[1] - lambda_list[0])]] elif order == 2: denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2]) denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2]) denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1]) return [[1 / denominator1, (-lambda_list[1] - lambda_list[2]) / denominator1, lambda_list[1] * lambda_list[2] / denominator1], [1 / denominator2, (-lambda_list[0] - lambda_list[2]) / denominator2, lambda_list[0] * lambda_list[2] / denominator2], [1 / denominator3, (-lambda_list[0] - lambda_list[1]) / denominator3, lambda_list[0] * lambda_list[1] / denominator3] ] elif order == 3: denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2]) * ( lambda_list[0] - lambda_list[3]) denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2]) * ( lambda_list[1] - lambda_list[3]) denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1]) * ( lambda_list[2] - lambda_list[3]) denominator4 = (lambda_list[3] - lambda_list[0]) * (lambda_list[3] - lambda_list[1]) * ( lambda_list[3] - lambda_list[2]) return [[1 / denominator1, (-lambda_list[1] - lambda_list[2] - lambda_list[3]) / denominator1, (lambda_list[1] * lambda_list[2] + lambda_list[1] * lambda_list[3] + lambda_list[2] * lambda_list[ 3]) / denominator1, (-lambda_list[1] * lambda_list[2] * lambda_list[3]) / denominator1], [1 / denominator2, (-lambda_list[0] - lambda_list[2] - lambda_list[3]) / denominator2, (lambda_list[0] * lambda_list[2] + lambda_list[0] * lambda_list[3] + lambda_list[2] * lambda_list[ 3]) / denominator2, (-lambda_list[0] * lambda_list[2] * lambda_list[3]) / denominator2], [1 / denominator3, (-lambda_list[0] - lambda_list[1] - lambda_list[3]) / denominator3, (lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[3] + lambda_list[1] * lambda_list[ 3]) / denominator3, (-lambda_list[0] * lambda_list[1] * lambda_list[3]) / denominator3], [1 / denominator4, (-lambda_list[0] - lambda_list[1] - lambda_list[2]) / denominator4, (lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[2] + lambda_list[1] * lambda_list[ 2]) / denominator4, (-lambda_list[0] * lambda_list[1] * lambda_list[2]) / denominator4] ] def get_coefficients_fn(self, order, interval_start, interval_end, lambda_list, tau): assert order in [1, 2, 3, 4] assert order == len(lambda_list), 'the length of lambda list must be equal to the order' coefficients = [] lagrange_coefficient = self.lagrange_polynomial_coefficient(order - 1, lambda_list) for i in range(order): coefficient = sum( lagrange_coefficient[i][j] * self.get_coefficients_exponential_positive( order - 1 - j, interval_start, interval_end, tau ) if self.predict_x0 else lagrange_coefficient[i][j] * self.get_coefficients_exponential_negative( order - 1 - j, interval_start, interval_end ) for j in range(order) ) coefficients.append(coefficient) assert len(coefficients) == order, 'the length of coefficients does not match the order' return coefficients def stochastic_adams_bashforth_update( self, model_output: torch.FloatTensor, prev_timestep: int, sample: torch.FloatTensor, noise: torch.FloatTensor, order: int, tau: torch.FloatTensor, ) -> torch.FloatTensor: """ One step for the SA-Predictor. Args: model_output (`torch.FloatTensor`): The direct output from the learned diffusion model at the current timestep. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. sample (`torch.FloatTensor`): A current instance of a sample created by the diffusion process. order (`int`): The order of SA-Predictor at this timestep. Returns: `torch.FloatTensor`: The sample tensor at the previous timestep. """ assert noise is not None timestep_list = self.timestep_list model_output_list = self.model_outputs s0, t = self.timestep_list[-1], prev_timestep lambda_t, lambda_s0 = self.lambda_t[t], self.lambda_t[s0] alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0] sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0] gradient_part = torch.zeros_like(sample) h = lambda_t - lambda_s0 lambda_list = [self.lambda_t[timestep_list[-(i + 1)]] for i in range(order)] gradient_coefficients = self.get_coefficients_fn(order, lambda_s0, lambda_t, lambda_list, tau) x = sample if self.predict_x0 and order == 2: gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * ( h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / ( (1 + tau ** 2) ** 2)) / (self.lambda_t[timestep_list[-1]] - self.lambda_t[ timestep_list[-2]]) gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * ( h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / ( (1 + tau ** 2) ** 2)) / (self.lambda_t[timestep_list[-1]] - self.lambda_t[ timestep_list[-2]]) for i in range(order): if self.predict_x0: gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[ i] * model_output_list[-(i + 1)] else: gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_output_list[-(i + 1)] if self.predict_x0: noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise else: noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise if self.predict_x0: x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part else: x_t = (alpha_t / alpha_s0) * x + gradient_part + noise_part x_t = x_t.to(x.dtype) return x_t def stochastic_adams_moulton_update( self, this_model_output: torch.FloatTensor, this_timestep: int, last_sample: torch.FloatTensor, last_noise: torch.FloatTensor, this_sample: torch.FloatTensor, order: int, tau: torch.FloatTensor, ) -> torch.FloatTensor: """ One step for the SA-Corrector. Args: this_model_output (`torch.FloatTensor`): The model outputs at `x_t`. this_timestep (`int`): The current timestep `t`. last_sample (`torch.FloatTensor`): The generated sample before the last predictor `x_{t-1}`. this_sample (`torch.FloatTensor`): The generated sample after the last predictor `x_{t}`. order (`int`): The order of SA-Corrector at this step. Returns: `torch.FloatTensor`: The corrected sample tensor at the current timestep. """ assert last_noise is not None timestep_list = self.timestep_list model_output_list = self.model_outputs s0, t = self.timestep_list[-1], this_timestep lambda_t, lambda_s0 = self.lambda_t[t], self.lambda_t[s0] alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0] sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0] gradient_part = torch.zeros_like(this_sample) h = lambda_t - lambda_s0 t_list = timestep_list + [this_timestep] lambda_list = [self.lambda_t[t_list[-(i + 1)]] for i in range(order)] model_prev_list = model_output_list + [this_model_output] gradient_coefficients = self.get_coefficients_fn(order, lambda_s0, lambda_t, lambda_list, tau) x = last_sample if self.predict_x0 and order == 2: gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * ( h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / ( (1 + tau ** 2) ** 2 * h)) gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * ( h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / ( (1 + tau ** 2) ** 2 * h)) for i in range(order): if self.predict_x0: gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[ i] * model_prev_list[-(i + 1)] else: gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)] if self.predict_x0: noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * last_noise else: noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * last_noise if self.predict_x0: x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part else: x_t = (alpha_t / alpha_s0) * x + gradient_part + noise_part x_t = x_t.to(x.dtype) return x_t def step( self, model_output: torch.FloatTensor, timestep: int, sample: torch.FloatTensor, generator=None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with the SA-Solver. Args: model_output (`torch.FloatTensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. sample (`torch.FloatTensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`. Returns: [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`: If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a tuple is returned where the first element is the sample tensor. """ if self.num_inference_steps is None: raise ValueError( "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler" ) if isinstance(timestep, torch.Tensor): timestep = timestep.to(self.timesteps.device) step_index = (self.timesteps == timestep).nonzero() if len(step_index) == 0: step_index = len(self.timesteps) - 1 else: step_index = step_index.item() use_corrector = ( step_index > 0 and self.last_sample is not None ) model_output_convert = self.convert_model_output(model_output, timestep, sample) if use_corrector: current_tau = self.tau_func(self.timestep_list[-1]) sample = self.stochastic_adams_moulton_update( this_model_output=model_output_convert, this_timestep=timestep, last_sample=self.last_sample, last_noise=self.last_noise, this_sample=sample, order=self.this_corrector_order, tau=current_tau, ) prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1] for i in range(max(self.config.predictor_order, self.config.corrector_order - 1) - 1): self.model_outputs[i] = self.model_outputs[i + 1] self.timestep_list[i] = self.timestep_list[i + 1] self.model_outputs[-1] = model_output_convert self.timestep_list[-1] = timestep noise = randn_tensor( model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype ) if self.config.lower_order_final: this_predictor_order = min(self.config.predictor_order, len(self.timesteps) - step_index) this_corrector_order = min(self.config.corrector_order, len(self.timesteps) - step_index + 1) else: this_predictor_order = self.config.predictor_order this_corrector_order = self.config.corrector_order self.this_predictor_order = min(this_predictor_order, self.lower_order_nums + 1) # warmup for multistep self.this_corrector_order = min(this_corrector_order, self.lower_order_nums + 2) # warmup for multistep assert self.this_predictor_order > 0 assert self.this_corrector_order > 0 self.last_sample = sample self.last_noise = noise current_tau = self.tau_func(self.timestep_list[-1]) prev_sample = self.stochastic_adams_bashforth_update( model_output=model_output_convert, prev_timestep=prev_timestep, sample=sample, noise=noise, order=self.this_predictor_order, tau=current_tau, ) if self.lower_order_nums < max(self.config.predictor_order, self.config.corrector_order - 1): self.lower_order_nums += 1 if not return_dict: return (prev_sample,) return SchedulerOutput(prev_sample=prev_sample) def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: sample (`torch.FloatTensor`): The input sample. Returns: `torch.FloatTensor`: A scaled input sample. """ return sample # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, original_samples: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor, ) -> torch.FloatTensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype) timesteps = timesteps.to(original_samples.device) sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(original_samples.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) return sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise def __len__(self): return self.config.num_train_timesteps ================================================ FILE: PixArt-alpha-ToCa/diffusion/utils/__init__.py ================================================ ================================================ FILE: PixArt-alpha-ToCa/diffusion/utils/checkpoint.py ================================================ import os import re import torch from diffusion.utils.logger import get_root_logger def save_checkpoint(work_dir, epoch, model, model_ema=None, optimizer=None, lr_scheduler=None, keep_last=False, step=None, ): os.makedirs(work_dir, exist_ok=True) state_dict = dict(state_dict=model.state_dict()) if model_ema is not None: state_dict['state_dict_ema'] = model_ema.state_dict() if optimizer is not None: state_dict['optimizer'] = optimizer.state_dict() if lr_scheduler is not None: state_dict['scheduler'] = lr_scheduler.state_dict() if epoch is not None: state_dict['epoch'] = epoch file_path = os.path.join(work_dir, f"epoch_{epoch}.pth") if step is not None: file_path = file_path.split('.pth')[0] + f"_step_{step}.pth" logger = get_root_logger() torch.save(state_dict, file_path) logger.info(f'Saved checkpoint of epoch {epoch} to {file_path.format(epoch)}.') if keep_last: for i in range(epoch): previous_ckgt = file_path.format(i) if os.path.exists(previous_ckgt): os.remove(previous_ckgt) def load_checkpoint(checkpoint, model, model_ema=None, optimizer=None, lr_scheduler=None, load_ema=False, resume_optimizer=True, resume_lr_scheduler=True ): assert isinstance(checkpoint, str) ckpt_file = checkpoint checkpoint = torch.load(ckpt_file, map_location="cpu") state_dict_keys = ['pos_embed', 'base_model.pos_embed', 'model.pos_embed'] for key in state_dict_keys: if key in checkpoint['state_dict']: del checkpoint['state_dict'][key] if 'state_dict_ema' in checkpoint and key in checkpoint['state_dict_ema']: del checkpoint['state_dict_ema'][key] break if load_ema: state_dict = checkpoint['state_dict_ema'] else: state_dict = checkpoint.get('state_dict', checkpoint) # to be compatible with the official checkpoint # model.load_state_dict(state_dict) missing, unexpect = model.load_state_dict(state_dict, strict=False) if model_ema is not None: model_ema.load_state_dict(checkpoint['state_dict_ema'], strict=False) if optimizer is not None and resume_optimizer: optimizer.load_state_dict(checkpoint['optimizer']) if lr_scheduler is not None and resume_lr_scheduler: lr_scheduler.load_state_dict(checkpoint['scheduler']) logger = get_root_logger() if optimizer is not None: epoch = checkpoint.get('epoch', re.match(r'.*epoch_(\d*).*.pth', ckpt_file).group()[0]) logger.info(f'Resume checkpoint of epoch {epoch} from {ckpt_file}. Load ema: {load_ema}, ' f'resume optimizer: {resume_optimizer}, resume lr scheduler: {resume_lr_scheduler}.') return epoch, missing, unexpect logger.info(f'Load checkpoint from {ckpt_file}. Load ema: {load_ema}.') return missing, unexpect ================================================ FILE: PixArt-alpha-ToCa/diffusion/utils/data_sampler.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os from typing import Sequence from torch.utils.data import BatchSampler, Sampler, Dataset from random import shuffle, choice from copy import deepcopy from diffusion.utils.logger import get_root_logger class AspectRatioBatchSampler(BatchSampler): """A sampler wrapper for grouping images with similar aspect ratio into a same batch. Args: sampler (Sampler): Base sampler. dataset (Dataset): Dataset providing data information. batch_size (int): Size of mini-batch. drop_last (bool): If ``True``, the sampler will drop the last batch if its size would be less than ``batch_size``. aspect_ratios (dict): The predefined aspect ratios. """ def __init__(self, sampler: Sampler, dataset: Dataset, batch_size: int, aspect_ratios: dict, drop_last: bool = False, config=None, valid_num=0, # take as valid aspect-ratio when sample number >= valid_num **kwargs) -> None: if not isinstance(sampler, Sampler): raise TypeError('sampler should be an instance of ``Sampler``, ' f'but got {sampler}') if not isinstance(batch_size, int) or batch_size <= 0: raise ValueError('batch_size should be a positive integer value, ' f'but got batch_size={batch_size}') self.sampler = sampler self.dataset = dataset self.batch_size = batch_size self.aspect_ratios = aspect_ratios self.drop_last = drop_last self.ratio_nums_gt = kwargs.get('ratio_nums', None) self.config = config assert self.ratio_nums_gt # buckets for each aspect ratio self._aspect_ratio_buckets = {ratio: [] for ratio in aspect_ratios} self.current_available_bucket_keys = [str(k) for k, v in self.ratio_nums_gt.items() if v >= valid_num] logger = get_root_logger() if config is None else get_root_logger(os.path.join(config.work_dir, 'train_log.log')) logger.warning(f"Using valid_num={valid_num} in config file. Available {len(self.current_available_bucket_keys)} aspect_ratios: {self.current_available_bucket_keys}") def __iter__(self) -> Sequence[int]: for idx in self.sampler: data_info = self.dataset.get_data_info(idx) height, width = data_info['height'], data_info['width'] ratio = height / width # find the closest aspect ratio closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio)) if closest_ratio not in self.current_available_bucket_keys: continue bucket = self._aspect_ratio_buckets[closest_ratio] bucket.append(idx) # yield a batch of indices in the same aspect ratio group if len(bucket) == self.batch_size: yield bucket[:] del bucket[:] # yield the rest data and reset the buckets for bucket in self._aspect_ratio_buckets.values(): while len(bucket) > 0: if len(bucket) <= self.batch_size: if not self.drop_last: yield bucket[:] bucket = [] else: yield bucket[:self.batch_size] bucket = bucket[self.batch_size:] class BalancedAspectRatioBatchSampler(AspectRatioBatchSampler): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Assign samples to each bucket self.ratio_nums_gt = kwargs.get('ratio_nums', None) assert self.ratio_nums_gt self._aspect_ratio_buckets = {float(ratio): [] for ratio in self.aspect_ratios.keys()} self.original_buckets = {} self.current_available_bucket_keys = [k for k, v in self.ratio_nums_gt.items() if v >= 3000] self.all_available_keys = deepcopy(self.current_available_bucket_keys) self.exhausted_bucket_keys = [] self.total_batches = len(self.sampler) // self.batch_size self._aspect_ratio_count = {} for k in self.all_available_keys: self._aspect_ratio_count[float(k)] = 0 self.original_buckets[float(k)] = [] logger = get_root_logger(os.path.join(self.config.work_dir, 'train_log.log')) logger.warning(f"Available {len(self.current_available_bucket_keys)} aspect_ratios: {self.current_available_bucket_keys}") def __iter__(self) -> Sequence[int]: i = 0 for idx in self.sampler: data_info = self.dataset.get_data_info(idx) height, width = data_info['height'], data_info['width'] ratio = height / width closest_ratio = float(min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))) if closest_ratio not in self.all_available_keys: continue if self._aspect_ratio_count[closest_ratio] < self.ratio_nums_gt[closest_ratio]: self._aspect_ratio_count[closest_ratio] += 1 self._aspect_ratio_buckets[closest_ratio].append(idx) self.original_buckets[closest_ratio].append(idx) # Save the original samples for each bucket if not self.current_available_bucket_keys: self.current_available_bucket_keys, self.exhausted_bucket_keys = self.exhausted_bucket_keys, [] if closest_ratio not in self.current_available_bucket_keys: continue key = closest_ratio bucket = self._aspect_ratio_buckets[key] if len(bucket) == self.batch_size: yield bucket[:self.batch_size] del bucket[:self.batch_size] i += 1 self.exhausted_bucket_keys.append(key) self.current_available_bucket_keys.remove(key) for _ in range(self.total_batches - i): key = choice(self.all_available_keys) bucket = self._aspect_ratio_buckets[key] if len(bucket) >= self.batch_size: yield bucket[:self.batch_size] del bucket[:self.batch_size] # If a bucket is exhausted if not bucket: self._aspect_ratio_buckets[key] = deepcopy(self.original_buckets[key][:]) shuffle(self._aspect_ratio_buckets[key]) else: self._aspect_ratio_buckets[key] = deepcopy(self.original_buckets[key][:]) shuffle(self._aspect_ratio_buckets[key]) ================================================ FILE: PixArt-alpha-ToCa/diffusion/utils/dist_utils.py ================================================ """ This file contains primitives for multi-gpu communication. This is useful when doing distributed training. """ import os import pickle import shutil import gc import mmcv import torch import torch.distributed as dist from mmcv.runner import get_dist_info def is_distributed(): return get_world_size() > 1 def get_world_size(): if not dist.is_available(): return 1 return dist.get_world_size() if dist.is_initialized() else 1 def get_rank(): if not dist.is_available(): return 0 return dist.get_rank() if dist.is_initialized() else 0 def get_local_rank(): if not dist.is_available(): return 0 return int(os.getenv('LOCAL_RANK', 0)) if dist.is_initialized() else 0 def is_master(): return get_rank() == 0 def is_local_master(): return get_local_rank() == 0 def get_local_proc_group(group_size=8): world_size = get_world_size() if world_size <= group_size or group_size == 1: return None assert world_size % group_size == 0, f'world size ({world_size}) should be evenly divided by group size ({group_size}).' process_groups = getattr(get_local_proc_group, 'process_groups', {}) if group_size not in process_groups: num_groups = dist.get_world_size() // group_size groups = [list(range(i * group_size, (i + 1) * group_size)) for i in range(num_groups)] process_groups.update({group_size: [torch.distributed.new_group(group) for group in groups]}) get_local_proc_group.process_groups = process_groups group_idx = get_rank() // group_size return get_local_proc_group.process_groups.get(group_size)[group_idx] def synchronize(): """ Helper function to synchronize (barrier) among all processes when using distributed training """ if not dist.is_available(): return if not dist.is_initialized(): return world_size = dist.get_world_size() if world_size == 1: return dist.barrier() def all_gather(data): """ Run all_gather on arbitrary picklable data (not necessarily tensors) Args: data: any picklable object Returns: list[data]: list of data gathered from each rank """ to_device = torch.device("cuda") # to_device = torch.device("cpu") world_size = get_world_size() if world_size == 1: return [data] # serialized to a Tensor buffer = pickle.dumps(data) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to(to_device) # obtain Tensor size of each rank local_size = torch.LongTensor([tensor.numel()]).to(to_device) size_list = [torch.LongTensor([0]).to(to_device) for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) tensor_list = [ torch.ByteTensor(size=(max_size,)).to(to_device) for _ in size_list ] if local_size != max_size: padding = torch.ByteTensor(size=(max_size - local_size,)).to(to_device) tensor = torch.cat((tensor, padding), dim=0) dist.all_gather(tensor_list, tensor) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list def reduce_dict(input_dict, average=True): """ Args: input_dict (dict): all the values will be reduced average (bool): whether to do average or sum Reduce the values in the dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as input_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return input_dict with torch.no_grad(): reduced_dict = _extracted_from_reduce_dict_14(input_dict, average, world_size) return reduced_dict # TODO Rename this here and in `reduce_dict` def _extracted_from_reduce_dict_14(input_dict, average, world_size): names = [] values = [] # sort the keys so that they are consistent across processes for k in sorted(input_dict.keys()): names.append(k) values.append(input_dict[k]) values = torch.stack(values, dim=0) dist.reduce(values, dst=0) if dist.get_rank() == 0 and average: # only main process gets accumulated, so only divide by # world_size in this case values /= world_size return dict(zip(names, values)) def broadcast(data, **kwargs): if get_world_size() == 1: return data data = [data] dist.broadcast_object_list(data, **kwargs) return data[0] def all_gather_cpu(result_part, tmpdir=None, collect_by_master=True): rank, world_size = get_dist_info() if tmpdir is None: tmpdir = './tmp' if rank == 0: mmcv.mkdir_or_exist(tmpdir) synchronize() # dump the part result to the dir mmcv.dump(result_part, os.path.join(tmpdir, f'part_{rank}.pkl')) synchronize() if collect_by_master and rank != 0: return None # load results of all parts from tmp dir results = [] for i in range(world_size): part_file = os.path.join(tmpdir, f'part_{i}.pkl') results.append(mmcv.load(part_file)) if not collect_by_master: synchronize() # remove tmp dir if rank == 0: shutil.rmtree(tmpdir) return results def all_gather_tensor(tensor, group_size=None, group=None): if group_size is None: group_size = get_world_size() if group_size == 1: output = [tensor] else: output = [torch.zeros_like(tensor) for _ in range(group_size)] dist.all_gather(output, tensor, group=group) return output def gather_difflen_tensor(feat, num_samples_list, concat=True, group=None, group_size=None): world_size = get_world_size() if world_size == 1: return feat if concat else [feat] num_samples, *feat_dim = feat.size() # padding to max number of samples feat_padding = feat.new_zeros((max(num_samples_list), *feat_dim)) feat_padding[:num_samples] = feat # gather feat_gather = all_gather_tensor(feat_padding, group=group, group_size=group_size) for r, num in enumerate(num_samples_list): feat_gather[r] = feat_gather[r][:num] if concat: feat_gather = torch.cat(feat_gather) return feat_gather class GatherLayer(torch.autograd.Function): '''Gather tensors from all process, supporting backward propagation. ''' @staticmethod def forward(ctx, input): ctx.save_for_backward(input) num_samples = torch.tensor(input.size(0), dtype=torch.long, device=input.device) ctx.num_samples_list = all_gather_tensor(num_samples) output = gather_difflen_tensor(input, ctx.num_samples_list, concat=False) return tuple(output) @staticmethod def backward(ctx, *grads): # tuple(output)'s grad input, = ctx.saved_tensors num_samples_list = ctx.num_samples_list rank = get_rank() start, end = sum(num_samples_list[:rank]), sum(num_samples_list[:rank + 1]) grads = torch.cat(grads) if is_distributed(): dist.all_reduce(grads) grad_out = torch.zeros_like(input) grad_out[:] = grads[start:end] return grad_out, None, None class GatherLayerWithGroup(torch.autograd.Function): '''Gather tensors from all process, supporting backward propagation. ''' @staticmethod def forward(ctx, input, group, group_size): ctx.save_for_backward(input) ctx.group_size = group_size output = all_gather_tensor(input, group=group, group_size=group_size) return tuple(output) @staticmethod def backward(ctx, *grads): # tuple(output)'s grad input, = ctx.saved_tensors grads = torch.stack(grads) if is_distributed(): dist.all_reduce(grads) grad_out = torch.zeros_like(input) grad_out[:] = grads[get_rank() % ctx.group_size] return grad_out, None, None def gather_layer_with_group(data, group=None, group_size=None): if group_size is None: group_size = get_world_size() return GatherLayer.apply(data, group, group_size) from typing import Union import math # from torch.distributed.fsdp.fully_sharded_data_parallel import TrainingState_, _calc_grad_norm @torch.no_grad() def clip_grad_norm_( self, max_norm: Union[float, int], norm_type: Union[float, int] = 2.0 ) -> None: self._lazy_init() self._wait_for_previous_optim_step() assert self._is_root, "clip_grad_norm should only be called on the root (parent) instance" self._assert_state(TrainingState_.IDLE) max_norm = float(max_norm) norm_type = float(norm_type) # Computes the max norm for this shard's gradients and sync's across workers local_norm = _calc_grad_norm(self.params_with_grad, norm_type).cuda() # type: ignore[arg-type] if norm_type == math.inf: total_norm = local_norm dist.all_reduce(total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group) else: total_norm = local_norm ** norm_type dist.all_reduce(total_norm, group=self.process_group) total_norm = total_norm ** (1.0 / norm_type) clip_coef = torch.tensor(max_norm, dtype=total_norm.dtype, device=total_norm.device) / (total_norm + 1e-6) if clip_coef < 1: # multiply by clip_coef, aka, (max_norm/total_norm). for p in self.params_with_grad: assert p.grad is not None p.grad.detach().mul_(clip_coef.to(p.grad.device)) return total_norm def flush(): gc.collect() torch.cuda.empty_cache() ================================================ FILE: PixArt-alpha-ToCa/diffusion/utils/logger.py ================================================ import logging import os import torch.distributed as dist from datetime import datetime from .dist_utils import is_local_master from mmcv.utils.logging import logger_initialized def get_root_logger(log_file=None, log_level=logging.INFO, name='PixArt'): """Get root logger. Args: log_file (str, optional): File path of log. Defaults to None. log_level (int, optional): The level of logger. Defaults to logging.INFO. name (str): logger name Returns: :obj:`logging.Logger`: The obtained logger """ if log_file is None: log_file = '/dev/null' return get_logger(name=name, log_file=log_file, log_level=log_level) def get_logger(name, log_file=None, log_level=logging.INFO): """Initialize and get a logger by name. If the logger has not been initialized, this method will initialize the logger by adding one or two handlers, otherwise the initialized logger will be directly returned. During initialization, a StreamHandler will always be added. If `log_file` is specified and the process rank is 0, a FileHandler will also be added. Args: name (str): Logger name. log_file (str | None): The log filename. If specified, a FileHandler will be added to the logger. log_level (int): The logger level. Note that only the process of rank 0 is affected, and other processes will set the level to "Error" thus be silent most of the time. Returns: logging.Logger: The expected logger. """ logger = logging.getLogger(name) logger.propagate = False # disable root logger to avoid duplicate logging if name in logger_initialized: return logger # handle hierarchical names # e.g., logger "a" is initialized, then logger "a.b" will skip the # initialization since it is a child of "a". for logger_name in logger_initialized: if name.startswith(logger_name): return logger stream_handler = logging.StreamHandler() handlers = [stream_handler] rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0 # only rank 0 will add a FileHandler if rank == 0 and log_file is not None: file_handler = logging.FileHandler(log_file, 'w') handlers.append(file_handler) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') for handler in handlers: handler.setFormatter(formatter) handler.setLevel(log_level) logger.addHandler(handler) # only rank0 for each node will print logs log_level = log_level if is_local_master() else logging.ERROR logger.setLevel(log_level) logger_initialized[name] = True return logger def rename_file_with_creation_time(file_path): # 获取文件的创建时间 creation_time = os.path.getctime(file_path) creation_time_str = datetime.fromtimestamp(creation_time).strftime('%Y-%m-%d_%H-%M-%S') # 构建新的文件名 dir_name, file_name = os.path.split(file_path) name, ext = os.path.splitext(file_name) new_file_name = f"{name}_{creation_time_str}{ext}" new_file_path = os.path.join(dir_name, new_file_name) # 重命名文件 os.rename(file_path, new_file_path) print(f"File renamed to: {new_file_path}") ================================================ FILE: PixArt-alpha-ToCa/diffusion/utils/lr_scheduler.py ================================================ from diffusers import get_cosine_schedule_with_warmup, get_constant_schedule_with_warmup from torch.optim import Optimizer from torch.optim.lr_scheduler import LambdaLR import math from diffusion.utils.logger import get_root_logger def build_lr_scheduler(config, optimizer, train_dataloader, lr_scale_ratio): if not config.get('lr_schedule_args', None): config.lr_schedule_args = {} if config.get('lr_warmup_steps', None): config['num_warmup_steps'] = config.get('lr_warmup_steps') # for compatibility with old version logger = get_root_logger() logger.info( f'Lr schedule: {config.lr_schedule}, ' + ",".join( [f"{key}:{value}" for key, value in config.lr_schedule_args.items()]) + '.') if config.lr_schedule == 'cosine': lr_scheduler = get_cosine_schedule_with_warmup( optimizer=optimizer, **config.lr_schedule_args, num_training_steps=(len(train_dataloader) * config.num_epochs), ) elif config.lr_schedule == 'constant': lr_scheduler = get_constant_schedule_with_warmup( optimizer=optimizer, **config.lr_schedule_args, ) elif config.lr_schedule == 'cosine_decay_to_constant': assert lr_scale_ratio >= 1 lr_scheduler = get_cosine_decay_to_constant_with_warmup( optimizer=optimizer, **config.lr_schedule_args, final_lr=1 / lr_scale_ratio, num_training_steps=(len(train_dataloader) * config.num_epochs), ) else: raise RuntimeError(f'Unrecognized lr schedule {config.lr_schedule}.') return lr_scheduler def get_cosine_decay_to_constant_with_warmup(optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, final_lr: float = 0.0, num_decay: float = 0.667, num_cycles: float = 0.5, last_epoch: int = -1 ): """ Create a schedule with a cosine annealing lr followed by a constant lr. Args: optimizer ([`~torch.optim.Optimizer`]): The optimizer for which to schedule the learning rate. num_warmup_steps (`int`): The number of steps for the warmup phase. num_training_steps (`int`): The number of total training steps. final_lr (`int`): The final constant lr after cosine decay. num_decay (`int`): The last_epoch (`int`, *optional*, defaults to -1): The index of the last epoch when resuming training. Return: `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. """ def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) num_decay_steps = int(num_training_steps * num_decay) if current_step > num_decay_steps: return final_lr progress = float(current_step - num_warmup_steps) / float(max(1, num_decay_steps - num_warmup_steps)) return ( max( 0.0, 0.5 * (1.0 + math.cos(math.pi * num_cycles * 2.0 * progress)), ) * (1 - final_lr) ) + final_lr return LambdaLR(optimizer, lr_lambda, last_epoch) ================================================ FILE: PixArt-alpha-ToCa/diffusion/utils/misc.py ================================================ import collections import datetime import os import random import subprocess import time from multiprocessing import JoinableQueue, Process import numpy as np import torch import torch.distributed as dist from mmcv import Config from mmcv.runner import get_dist_info from diffusion.utils.logger import get_root_logger os.environ["MOX_SILENT_MODE"] = "1" # mute moxing log def read_config(file): # solve config loading conflict when multi-processes import time while True: config = Config.fromfile(file) if len(config) == 0: time.sleep(0.1) continue break return config def init_random_seed(seed=None, device='cuda'): """Initialize random seed. If the seed is not set, the seed will be automatically randomized, and then broadcast to all processes to prevent some potential bugs. Args: seed (int, Optional): The seed. Default to None. device (str): The device where the seed will be put on. Default to 'cuda'. Returns: int: Seed to be used. """ if seed is not None: return seed # Make sure all ranks share the same random seed to prevent # some potential bugs. Please refer to # https://github.com/open-mmlab/mmdetection/issues/6339 rank, world_size = get_dist_info() seed = np.random.randint(2 ** 31) if world_size == 1: return seed if rank == 0: random_num = torch.tensor(seed, dtype=torch.int32, device=device) else: random_num = torch.tensor(0, dtype=torch.int32, device=device) dist.broadcast(random_num, src=0) return random_num.item() def set_random_seed(seed, deterministic=False): """Set random seed. Args: seed (int): Seed to be used. deterministic (bool): Whether to set the deterministic option for CUDNN backend, i.e., set `torch.backends.cudnn.deterministic` to True and `torch.backends.cudnn.benchmark` to False. Default: False. """ random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) if deterministic: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False class SimpleTimer: def __init__(self, num_tasks, log_interval=1, desc="Process"): self.num_tasks = num_tasks self.desc = desc self.count = 0 self.log_interval = log_interval self.start_time = time.time() self.logger = get_root_logger() def log(self): self.count += 1 if (self.count % self.log_interval) == 0 or self.count == self.num_tasks: time_elapsed = time.time() - self.start_time avg_time = time_elapsed / self.count eta_sec = avg_time * (self.num_tasks - self.count) eta_str = str(datetime.timedelta(seconds=int(eta_sec))) elapsed_str = str(datetime.timedelta(seconds=int(time_elapsed))) log_info = f"{self.desc} [{self.count}/{self.num_tasks}], elapsed_time:{elapsed_str}," \ f" avg_time: {avg_time}, eta: {eta_str}." self.logger.info(log_info) class DebugUnderflowOverflow: """ This debug class helps detect and understand where the model starts getting very large or very small, and more importantly `nan` or `inf` weight and activation elements. There are 2 working modes: 1. Underflow/overflow detection (default) 2. Specific batch absolute min/max tracing without detection Mode 1: Underflow/overflow detection To activate the underflow/overflow detection, initialize the object with the model : ```python debug_overflow = DebugUnderflowOverflow(model) ``` then run the training as normal and if `nan` or `inf` gets detected in at least one of the weight, input or output elements this module will throw an exception and will print `max_frames_to_save` frames that lead to this event, each frame reporting 1. the fully qualified module name plus the class name whose `forward` was run 2. the absolute min and max value of all elements for each module weights, and the inputs and output For example, here is the header and the last few frames in detection report for `google/mt5-small` run in fp16 mixed precision : ``` Detected inf/nan during batch_number=0 Last 21 forward frames: abs min abs max metadata [...] encoder.block.2.layer.1.DenseReluDense.wi_0 Linear 2.17e-07 4.50e+00 weight 1.79e-06 4.65e+00 input[0] 2.68e-06 3.70e+01 output encoder.block.2.layer.1.DenseReluDense.wi_1 Linear 8.08e-07 2.66e+01 weight 1.79e-06 4.65e+00 input[0] 1.27e-04 2.37e+02 output encoder.block.2.layer.1.DenseReluDense.wo Linear 1.01e-06 6.44e+00 weight 0.00e+00 9.74e+03 input[0] 3.18e-04 6.27e+04 output encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense 1.79e-06 4.65e+00 input[0] 3.18e-04 6.27e+04 output encoder.block.2.layer.1.dropout Dropout 3.18e-04 6.27e+04 input[0] 0.00e+00 inf output ``` You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value was around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which renormalizes the weights, after it zeroed some of the elements, which pushes the absolute max value to more than 64K, and we get an overlow. As you can see it's the previous frames that we need to look into when the numbers start going into very large for fp16 numbers. The tracking is done in a forward hook, which gets invoked immediately after `forward` has completed. By default the last 21 frames are printed. You can change the default to adjust for your needs. For example : ```python debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100) ``` To validate that you have set up this debugging feature correctly, and you intend to use it in a training that may take hours to complete, first run it with normal tracing enabled for one of a few batches as explained in the next section. Mode 2. Specific batch absolute min/max tracing without detection The second work mode is per-batch tracing with the underflow/overflow detection feature turned off. Let's say you want to watch the absolute min and max values for all the ingredients of each `forward` call of a given batch, and only do that for batches 1 and 3. Then you instantiate this class as : ```python debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3]) ``` And now full batches 1 and 3 will be traced using the same format as explained above. Batches are 0-indexed. This is helpful if you know that the program starts misbehaving after a certain batch number, so you can fast-forward right to that area. Early stopping: You can also specify the batch number after which to stop the training, with : ```python debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3) ``` This feature is mainly useful in the tracing mode, but you can use it for any mode. **Performance**: As this module measures absolute `min`/``max` of each weight of the model on every forward it'll slow the training down. Therefore remember to turn it off once the debugging needs have been met. Args: model (`nn.Module`): The model to debug. max_frames_to_save (`int`, *optional*, defaults to 21): How many frames back to record trace_batch_nums(`List[int]`, *optional*, defaults to `[]`): Which batch numbers to trace (turns detection off) abort_after_batch_num (`int``, *optional*): Whether to abort after a certain batch number has finished """ def __init__(self, model, max_frames_to_save=21, trace_batch_nums=None, abort_after_batch_num=None): if trace_batch_nums is None: trace_batch_nums = [] self.model = model self.trace_batch_nums = trace_batch_nums self.abort_after_batch_num = abort_after_batch_num # keep a LIFO buffer of frames to dump as soon as inf/nan is encountered to give context to the problem emergence self.frames = collections.deque([], max_frames_to_save) self.frame = [] self.batch_number = 0 self.total_calls = 0 self.detected_overflow = False self.prefix = " " self.analyse_model() self.register_forward_hook() def save_frame(self, frame=None): if frame is not None: self.expand_frame(frame) self.frames.append("\n".join(self.frame)) self.frame = [] # start a new frame def expand_frame(self, line): self.frame.append(line) def trace_frames(self): print("\n".join(self.frames)) self.frames = [] def reset_saved_frames(self): self.frames = [] def dump_saved_frames(self): print(f"\nDetected inf/nan during batch_number={self.batch_number} " f"Last {len(self.frames)} forward frames:" f"{'abs min':8} {'abs max':8} metadata" f"'\n'.join(self.frames)" f"\n\n") self.frames = [] def analyse_model(self): # extract the fully qualified module names, to be able to report at run time. e.g.: # encoder.block.2.layer.0.SelfAttention.o # # for shared weights only the first shared module name will be registered self.module_names = {m: name for name, m in self.model.named_modules()} # self.longest_module_name = max(len(v) for v in self.module_names.values()) def analyse_variable(self, var, ctx): if torch.is_tensor(var): self.expand_frame(self.get_abs_min_max(var, ctx)) if self.detect_overflow(var, ctx): self.detected_overflow = True elif var is None: self.expand_frame(f"{'None':>17} {ctx}") else: self.expand_frame(f"{'not a tensor':>17} {ctx}") def batch_start_frame(self): self.expand_frame(f"\n\n{self.prefix} *** Starting batch number={self.batch_number} ***") self.expand_frame(f"{'abs min':8} {'abs max':8} metadata") def batch_end_frame(self): self.expand_frame(f"{self.prefix} *** Finished batch number={self.batch_number - 1} ***\n\n") def create_frame(self, module, input, output): self.expand_frame(f"{self.prefix} {self.module_names[module]} {module.__class__.__name__}") # params for name, p in module.named_parameters(recurse=False): self.analyse_variable(p, name) # inputs if isinstance(input, tuple): for i, x in enumerate(input): self.analyse_variable(x, f"input[{i}]") else: self.analyse_variable(input, "input") # outputs if isinstance(output, tuple): for i, x in enumerate(output): # possibly a tuple of tuples if isinstance(x, tuple): for j, y in enumerate(x): self.analyse_variable(y, f"output[{i}][{j}]") else: self.analyse_variable(x, f"output[{i}]") else: self.analyse_variable(output, "output") self.save_frame() def register_forward_hook(self): self.model.apply(self._register_forward_hook) def _register_forward_hook(self, module): module.register_forward_hook(self.forward_hook) def forward_hook(self, module, input, output): # - input is a tuple of packed inputs (could be non-Tensors) # - output could be a Tensor or a tuple of Tensors and non-Tensors last_frame_of_batch = False trace_mode = self.batch_number in self.trace_batch_nums if trace_mode: self.reset_saved_frames() if self.total_calls == 0: self.batch_start_frame() self.total_calls += 1 # count batch numbers - the very first forward hook of the batch will be called when the # batch completes - i.e. it gets called very last - we know this batch has finished if module == self.model: self.batch_number += 1 last_frame_of_batch = True self.create_frame(module, input, output) # if last_frame_of_batch: # self.batch_end_frame() if trace_mode: self.trace_frames() if last_frame_of_batch: self.batch_start_frame() if self.detected_overflow and not trace_mode: self.dump_saved_frames() # now we can abort, as it's pointless to continue running raise ValueError( "DebugUnderflowOverflow: inf/nan detected, aborting as there is no point running further. " "Please scroll up above this traceback to see the activation values prior to this event." ) # abort after certain batch if requested to do so if self.abort_after_batch_num is not None and self.batch_number > self.abort_after_batch_num: raise ValueError( f"DebugUnderflowOverflow: aborting after {self.batch_number} batches due to `abort_after_batch_num={self.abort_after_batch_num}` arg" ) @staticmethod def get_abs_min_max(var, ctx): abs_var = var.abs() return f"{abs_var.min():8.2e} {abs_var.max():8.2e} {ctx}" @staticmethod def detect_overflow(var, ctx): """ Report whether the tensor contains any `nan` or `inf` entries. This is useful for detecting overflows/underflows and best to call right after the function that did some math that modified the tensor in question. This function contains a few other helper features that you can enable and tweak directly if you want to track various other things. Args: var: the tensor variable to check ctx: the message to print as a context Return: `True` if `inf` or `nan` was detected, `False` otherwise """ detected = False if torch.isnan(var).any().item(): detected = True print(f"{ctx} has nans") if torch.isinf(var).any().item(): detected = True print(f"{ctx} has infs") if var.dtype == torch.float32 and torch.ge(var.abs(), 65535).any().item(): detected = True print(f"{ctx} has overflow values {var.abs().max().item()}.") return detected ================================================ FILE: PixArt-alpha-ToCa/diffusion/utils/optimizer.py ================================================ import math from mmcv import Config from mmcv.runner import build_optimizer as mm_build_optimizer, OPTIMIZER_BUILDERS, DefaultOptimizerConstructor, \ OPTIMIZERS from mmcv.utils import _BatchNorm, _InstanceNorm from torch.nn import GroupNorm, LayerNorm from .logger import get_root_logger from typing import Tuple, Optional, Callable import torch from torch.optim.optimizer import Optimizer def auto_scale_lr(effective_bs, optimizer_cfg, rule='linear', base_batch_size=256): assert rule in ['linear', 'sqrt'] logger = get_root_logger() # scale by world size if rule == 'sqrt': scale_ratio = math.sqrt(effective_bs / base_batch_size) elif rule == 'linear': scale_ratio = effective_bs / base_batch_size optimizer_cfg['lr'] *= scale_ratio logger.info(f'Automatically adapt lr to {optimizer_cfg["lr"]:.7f} (using {rule} scaling rule).') return scale_ratio @OPTIMIZER_BUILDERS.register_module() class MyOptimizerConstructor(DefaultOptimizerConstructor): def add_params(self, params, module, prefix='', is_dcn_module=None): """Add all parameters of module to the params list. The parameters of the given module will be added to the list of param groups, with specific rules defined by paramwise_cfg. Args: params (list[dict]): A list of param groups, it will be modified in place. module (nn.Module): The module to be added. prefix (str): The prefix of the module """ # get param-wise options custom_keys = self.paramwise_cfg.get('custom_keys', {}) # first sort with alphabet order and then sort with reversed len of str # sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True) bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', 1.) bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', 1.) norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', 1.) bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False) # special rules for norm layers and depth-wise conv layers is_norm = isinstance(module, (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm)) for name, param in module.named_parameters(recurse=False): base_lr = self.base_lr if name == 'bias' and not is_norm and not is_dcn_module: base_lr *= bias_lr_mult # apply weight decay policies base_wd = self.base_wd # norm decay if is_norm: if self.base_wd is not None: base_wd *= norm_decay_mult elif name == 'bias' and not is_dcn_module: if self.base_wd is not None: # TODO: current bias_decay_mult will have affect on DCN base_wd *= bias_decay_mult param_group = {'params': [param]} if not param.requires_grad: param_group['requires_grad'] = False params.append(param_group) continue if bypass_duplicate and self._is_in(param_group, params): logger = get_root_logger() logger.warn(f'{prefix} is duplicate. It is skipped since ' f'bypass_duplicate={bypass_duplicate}') continue # if the parameter match one of the custom keys, ignore other rules is_custom = False for key in custom_keys: scope, key_name = key if isinstance(key, tuple) else (None, key) if scope is not None and scope not in f'{prefix}': continue if key_name in f'{prefix}.{name}': is_custom = True if 'lr_mult' in custom_keys[key]: # if 'base_classes' in f'{prefix}.{name}' or 'attn_base' in f'{prefix}.{name}': # param_group['lr'] = self.base_lr # else: param_group['lr'] = self.base_lr * custom_keys[key]['lr_mult'] elif 'lr' not in param_group: param_group['lr'] = base_lr if self.base_wd is not None: if 'decay_mult' in custom_keys[key]: param_group['weight_decay'] = self.base_wd * custom_keys[key]['decay_mult'] elif 'weight_decay' not in param_group: param_group['weight_decay'] = base_wd if not is_custom: # bias_lr_mult affects all bias parameters # except for norm.bias dcn.conv_offset.bias if base_lr != self.base_lr: param_group['lr'] = base_lr if base_wd != self.base_wd: param_group['weight_decay'] = base_wd params.append(param_group) for child_name, child_mod in module.named_children(): child_prefix = f'{prefix}.{child_name}' if prefix else child_name self.add_params( params, child_mod, prefix=child_prefix, is_dcn_module=is_dcn_module) def build_optimizer(model, optimizer_cfg): # default parameter-wise config logger = get_root_logger() if hasattr(model, 'module'): model = model.module # set optimizer constructor optimizer_cfg.setdefault('constructor', 'MyOptimizerConstructor') # parameter-wise setting: cancel weight decay for some specific modules custom_keys = dict() for name, module in model.named_modules(): if hasattr(module, 'zero_weight_decay'): custom_keys |= { (name, key): dict(decay_mult=0) for key in module.zero_weight_decay } paramwise_cfg = Config(dict(cfg=dict(custom_keys=custom_keys))) if given_cfg := optimizer_cfg.get('paramwise_cfg'): paramwise_cfg.merge_from_dict(dict(cfg=given_cfg)) optimizer_cfg['paramwise_cfg'] = paramwise_cfg.cfg # build optimizer optimizer = mm_build_optimizer(model, optimizer_cfg) weight_decay_groups = dict() lr_groups = dict() for group in optimizer.param_groups: if not group.get('requires_grad', True): continue lr_groups.setdefault(group['lr'], []).append(group) weight_decay_groups.setdefault(group['weight_decay'], []).append(group) learnable_count, fix_count = 0, 0 for p in model.parameters(): if p.requires_grad: learnable_count += 1 else: fix_count += 1 fix_info = f"{learnable_count} are learnable, {fix_count} are fix" lr_info = "Lr group: " + ", ".join([f'{len(group)} params with lr {lr:.5f}' for lr, group in lr_groups.items()]) wd_info = "Weight decay group: " + ", ".join( [f'{len(group)} params with weight decay {wd}' for wd, group in weight_decay_groups.items()]) opt_info = f"Optimizer: total {len(optimizer.param_groups)} param groups, {fix_info}. {lr_info}; {wd_info}." logger.info(opt_info) return optimizer @OPTIMIZERS.register_module() class Lion(Optimizer): def __init__( self, params, lr: float = 1e-4, betas: Tuple[float, float] = (0.9, 0.99), weight_decay: float = 0.0, ): assert lr > 0. assert all(0. <= beta <= 1. for beta in betas) defaults = dict(lr=lr, betas=betas, weight_decay=weight_decay) super().__init__(params, defaults) @staticmethod def update_fn(p, grad, exp_avg, lr, wd, beta1, beta2): # stepweight decay p.data.mul_(1 - lr * wd) # weight update update = exp_avg.clone().lerp_(grad, 1 - beta1).sign_() p.add_(update, alpha=-lr) # decay the momentum running average coefficient exp_avg.lerp_(grad, 1 - beta2) @staticmethod def exists(val): return val is not None @torch.no_grad() def step( self, closure: Optional[Callable] = None ): loss = None if self.exists(closure): with torch.enable_grad(): loss = closure() for group in self.param_groups: for p in filter(lambda p: self.exists(p.grad), group['params']): grad, lr, wd, beta1, beta2, state = p.grad, group['lr'], group['weight_decay'], *group['betas'], \ self.state[p] # init state - exponential moving average of gradient values if len(state) == 0: state['exp_avg'] = torch.zeros_like(p) exp_avg = state['exp_avg'] self.update_fn( p, grad, exp_avg, lr, wd, beta1, beta2 ) return loss ================================================ FILE: PixArt-alpha-ToCa/docker-compose.yml ================================================ version: "3.8" services: pixart: container_name: pixart image: pixart:latest build: context: . ports: - 12345:12345 environment: - APP_CONTEXT=1024 #1024, 512, LCM tmpfs: - /tmp volumes: - ./docker/cache/gradio:/workspace/gradio_cached_examples/30:rw - ./docker/cache/huggingface:/root/.cache/huggingface:rw deploy: resources: reservations: devices: - driver: nvidia device_ids: ['0'] capabilities: [gpu] ================================================ FILE: PixArt-alpha-ToCa/docker-entrypoint.sh ================================================ #!/usr/bin/env bash set -Eeuo pipefail # Check if APP_CONTEXT matches one of the specific values if [ "$APP_CONTEXT" = "1024" ]; then echo "APP_CONTEXT is 1024" /usr/bin/python /workspace/app/app.py "$@" elif [ "$APP_CONTEXT" = "512" ]; then echo "APP_CONTEXT is 512" /usr/bin/python /workspace/app/app_512.py "$@" elif [ "$APP_CONTEXT" = "LCM" ]; then echo "APP_CONTEXT is LCM" /usr/bin/python /workspace/app/app_lcm.py "$@" else echo "APP_CONTEXT is not set to 1024, 512, or LCM, defaulting to 1024" /usr/bin/python /workspace/app/app.py "$@" fi ================================================ FILE: PixArt-alpha-ToCa/docker-readme.md ================================================ ================================================ FILE: PixArt-alpha-ToCa/environment-pixart.yml ================================================ name: pixart channels: - defaults dependencies: - _libgcc_mutex=0.1=main - _openmp_mutex=5.1=1_gnu - ca-certificates=2024.7.2=h06a4308_0 - ld_impl_linux-64=2.38=h1181459_1 - libffi=3.3=he6710b0_2 - libgcc-ng=11.2.0=h1234567_1 - libgomp=11.2.0=h1234567_1 - libstdcxx-ng=11.2.0=h1234567_1 - ncurses=6.4=h6a678d5_0 - openssl=1.1.1w=h7f8727e_0 - pip=24.2=py39h06a4308_0 - python=3.9.0=hdb3f193_2 - readline=8.2=h5eee18b_0 - setuptools=72.1.0=py39h06a4308_0 - sqlite=3.45.3=h5eee18b_0 - tk=8.6.14=h39e8969_0 - wheel=0.43.0=py39h06a4308_0 - xz=5.4.6=h5eee18b_1 - zlib=1.2.13=h5eee18b_1 - pip: - absl-py==2.1.0 - accelerate==0.34.0 - addict==2.4.0 - aiofiles==23.2.1 - aiohappyeyeballs==2.4.0 - aiohttp==3.10.5 - aiosignal==1.3.1 - altair==5.4.1 - annotated-types==0.7.0 - anyio==4.4.0 - async-timeout==4.0.3 - attrs==24.2.0 - beautifulsoup4==4.12.3 - bs4==0.0.2 - certifi==2024.8.30 - charset-normalizer==3.3.2 - click==8.1.7 - coloredlogs==15.0.1 - contourpy==1.3.0 - cycler==0.12.1 - datasets==2.21.0 - diffusers==0.31.0.dev0 - dill==0.3.8 - einops==0.8.0 - exceptiongroup==1.2.2 - fastapi==0.112.2 - ffmpy==0.4.0 - filelock==3.15.4 - fonttools==4.53.1 - frozenlist==1.4.1 - fsspec==2024.6.1 - ftfy==6.2.3 - gradio==4.1.1 - gradio-client==0.7.0 - grpcio==1.66.1 - h11==0.14.0 - httpcore==1.0.5 - httpx==0.27.2 - huggingface-hub==0.24.6 - humanfriendly==10.0 - idna==3.8 - importlib-metadata==8.4.0 - importlib-resources==6.4.4 - jinja2==3.1.4 - jsonschema==4.23.0 - jsonschema-specifications==2023.12.1 - kiwisolver==1.4.5 - markdown==3.7 - markdown-it-py==3.0.0 - markupsafe==2.1.5 - matplotlib==3.9.2 - mdurl==0.1.2 - mmcv==1.7.0 - mpmath==1.3.0 - multidict==6.0.5 - multiprocess==0.70.16 - narwhals==1.6.1 - networkx==3.2.1 - numpy==1.26.4 - nvidia-cublas-cu12==12.1.3.1 - nvidia-cuda-cupti-cu12==12.1.105 - nvidia-cuda-nvrtc-cu12==12.1.105 - nvidia-cuda-runtime-cu12==12.1.105 - nvidia-cudnn-cu12==9.1.0.70 - nvidia-cufft-cu12==11.0.2.54 - nvidia-curand-cu12==10.3.2.106 - nvidia-cusolver-cu12==11.4.5.107 - nvidia-cusparse-cu12==12.1.0.106 - nvidia-nccl-cu12==2.20.5 - nvidia-nvjitlink-cu12==12.6.68 - nvidia-nvtx-cu12==12.1.105 - opencv-python==4.10.0.84 - optimum==1.21.4 - orjson==3.10.7 - packaging==24.1 - pandas==2.2.2 - peft==0.6.2 - pillow==10.4.0 - platformdirs==4.2.2 - protobuf==3.20.2 - psutil==6.0.0 - pyarrow==17.0.0 - pydantic==2.8.2 - pydantic-core==2.20.1 - pydub==0.25.1 - pygments==2.18.0 - pyparsing==3.1.4 - python-dateutil==2.9.0.post0 - python-multipart==0.0.9 - pytorch-fid==0.3.0 - pytz==2024.1 - pyyaml==6.0.2 - referencing==0.35.1 - regex==2024.7.24 - requests==2.32.3 - rich==13.8.0 - rpds-py==0.20.0 - safetensors==0.4.4 - scipy==1.13.1 - semantic-version==2.10.0 - sentencepiece==0.1.99 - shellingham==1.5.4 - six==1.16.0 - sniffio==1.3.1 - soupsieve==2.6 - starlette==0.38.4 - sympy==1.13.2 - tensorboard==2.17.1 - tensorboard-data-server==0.7.2 - tensorboardx==2.6.2.2 - timm==0.6.12 - tokenizers==0.19.1 - tomli==2.0.1 - tomlkit==0.12.0 - torch==2.4.0 - torchaudio==2.1.1+cu118 - torchvision==0.16.1+cu118 - tqdm==4.66.5 - transformers==4.43.4 - triton==3.0.0 - typer==0.12.5 - typing-extensions==4.12.2 - tzdata==2024.1 - urllib3==2.2.2 - uvicorn==0.30.6 - wcwidth==0.2.13 - websockets==11.0.3 - werkzeug==3.0.4 - xformers==0.0.27.post2 - xxhash==3.5.0 - yapf==0.40.1 - yarl==1.9.7 - zipp==3.20.1 prefix: /root/miniconda3/envs/pixart ================================================ FILE: PixArt-alpha-ToCa/environment.yml ================================================ name: PixArt channels: - pytorch - nvidia dependencies: - python >= 3.8 - pytorch >= 1.13 - torchvision - pytorch-cuda=11.7 - pip: - timm==0.6.12 - diffusers - accelerate - mmcv==1.7.0 - diffusers - accelerate==0.15.0 - tensorboard - transformers==4.26.1 - sentencepiece~=0.1.97 - ftfy~=6.1.1 - beautifulsoup4~=4.11.1 - opencv-python - bs4 - einops - xformers ================================================ FILE: PixArt-alpha-ToCa/notebooks/PixArt_xl2_img512_internal_for_pokemon_sample_training.py ================================================ _base_ = ['/workspace/PixArt-alpha/configs/PixArt_xl2_internal.py'] data_root = '/workspace' image_list_json = ['data_info.json',] data = dict(type='InternalData', root='/workspace/pixart-pokemon', image_list_json=image_list_json, transform='default_train', load_vae_feat=True) image_size = 512 # model setting window_block_indexes = [] window_size=0 use_rel_pos=False model = 'PixArt_XL_2' fp32_attention = True load_from = "/workspace/PixArt-alpha/output/pretrained_models/PixArt-XL-2-512x512.pth" vae_pretrained = "output/pretrained_models/sd-vae-ft-ema" lewei_scale = 1.0 # training setting use_fsdp=False # if use FSDP mode num_workers=10 train_batch_size = 38 # 32 num_epochs = 200 # 3 gradient_accumulation_steps = 1 grad_checkpointing = True gradient_clip = 0.01 optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10) lr_schedule_args = dict(num_warmup_steps=1000) eval_sampling_steps = 200 log_interval = 20 save_model_steps=100 work_dir = 'output/debug' ================================================ FILE: PixArt-alpha-ToCa/notebooks/convert-checkpoint-to-diffusers.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": 2, "id": "2878bb5d-33a3-4a5b-b15c-c832c700129b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/workspace/PixArt-alpha\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/IPython/core/magics/osm.py:417: UserWarning: using dhist requires you to install the `pickleshare` library.\n", " self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n" ] } ], "source": [ "%cd PixArt-alpha" ] }, { "cell_type": "code", "execution_count": 14, "id": "7dd2d98c-3f8f-40f1-a9e1-bc916774afb3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total number of transformer parameters: 610856096\n" ] } ], "source": [ "!python tools/convert_pixart_alpha_to_diffusers.py \\\n", " --orig_ckpt_path \"/workspace/PixArt-alpha/output/trained_model/checkpoints/epoch_5_step_110.pth\" \\\n", " --dump_path \"/workspace/PixArt-alpha/output/diffusers_trained\" \\\n", " --only_transformer=True \\\n", " --image_size 512 \\\n", " --multi_scale_train=False\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: PixArt-alpha-ToCa/notebooks/infer.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": 1, "id": "8b2458c4-c461-4ddc-af94-fcd837357da4", "metadata": {}, "outputs": [], "source": [ "from diffusers import PixArtAlphaPipeline\n", "import torch\n", "from diffusers import Transformer2DModel" ] }, { "cell_type": "code", "execution_count": null, "id": "81a5bc0f-682b-4ff9-92e9-43b68b3df8fc", "metadata": {}, "outputs": [], "source": [ "# for comparison\n", "\n", "orig_pipe = pipe = PixArtAlphaPipeline.from_pretrained(\"PixArt-alpha/PixArt-XL-2-512x512\", torch_dtype=torch.float16)\n", "orig_pipe = orig_pipe.to(\"cuda\")" ] }, { "cell_type": "code", "execution_count": null, "id": "efc07821-5479-4ca3-a2c6-114ac484fd1e", "metadata": {}, "outputs": [], "source": [ "transformer = Transformer2DModel.from_pretrained(\"/workspace/PixArt-alpha/output/diffusers_trained/transformer\", torch_dtype=torch.float16)\n", "pipe = PixArtAlphaPipeline.from_pretrained(\"PixArt-alpha/PixArt-XL-2-512x512\", torch_dtype=torch.float16, transformer=transformer)\n", "pipe = pipe.to(\"cuda\")" ] }, { "cell_type": "code", "execution_count": 22, "id": "57da873b-2c13-463b-b558-ee69522ccefc", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d69c7683773c4c25914764800ec1ef4f", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/20 [00:00" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prompt = \"A green pokemon on white background\"\n", "image = pipe(prompt=prompt).images[0]\n", "image" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: PixArt-alpha-ToCa/notebooks/train.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "c423d2a1-475e-482e-b759-f16456fd6707", "metadata": {}, "source": [ "# Install" ] }, { "cell_type": "code", "execution_count": null, "id": "0440d6a7-78b9-49e9-98a2-9a5ed75e1a2f", "metadata": {}, "outputs": [], "source": [ "!git clone https://github.com/kopyl/PixArt-alpha.git" ] }, { "cell_type": "code", "execution_count": null, "id": "0abadf51-a7e3-4091-bb02-0bdd8d28fb73", "metadata": {}, "outputs": [], "source": [ "%cd PixArt-alpha" ] }, { "cell_type": "code", "execution_count": null, "id": "4df1af24-f439-485d-a946-966dbf16c49b", "metadata": { "scrolled": true }, "outputs": [], "source": [ "!pip install torch==2.0.0+cu117 torchvision==0.15.1+cu117 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu117\n", "!pip install -r requirements.txt\n", "!pip install wandb" ] }, { "cell_type": "markdown", "id": "d44474fd-0b92-48fc-b4cf-142b59d3917c", "metadata": {}, "source": [ "## Download model" ] }, { "cell_type": "code", "execution_count": null, "id": "06b1c1c9-f8b1-4719-8564-2383eac9ff28", "metadata": {}, "outputs": [], "source": [ "!python tools/download.py --model_names \"PixArt-XL-2-512x512.pth\"" ] }, { "cell_type": "markdown", "id": "f298a89c-d2a5-4da7-8304-c1390da0ba58", "metadata": {}, "source": [ "## Make dataset out of Hugginggface dataset" ] }, { "cell_type": "code", "execution_count": null, "id": "e17b8883-0a5c-4fa3-a7d0-e8ee95e42027", "metadata": {}, "outputs": [], "source": [ "import os\n", "from tqdm.notebook import tqdm\n", "from datasets import load_dataset\n", "import json" ] }, { "cell_type": "code", "execution_count": null, "id": "92957b2c-6765-48ee-9296-d6739066d74d", "metadata": {}, "outputs": [], "source": [ "dataset = load_dataset(\"lambdalabs/pokemon-blip-captions\")" ] }, { "cell_type": "code", "execution_count": null, "id": "0095cdda-c31a-48ee-a115-076a5fc393c3", "metadata": {}, "outputs": [], "source": [ "root_dir = \"/workspace/pixart-pokemon\"\n", "images_dir = \"images\"\n", "captions_dir = \"captions\"\n", "\n", "images_dir_absolute = os.path.join(root_dir, images_dir)\n", "captions_dir_absolute = os.path.join(root_dir, captions_dir)\n", "\n", "if not os.path.exists(root_dir):\n", " os.makedirs(os.path.join(root_dir, images_dir))\n", "\n", "if not os.path.exists(os.path.join(root_dir, images_dir)):\n", " os.makedirs(os.path.join(root_dir, images_dir))\n", "if not os.path.exists(os.path.join(root_dir, captions_dir)):\n", " os.makedirs(os.path.join(root_dir, captions_dir))\n", "\n", "image_format = \"png\"\n", "json_name = \"partition/data_info.json\"\n", "if not os.path.exists(os.path.join(root_dir, \"partition\")):\n", " os.makedirs(os.path.join(root_dir, \"partition\"))\n", "\n", "absolute_json_name = os.path.join(root_dir, json_name)\n", "data_info = []\n", "\n", "order = 0\n", "for item in tqdm(dataset[\"train\"]): \n", " image = item[\"image\"]\n", " image.save(f\"{images_dir_absolute}/{order}.{image_format}\")\n", " with open(f\"{captions_dir_absolute}/{order}.txt\", \"w\") as text_file:\n", " text_file.write(item[\"text\"])\n", " \n", " width, height = 512, 512\n", " ratio = 1\n", " data_info.append({\n", " \"height\": height,\n", " \"width\": width,\n", " \"ratio\": ratio,\n", " \"path\": f\"images/{order}.{image_format}\",\n", " \"prompt\": item[\"text\"],\n", " })\n", " \n", " order += 1\n", "\n", "with open(absolute_json_name, \"w\") as json_file:\n", " json.dump(data_info, json_file)" ] }, { "cell_type": "markdown", "id": "25be1c03", "metadata": {}, "source": [ "## Extract features" ] }, { "cell_type": "code", "execution_count": null, "id": "9f07a4f5-1873-48bf-86d0-9304942de5d3", "metadata": {}, "outputs": [], "source": [ "!python /workspace/PixArt-alpha/tools/extract_features.py \\\n", " --img_size 512 \\\n", " --json_path \"/workspace/pixart-pokemon/partition/data_info.json\" \\\n", " --t5_save_root \"/workspace/pixart-pokemon/caption_feature_wmask\" \\\n", " --vae_save_root \"/workspace/pixart-pokemon/img_vae_features\" \\\n", " --pretrained_models_dir \"/workspace/PixArt-alpha/output/pretrained_models\" \\\n", " --dataset_root \"/workspace/pixart-pokemon\"" ] }, { "cell_type": "code", "execution_count": null, "id": "9fc653d0", "metadata": {}, "outputs": [], "source": [ "!wandb login REPLACE_THIS_WITH_YOUR_AUTH_TOKEN_OF_WANDB" ] }, { "cell_type": "markdown", "id": "2cf1fd1a", "metadata": {}, "source": [ "## Train model" ] }, { "cell_type": "code", "execution_count": null, "id": "ea0e9dab-17bc-45ed-9c81-b670bbb8de47", "metadata": {}, "outputs": [], "source": [ "!python -m torch.distributed.launch \\\n", " train_scripts/train.py \\\n", " /workspace/PixArt-alpha/notebooks/PixArt_xl2_img512_internal_for_pokemon_sample_training.py \\\n", " --work-dir output/trained_model \\\n", " --report_to=\"wandb\" \\\n", " --loss_report_name=\"train_loss\"" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: PixArt-alpha-ToCa/requirements.txt ================================================ torch==2.1.1 torchaudio==2.1.1 torchvision==0.16.1 mmcv==1.7.0 git+https://github.com/huggingface/diffusers timm==0.6.12 accelerate tensorboard tensorboardX transformers sentencepiece~=0.1.99 ftfy beautifulsoup4 protobuf==3.20.2 gradio==4.1.1 yapf==0.40.1 opencv-python bs4 einops xformers optimum peft==0.6.2 ================================================ FILE: PixArt-alpha-ToCa/scripts/infer_pixart_8_bits.py ================================================ # pip install -U accelerate transformers bitsandbytes # pip install -U git+https://github.com/huggingface/diffusers from transformers import T5EncoderModel from diffusers import PixArtAlphaPipeline import torch import gc def flush(): gc.collect() torch.cuda.empty_cache() def bytes_to_giga_bytes(bytes): return bytes / 1024 / 1024 / 1024 # Loading in 8 bits needs `bitsandbytes`. text_encoder = T5EncoderModel.from_pretrained( "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder", load_in_8bit=True, device_map="auto", ) pipe = PixArtAlphaPipeline.from_pretrained( "PixArt-alpha/PixArt-XL-2-1024-MS", text_encoder=text_encoder, transformer=None, device_map="auto" ) with torch.no_grad(): prompt = "cute cat" prompt_embeds, prompt_attention_mask, negative_embeds, negative_prompt_attention_mask = pipe.encode_prompt(prompt) del text_encoder del pipe flush() pipe = PixArtAlphaPipeline.from_pretrained( "PixArt-alpha/PixArt-XL-2-1024-MS", text_encoder=None, torch_dtype=torch.float16, ).to("cuda") latents = pipe( negative_prompt=None, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, prompt_attention_mask=prompt_attention_mask, negative_prompt_attention_mask=negative_prompt_attention_mask, num_images_per_prompt=1, output_type="latent", ).images del pipe.transformer flush() with torch.no_grad(): image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor, return_dict=False)[0] image = pipe.image_processor.postprocess(image, output_type="pil") image[0].save("cat.png") print(f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB") ================================================ FILE: PixArt-alpha-ToCa/scripts/inference.py ================================================ import os import sys from pathlib import Path current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) import warnings warnings.filterwarnings("ignore") # ignore warning import re import argparse from datetime import datetime from tqdm import tqdm import torch from torchvision.utils import save_image from diffusers.models import AutoencoderKL from diffusion.model.utils import prepare_prompt_ar from diffusion import IDDPM, DPMS, SASolverSampler from tools.download import find_model from diffusion.model.nets import PixArtMS_XL_2, PixArt_XL_2 from diffusion.model.t5 import T5Embedder #from diffusion.data.datasets import get_chunks, ASPECT_RATIO_512_TEST, ASPECT_RATIO_1024_TEST from diffusion.data.datasets import get_chunks, ASPECT_RATIO_256_TEST, ASPECT_RATIO_512_TEST, ASPECT_RATIO_1024_TEST def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--image_size', default=256, type=int) parser.add_argument('--t5_path', default='../autodl-tmp/pretrained_models/t5_ckpts', type=str) # change to your own path parser.add_argument('--tokenizer_path', default='../autodl-tmp/pretrained_models/sd-vae-ft-ema', type=str) # change to your own path parser.add_argument('--txt_file', default='asset/samples.txt', type=str) # change to your own path parser.add_argument('--model_path', default='../autodl-tmp/pretrained_models/PixArt-XL-2-1024x1024.pth', type=str) # change to your own path parser.add_argument('--bs', default=1, type=int) parser.add_argument('--cfg_scale', default=4.5, type=float) parser.add_argument('--sampling_algo', default='dpm-solver', type=str, choices=['iddpm', 'dpm-solver', 'sa-solver']) parser.add_argument('--seed', default=0, type=int) parser.add_argument('--dataset', default='custom', type=str) parser.add_argument('--step', default=-1, type=int) parser.add_argument('--save_name', default='test_sample', type=str) parser.add_argument("--fresh_ratio", type=float, default=0.30) parser.add_argument("--cache_type", type=str, choices=['random', 'attention','similarity','norm', 'compress'], default='attention') parser.add_argument("--ratio_scheduler", type=str, default='ToCa', choices=['linear', 'cosine', 'exp', 'constant','linear-mode','layerwise','ToCa']) parser.add_argument("--force_fresh", type=str, choices=['global', 'local'], default='global', help="Force fresh strategy. global: fresh all tokens. local: fresh tokens acheiving fresh step threshold.") parser.add_argument("--fresh_threshold", type=int, default=3) parser.add_argument("--soft_fresh_weight", type=float, default=0.25, help="soft weight for updating the stale tokens by adding extra scores.") return parser.parse_args() def set_env(seed=0): torch.manual_seed(seed) torch.set_grad_enabled(False) for _ in range(30): torch.randn(1, 4, args.image_size, args.image_size) @torch.inference_mode() def visualize(items, bs, sample_steps, cfg_scale): for chunk in tqdm(list(get_chunks(items, bs)), unit='batch'): prompts = [] if bs == 1: prompt_clean, _, hw, ar, custom_hw = prepare_prompt_ar(chunk[0], base_ratios, device=device, show=False) # ar for aspect ratio if args.image_size == 1024: latent_size_h, latent_size_w = int(hw[0, 0] // 8), int(hw[0, 1] // 8) else: hw = torch.tensor([[args.image_size, args.image_size]], dtype=torch.float, device=device).repeat(bs, 1) ar = torch.tensor([[1.]], device=device).repeat(bs, 1) latent_size_h, latent_size_w = latent_size, latent_size prompts.append(prompt_clean.strip()) else: hw = torch.tensor([[args.image_size, args.image_size]], dtype=torch.float, device=device).repeat(bs, 1) ar = torch.tensor([[1.]], device=device).repeat(bs, 1) for prompt in chunk: prompts.append(prepare_prompt_ar(prompt, base_ratios, device=device, show=False)[0].strip()) latent_size_h, latent_size_w = latent_size, latent_size null_y = model.y_embedder.y_embedding[None].repeat(len(prompts), 1, 1)[:, None] with torch.no_grad(): caption_embs, emb_masks = t5.get_text_embeddings(prompts) caption_embs = caption_embs.float()[:, None] print('finish embedding') if args.sampling_algo == 'iddpm': # Create sampling noise: n = len(prompts) z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device).repeat(2, 1, 1, 1) model_kwargs = dict(y=torch.cat([caption_embs, null_y]), cfg_scale=cfg_scale, data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks, cache_type = args.cache_type, fresh_ratio = args.fresh_ratio, fresh_threshold = args.fresh_threshold, force_fresh = args.force_fresh, ratio_scheduler = args.ratio_scheduler, soft_fresh_weight = args.soft_fresh_weight) diffusion = IDDPM(str(sample_steps)) # Sample images: samples = diffusion.p_sample_loop( model.forward_with_cfg, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device ) samples, _ = samples.chunk(2, dim=0) # Remove null class samples elif args.sampling_algo == 'dpm-solver': # Create sampling noise: n = len(prompts) z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device) model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks, cache_type = args.cache_type, fresh_ratio = args.fresh_ratio, fresh_threshold = args.fresh_threshold, force_fresh = args.force_fresh, ratio_scheduler = args.ratio_scheduler, soft_fresh_weight = args.soft_fresh_weight) dpm_solver = DPMS(model.forward_with_dpmsolver, condition=caption_embs, uncondition=null_y, cfg_scale=cfg_scale, model_kwargs=model_kwargs) samples = dpm_solver.sample( z, steps=sample_steps, order=2, skip_type="time_uniform", method="multistep", model_kwargs = model_kwargs, ) elif args.sampling_algo == 'sa-solver': # Create sampling noise: n = len(prompts) model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks, cache_type = args.cache_type, fresh_ratio = args.fresh_ratio, fresh_threshold = args.fresh_threshold, force_fresh = args.force_fresh, ratio_scheduler = args.ratio_scheduler, soft_fresh_weight = args.soft_fresh_weight) sa_solver = SASolverSampler(model.forward_with_dpmsolver, device=device) samples = sa_solver.sample( S=25, batch_size=n, shape=(4, latent_size_h, latent_size_w), eta=1, conditioning=caption_embs, unconditional_conditioning=null_y, unconditional_guidance_scale=cfg_scale, model_kwargs=model_kwargs, )[0] samples = vae.decode(samples / 0.18215).sample torch.cuda.empty_cache() # Save images: os.umask(0o000) # file permission: 666; dir permission: 777 for i, sample in enumerate(samples): save_path = os.path.join(save_root, f"{prompts[i][:100]}.jpg") print("Saving path: ", save_path) save_image(sample, save_path, nrow=1, normalize=True, value_range=(-1, 1)) if __name__ == '__main__': args = get_args() # Setup PyTorch: seed = args.seed set_env(seed) device = "cuda" if torch.cuda.is_available() else "cpu" assert args.sampling_algo in ['iddpm', 'dpm-solver', 'sa-solver'] # only support fixed latent size currently latent_size = args.image_size // 8 lewei_scale = {256: 1, 512: 1, 1024: 2} # trick for positional embedding interpolation #lewei_scale = {512: 1, 1024: 2} # trick for positional embedding interpolation sample_steps_dict = {'iddpm': 100, 'dpm-solver': 20, 'sa-solver': 25} sample_steps = args.step if args.step != -1 else sample_steps_dict[args.sampling_algo] weight_dtype = torch.float16 print(f"Inference with {weight_dtype}") # model setting if args.image_size in [256, 512]: model = PixArt_XL_2(input_size=latent_size, lewei_scale=lewei_scale[args.image_size]).to(device) else: model = PixArtMS_XL_2(input_size=latent_size, lewei_scale=lewei_scale[args.image_size]).to(device) print(f"Generating sample from ckpt: {args.model_path}") state_dict = find_model(args.model_path) del state_dict['state_dict']['pos_embed'] missing, unexpected = model.load_state_dict(state_dict['state_dict'], strict=False) print('Missing keys: ', missing) print('Unexpected keys', unexpected) model.eval() model.to(weight_dtype) base_ratios = eval(f'ASPECT_RATIO_{args.image_size}_TEST') vae = AutoencoderKL.from_pretrained(args.tokenizer_path).to(device) t5 = T5Embedder(device="cuda", local_cache=True, cache_dir=args.t5_path, torch_dtype=torch.float) work_dir = os.path.join(*args.model_path.split('/')[:-2]) work_dir = f'/{work_dir}' if args.model_path[0] == '/' else work_dir # data setting with open(args.txt_file, 'r') as f: items = [item.strip() for item in f.readlines()] # img save setting try: epoch_name = re.search(r'.*epoch_(\d+).*.pth', args.model_path).group(1) step_name = re.search(r'.*step_(\d+).*.pth', args.model_path).group(1) except Exception: epoch_name = 'unknown' step_name = 'unknown' img_save_dir = os.path.join(work_dir, 'vis') os.umask(0o000) # file permission: 666; dir permission: 777 os.makedirs(img_save_dir, exist_ok=True) save_root = os.path.join(img_save_dir, f"{datetime.now().date()}_{args.dataset}_epoch{epoch_name}_step{step_name}_scale{args.cfg_scale}_step{sample_steps}_size{args.image_size}_bs{args.bs}_samp{args.sampling_algo}_seed{seed}") os.makedirs(save_root, exist_ok=True) visualize(items, args.bs, sample_steps, args.cfg_scale) ================================================ FILE: PixArt-alpha-ToCa/scripts/inference_ddp.py ================================================ import os import sys from pathlib import Path current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) import warnings warnings.filterwarnings("ignore") # ignore warning import re import argparse from datetime import datetime from tqdm import tqdm import torch from torchvision.utils import save_image from diffusers.models import AutoencoderKL import torch.distributed as dist from torch.utils.data import DataLoader, DistributedSampler from diffusion.model.utils import prepare_prompt_ar from diffusion import IDDPM, DPMS, SASolverSampler from tools.download import find_model from diffusion.model.nets import PixArtMS_XL_2, PixArt_XL_2 from diffusion.model.t5 import T5Embedder from diffusion.data.datasets import get_chunks, ASPECT_RATIO_256_TEST, ASPECT_RATIO_512_TEST, ASPECT_RATIO_1024_TEST def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--image_size', default=256, type=int) parser.add_argument('--t5_path', default='../autodl-tmp/pretrained_models/t5_ckpts', type=str) # change to your t5 path parser.add_argument('--tokenizer_path', default='../autodl-tmp/pretrained_models/sd-vae-ft-ema', type=str) # change to your tokenizer path parser.add_argument('--txt_file', default='asset/samples.txt', type=str) # change to your txt prompt file parser.add_argument('--model_path', default='../autodl-tmp/pretrained_models/PixArt-XL-2-1024x1024.pth', type=str) parser.add_argument('--bs', default=1, type=int) parser.add_argument('--cfg_scale', default=4.5, type=float) parser.add_argument('--sampling_algo', default='dpm-solver', type=str, choices=['iddpm', 'dpm-solver', 'sa-solver']) parser.add_argument('--seed', default=0, type=int) parser.add_argument('--dataset', default='custom', type=str) parser.add_argument('--step', default=-1, type=int) parser.add_argument('--save_name', default='test_sample', type=str) parser.add_argument("--fresh_ratio", type=float, default=0.30) parser.add_argument("--cache_type", type=str, choices=['random', 'attention', 'similarity', 'norm', 'compress'], default='attention') parser.add_argument("--ratio_scheduler", type=str, default='ToCa', choices=['linear', 'cosine', 'exp', 'constant', 'linear-mode', 'layerwise', 'ToCa']) parser.add_argument("--force_fresh", type=str, choices=['global', 'local'], default='global') parser.add_argument("--fresh_threshold", type=int, default=3) parser.add_argument("--soft_fresh_weight", type=float, default=0.25) return parser.parse_args() def setup_ddp(): dist.init_process_group(backend='nccl') local_rank = dist.get_rank() torch.cuda.set_device(local_rank) return local_rank def cleanup_ddp(): dist.destroy_process_group() def set_env(seed=0, local_rank=None): global_seed = seed + local_rank torch.manual_seed(global_seed) torch.cuda.manual_seed(global_seed) #torch.cuda.manual_seed_all(global_seed) torch.set_grad_enabled(False) return torch.device(f'cuda:{local_rank}') @torch.inference_mode() def visualize(items, bs, sample_steps, cfg_scale, device): sampler = DistributedSampler(items, shuffle=False, num_replicas=dist.get_world_size(), rank=dist.get_rank()) data_loader = DataLoader(items, batch_size=bs, sampler=sampler, drop_last=False) pbar = tqdm(data_loader, unit='batch') if dist.get_rank() == 0 else data_loader for chunk in pbar: prompts = [] if bs == 1: prompt_clean, _, hw, ar, custom_hw = prepare_prompt_ar(chunk[0], base_ratios, device=device, show=False) # ar for aspect ratio if args.image_size == 1024: latent_size_h, latent_size_w = int(hw[0, 0] // 8), int(hw[0, 1] // 8) else: hw = torch.tensor([[args.image_size, args.image_size]], dtype=torch.float, device=device).repeat(bs, 1) ar = torch.tensor([[1.]], device=device).repeat(bs, 1) latent_size_h, latent_size_w = latent_size, latent_size prompts.append(prompt_clean.strip()) else: hw = torch.tensor([[args.image_size, args.image_size]], dtype=torch.float, device=device).repeat(bs, 1) ar = torch.tensor([[1.]], device=device).repeat(bs, 1) for prompt in chunk: prompts.append(prepare_prompt_ar(prompt, base_ratios, device=device, show=False)[0].strip()) latent_size_h, latent_size_w = latent_size, latent_size null_y = model.module.y_embedder.y_embedding[None].repeat(len(prompts), 1, 1)[:, None] with torch.no_grad(): caption_embs, emb_masks = t5.get_text_embeddings(prompts) caption_embs = caption_embs.float()[:, None] #print('finish embedding') if args.sampling_algo == 'iddpm': # we have not tested this part, there may bugsss. n = len(prompts) z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device).repeat(2, 1, 1, 1) model_kwargs = dict(y=torch.cat([caption_embs, null_y]), cfg_scale=cfg_scale, data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks, cache_type=args.cache_type, fresh_ratio=args.fresh_ratio, fresh_threshold=args.fresh_threshold, force_fresh=args.force_fresh, ratio_scheduler=args.ratio_scheduler, soft_fresh_weight=args.soft_fresh_weight) diffusion = IDDPM(str(sample_steps)) samples = diffusion.p_sample_loop( model.module.forward_with_cfg, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device ) samples, _ = samples.chunk(2, dim=0) elif args.sampling_algo == 'dpm-solver': # Main srategy, we have tested and make sure it works. n = len(prompts) z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device) model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks, cache_type=args.cache_type, fresh_ratio=args.fresh_ratio, fresh_threshold=args.fresh_threshold, force_fresh=args.force_fresh, ratio_scheduler=args.ratio_scheduler, soft_fresh_weight=args.soft_fresh_weight) dpm_solver = DPMS(model.module.forward_with_dpmsolver, condition=caption_embs, uncondition=null_y, cfg_scale=cfg_scale, model_kwargs=model_kwargs) samples = dpm_solver.sample( z, steps=sample_steps, order=2, skip_type="time_uniform", method="multistep", model_kwargs=model_kwargs, rank = dist.get_rank() ) # not supported now elif args.sampling_algo == 'sa-solver': n = len(prompts) model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks, cache_type=args.cache_type, fresh_ratio=args.fresh_ratio, fresh_threshold=args.fresh_threshold, force_fresh=args.force_fresh, ratio_scheduler=args.ratio_scheduler, soft_fresh_weight=args.soft_fresh_weight) sa_solver = SASolverSampler(model.module.forward_with_dpmsolver, device=device) samples = sa_solver.sample( S=25, batch_size=n, shape=(4, latent_size_h, latent_size_w), eta=1, conditioning=caption_embs, unconditional_conditioning=null_y, unconditional_guidance_scale=cfg_scale, model_kwargs=model_kwargs, )[0] samples = vae.decode(samples / 0.18215).sample torch.cuda.empty_cache() dist.barrier() #if dist.get_rank() == 0: os.umask(0o000) for i, sample in enumerate(samples): save_path = os.path.join(save_root, f"{prompts[i][:100]}.jpg") #print("Saving path: ", save_path) save_image(sample, save_path, nrow=1, normalize=True, value_range=(-1, 1)) if __name__ == '__main__': args = get_args() # Setup DDP local_rank = setup_ddp() # Setup environment device = set_env(args.seed, local_rank) # only support fixed latent size currently latent_size = args.image_size // 8 lewei_scale = {256: 1, 512: 1, 1024: 2} sample_steps_dict = {'iddpm': 100, 'dpm-solver': 20, 'sa-solver': 25} sample_steps = args.step if args.step != -1 else sample_steps_dict[args.sampling_algo] weight_dtype = torch.float16 print(f"Inference with {weight_dtype}") # model setting if args.image_size in [256, 512]: model = PixArt_XL_2(input_size=latent_size, lewei_scale=lewei_scale[args.image_size]).to(device) else: model = PixArtMS_XL_2(input_size=latent_size, lewei_scale=lewei_scale[args.image_size]).to(device) print(f"Generating sample from ckpt: {args.model_path}") state_dict = find_model(args.model_path) del state_dict['state_dict']['pos_embed'] missing, unexpected = model.load_state_dict(state_dict['state_dict'], strict=False) print('Missing keys: ', missing) print('Unexpected keys', unexpected) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank]) model.module.eval() model.module.to(weight_dtype) base_ratios = eval(f'ASPECT_RATIO_{args.image_size}_TEST') vae = AutoencoderKL.from_pretrained(args.tokenizer_path).to(device) t5 = T5Embedder(device="cuda", local_cache=True, cache_dir=args.t5_path, torch_dtype=torch.float) work_dir = os.path.join(*args.model_path.split('/')[:-2]) work_dir = f'/{work_dir}' if args.model_path[0] == '/' else work_dir with open(args.txt_file, 'r') as f: items = [item.strip() for item in f.readlines()] epoch_name = re.search(r'.*epoch_(\d+).*.pth', args.model_path).group(1) if re.search(r'.*epoch_(\d+).*.pth', args.model_path) else 'unknown' step_name = re.search(r'.*step_(\d+).*.pth', args.model_path).group(1) if re.search(r'.*step_(\d+).*.pth', args.model_path) else 'unknown' img_save_dir = os.path.join(work_dir, 'vis') os.umask(0o000) os.makedirs(img_save_dir, exist_ok=True) save_root = os.path.join(img_save_dir, f"{datetime.now().date()}_{args.dataset}_epoch{epoch_name}_step{step_name}_scale{args.cfg_scale}_step{sample_steps}_size{args.image_size}_bs{args.bs}_samp{args.sampling_algo}_seed{args.seed}") os.makedirs(save_root, exist_ok=True) visualize(items, args.bs, sample_steps, args.cfg_scale, device) cleanup_ddp() ================================================ FILE: PixArt-alpha-ToCa/scripts/inference_lcm.py ================================================ import os import sys from pathlib import Path current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) import warnings warnings.filterwarnings("ignore") # ignore warning import re import argparse from datetime import datetime from tqdm import tqdm import torch from torchvision.utils import save_image from diffusers.models import AutoencoderKL from diffusion.model.utils import prepare_prompt_ar from tools.download import find_model from diffusion.model.nets import PixArtMS_XL_2, PixArt_XL_2 from diffusion.model.t5 import T5Embedder from diffusion.data.datasets import get_chunks from diffusion.lcm_scheduler import LCMScheduler from diffusion.data.datasets import ASPECT_RATIO_512_TEST, ASPECT_RATIO_1024_TEST def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--image_size', default=1024, type=int) parser.add_argument('--t5_path', default='output/pretrained_models/t5_ckpts', type=str) parser.add_argument('--tokenizer_path', default='output/pretrained_models/sd-vae-ft-ema', type=str) parser.add_argument('--txt_file', default='asset/samples.txt', type=str) parser.add_argument('--model_path', default='output/pretrained_models/PixArt-XL-2-1024x1024.pth', type=str) parser.add_argument('--bs', default=1, type=int) parser.add_argument('--cfg_scale', default=4.5, type=float) parser.add_argument('--sample_steps', default=4, type=int) parser.add_argument('--seed', default=0, type=int) parser.add_argument('--dataset', default='custom', type=str) parser.add_argument('--step', default=-1, type=int) parser.add_argument('--save_name', default='test_sample', type=str) return parser.parse_args() def set_env(seed=0): torch.manual_seed(seed) torch.set_grad_enabled(False) for _ in range(30): torch.randn(1, 4, args.image_size, args.image_size) @torch.inference_mode() def visualize(items, bs, sample_steps, cfg_scale): # 4. Prepare timesteps scheduler.set_timesteps(sample_steps, 50) timesteps = scheduler.timesteps for chunk in tqdm(list(get_chunks(items, bs)), unit='batch'): prompts = [] if bs == 1: prompt_clean, _, hw, ar, custom_hw = prepare_prompt_ar(chunk[0], base_ratios, device=device, show=False) # ar for aspect ratio if args.image_size == 1024: latent_size_h, latent_size_w = int(hw[0, 0] // 8), int(hw[0, 1] // 8) else: hw = torch.tensor([[args.image_size, args.image_size]], dtype=torch.float, device=device).repeat(bs, 1) ar = torch.tensor([[1.]], device=device).repeat(bs, 1) latent_size_h, latent_size_w = latent_size, latent_size prompts.append(prompt_clean.strip()) else: hw = torch.tensor([[args.image_size, args.image_size]], dtype=torch.float, device=device).repeat(bs, 1) ar = torch.tensor([[1.]], device=device).repeat(bs, 1) prompts.append(prepare_prompt_ar(prompt, base_ratios, device=device, show=False)[0].strip()) latent_size_h, latent_size_w = latent_size, latent_size with torch.no_grad(): caption_embs, emb_masks = t5.get_text_embeddings(prompts) caption_embs = caption_embs.float()[:, None] print('finish embedding') # Create sampling noise: n = len(prompts) latents = torch.randn(n, 4, latent_size_h, latent_size_w, device=device) model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks) # 7. LCM MultiStep Sampling Loop: for i, t in tqdm(list(enumerate(timesteps))): ts = torch.full((bs,), t, device=device, dtype=torch.long) # model prediction (v-prediction, eps, x) model_pred = model(latents, ts, caption_embs, **model_kwargs)[:, :4] # compute the previous noisy sample x_t -> x_t-1 latents, denoised = scheduler.step(model_pred, i, t, latents, return_dict=False) samples = vae.decode(denoised / 0.18215).sample torch.cuda.empty_cache() # Save images: os.umask(0o000) # file permission: 666; dir permission: 777 for i, sample in enumerate(samples): save_path = os.path.join(save_root, f"{prompts[i][:100]}.jpg") print("Saving path: ", save_path) save_image(sample, save_path, nrow=1, normalize=True, value_range=(-1, 1)) if __name__ == '__main__': args = get_args() # Setup PyTorch: seed = args.seed set_env(seed) device = "cuda" if torch.cuda.is_available() else "cpu" # only support fixed latent size currently latent_size = args.image_size // 8 lewei_scale = {512: 1, 1024: 2} # trick for positional embedding interpolation sample_steps = args.sample_steps # Initalize Scheduler: scheduler = LCMScheduler(beta_start=0.0001, beta_end=0.02, beta_schedule="linear", prediction_type="epsilon") # model setting if args.image_size == 512: model = PixArt_XL_2(input_size=latent_size, lewei_scale=lewei_scale[args.image_size]).to(device) else: model = PixArtMS_XL_2(input_size=latent_size, lewei_scale=lewei_scale[args.image_size]).to(device) print(f"Generating sample from ckpt: {args.model_path}") state_dict = find_model(args.model_path) del state_dict['state_dict']['pos_embed'] missing, unexpected = model.load_state_dict(state_dict['state_dict'], strict=False) print('Missing keys: ', missing) print('Unexpected keys', unexpected) model.eval() base_ratios = eval(f'ASPECT_RATIO_{args.image_size}_TEST') vae = AutoencoderKL.from_pretrained(args.tokenizer_path).to(device) t5 = T5Embedder(device="cuda", local_cache=True, cache_dir=args.t5_path, torch_dtype=torch.float) work_dir = os.path.join(*args.model_path.split('/')[:-2]) work_dir = f'/{work_dir}' if args.model_path[0] == '/' else work_dir # data setting with open(args.txt_file, 'r') as f: items = [item.strip() for item in f.readlines()] # img save setting try: epoch_name = re.search(r'.*epoch_(\d+).*.pth', args.model_path).group(1) step_name = re.search(r'.*step_(\d+).*.pth', args.model_path).group(1) except Exception: epoch_name = 'unknown' step_name = 'unknown' img_save_dir = os.path.join(work_dir, 'vis') os.umask(0o000) # file permission: 666; dir permission: 777 os.makedirs(img_save_dir, exist_ok=True) save_root = os.path.join(img_save_dir, f"{datetime.now().date()}_{args.dataset}_epoch{epoch_name}_step{step_name}_scale{args.cfg_scale}_step{sample_steps}_size{args.image_size}_bs{args.bs}_sampLCM_seed{seed}") os.makedirs(save_root, exist_ok=True) visualize(items, args.bs, sample_steps, args.cfg_scale) ================================================ FILE: PixArt-alpha-ToCa/scripts/interface.py ================================================ import argparse import sys from pathlib import Path current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) import os import random import torch from torchvision.utils import save_image from diffusion import IDDPM, DPMS, SASolverSampler from diffusers.models import AutoencoderKL from tools.download import find_model from datetime import datetime from typing import List, Union import gradio as gr import numpy as np from gradio.components import Textbox, Image from diffusion.model.utils import prepare_prompt_ar, resize_and_crop_tensor from diffusion.model.nets import PixArtMS_XL_2, PixArt_XL_2 from diffusion.model.t5 import T5Embedder from torchvision.utils import _log_api_usage_once, make_grid from diffusion.data.datasets import ASPECT_RATIO_512_TEST, ASPECT_RATIO_1024_TEST from asset.examples import examples MAX_SEED = np.iinfo(np.int32).max def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--image_size', default=1024, type=int) parser.add_argument('--model_path', default='output/pretrained_models/PixArt-XL-2-1024-MS.pth', type=str) parser.add_argument('--t5_path', default='output/pretrained_models', type=str) parser.add_argument('--tokenizer_path', default='output/pretrained_models/sd-vae-ft-ema', type=str) parser.add_argument('--llm_model', default='t5', type=str) parser.add_argument('--port', default=7788, type=int) return parser.parse_args() @torch.no_grad() def ndarr_image(tensor: Union[torch.Tensor, List[torch.Tensor]], **kwargs,) -> None: if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(save_image) grid = make_grid(tensor, **kwargs) # Add 0.5 after unnormalizing to [0, 255] to round to the nearest integer return grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy() def set_env(seed=0): torch.manual_seed(seed) torch.set_grad_enabled(False) for _ in range(30): torch.randn(1, 4, args.image_size, args.image_size) def randomize_seed_fn(seed: int, randomize_seed: bool) -> int: if randomize_seed: seed = random.randint(0, MAX_SEED) return seed @torch.inference_mode() def generate_img(prompt, sampler, sample_steps, scale, seed=0, randomize_seed=False): seed = int(randomize_seed_fn(seed, randomize_seed)) set_env(seed) os.makedirs(f'output/demo/online_demo_prompts/', exist_ok=True) save_promt_path = f'output/demo/online_demo_prompts/tested_prompts{datetime.now().date()}.txt' with open(save_promt_path, 'a') as f: f.write(prompt + '\n') print(prompt) prompt_clean, prompt_show, hw, ar, custom_hw = prepare_prompt_ar(prompt, base_ratios, device=device) # ar for aspect ratio prompt_clean = prompt_clean.strip() if isinstance(prompt_clean, str): prompts = [prompt_clean] caption_embs, emb_masks = llm_embed_model.get_text_embeddings(prompts) caption_embs = caption_embs[:, None] null_y = model.y_embedder.y_embedding[None].repeat(len(prompts), 1, 1)[:, None] latent_size_h, latent_size_w = int(hw[0, 0]//8), int(hw[0, 1]//8) # Sample images: if sampler == 'iddpm': # Create sampling noise: n = len(prompts) z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device).repeat(2, 1, 1, 1) model_kwargs = dict(y=torch.cat([caption_embs, null_y]), cfg_scale=scale, data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks) diffusion = IDDPM(str(sample_steps)) samples = diffusion.p_sample_loop( model.forward_with_cfg, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device ) samples, _ = samples.chunk(2, dim=0) # Remove null class samples elif sampler == 'dpm-solver': # Create sampling noise: n = len(prompts) z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device) model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks) dpm_solver = DPMS(model.forward_with_dpmsolver, condition=caption_embs, uncondition=null_y, cfg_scale=scale, model_kwargs=model_kwargs) samples = dpm_solver.sample( z, steps=sample_steps, order=2, skip_type="time_uniform", method="multistep", ) elif sampler == 'sa-solver': # Create sampling noise: n = len(prompts) model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks) sa_solver = SASolverSampler(model.forward_with_dpmsolver, device=device) samples = sa_solver.sample( S=sample_steps, batch_size=n, shape=(4, latent_size_h, latent_size_w), eta=1, conditioning=caption_embs, unconditional_conditioning=null_y, unconditional_guidance_scale=scale, model_kwargs=model_kwargs, )[0] samples = vae.decode(samples / 0.18215).sample torch.cuda.empty_cache() samples = resize_and_crop_tensor(samples, custom_hw[0,1], custom_hw[0,0]) display_model_info = f'Model path: {args.model_path},\nBase image size: {args.image_size}, \nSampling Algo: {sampler}' return ndarr_image(samples, normalize=True, value_range=(-1, 1)), prompt_show, display_model_info, seed if __name__ == '__main__': from diffusion.utils.logger import get_root_logger args = get_args() device = "cuda" if torch.cuda.is_available() else "cpu" logger = get_root_logger() assert args.image_size in [512, 1024], "We only provide pre-trained models for 256x256, 512x512 and 1024x1024 resolutions." lewei_scale = {512: 1, 1024: 2} latent_size = args.image_size // 8 t5_device = {512: 'cuda', 1024: 'cuda'} if args.image_size == 512: model = PixArt_XL_2(input_size=latent_size, lewei_scale=lewei_scale[args.image_size]).to(device) else: model = PixArtMS_XL_2(input_size=latent_size, lewei_scale=lewei_scale[args.image_size]).to(device) state_dict = find_model(args.model_path) del state_dict['state_dict']['pos_embed'] missing, unexpected = model.load_state_dict(state_dict['state_dict'], strict=False) logger.warning(f'Missing keys: {missing}') logger.warning(f'Unexpected keys: {unexpected}') model.eval() base_ratios = eval(f'ASPECT_RATIO_{args.image_size}_TEST') vae = AutoencoderKL.from_pretrained(args.tokenizer_path).to(device) if args.llm_model == 't5': llm_embed_model = T5Embedder(device=t5_device[args.image_size], local_cache=True, cache_dir=args.t5_path, torch_dtype=torch.float) else: print('We support t5 only, please initialize the llm again') sys.exit() title = f""" '' Unleashing your Creativity \n ''
{args.image_size}px
""" DESCRIPTION = """# PixArt-Alpha 1024px ## If PixArt-Alpha is helpful, please help to ⭐ the [Github Repo](https://github.com/PixArt-alpha/PixArt) and recommend it to your friends 😊' #### [PixArt-Alpha 1024px](https://github.com/PixArt-alpha/PixArt-alpha) is a transformer-based text-to-image diffusion system trained on text embeddings from T5. This demo uses the [PixArt-alpha/PixArt-XL-2-1024-MS](https://huggingface.co/PixArt-alpha/PixArt-XL-2-1024-MS) checkpoint. #### English prompts ONLY; 提示词仅限英文 Don't want to queue? Try [OpenXLab](https://openxlab.org.cn/apps/detail/PixArt-alpha/PixArt-alpha) or [Google Colab Demo](https://colab.research.google.com/drive/1jZ5UZXk7tcpTfVwnX33dDuefNMcnW9ME?usp=sharing). """ if not torch.cuda.is_available(): DESCRIPTION += "\n

Running on CPU 🥶 This demo does not work on CPU.

" demo = gr.Interface( fn=generate_img, inputs=[Textbox(label="Note: If you want to specify a aspect ratio or determine a customized height and width, " "use --ar h:w (or --aspect_ratio h:w) or --hw h:w. If no aspect ratio or hw is given, all setting will be default.", placeholder="Please enter your prompt. \n"), gr.Radio( choices=["iddpm", "dpm-solver"], label=f"Sampler", interactive=True, value='dpm-solver', ), gr.Slider( label='Sample Steps', minimum=1, maximum=100, value=14, step=1 ), gr.Slider( label='Guidance Scale', minimum=0.1, maximum=30.0, value=4.5, step=0.1 ), gr.Slider( label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0, ), gr.Checkbox(label="Randomize seed", value=True), ], outputs=[Image(type="numpy", label="Img"), Textbox(label="clean prompt"), Textbox(label="model info"), gr.Slider(label='seed')], title=title, description=DESCRIPTION, examples=examples, ) demo.launch(server_name="0.0.0.0", server_port=args.port, debug=True) ================================================ FILE: PixArt-alpha-ToCa/scripts/interface_controlnet.py ================================================ import argparse import os from datetime import datetime import numpy as np import sys from pathlib import Path from typing import List, Union current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) import gradio as gr from gradio.components import Textbox, Image, Slider import torch import torchvision.transforms as T import torchvision.transforms.functional as TF from torchvision.utils import _log_api_usage_once, make_grid, save_image from diffusion import IDDPM, DPMS, SASolverSampler from diffusion.data.datasets import * from diffusion.model.hed import HEDdetector from diffusion.model.nets import PixArtMS_XL_2, ControlPixArtHalf, ControlPixArtMSHalf from diffusion.model.t5 import T5Embedder from diffusion.model.utils import prepare_prompt_ar, resize_and_crop_tensor from diffusion.utils.misc import read_config from diffusers.models import AutoencoderKL from tools.download import find_model vae_scale = 0.18215 DESCRIPTION = """![Logo](https://raw.githubusercontent.com/PixArt-alpha/PixArt-alpha.github.io/master/static/images/logo.png) # PixArt-Alpha 1024px + ControlNet. This is the demo for ControlNet combined with 1024px PixArt-Alpha. # The input reference image need to be around 1024x1024. And descriptive prompts also need to be provided. # You may change the random seed, if you didn't get satisfied results. """ def get_args(): parser = argparse.ArgumentParser() parser.add_argument("config", type=str, help="config") parser.add_argument('--num_sampling_steps', default=14, type=int) parser.add_argument('--cfg_scale', default=4.5, type=int) parser.add_argument('--image_size', default=1024, type=int) parser.add_argument('--model_path', type=str) parser.add_argument('--tokenizer_path', default='output/pretrained_models/sd-vae-ft-ema', type=str) parser.add_argument('--llm_model', default='t5', type=str) parser.add_argument('--sampling_algo', default='dpm-solver', type=str, choices=['iddpm', 'dpm-solver', 'sa-solver']) parser.add_argument('--port', default=7788, type=int) parser.add_argument('--condition_strength', default=1, type=float) return parser.parse_args() @torch.no_grad() def ndarr_image(tensor: Union[torch.Tensor, List[torch.Tensor]], **kwargs, ) -> None: if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(save_image) grid = make_grid(tensor, **kwargs) ndarr = grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy() return ndarr def set_env(): torch.manual_seed(0) torch.set_grad_enabled(False) @torch.inference_mode() def generate_img(prompt, given_image, seed): torch.manual_seed(seed) torch.cuda.empty_cache() strength = 1.0 c_vis = given_image save_promt_path = f'{save_prompt_path}/tested_prompts{datetime.now().date()}.txt' with open(save_promt_path, 'a') as f: f.write(prompt + '\n') prompt_clean, prompt_show, hw, ar, custom_hw = prepare_prompt_ar(prompt, base_ratios, device=device) # ar for aspect ratio prompt_clean = prompt_clean.strip() if isinstance(prompt_clean, str): prompts = [prompt_clean] caption_embs, emb_masks = llm_embed_model.get_text_embeddings(prompts) caption_embs = caption_embs[:, None] null_y = model.y_embedder.y_embedding[None].repeat(len(prompts), 1, 1)[:, None] # condition process if given_image is not None: ar = torch.tensor([given_image.size[1] / given_image.size[0]], device=device)[None] custom_hw = torch.tensor([given_image.size[1], given_image.size[0]], device=device)[None] closest_hw = base_ratios[min(base_ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))] hw = torch.tensor(closest_hw, device=device)[None] condition_transform = T.Compose([ T.Lambda(lambda img: img.convert('RGB')), T.Resize(int(min(closest_hw))), T.CenterCrop([int(closest_hw[0]), int(closest_hw[1])]), T.ToTensor(), ]) given_image = condition_transform(given_image).unsqueeze(0).to(device) hed_edge = hed(given_image) * strength hed_edge = TF.normalize(hed_edge, [.5], [.5]) hed_edge = hed_edge.repeat(1, 3, 1, 1) posterior = vae.encode(hed_edge).latent_dist condition = posterior.sample() c = condition * vae_scale c_vis = vae.decode(condition)['sample'] c_vis = torch.clamp(127.5 * c_vis + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()[0] else: c = None latent_size_h, latent_size_w = int(hw[0, 0] // 8), int(hw[0, 1] // 8) # Sample images: if args.sampling_algo == 'iddpm': # Create sampling noise: n = len(prompts) z = torch.randn(n, 4, latent_size, latent_size, device=device).repeat(2, 1, 1, 1) model_kwargs = dict(y=torch.cat([caption_embs, null_y]), cfg_scale=args.cfg_scale, data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks, c=c) diffusion = IDDPM(str(args.num_sampling_steps)) samples = diffusion.p_sample_loop( model.forward_with_cfg, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device ) samples, _ = samples.chunk(2, dim=0) # Remove null class samples elif args.sampling_algo == 'dpm-solver': # Create sampling noise: n = len(prompts) z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device) model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks, c=c) dpm_solver = DPMS(model.forward_with_dpmsolver, condition=caption_embs, uncondition=null_y, cfg_scale=args.cfg_scale, model_kwargs=model_kwargs) samples = dpm_solver.sample( z, steps=args.num_sampling_steps, order=2, skip_type="time_uniform", method="multistep", ) elif args.sampling_algo == 'sa-solver': # Create sampling noise: n = len(prompts) model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks, c=c) sas_solver = SASolverSampler(model.forward_with_dpmsolver, device=device) samples = sas_solver.sample( S=args.num_sampling_steps, batch_size=n, shape=(4, latent_size_h, latent_size_w), eta=1, conditioning=caption_embs, unconditional_conditioning=null_y, unconditional_guidance_scale=args.cfg_scale, model_kwargs=model_kwargs, )[0] samples = vae.decode(samples / vae_scale).sample torch.cuda.empty_cache() samples = resize_and_crop_tensor(samples, custom_hw[0, 1], custom_hw[0, 0]) return ndarr_image(samples, normalize=True, value_range=(-1, 1)), c_vis, prompt_show if __name__ == '__main__': args = get_args() config = read_config(args.config) set_env() device = "cuda" if torch.cuda.is_available() else "cpu" save_prompt_path = 'output/demo/online_demo_prompts/' os.makedirs(save_prompt_path, exist_ok=True) assert args.image_size in [512, 1024], "We only provide pre-trained models for 512x512 and 1024x1024 resolutions." lewei_scale = {512: 1, 1024: 2} latent_size = args.image_size // 8 weight_dtype = torch.float16 print(f"Inference with {weight_dtype}") model = PixArtMS_XL_2(input_size=latent_size, lewei_scale=lewei_scale[args.image_size]) if config.image_size == 512: print('model architecture ControlPixArtHalf and image size is 512') model = ControlPixArtHalf(model).to(device) elif config.image_size == 1024: print('model architecture ControlPixArtMSHalf and image size is 1024') model = ControlPixArtMSHalf(model).to(device) state_dict = find_model(args.model_path)['state_dict'] if 'pos_embed' in state_dict: del state_dict['pos_embed'] elif 'base_model.pos_embed' in state_dict: del state_dict['base_model.pos_embed'] missing, unexpected = model.load_state_dict(state_dict, strict=False) print('Missing keys (missing pos_embed is normal): ', missing) print('Unexpected keys', unexpected) model.eval() model.to(weight_dtype) display_model_info = f'model path: {args.model_path},\n base image size: {args.image_size}' base_ratios = eval(f'ASPECT_RATIO_{args.image_size}_TEST') vae = AutoencoderKL.from_pretrained(args.tokenizer_path).to(device) hed = HEDdetector(False).to(device) if args.llm_model == 't5': print("begin load t5") llm_embed_model = T5Embedder(device=device, local_cache=True, cache_dir='data/t5_ckpts', torch_dtype=torch.float) print("finish load t5") else: print(f'We support t5 only, please initialize the llm again') sys.exit() gr.Markdown(DESCRIPTION) demo = gr.Interface(fn=generate_img, inputs=[ Textbox(label="Enter a reference image, the resolution of image need around 1024 x 1024", placeholder="Please enter your prompt. \n"), Image(type="pil", label="Condition"), Slider(minimum=0., maximum=10000., value=0, step=2, label='seed'), ], outputs=[Image(type="numpy", label="Img"), Image(type="numpy", label="HED Edge Map"), Textbox(label="clean prompt"),] ) demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=args.port, debug=True) ================================================ FILE: PixArt-alpha-ToCa/scripts/pipeline_pixart_inpaint.py ================================================ # Copyright 2023 PixArt-Alpha Authors and The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import html import inspect import re import urllib.parse as ul from typing import Callable, List, Optional, Tuple, Union import torch import torch.nn.functional as F from transformers import T5EncoderModel, T5Tokenizer from diffusers.image_processor import PipelineImageInput, PixArtImageProcessor, VaeImageProcessor from diffusers.models import AutoencoderKL, Transformer2DModel from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput from diffusers.schedulers import DPMSolverMultistepScheduler from diffusers.utils import ( BACKENDS_MAPPING, deprecate, is_bs4_available, is_ftfy_available, logging, replace_example_docstring, ) from diffusers.utils.torch_utils import randn_tensor logger = logging.get_logger(__name__) # pylint: disable=invalid-name if is_bs4_available(): from bs4 import BeautifulSoup if is_ftfy_available(): import ftfy EXAMPLE_DOC_STRING = """ Examples: ```py >>> import torch >>> from diffusers import PixArtAlphaInpaintPipeline >>> # You can replace the checkpoint id with "PixArt-alpha/PixArt-XL-2-512x512" too. >>> pipe = PixArtAlphaInpaintPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16) >>> # Enable memory optimizations. >>> pipe.enable_model_cpu_offload() >>> prompt = "" >>> image = Image.open('') >>> image = pipe(prompt, image=image, mask_image=mask_image, strength=1.0).images[0] ``` """ ASPECT_RATIO_1024_BIN = { "0.25": [512.0, 2048.0], "0.28": [512.0, 1856.0], "0.32": [576.0, 1792.0], "0.33": [576.0, 1728.0], "0.35": [576.0, 1664.0], "0.4": [640.0, 1600.0], "0.42": [640.0, 1536.0], "0.48": [704.0, 1472.0], "0.5": [704.0, 1408.0], "0.52": [704.0, 1344.0], "0.57": [768.0, 1344.0], "0.6": [768.0, 1280.0], "0.68": [832.0, 1216.0], "0.72": [832.0, 1152.0], "0.78": [896.0, 1152.0], "0.82": [896.0, 1088.0], "0.88": [960.0, 1088.0], "0.94": [960.0, 1024.0], "1.0": [1024.0, 1024.0], "1.07": [1024.0, 960.0], "1.13": [1088.0, 960.0], "1.21": [1088.0, 896.0], "1.29": [1152.0, 896.0], "1.38": [1152.0, 832.0], "1.46": [1216.0, 832.0], "1.67": [1280.0, 768.0], "1.75": [1344.0, 768.0], "2.0": [1408.0, 704.0], "2.09": [1472.0, 704.0], "2.4": [1536.0, 640.0], "2.5": [1600.0, 640.0], "3.0": [1728.0, 576.0], "4.0": [2048.0, 512.0], } ASPECT_RATIO_512_BIN = { "0.25": [256.0, 1024.0], "0.28": [256.0, 928.0], "0.32": [288.0, 896.0], "0.33": [288.0, 864.0], "0.35": [288.0, 832.0], "0.4": [320.0, 800.0], "0.42": [320.0, 768.0], "0.48": [352.0, 736.0], "0.5": [352.0, 704.0], "0.52": [352.0, 672.0], "0.57": [384.0, 672.0], "0.6": [384.0, 640.0], "0.68": [416.0, 608.0], "0.72": [416.0, 576.0], "0.78": [448.0, 576.0], "0.82": [448.0, 544.0], "0.88": [480.0, 544.0], "0.94": [480.0, 512.0], "1.0": [512.0, 512.0], "1.07": [512.0, 480.0], "1.13": [544.0, 480.0], "1.21": [544.0, 448.0], "1.29": [576.0, 448.0], "1.38": [576.0, 416.0], "1.46": [608.0, 416.0], "1.67": [640.0, 384.0], "1.75": [672.0, 384.0], "2.0": [704.0, 352.0], "2.09": [736.0, 352.0], "2.4": [768.0, 320.0], "2.5": [800.0, 320.0], "3.0": [864.0, 288.0], "4.0": [1024.0, 256.0], } # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps def retrieve_timesteps( scheduler, num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, **kwargs, ): """ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. Args: scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: raise ValueError( f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" f" timestep schedules. Please check whether you are using the correct scheduler." ) scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps return timesteps, num_inference_steps # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents def retrieve_latents( encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample" ): if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": return encoder_output.latent_dist.sample(generator) elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": return encoder_output.latent_dist.mode() elif hasattr(encoder_output, "latents"): return encoder_output.latents else: raise AttributeError("Could not access latents of provided encoder_output") class PixArtAlphaInpaintPipeline(DiffusionPipeline): r""" Pipeline for text-to-image generation using PixArt-Alpha. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. text_encoder ([`T5EncoderModel`]): Frozen text-encoder. PixArt-Alpha uses [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant. tokenizer (`T5Tokenizer`): Tokenizer of class [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer). transformer ([`Transformer2DModel`]): A text conditioned `Transformer2DModel` to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `transformer` to denoise the encoded image latents. """ bad_punct_regex = re.compile( r"[" + "#®•©™&@·º½¾¿¡§~" + r"\)" + r"\(" + r"\]" + r"\[" + r"\}" + r"\{" + r"\|" + "\\" + r"\/" + r"\*" + r"]{1,}" ) # noqa _optional_components = ["tokenizer", "text_encoder"] model_cpu_offload_seq = "text_encoder->transformer->vae" def __init__( self, tokenizer: T5Tokenizer, text_encoder: T5EncoderModel, vae: AutoencoderKL, transformer: Transformer2DModel, scheduler: DPMSolverMultistepScheduler, ): super().__init__() self.register_modules( tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor) self.mask_processor = VaeImageProcessor( vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True ) # Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/utils.py def mask_text_embeddings(self, emb, mask): if emb.shape[0] == 1: keep_index = mask.sum().item() return emb[:, :, :keep_index, :], keep_index else: masked_feature = emb * mask[:, None, :, None] return masked_feature, emb.shape[2] # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt def encode_prompt( self, prompt: Union[str, List[str]], do_classifier_free_guidance: bool = True, negative_prompt: str = "", num_images_per_prompt: int = 1, device: Optional[torch.device] = None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, prompt_attention_mask: Optional[torch.FloatTensor] = None, negative_prompt_attention_mask: Optional[torch.FloatTensor] = None, clean_caption: bool = False, **kwargs, ): r""" Encodes the prompt into text encoder hidden states. Args: prompt (`str` or `List[str]`, *optional*): prompt to be encoded negative_prompt (`str` or `List[str]`, *optional*): The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For PixArt-Alpha, this should be "". do_classifier_free_guidance (`bool`, *optional*, defaults to `True`): whether to use classifier free guidance or not num_images_per_prompt (`int`, *optional*, defaults to 1): number of images that should be generated per prompt device: (`torch.device`, *optional*): torch device to place the resulting embeddings on prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. negative_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the "" string. clean_caption (bool, defaults to `False`): If `True`, the function will preprocess and clean the provided caption before encoding. """ if "mask_feature" in kwargs: deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version." deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False) if device is None: device = self._execution_device if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): batch_size = len(prompt) else: batch_size = prompt_embeds.shape[0] # See Section 3.1. of the paper. max_length = 120 if prompt_embeds is None: prompt = self._text_preprocessing(prompt, clean_caption=clean_caption) text_inputs = self.tokenizer( prompt, padding="max_length", max_length=max_length, truncation=True, add_special_tokens=True, return_tensors="pt", ) text_input_ids = text_inputs.input_ids untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( text_input_ids, untruncated_ids ): removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" f" {max_length} tokens: {removed_text}" ) prompt_attention_mask = text_inputs.attention_mask prompt_attention_mask = prompt_attention_mask.to(device) prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask) prompt_embeds = prompt_embeds[0] if self.text_encoder is not None: dtype = self.text_encoder.dtype elif self.transformer is not None: dtype = self.transformer.dtype else: dtype = None prompt_embeds = prompt_embeds.to(dtype=dtype, device=device) bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1) prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: uncond_tokens = [negative_prompt] * batch_size uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( uncond_tokens, padding="max_length", max_length=max_length, truncation=True, return_attention_mask=True, add_special_tokens=True, return_tensors="pt", ) negative_prompt_attention_mask = uncond_input.attention_mask negative_prompt_attention_mask = negative_prompt_attention_mask.to(device) negative_prompt_embeds = self.text_encoder( uncond_input.input_ids.to(device), attention_mask=negative_prompt_attention_mask ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device) negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1) negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1) else: negative_prompt_embeds = None negative_prompt_attention_mask = None return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs def prepare_extra_step_kwargs(self, generator, eta): # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( self, prompt, height, width, negative_prompt, callback_steps, prompt_embeds=None, negative_prompt_embeds=None, prompt_attention_mask=None, negative_prompt_attention_mask=None, ): if height % 8 != 0 or width % 8 != 0: raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" f" {type(callback_steps)}." ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" " only forward one of the two." ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if prompt is not None and negative_prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:" f" {negative_prompt_embeds}. Please make sure to only forward one of the two." ) if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" f" {negative_prompt_embeds}. Please make sure to only forward one of the two." ) if prompt_embeds is not None and prompt_attention_mask is None: raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.") if negative_prompt_embeds is not None and negative_prompt_attention_mask is None: raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.") if prompt_embeds is not None and negative_prompt_embeds is not None: if prompt_embeds.shape != negative_prompt_embeds.shape: raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" f" {negative_prompt_embeds.shape}." ) if prompt_attention_mask.shape != negative_prompt_attention_mask.shape: raise ValueError( "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but" f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`" f" {negative_prompt_attention_mask.shape}." ) # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing def _text_preprocessing(self, text, clean_caption=False): if clean_caption and not is_bs4_available(): logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`")) logger.warn("Setting `clean_caption` to False...") clean_caption = False if clean_caption and not is_ftfy_available(): logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`")) logger.warn("Setting `clean_caption` to False...") clean_caption = False if not isinstance(text, (tuple, list)): text = [text] def process(text: str): if clean_caption: text = self._clean_caption(text) text = self._clean_caption(text) else: text = text.lower().strip() return text return [process(t) for t in text] # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption def _clean_caption(self, caption): caption = str(caption) caption = ul.unquote_plus(caption) caption = caption.strip().lower() caption = re.sub("", "person", caption) # urls: caption = re.sub( r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", caption, ) # regex for urls caption = re.sub( r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", caption, ) # regex for urls # html: caption = BeautifulSoup(caption, features="html.parser").text # @ caption = re.sub(r"@[\w\d]+\b", "", caption) # 31C0—31EF CJK Strokes # 31F0—31FF Katakana Phonetic Extensions # 3200—32FF Enclosed CJK Letters and Months # 3300—33FF CJK Compatibility # 3400—4DBF CJK Unified Ideographs Extension A # 4DC0—4DFF Yijing Hexagram Symbols # 4E00—9FFF CJK Unified Ideographs caption = re.sub(r"[\u31c0-\u31ef]+", "", caption) caption = re.sub(r"[\u31f0-\u31ff]+", "", caption) caption = re.sub(r"[\u3200-\u32ff]+", "", caption) caption = re.sub(r"[\u3300-\u33ff]+", "", caption) caption = re.sub(r"[\u3400-\u4dbf]+", "", caption) caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption) caption = re.sub(r"[\u4e00-\u9fff]+", "", caption) ####################################################### # все виды тире / all types of dash --> "-" caption = re.sub( r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa "-", caption, ) # кавычки к одному стандарту caption = re.sub(r"[`´«»“”¨]", '"', caption) caption = re.sub(r"[‘’]", "'", caption) # " caption = re.sub(r""?", "", caption) # & caption = re.sub(r"&", "", caption) # ip adresses: caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption) # article ids: caption = re.sub(r"\d:\d\d\s+$", "", caption) # \n caption = re.sub(r"\\n", " ", caption) # "#123" caption = re.sub(r"#\d{1,3}\b", "", caption) # "#12345.." caption = re.sub(r"#\d{5,}\b", "", caption) # "123456.." caption = re.sub(r"\b\d{6,}\b", "", caption) # filenames: caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) # caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " # this-is-my-cute-cat / this_is_my_cute_cat regex2 = re.compile(r"(?:\-|\_)") if len(re.findall(regex2, caption)) > 3: caption = re.sub(regex2, " ", caption) caption = ftfy.fix_text(caption) caption = html.unescape(html.unescape(caption)) caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640 caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231 caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) caption = re.sub(r"\bpage\s+\d+\b", "", caption) caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) caption = re.sub(r"\b\s+\:\s+", r": ", caption) caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption) caption = re.sub(r"\s+", " ", caption) caption.strip() caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption) caption = re.sub(r"^[\'\_,\-\:;]", r"", caption) caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption) caption = re.sub(r"^\.\S+$", "", caption) return caption.strip() # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents( self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None, image=None, timestep=None, is_strength_max=True, return_image_latents=True, ): shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" f" size of {batch_size}. Make sure the batch size matches the length of the generators." ) if (image is None or timestep is None) and not is_strength_max: raise ValueError( "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise." "However, either the image or the noise timestep has not been provided." ) if return_image_latents or (latents is None and not is_strength_max): image = image.to(device=device, dtype=dtype) if image.shape[1] == 4: image_latents = image else: image_latents = self._encode_vae_image(image=image, generator=generator) image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1) if latents is None: noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) # if strength is 1. then initialise the latents to noise, else initial to image + noise latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep) # if pure noise then scale the initial latents by the Scheduler's init sigma latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents else: noise = latents.to(device) latents = noise * self.scheduler.init_noise_sigma # scale the initial noise by the standard deviation required by the scheduler latents = latents * self.scheduler.init_noise_sigma return latents, noise, image_latents def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator): if isinstance(generator, list): image_latents = [ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(image.shape[0]) ] image_latents = torch.cat(image_latents, dim=0) else: image_latents = retrieve_latents(self.vae.encode(image), generator=generator) image_latents = self.vae.config.scaling_factor * image_latents return image_latents def prepare_mask_latents( self, mask, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance ): # resize the mask to latents shape as we concatenate the mask to the latents # we do that before converting to dtype to avoid breaking in case we're using cpu_offload # and half precision mask = torch.nn.functional.interpolate( mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor) ) mask = mask.to(device=device, dtype=dtype) if mask.shape[0] < batch_size: if not batch_size % mask.shape[0] == 0: raise ValueError( "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to" f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number" " of masks that you pass is divisible by the total requested batch size." ) mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1) mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask return mask # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps def get_timesteps(self, num_inference_steps, strength, device): # get the original timestep using init_timestep init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] return timesteps, num_inference_steps - t_start @torch.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, prompt: Union[str, List[str]] = None, image: PipelineImageInput = None, mask_image: PipelineImageInput = None, strength: float = 1.0, negative_prompt: str = "", num_inference_steps: int = 20, timesteps: List[int] = None, guidance_scale: float = 4.5, num_images_per_prompt: Optional[int] = 1, height: Optional[int] = None, width: Optional[int] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, latents: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None, prompt_attention_mask: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_attention_mask: Optional[torch.FloatTensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: int = 1, clean_caption: bool = True, use_resolution_binning: bool = True, **kwargs, ) -> Union[ImagePipelineOutput, Tuple]: """ Function invoked when calling the pipeline for generation. Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to be masked out with `mask_image` and repainted according to `prompt`). For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but if passing latents directly it is not encoded again. mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). num_inference_steps (`int`, *optional*, defaults to 100): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. timesteps (`List[int]`, *optional*): Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps` timesteps are used. Must be in descending order. guidance_scale (`float`, *optional*, defaults to 4.5): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. height (`int`, *optional*, defaults to self.unet.config.sample_size): The height in pixels of the generated image. width (`int`, *optional*, defaults to self.unet.config.sample_size): The width in pixels of the generated image. eta (`float`, *optional*, defaults to 0.0): Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to [`schedulers.DDIMScheduler`], will be ignored for others. generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. latents (`torch.FloatTensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for text embeddings. negative_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated negative text embeddings. For PixArt-Alpha this negative prompt should be "". If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. negative_prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for negative text embeddings. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. clean_caption (`bool`, *optional*, defaults to `True`): Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to be installed. If the dependencies are not installed, the embeddings will be created from the raw prompt. use_resolution_binning (`bool` defaults to `True`): If set to `True`, the requested height and width are first mapped to the closest resolutions using `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to the requested resolution. Useful for generating non-square images. Examples: Returns: [`~pipelines.ImagePipelineOutput`] or `tuple`: If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is returned where the first element is a list with the generated images """ if "mask_feature" in kwargs: deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version." deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False) # 1. Check inputs. Raise error if not correct height = height or self.transformer.config.sample_size * self.vae_scale_factor width = width or self.transformer.config.sample_size * self.vae_scale_factor if use_resolution_binning: aspect_ratio_bin = ( ASPECT_RATIO_1024_BIN if self.transformer.config.sample_size == 128 else ASPECT_RATIO_512_BIN ) orig_height, orig_width = height, width height, width = self.image_processor.classify_height_width_bin(height, width, ratios=aspect_ratio_bin) self.check_inputs( prompt, height, width, negative_prompt, callback_steps, prompt_embeds, negative_prompt_embeds, prompt_attention_mask, negative_prompt_attention_mask, ) # 2. Default height and width to transformer if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): batch_size = len(prompt) else: batch_size = prompt_embeds.shape[0] device = self._execution_device # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` # corresponds to doing no classifier free guidance. do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt ( prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask, ) = self.encode_prompt( prompt, do_classifier_free_guidance, negative_prompt=negative_prompt, num_images_per_prompt=num_images_per_prompt, device=device, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, prompt_attention_mask=prompt_attention_mask, negative_prompt_attention_mask=negative_prompt_attention_mask, clean_caption=clean_caption, ) if do_classifier_free_guidance: prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0) # 4. Prepare timesteps timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) timesteps, num_inference_steps = self.get_timesteps( num_inference_steps=num_inference_steps, strength=strength, device=device ) # at which timestep to set the initial noise (n.b. 50% if strength is 0.5) latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise is_strength_max = strength == 1.0 init_image = self.image_processor.preprocess(image, height=height, width=width) init_image = init_image.to(dtype=torch.float32) # 5. Prepare latents. latent_channels = self.transformer.config.in_channels latents_outputs = self.prepare_latents( batch_size * num_images_per_prompt, latent_channels, height, width, prompt_embeds.dtype, device, generator, latents, image=init_image, timestep=latent_timestep, is_strength_max=is_strength_max, ) latents, noise, image_latents = latents_outputs mask_condition = self.mask_processor.preprocess(mask_image, height=height, width=width) mask = self.prepare_mask_latents( mask_condition, batch_size * num_images_per_prompt, height, width, prompt_embeds.dtype, device, generator, do_classifier_free_guidance, ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 6.1 Prepare micro-conditions. added_cond_kwargs = {"resolution": None, "aspect_ratio": None} if self.transformer.config.sample_size == 128: resolution = torch.tensor([height, width]).repeat(batch_size * num_images_per_prompt, 1) aspect_ratio = torch.tensor([float(height / width)]).repeat(batch_size * num_images_per_prompt, 1) resolution = resolution.to(dtype=prompt_embeds.dtype, device=device) aspect_ratio = aspect_ratio.to(dtype=prompt_embeds.dtype, device=device) added_cond_kwargs = {"resolution": resolution, "aspect_ratio": aspect_ratio} # 7. Denoising loop num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) current_timestep = t if not torch.is_tensor(current_timestep): # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can # This would be a good case for the `match` statement (Python 3.10+) is_mps = latent_model_input.device.type == "mps" if isinstance(current_timestep, float): dtype = torch.float32 if is_mps else torch.float64 else: dtype = torch.int32 if is_mps else torch.int64 current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device) elif len(current_timestep.shape) == 0: current_timestep = current_timestep[None].to(latent_model_input.device) # broadcast to batch dimension in a way that's compatible with ONNX/Core ML current_timestep = current_timestep.expand(latent_model_input.shape[0]) # predict noise model_output noise_pred = self.transformer( latent_model_input, encoder_hidden_states=prompt_embeds, encoder_attention_mask=prompt_attention_mask, timestep=current_timestep, added_cond_kwargs=added_cond_kwargs, return_dict=False, )[0] # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # learned sigma if self.transformer.config.out_channels // 2 == latent_channels: noise_pred = noise_pred.chunk(2, dim=1)[0] else: noise_pred = noise_pred # compute previous image: x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] init_latents_proper = image_latents if do_classifier_free_guidance: init_mask, _ = mask.chunk(2) else: init_mask = mask if i < len(timesteps) - 1: noise_timestep = timesteps[i + 1] init_latents_proper = self.scheduler.add_noise( init_latents_proper, noise, torch.tensor([noise_timestep]) ) latents = (1 - init_mask) * init_latents_proper + init_mask * latents # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: step_idx = i // getattr(self.scheduler, "order", 1) callback(step_idx, t, latents) if not output_type == "latent": image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] if use_resolution_binning: image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height) else: image = latents if not output_type == "latent": image = self.image_processor.postprocess(image, output_type=output_type) # Offload all models self.maybe_free_model_hooks() if not return_dict: return (image,) return ImagePipelineOutput(images=image) ================================================ FILE: PixArt-alpha-ToCa/scripts/pipeline_pixart_reference.py ================================================ # Copyright 2023 PixArt-Alpha Authors and The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import html import inspect import re import urllib.parse as ul from typing import Callable, List, Optional, Tuple, Union from PIL import Image import torch import torch.nn.functional as F from transformers import T5EncoderModel, T5Tokenizer from diffusers.image_processor import VaeImageProcessor, PipelineImageInput from diffusers.models import AutoencoderKL, Transformer2DModel from diffusers.schedulers import DPMSolverMultistepScheduler from diffusers.utils import ( BACKENDS_MAPPING, deprecate, is_bs4_available, is_ftfy_available, logging, replace_example_docstring, ) from diffusers.utils.torch_utils import randn_tensor from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name if is_bs4_available(): from bs4 import BeautifulSoup if is_ftfy_available(): import ftfy EXAMPLE_DOC_STRING = """ Examples: ```py >>> import PIL >>> from io import BytesIO >>> import requests >>> import torch >>> from diffusers import PixArtAlphaReferencePipeline >>> def download_image(url): ... response = requests.get(url) ... return PIL.Image.open(BytesIO(response.content)).convert("RGB") >>> # You can replace the checkpoint id with "PixArt-alpha/PixArt-XL-2-512x512" too. >>> pipe = PixArtAlphaReferencePipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16) >>> pipe = pipe.to('cuda') >>> img_url = "http://p1.qhimgs4.com/t01fef6f9d5e69335dd.jpg" >>> ref_image = download_image(img_url).crop((0, 0, 2160, 2160)).resize((1024, 1024)) >>> image_out = pipe( ... prompt='', ... height=1024, ... width=1024, ... image=ref_image, ... num_inference_steps=20, ... guidance_scale=4.0, ... ).images[0] ``` """ ASPECT_RATIO_1024_BIN = { "0.25": [512.0, 2048.0], "0.28": [512.0, 1856.0], "0.32": [576.0, 1792.0], "0.33": [576.0, 1728.0], "0.35": [576.0, 1664.0], "0.4": [640.0, 1600.0], "0.42": [640.0, 1536.0], "0.48": [704.0, 1472.0], "0.5": [704.0, 1408.0], "0.52": [704.0, 1344.0], "0.57": [768.0, 1344.0], "0.6": [768.0, 1280.0], "0.68": [832.0, 1216.0], "0.72": [832.0, 1152.0], "0.78": [896.0, 1152.0], "0.82": [896.0, 1088.0], "0.88": [960.0, 1088.0], "0.94": [960.0, 1024.0], "1.0": [1024.0, 1024.0], "1.07": [1024.0, 960.0], "1.13": [1088.0, 960.0], "1.21": [1088.0, 896.0], "1.29": [1152.0, 896.0], "1.38": [1152.0, 832.0], "1.46": [1216.0, 832.0], "1.67": [1280.0, 768.0], "1.75": [1344.0, 768.0], "2.0": [1408.0, 704.0], "2.09": [1472.0, 704.0], "2.4": [1536.0, 640.0], "2.5": [1600.0, 640.0], "3.0": [1728.0, 576.0], "4.0": [2048.0, 512.0], } ASPECT_RATIO_512_BIN = { "0.25": [256.0, 1024.0], "0.28": [256.0, 928.0], "0.32": [288.0, 896.0], "0.33": [288.0, 864.0], "0.35": [288.0, 832.0], "0.4": [320.0, 800.0], "0.42": [320.0, 768.0], "0.48": [352.0, 736.0], "0.5": [352.0, 704.0], "0.52": [352.0, 672.0], "0.57": [384.0, 672.0], "0.6": [384.0, 640.0], "0.68": [416.0, 608.0], "0.72": [416.0, 576.0], "0.78": [448.0, 576.0], "0.82": [448.0, 544.0], "0.88": [480.0, 544.0], "0.94": [480.0, 512.0], "1.0": [512.0, 512.0], "1.07": [512.0, 480.0], "1.13": [544.0, 480.0], "1.21": [544.0, 448.0], "1.29": [576.0, 448.0], "1.38": [576.0, 416.0], "1.46": [608.0, 416.0], "1.67": [640.0, 384.0], "1.75": [672.0, 384.0], "2.0": [704.0, 352.0], "2.09": [736.0, 352.0], "2.4": [768.0, 320.0], "2.5": [800.0, 320.0], "3.0": [864.0, 288.0], "4.0": [1024.0, 256.0], } # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps def retrieve_timesteps( scheduler, num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, **kwargs, ): """ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. Args: scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: raise ValueError( f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" f" timestep schedules. Please check whether you are using the correct scheduler." ) scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps return timesteps, num_inference_steps # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents def retrieve_latents( encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample" ): if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": return encoder_output.latent_dist.sample(generator) elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": return encoder_output.latent_dist.mode() elif hasattr(encoder_output, "latents"): return encoder_output.latents else: raise AttributeError("Could not access latents of provided encoder_output") class PixArtAlphaReferencePipeline(DiffusionPipeline): r""" Pipeline for image-to-image generation using PixArt-Alpha. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. text_encoder ([`T5EncoderModel`]): Frozen text-encoder. PixArt-Alpha uses [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant. tokenizer (`T5Tokenizer`): Tokenizer of class [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer). transformer ([`Transformer2DModel`]): A text conditioned `Transformer2DModel` to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `transformer` to denoise the encoded image latents. """ bad_punct_regex = re.compile( r"[" + "#®•©™&@·º½¾¿¡§~" + r"\)" + r"\(" + r"\]" + r"\[" + r"\}" + r"\{" + r"\|" + "\\" + r"\/" + r"\*" + r"]{1,}" ) # noqa _optional_components = ["tokenizer", "text_encoder"] model_cpu_offload_seq = "text_encoder->transformer->vae" def __init__( self, tokenizer: T5Tokenizer, text_encoder: T5EncoderModel, vae: AutoencoderKL, transformer: Transformer2DModel, scheduler: DPMSolverMultistepScheduler, ): super().__init__() self.register_modules( tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.mask_processor = VaeImageProcessor( vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True ) # Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/utils.py def mask_text_embeddings(self, emb, mask): if emb.shape[0] == 1: keep_index = mask.sum().item() return emb[:, :, :keep_index, :], keep_index else: masked_feature = emb * mask[:, None, :, None] return masked_feature, emb.shape[2] # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt def encode_prompt( self, prompt: Union[str, List[str]], do_classifier_free_guidance: bool = True, negative_prompt: str = "", num_images_per_prompt: int = 1, device: Optional[torch.device] = None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, prompt_attention_mask: Optional[torch.FloatTensor] = None, negative_prompt_attention_mask: Optional[torch.FloatTensor] = None, clean_caption: bool = False, **kwargs, ): r""" Encodes the prompt into text encoder hidden states. Args: prompt (`str` or `List[str]`, *optional*): prompt to be encoded negative_prompt (`str` or `List[str]`, *optional*): The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For PixArt-Alpha, this should be "". do_classifier_free_guidance (`bool`, *optional*, defaults to `True`): whether to use classifier free guidance or not num_images_per_prompt (`int`, *optional*, defaults to 1): number of images that should be generated per prompt device: (`torch.device`, *optional*): torch device to place the resulting embeddings on prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. negative_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the "" string. clean_caption (bool, defaults to `False`): If `True`, the function will preprocess and clean the provided caption before encoding. """ if "mask_feature" in kwargs: deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version." deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False) if device is None: device = self._execution_device if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): batch_size = len(prompt) else: batch_size = prompt_embeds.shape[0] # See Section 3.1. of the paper. max_length = 120 if prompt_embeds is None: prompt = self._text_preprocessing(prompt, clean_caption=clean_caption) text_inputs = self.tokenizer( prompt, padding="max_length", max_length=max_length, truncation=True, add_special_tokens=True, return_tensors="pt", ) text_input_ids = text_inputs.input_ids untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( text_input_ids, untruncated_ids ): removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1: -1]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" f" {max_length} tokens: {removed_text}" ) prompt_attention_mask = text_inputs.attention_mask prompt_attention_mask = prompt_attention_mask.to(device) prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask) prompt_embeds = prompt_embeds[0] if self.text_encoder is not None: dtype = self.text_encoder.dtype elif self.transformer is not None: dtype = self.transformer.dtype else: dtype = None prompt_embeds = prompt_embeds.to(dtype=dtype, device=device) bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1) prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: uncond_tokens = [negative_prompt] * batch_size uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( uncond_tokens, padding="max_length", max_length=max_length, truncation=True, return_attention_mask=True, add_special_tokens=True, return_tensors="pt", ) negative_prompt_attention_mask = uncond_input.attention_mask negative_prompt_attention_mask = negative_prompt_attention_mask.to(device) negative_prompt_embeds = self.text_encoder( uncond_input.input_ids.to(device), attention_mask=negative_prompt_attention_mask ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device) negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1) negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1) else: negative_prompt_embeds = None negative_prompt_attention_mask = None return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs def prepare_extra_step_kwargs(self, generator, eta): # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( self, prompt, image, height, width, negative_prompt, callback_steps, prompt_embeds=None, negative_prompt_embeds=None, prompt_attention_mask=None, negative_prompt_attention_mask=None, ): if height % 8 != 0 or width % 8 != 0: raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" f" {type(callback_steps)}." ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" " only forward one of the two." ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if prompt is not None and negative_prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:" f" {negative_prompt_embeds}. Please make sure to only forward one of the two." ) if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" f" {negative_prompt_embeds}. Please make sure to only forward one of the two." ) if prompt_embeds is not None and prompt_attention_mask is None: raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.") if negative_prompt_embeds is not None and negative_prompt_attention_mask is None: raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.") if prompt_embeds is not None and negative_prompt_embeds is not None: if prompt_embeds.shape != negative_prompt_embeds.shape: raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" f" {negative_prompt_embeds.shape}." ) if prompt_attention_mask.shape != negative_prompt_attention_mask.shape: raise ValueError( "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but" f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`" f" {negative_prompt_attention_mask.shape}." ) if image is None: raise ValueError( "Provide `image`. Cannot leave `image` undefined." ) # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing def _text_preprocessing(self, text, clean_caption=False): if clean_caption and not is_bs4_available(): logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`")) logger.warn("Setting `clean_caption` to False...") clean_caption = False if clean_caption and not is_ftfy_available(): logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`")) logger.warn("Setting `clean_caption` to False...") clean_caption = False if not isinstance(text, (tuple, list)): text = [text] def process(text: str): if clean_caption: text = self._clean_caption(text) text = self._clean_caption(text) else: text = text.lower().strip() return text return [process(t) for t in text] # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption def _clean_caption(self, caption): caption = str(caption) caption = ul.unquote_plus(caption) caption = caption.strip().lower() caption = re.sub("", "person", caption) # urls: caption = re.sub( r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", caption, ) # regex for urls caption = re.sub( r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", caption, ) # regex for urls # html: caption = BeautifulSoup(caption, features="html.parser").text # @ caption = re.sub(r"@[\w\d]+\b", "", caption) # 31C0—31EF CJK Strokes # 31F0—31FF Katakana Phonetic Extensions # 3200—32FF Enclosed CJK Letters and Months # 3300—33FF CJK Compatibility # 3400—4DBF CJK Unified Ideographs Extension A # 4DC0—4DFF Yijing Hexagram Symbols # 4E00—9FFF CJK Unified Ideographs caption = re.sub(r"[\u31c0-\u31ef]+", "", caption) caption = re.sub(r"[\u31f0-\u31ff]+", "", caption) caption = re.sub(r"[\u3200-\u32ff]+", "", caption) caption = re.sub(r"[\u3300-\u33ff]+", "", caption) caption = re.sub(r"[\u3400-\u4dbf]+", "", caption) caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption) caption = re.sub(r"[\u4e00-\u9fff]+", "", caption) ####################################################### # все виды тире / all types of dash --> "-" caption = re.sub( r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa "-", caption, ) # кавычки к одному стандарту caption = re.sub(r"[`´«»“”¨]", '"', caption) caption = re.sub(r"[‘’]", "'", caption) # " caption = re.sub(r""?", "", caption) # & caption = re.sub(r"&", "", caption) # ip adresses: caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption) # article ids: caption = re.sub(r"\d:\d\d\s+$", "", caption) # \n caption = re.sub(r"\\n", " ", caption) # "#123" caption = re.sub(r"#\d{1,3}\b", "", caption) # "#12345.." caption = re.sub(r"#\d{5,}\b", "", caption) # "123456.." caption = re.sub(r"\b\d{6,}\b", "", caption) # filenames: caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) # caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " # this-is-my-cute-cat / this_is_my_cute_cat regex2 = re.compile(r"(?:\-|\_)") if len(re.findall(regex2, caption)) > 3: caption = re.sub(regex2, " ", caption) caption = ftfy.fix_text(caption) caption = html.unescape(html.unescape(caption)) caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640 caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231 caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) caption = re.sub(r"\bpage\s+\d+\b", "", caption) caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) caption = re.sub(r"\b\s+\:\s+", r": ", caption) caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption) caption = re.sub(r"\s+", " ", caption) caption.strip() caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption) caption = re.sub(r"^[\'\_,\-\:;]", r"", caption) caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption) caption = re.sub(r"^\.\S+$", "", caption) return caption.strip() # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None, image=None, timestep=None, is_strength_max=True, return_image_latents=True, ): shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" f" size of {batch_size}. Make sure the batch size matches the length of the generators." ) if (image is None or timestep is None) and not is_strength_max: raise ValueError( "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise." "However, either the image or the noise timestep has not been provided." ) if return_image_latents or (latents is None and not is_strength_max): image = image.to(device=device, dtype=dtype) if image.shape[1] == 4: image_latents = image else: image_latents = self._encode_vae_image(image=image, generator=generator) image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1) if latents is None: noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) # if strength is 1. then initialise the latents to noise, else initial to image + noise latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep) # if pure noise then scale the initial latents by the Scheduler's init sigma latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents else: noise = latents.to(device) latents = noise * self.scheduler.init_noise_sigma # scale the initial noise by the standard deviation required by the scheduler latents = latents * self.scheduler.init_noise_sigma return latents, noise, image_latents @staticmethod def classify_height_width_bin(height: int, width: int, ratios: dict) -> Tuple[int, int]: """Returns binned height and width.""" ar = float(height / width) closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar)) default_hw = ratios[closest_ratio] return int(default_hw[0]), int(default_hw[1]) @staticmethod def resize_and_crop_tensor(samples: torch.Tensor, new_width: int, new_height: int) -> torch.Tensor: orig_height, orig_width = samples.shape[2], samples.shape[3] # Check if resizing is needed if orig_height != new_height or orig_width != new_width: ratio = max(new_height / orig_height, new_width / orig_width) resized_width = int(orig_width * ratio) resized_height = int(orig_height * ratio) # Resize samples = F.interpolate( samples, size=(resized_height, resized_width), mode="bilinear", align_corners=False ) # Center Crop start_x = (resized_width - new_width) // 2 end_x = start_x + new_width start_y = (resized_height - new_height) // 2 end_y = start_y + new_height samples = samples[:, :, start_y:end_y, start_x:end_x] return samples def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator): if isinstance(generator, list): image_latents = [ retrieve_latents(self.vae.encode(image[i: i + 1]), generator=generator[i]) for i in range(image.shape[0]) ] image_latents = torch.cat(image_latents, dim=0) else: image_latents = retrieve_latents(self.vae.encode(image), generator=generator) image_latents = self.vae.config.scaling_factor * image_latents return image_latents def prepare_mask_latents( self, mask, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance ): # resize the mask to latents shape as we concatenate the mask to the latents # we do that before converting to dtype to avoid breaking in case we're using cpu_offload # and half precision mask = torch.nn.functional.interpolate( mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor) ) mask = mask.to(device=device, dtype=dtype) if mask.shape[0] < batch_size: if not batch_size % mask.shape[0] == 0: raise ValueError( "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to" f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number" " of masks that you pass is divisible by the total requested batch size." ) mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1) mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask return mask # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps def get_timesteps(self, num_inference_steps, strength, device): # get the original timestep using init_timestep init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:] return timesteps, num_inference_steps - t_start @torch.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, prompt: Union[str, List[str]] = None, image: PipelineImageInput = None, strength: float = 1.0, negative_prompt: str = "", num_inference_steps: int = 20, timesteps: List[int] = None, guidance_scale: float = 4.5, num_images_per_prompt: Optional[int] = 1, height: Optional[int] = None, width: Optional[int] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, latents: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None, prompt_attention_mask: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_attention_mask: Optional[torch.FloatTensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: int = 1, clean_caption: bool = True, use_resolution_binning: bool = True, **kwargs, ) -> Union[ImagePipelineOutput, Tuple]: """ Function invoked when calling the pipeline for generation. Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): The reference image guides the image generation. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). num_inference_steps (`int`, *optional*, defaults to 100): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. timesteps (`List[int]`, *optional*): Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps` timesteps are used. Must be in descending order. guidance_scale (`float`, *optional*, defaults to 4.5): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. height (`int`, *optional*, defaults to self.unet.config.sample_size): The height in pixels of the generated image. width (`int`, *optional*, defaults to self.unet.config.sample_size): The width in pixels of the generated image. eta (`float`, *optional*, defaults to 0.0): Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to [`schedulers.DDIMScheduler`], will be ignored for others. generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. latents (`torch.FloatTensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for text embeddings. negative_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated negative text embeddings. For PixArt-Alpha this negative prompt should be "". If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. negative_prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for negative text embeddings. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. clean_caption (`bool`, *optional*, defaults to `True`): Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to be installed. If the dependencies are not installed, the embeddings will be created from the raw prompt. use_resolution_binning (`bool` defaults to `True`): If set to `True`, the requested height and width are first mapped to the closest resolutions using `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to the requested resolution. Useful for generating non-square images. Examples: Returns: [`~pipelines.ImagePipelineOutput`] or `tuple`: If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is returned where the first element is a list with the generated images """ if "mask_feature" in kwargs: deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version." deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False) # 1. Check inputs. Raise error if not correct height = height or self.transformer.config.sample_size * self.vae_scale_factor width = width or self.transformer.config.sample_size * self.vae_scale_factor width *= 2 ref = image image = Image.new("RGB", (width, height), (255, 255, 255)) image.paste(ref, (0, 0)) mask_image = Image.new("RGB", (width, height), (255, 255, 255)) balck_rect = Image.new("RGB", (width // 2, height), (0, 0, 0)) mask_image.paste(balck_rect, (0, 0)) if use_resolution_binning: aspect_ratio_bin = ( ASPECT_RATIO_1024_BIN if self.transformer.config.sample_size == 128 else ASPECT_RATIO_512_BIN ) orig_height, orig_width = height, width height, width = self.classify_height_width_bin(height, width, ratios=aspect_ratio_bin) self.check_inputs( prompt, image, height, width, negative_prompt, callback_steps, prompt_embeds, negative_prompt_embeds, prompt_attention_mask, negative_prompt_attention_mask, ) # 2. Default height and width to transformer if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): batch_size = len(prompt) else: batch_size = prompt_embeds.shape[0] device = self._execution_device # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` # corresponds to doing no classifier free guidance. do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt ( prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask, ) = self.encode_prompt( prompt, do_classifier_free_guidance, negative_prompt=negative_prompt, num_images_per_prompt=num_images_per_prompt, device=device, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, prompt_attention_mask=prompt_attention_mask, negative_prompt_attention_mask=negative_prompt_attention_mask, clean_caption=clean_caption, ) if do_classifier_free_guidance: prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0) # 4. Prepare timesteps timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) timesteps, num_inference_steps = self.get_timesteps( num_inference_steps=num_inference_steps, strength=strength, device=device ) # at which timestep to set the initial noise (n.b. 50% if strength is 0.5) latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise is_strength_max = strength == 1.0 init_image = self.image_processor.preprocess(image, height=height, width=width) init_image = init_image.to(dtype=torch.float32) # 5. Prepare latents. latent_channels = self.transformer.config.in_channels latents_outputs = self.prepare_latents( batch_size * num_images_per_prompt, latent_channels, height, width, prompt_embeds.dtype, device, generator, latents, image=init_image, timestep=latent_timestep, is_strength_max=is_strength_max, ) latents, noise, image_latents = latents_outputs mask_condition = self.mask_processor.preprocess(mask_image, height=height, width=width) mask = self.prepare_mask_latents( mask_condition, batch_size * num_images_per_prompt, height, width, prompt_embeds.dtype, device, generator, do_classifier_free_guidance, ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 6.1 Prepare micro-conditions. added_cond_kwargs = {"resolution": None, "aspect_ratio": None} if self.transformer.config.sample_size == 128: resolution = torch.tensor([height, width]).repeat(batch_size * num_images_per_prompt, 1) aspect_ratio = torch.tensor([float(height / width)]).repeat(batch_size * num_images_per_prompt, 1) resolution = resolution.to(dtype=prompt_embeds.dtype, device=device) aspect_ratio = aspect_ratio.to(dtype=prompt_embeds.dtype, device=device) added_cond_kwargs = {"resolution": resolution, "aspect_ratio": aspect_ratio} # 7. Denoising loop num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) current_timestep = t if not torch.is_tensor(current_timestep): # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can # This would be a good case for the `match` statement (Python 3.10+) is_mps = latent_model_input.device.type == "mps" if isinstance(current_timestep, float): dtype = torch.float32 if is_mps else torch.float64 else: dtype = torch.int32 if is_mps else torch.int64 current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device) elif len(current_timestep.shape) == 0: current_timestep = current_timestep[None].to(latent_model_input.device) # broadcast to batch dimension in a way that's compatible with ONNX/Core ML # predict noise model_output noise_pred = self.transformer( latent_model_input, encoder_hidden_states=prompt_embeds, encoder_attention_mask=prompt_attention_mask, timestep=current_timestep, added_cond_kwargs=added_cond_kwargs, return_dict=False, )[0] # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # learned sigma if self.transformer.config.out_channels // 2 == latent_channels: noise_pred = noise_pred.chunk(2, dim=1)[0] else: noise_pred = noise_pred # compute previous image: x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] init_latents_proper = image_latents if do_classifier_free_guidance: init_mask, _ = mask.chunk(2) else: init_mask = mask if i < len(timesteps) - 1: noise_timestep = timesteps[i + 1] init_latents_proper = self.scheduler.add_noise( init_latents_proper, noise, torch.tensor([noise_timestep]) ) latents_ = latents latents = (1 - init_mask) * init_latents_proper + init_mask * latents latent_model_input = torch.cat([latents_] + [latents]) if do_classifier_free_guidance else latents # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: step_idx = i // getattr(self.scheduler, "order", 1) callback(step_idx, t, latents) if not output_type == "latent": image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] if use_resolution_binning: image = self.resize_and_crop_tensor(image, orig_width, orig_height) else: image = latents image = image.chunk(2, -1)[1] if not output_type == "latent": image = self.image_processor.postprocess(image, output_type=output_type) # Offload all models self.maybe_free_model_hooks() if not return_dict: return (image,) return ImagePipelineOutput(images=image) ================================================ FILE: PixArt-alpha-ToCa/timing_analysis.py ================================================ import json import numpy as np import matplotlib.pyplot as plt with open('timing_info.json', 'r') as f: data = json.load(f) attn_times = [] cross_attn_times = [] mlp_times = [] block_times = [] for entry in data: timing_info = entry['timing_info'] attn_times.extend(timing_info['attn_time']) cross_attn_times.extend(timing_info['cross_attn_time']) mlp_times.extend(timing_info['mlp_time']) block_times.extend(timing_info['block_time']) average_attn_time = np.mean(attn_times) average_cross_attn_time = np.mean(cross_attn_times) average_mlp_time = np.mean(mlp_times) average_block_time = np.mean(block_times) print(f"Average Attention Time: {average_attn_time:.4f} ms") print(f"Average Cross Attention Time: {average_cross_attn_time:.4f} ms") print(f"Average MLP Time: {average_mlp_time:.4f} ms") print(f"Average Block Time: {average_block_time:.4f} ms") labels = ['Attention', 'Cross Attention', 'MLP', 'Block'] avg_times = [average_attn_time, average_cross_attn_time, average_mlp_time, average_block_time] plt.bar(labels, avg_times, color=['blue', 'green', 'red', 'orange']) plt.ylabel('Average Time (ms)') plt.title('Average Time per Module') plt.savefig('module_average_times.png') ================================================ FILE: PixArt-alpha-ToCa/timing_info.json ================================================ [{"timing_info": {"block_time": [10.906271934509277], "attn_time": [7.704576015472412], "cross_attn_time": [0.9379839897155762], "mlp_time": [2.0203518867492676]}, "current": {"num_steps": 20, "step": 0, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.602560043334961], "attn_time": [0.5560320019721985], "cross_attn_time": [0.5662720203399658], "mlp_time": [0.30105599761009216]}, "current": {"num_steps": 20, "step": 0, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4970879554748535], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 0, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4981119632720947], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 0, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4755840301513672], "attn_time": [0.4925439953804016], "cross_attn_time": [0.52019202709198], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 0, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4776320457458496], "attn_time": [0.48742398619651794], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 0, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4428160190582275], "attn_time": [0.4925439953804016], "cross_attn_time": [0.5038080215454102], "mlp_time": [0.2764799892902374]}, "current": {"num_steps": 20, "step": 0, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4407680034637451], "attn_time": [0.4761599898338318], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 0, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.46943998336792], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 0, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.465343952178955], "attn_time": [0.48230400681495667], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 0, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4632960557937622], "attn_time": [0.48742398619651794], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 0, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4612480401992798], "attn_time": [0.4761599898338318], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 0, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4592000246047974], "attn_time": [0.48230400681495667], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 0, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.435647964477539], "attn_time": [0.47308799624443054], "cross_attn_time": [0.506879985332489], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 0, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.46943998336792], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 0, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5421439409255981], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5631999969482422], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 0, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.474560022354126], "attn_time": [0.4915199875831604], "cross_attn_time": [0.5048320293426514], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 0, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4725120067596436], "attn_time": [0.48947200179100037], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 0, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4499839544296265], "attn_time": [0.48230400681495667], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 0, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4899200201034546], "attn_time": [0.506879985332489], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 0, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4551039934158325], "attn_time": [0.48947200179100037], "cross_attn_time": [0.5120000243186951], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 0, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.46943998336792], "attn_time": [0.48947200179100037], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 0, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5011839866638184], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 0, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4776320457458496], "attn_time": [0.4853760004043579], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 0, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.457152009010315], "attn_time": [0.4843519926071167], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 0, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4960639476776123], "attn_time": [0.5099520087242126], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 0, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.462272047996521], "attn_time": [0.4843519926071167], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 0, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4888960123062134], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.30105599761009216]}, "current": {"num_steps": 20, "step": 0, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.733855962753296], "attn_time": [0.579584002494812], "cross_attn_time": [0.567296028137207], "mlp_time": [0.3266560137271881]}, "current": {"num_steps": 20, "step": 1, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5523840188980103], "attn_time": [0.5181440114974976], "cross_attn_time": [0.5355520248413086], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 1, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.48742398619651794], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 1, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 1, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5360000133514404], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 1, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5242879986763], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 1, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.481727957725525], "attn_time": [0.48844799399375916], "cross_attn_time": [0.5038080215454102], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 1, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.528831958770752], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 1, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4796799421310425], "attn_time": [0.48127999901771545], "cross_attn_time": [0.5099520087242126], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 1, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 1, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.48947200179100037], "cross_attn_time": [0.5242879986763], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 1, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5001599788665771], "attn_time": [0.4843519926071167], "cross_attn_time": [0.52019202709198], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 1, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 1, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5063040256500244], "attn_time": [0.4904960095882416], "cross_attn_time": [0.5130239725112915], "mlp_time": [0.2959359884262085]}, "current": {"num_steps": 20, "step": 1, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 1, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.4904960095882416], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2959359884262085]}, "current": {"num_steps": 20, "step": 1, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [2.950144052505493], "attn_time": [0.5017600059509277], "cross_attn_time": [1.1509759426116943], "mlp_time": [0.9451519846916199]}, "current": {"num_steps": 20, "step": 1, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.29900801181793213]}, "current": {"num_steps": 20, "step": 1, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4940160512924194], "attn_time": [0.4904960095882416], "cross_attn_time": [0.506879985332489], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 1, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 1, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5011839866638184], "attn_time": [0.4843519926071167], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 1, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4899200201034546], "attn_time": [0.4864000082015991], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 1, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.5099520087242126], "cross_attn_time": [0.52019202709198], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 1, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [3.0791680812835693], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5181440114974976], "mlp_time": [1.8472959995269775]}, "current": {"num_steps": 20, "step": 1, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.6936960220336914], "attn_time": [0.6215680241584778], "cross_attn_time": [0.5591040253639221], "mlp_time": [0.30003198981285095]}, "current": {"num_steps": 20, "step": 1, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5421439409255981], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5355520248413086], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 1, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 1, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5452159643173218], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 1, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.6070079803466797], "attn_time": [0.5406720042228699], "cross_attn_time": [0.5591040253639221], "mlp_time": [0.3092480003833771]}, "current": {"num_steps": 20, "step": 2, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.598464012145996], "attn_time": [0.5355520248413086], "cross_attn_time": [0.5427200198173523], "mlp_time": [0.30822399258613586]}, "current": {"num_steps": 20, "step": 2, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5472639799118042], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5437440276145935], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 2, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5370240211486816], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.29900801181793213]}, "current": {"num_steps": 20, "step": 2, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.56876802444458], "attn_time": [0.5191680192947388], "cross_attn_time": [0.5427200198173523], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 2, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5554560422897339], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.3041279911994934]}, "current": {"num_steps": 20, "step": 2, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5544320344924927], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5437440276145935], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 2, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 2, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5738879442214966], "attn_time": [0.5263360142707825], "cross_attn_time": [0.536575973033905], "mlp_time": [0.30105599761009216]}, "current": {"num_steps": 20, "step": 2, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.558527946472168], "attn_time": [0.5191680192947388], "cross_attn_time": [0.536575973033905], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 2, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5472639799118042], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5386239886283875], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 2, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.5007359981536865], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 2, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.563647985458374], "attn_time": [0.5191680192947388], "cross_attn_time": [0.5396479964256287], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 2, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5441919565200806], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5355520248413086], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 2, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.6773120164871216], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.3491840064525604]}, "current": {"num_steps": 20, "step": 2, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5656960010528564], "attn_time": [0.5191680192947388], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.3041279911994934]}, "current": {"num_steps": 20, "step": 2, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5493119955062866], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.30105599761009216]}, "current": {"num_steps": 20, "step": 2, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 2, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.546239972114563], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 2, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 2, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 2, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.572864055633545], "attn_time": [0.5283839702606201], "cross_attn_time": [0.5386239886283875], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 2, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.558527946472168], "attn_time": [0.5191680192947388], "cross_attn_time": [0.5386239886283875], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 2, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 2, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.504256010055542], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5109760165214539], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 2, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.506879985332489], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 2, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.539072036743164], "attn_time": [0.5109760165214539], "cross_attn_time": [0.532480001449585], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 2, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.516543984413147], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 2, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5578240156173706], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5457919836044312], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 3, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.4915199875831604], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 3, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5001599788665771], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 3, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 3, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.4915199875831604], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 3, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.491968035697937], "attn_time": [0.4843519926071167], "cross_attn_time": [0.5130239725112915], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 3, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 3, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 3, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5452159643173218], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2979840040206909]}, "current": {"num_steps": 20, "step": 3, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4970879554748535], "attn_time": [0.4853760004043579], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 3, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5001599788665771], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 3, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 3, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5011839866638184], "attn_time": [0.48742398619651794], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 3, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [2.6859519481658936], "attn_time": [1.0670080184936523], "cross_attn_time": [0.8294399976730347], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 3, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 3, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4960639476776123], "attn_time": [0.48947200179100037], "cross_attn_time": [0.5109760165214539], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 3, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5063040256500244], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 3, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5370240211486816], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 3, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.4915199875831604], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 3, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.4843519926071167], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 3, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5011839866638184], "attn_time": [0.48742398619651794], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 3, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.2959359884262085]}, "current": {"num_steps": 20, "step": 3, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.506879985332489], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 3, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5001599788665771], "attn_time": [0.4843519926071167], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 3, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4878720045089722], "attn_time": [0.4853760004043579], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 3, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5370240211486816], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 3, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.506879985332489], "cross_attn_time": [0.52019202709198], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 3, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.511423945426941], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 3, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.547327995300293], "attn_time": [0.5232639908790588], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.30105599761009216]}, "current": {"num_steps": 20, "step": 4, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 4, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 4, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 4, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4940160512924194], "attn_time": [0.48947200179100037], "cross_attn_time": [0.5079039931297302], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 4, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 4, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 4, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4960639476776123], "attn_time": [0.4833280146121979], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 4, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4981119632720947], "attn_time": [0.48947200179100037], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 4, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 4, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5400960445404053], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.29900801181793213]}, "current": {"num_steps": 20, "step": 4, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 4, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5011839866638184], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 4, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 4, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 4, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5083520412445068], "attn_time": [0.4904960095882416], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 4, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.4904960095882416], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2979840040206909]}, "current": {"num_steps": 20, "step": 4, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.499135971069336], "attn_time": [0.4853760004043579], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 4, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 4, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 4, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 4, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 4, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 4, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5032320022583008], "attn_time": [0.4864000082015991], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 4, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4960639476776123], "attn_time": [0.4833280146121979], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 4, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5130239725112915], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 4, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.506879985332489], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 4, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 4, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.53711998462677], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5437440276145935], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 5, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5360000133514404], "attn_time": [0.5089280009269714], "cross_attn_time": [0.532480001449585], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 5, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.4915199875831604], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 5, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 5, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5089280009269714], "cross_attn_time": [0.52019202709198], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 5, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.539072036743164], "attn_time": [0.5160959959030151], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 5, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5411200523376465], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.3020800054073334]}, "current": {"num_steps": 20, "step": 5, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 5, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 5, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5063040256500244], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 5, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.4904960095882416], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 5, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.509376049041748], "attn_time": [0.4925439953804016], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 5, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 5, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 5, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5083520412445068], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 5, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5001599788665771], "attn_time": [0.48844799399375916], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 5, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5073280334472656], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5242879986763], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 5, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 5, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5073280334472656], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 5, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5073280334472656], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 5, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.4925439953804016], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 5, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.491968035697937], "attn_time": [0.4904960095882416], "cross_attn_time": [0.5120000243186951], "mlp_time": [0.2764799892902374]}, "current": {"num_steps": 20, "step": 5, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 5, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5011839866638184], "attn_time": [0.4925439953804016], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 5, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.5058559775352478], "cross_attn_time": [0.52019202709198], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 5, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.4925439953804016], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 5, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.481727957725525], "attn_time": [0.48230400681495667], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 5, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 5, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5565439462661743], "attn_time": [0.5181440114974976], "cross_attn_time": [0.5416960120201111], "mlp_time": [0.3041279911994934]}, "current": {"num_steps": 20, "step": 6, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4960639476776123], "attn_time": [0.48742398619651794], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 6, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 6, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5011839866638184], "attn_time": [0.4915199875831604], "cross_attn_time": [0.5242879986763], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 6, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 6, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.506879985332489], "cross_attn_time": [0.52019202709198], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 6, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 6, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 6, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 6, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.509376049041748], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 6, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.509376049041748], "attn_time": [0.5007359981536865], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 6, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 6, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 6, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4970879554748535], "attn_time": [0.48844799399375916], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2764799892902374]}, "current": {"num_steps": 20, "step": 6, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4735360145568848], "attn_time": [0.48127999901771545], "cross_attn_time": [0.5120000243186951], "mlp_time": [0.2764799892902374]}, "current": {"num_steps": 20, "step": 6, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5032320022583008], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5242879986763], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 6, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 6, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 6, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5063040256500244], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 6, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5083520412445068], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 6, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 6, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5073280334472656], "attn_time": [0.4925439953804016], "cross_attn_time": [0.5242879986763], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 6, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 6, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 6, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 6, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 6, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5472639799118042], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 6, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 6, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.535904049873352], "attn_time": [0.5171200037002563], "cross_attn_time": [0.5355520248413086], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 7, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 7, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.5079039931297302], "cross_attn_time": [0.52019202709198], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 7, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.506879985332489], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 7, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 7, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5130239725112915], "cross_attn_time": [0.52019202709198], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 7, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5452159643173218], "attn_time": [0.502784013748169], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 7, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5032320022583008], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 7, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 7, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.48947200179100037], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 7, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5130239725112915], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 7, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5032320022583008], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 7, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.509376049041748], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 7, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.506879985332489], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 7, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5032320022583008], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 7, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 7, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5441919565200806], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2959359884262085]}, "current": {"num_steps": 20, "step": 7, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5360000133514404], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 7, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5038080215454102], "cross_attn_time": [0.532480001449585], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 7, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5083520412445068], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 7, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5120000243186951], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 7, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5089280009269714], "cross_attn_time": [0.52019202709198], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 7, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 7, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.502784013748169], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 7, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5534080266952515], "attn_time": [0.5171200037002563], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 7, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.5109760165214539], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 7, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.528831958770752], "attn_time": [0.506879985332489], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 7, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5411200523376465], "attn_time": [0.5160959959030151], "cross_attn_time": [0.52019202709198], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 7, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.546463966369629], "attn_time": [0.5181440114974976], "cross_attn_time": [0.536575973033905], "mlp_time": [0.3031040132045746]}, "current": {"num_steps": 20, "step": 8, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.5017600059509277], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 8, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [3.84716796875], "attn_time": [0.749567985534668], "cross_attn_time": [0.5457919836044312], "mlp_time": [0.30720001459121704]}, "current": {"num_steps": 20, "step": 8, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5130239725112915], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 8, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 8, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 8, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 8, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 8, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.506879985332489], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 8, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.528831958770752], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 8, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 8, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 8, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.502784013748169], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 8, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 8, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.511423945426941], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 8, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.528831958770752], "attn_time": [0.506879985332489], "cross_attn_time": [0.5242879986763], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 8, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 8, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.509376049041748], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 8, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5298559665679932], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 8, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 8, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.511423945426941], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 8, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 8, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 8, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5370240211486816], "attn_time": [0.506879985332489], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 8, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 8, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.506879985332489], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 8, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 8, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 8, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5425920486450195], "attn_time": [0.5160959959030151], "cross_attn_time": [0.5375999808311462], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 9, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 9, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.502784013748169], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 9, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 9, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.528831958770752], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 9, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 9, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 9, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 9, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 9, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 9, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4981119632720947], "attn_time": [0.49459201097488403], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2744320034980774]}, "current": {"num_steps": 20, "step": 9, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4981119632720947], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.2764799892902374]}, "current": {"num_steps": 20, "step": 9, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.506879985332489], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 9, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 9, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 9, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5083520412445068], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 9, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 9, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 9, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.516543984413147], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 9, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 9, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.509376049041748], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2764799892902374]}, "current": {"num_steps": 20, "step": 9, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 9, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5083520412445068], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2764799892902374]}, "current": {"num_steps": 20, "step": 9, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.506879985332489], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 9, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.511423945426941], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 9, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 9, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4981119632720947], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5109760165214539], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 9, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.499135971069336], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 9, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5166079998016357], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 10, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.502784013748169], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 10, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 10, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 10, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5063040256500244], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 10, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5073280334472656], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 10, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 10, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5109760165214539], "cross_attn_time": [0.52019202709198], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 10, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.502784013748169], "cross_attn_time": [0.5130239725112915], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 10, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5073280334472656], "attn_time": [0.502784013748169], "cross_attn_time": [0.5120000243186951], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 10, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5073280334472656], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 10, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5242879986763], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 10, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 10, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 10, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.506879985332489], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 10, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.506879985332489], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 10, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 10, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.516543984413147], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 10, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5820800065994263], "attn_time": [0.536575973033905], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 10, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.6005120277404785], "attn_time": [0.5335040092468262], "cross_attn_time": [0.5478399991989136], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 10, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.551360011100769], "attn_time": [0.5120000243186951], "cross_attn_time": [0.536575973033905], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 10, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5370240211486816], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 10, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.6680959463119507], "attn_time": [0.5765119791030884], "cross_attn_time": [0.5488640069961548], "mlp_time": [0.30003198981285095]}, "current": {"num_steps": 20, "step": 10, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5718400478363037], "attn_time": [0.5171200037002563], "cross_attn_time": [0.5437440276145935], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 10, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5523840188980103], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 10, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 10, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5360000133514404], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 10, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.546239972114563], "attn_time": [0.5171200037002563], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 10, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5750720500946045], "attn_time": [0.5294079780578613], "cross_attn_time": [0.5529599785804749], "mlp_time": [0.29900801181793213]}, "current": {"num_steps": 20, "step": 11, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 11, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.502784013748169], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 11, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5544320344924927], "attn_time": [0.5222399830818176], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2959359884262085]}, "current": {"num_steps": 20, "step": 11, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.5089280009269714], "cross_attn_time": [0.532480001449585], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 11, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.539072036743164], "attn_time": [0.5171200037002563], "cross_attn_time": [0.52019202709198], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 11, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5375999808311462], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 11, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.539072036743164], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 11, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.528831958770752], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 11, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 11, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5646719932556152], "attn_time": [0.5345280170440674], "cross_attn_time": [0.5242879986763], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 11, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5400960445404053], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 11, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5360000133514404], "attn_time": [0.506879985332489], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 11, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5242879986763], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 11, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5411200523376465], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 11, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5534080266952515], "attn_time": [0.5181440114974976], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.29900801181793213]}, "current": {"num_steps": 20, "step": 11, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.506879985332489], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 11, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [2.9480960369110107], "attn_time": [0.5120000243186951], "cross_attn_time": [0.8601599931716919], "mlp_time": [1.1284480094909668]}, "current": {"num_steps": 20, "step": 11, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5503360033035278], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5406720042228699], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 11, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 11, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.4925439953804016], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 11, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5032320022583008], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 11, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.506879985332489], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 11, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 11, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5083520412445068], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 11, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.516543984413147], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 11, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 11, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5493119955062866], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 11, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5667200088500977], "attn_time": [0.5396479964256287], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.30617600679397583]}, "current": {"num_steps": 20, "step": 12, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5605759620666504], "attn_time": [0.5222399830818176], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2959359884262085]}, "current": {"num_steps": 20, "step": 12, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5482879877090454], "attn_time": [0.5160959959030151], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 12, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.502784013748169], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 12, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.5048320293426514], "cross_attn_time": [0.52019202709198], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 12, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5421439409255981], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 12, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.539072036743164], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 12, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.5038080215454102], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 12, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 12, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 12, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5349760055541992], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 12, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.511423945426941], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 12, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 12, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 12, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.8565119504928589], "attn_time": [0.6768640279769897], "cross_attn_time": [0.5652480125427246], "mlp_time": [0.317440003156662]}, "current": {"num_steps": 20, "step": 12, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5615999698638916], "attn_time": [0.5171200037002563], "cross_attn_time": [0.5447679758071899], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 12, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5534080266952515], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 12, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5523840188980103], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 12, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5441919565200806], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 12, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5472639799118042], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5386239886283875], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 12, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.551360011100769], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 12, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5431679487228394], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 12, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.502784013748169], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 12, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.546239972114563], "attn_time": [0.506879985332489], "cross_attn_time": [0.536575973033905], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 12, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5441919565200806], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 12, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.558527946472168], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 12, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.551360011100769], "attn_time": [0.5140479803085327], "cross_attn_time": [0.532480001449585], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 12, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5534080266952515], "attn_time": [0.5181440114974976], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 12, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.593727946281433], "attn_time": [0.5375999808311462], "cross_attn_time": [0.5550079941749573], "mlp_time": [0.3051519989967346]}, "current": {"num_steps": 20, "step": 13, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5708160400390625], "attn_time": [0.5171200037002563], "cross_attn_time": [0.5447679758071899], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 13, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5575040578842163], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5396479964256287], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 13, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5677440166473389], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5406720042228699], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 13, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5575040578842163], "attn_time": [0.5160959959030151], "cross_attn_time": [0.5386239886283875], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 13, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5595519542694092], "attn_time": [0.5160959959030151], "cross_attn_time": [0.5396479964256287], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 13, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5503360033035278], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5355520248413086], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 13, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5779839754104614], "attn_time": [0.5242879986763], "cross_attn_time": [0.536575973033905], "mlp_time": [0.3031040132045746]}, "current": {"num_steps": 20, "step": 13, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 13, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5411200523376465], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 13, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5360000133514404], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 13, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5411200523376465], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5355520248413086], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 13, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5605759620666504], "attn_time": [0.5171200037002563], "cross_attn_time": [0.5375999808311462], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 13, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 13, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4940160512924194], "attn_time": [0.4915199875831604], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 13, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 13, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 13, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 13, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5298559665679932], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 13, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 13, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 13, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 13, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5370240211486816], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 13, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.502784013748169], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 13, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.511423945426941], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 13, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 13, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5421439409255981], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 13, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.504256010055542], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2764799892902374]}, "current": {"num_steps": 20, "step": 13, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5440959930419922], "attn_time": [0.5212159752845764], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.30003198981285095]}, "current": {"num_steps": 20, "step": 14, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5073280334472656], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5109760165214539], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 14, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 14, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.504256010055542], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 14, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 14, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5089280009269714], "cross_attn_time": [0.52019202709198], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 14, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 14, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5130239725112915], "cross_attn_time": [0.52019202709198], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 14, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5349760055541992], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 14, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.516543984413147], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 14, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.504256010055542], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 14, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.5160959959030151], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 14, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5120000243186951], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 14, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5063040256500244], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5099520087242126], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 14, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 14, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [5.377024173736572], "attn_time": [1.6383999586105347], "cross_attn_time": [1.7756160497665405], "mlp_time": [1.4632960557937622]}, "current": {"num_steps": 20, "step": 14, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.5130239725112915], "cross_attn_time": [0.52019202709198], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 14, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.539072036743164], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 14, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 14, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 14, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5349760055541992], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 14, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 14, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 14, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5421439409255981], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 14, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 14, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5400960445404053], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 14, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5360000133514404], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 14, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.502784013748169], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 14, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.540992021560669], "attn_time": [0.5160959959030151], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.3031040132045746]}, "current": {"num_steps": 20, "step": 15, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5370240211486816], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 15, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5595519542694092], "attn_time": [0.5171200037002563], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.3031040132045746]}, "current": {"num_steps": 20, "step": 15, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5375999808311462], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 15, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.4904960095882416], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 15, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5120000243186951], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 15, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 15, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5400960445404053], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 15, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2979840040206909]}, "current": {"num_steps": 20, "step": 15, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4981119632720947], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5130239725112915], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 15, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 15, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 15, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 15, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5441919565200806], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 15, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 15, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 15, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 15, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.516543984413147], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 15, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.528831958770752], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 15, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 15, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 15, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5083520412445068], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 15, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5063040256500244], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 15, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5032320022583008], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 15, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 15, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5181440114974976], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 15, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 15, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 15, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.540287971496582], "attn_time": [0.5181440114974976], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.2979840040206909]}, "current": {"num_steps": 20, "step": 16, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5120000243186951], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 16, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.509376049041748], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 16, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 16, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 16, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 16, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 16, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5242879986763], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 16, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 16, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 16, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5001599788665771], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 16, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 16, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.502784013748169], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 16, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 16, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 16, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4970879554748535], "attn_time": [0.48947200179100037], "cross_attn_time": [0.52019202709198], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 16, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.516543984413147], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 16, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 16, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 16, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 16, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5360000133514404], "attn_time": [0.502784013748169], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 16, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 16, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.506879985332489], "cross_attn_time": [0.5120000243186951], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 16, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.506879985332489], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 16, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.516543984413147], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 16, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 16, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 16, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [2.0336639881134033], "attn_time": [0.7659519910812378], "cross_attn_time": [0.6256639957427979], "mlp_time": [0.3164159953594208]}, "current": {"num_steps": 20, "step": 16, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.549888014793396], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5447679758071899], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 17, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.506879985332489], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 17, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.546239972114563], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5386239886283875], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 17, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.4935680031776428], "cross_attn_time": [0.532480001449585], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 17, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.506879985332489], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 17, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.502784013748169], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 17, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5120000243186951], "cross_attn_time": [0.52019202709198], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 17, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5058559775352478], "cross_attn_time": [0.532480001449585], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 17, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 17, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5482879877090454], "attn_time": [0.5160959959030151], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 17, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.546239972114563], "attn_time": [0.5171200037002563], "cross_attn_time": [0.5242879986763], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 17, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 17, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4981119632720947], "attn_time": [0.4904960095882416], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 17, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 17, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.509376049041748], "attn_time": [0.4864000082015991], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 17, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 17, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 17, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5370240211486816], "attn_time": [0.506879985332489], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 17, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5007359981536865], "cross_attn_time": [0.532480001449585], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 17, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 17, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 17, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5298559665679932], "attn_time": [0.5089280009269714], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 17, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.502784013748169], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 17, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 17, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 17, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 17, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 17, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5360000133514404], "attn_time": [0.5160959959030151], "cross_attn_time": [0.52019202709198], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 17, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5399680137634277], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5427200198173523], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 18, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 18, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5400960445404053], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 18, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.528831958770752], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 18, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5298559665679932], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 18, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 18, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 18, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 18, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5242879986763], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 18, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.502784013748169], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 18, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 18, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.491968035697937], "attn_time": [0.4915199875831604], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 18, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5017600059509277], "cross_attn_time": [0.532480001449585], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 18, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5083520412445068], "attn_time": [0.502784013748169], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 18, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 18, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 18, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 18, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 18, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.502784013748169], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 18, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 18, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.499135971069336], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 18, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [14.97599983215332], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5130239725112915], "mlp_time": [13.750271797180176]}, "current": {"num_steps": 20, "step": 18, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 18, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5109760165214539], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 18, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 18, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 18, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.502784013748169], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 18, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 18, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 19, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4960639476776123], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 19, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 19, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5032320022583008], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 19, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.481727957725525], "attn_time": [0.48025599122047424], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 19, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.511423945426941], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 19, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.499135971069336], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 19, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5063040256500244], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 19, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 19, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 19, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 19, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.5007359981536865], "cross_attn_time": [0.532480001449585], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 19, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4950400590896606], "attn_time": [0.48947200179100037], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 19, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 19, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 19, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5298559665679932], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 19, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 19, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.506879985332489], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 19, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5431679487228394], "attn_time": [0.5171200037002563], "cross_attn_time": [0.52019202709198], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 19, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5431679487228394], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2959359884262085]}, "current": {"num_steps": 20, "step": 19, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5058559775352478], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 19, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 19, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 19, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 19, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5349760055541992], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 19, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.4915199875831604], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 19, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.516543984413147], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 19, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.504256010055542], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 19, "layer": 27, "is_force_fresh": true, "module": "mlp"}}] ================================================ FILE: PixArt-alpha-ToCa/tools/VLM_caption_lightning.py ================================================ # {'model': 'LLaVA-7B-v0', 'prompt': 'You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab.You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.Follow the instructions carefully and explain your answers in detail.###Human: Hi!###Assistant: Hi there! How can I help you today?\n###Human: ?\n###Assistant:', 'temperature': 0.2, 'max_new_tokens': 512, 'stop': '###', 'images': "List of 1 images: ['793f00027d3dc5bd69445a388a2f289c']"} import sys from pathlib import Path current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) import argparse import torch from transformers import AutoTokenizer, CLIPImageProcessor, CLIPVisionModel, AutoConfig from diffusion.model.llava import LlavaMPTForCausalLM from PIL import Image from tqdm import tqdm from os import path, makedirs from torch.utils.data import Dataset, DataLoader import json DEFAULT_IMAGE_TOKEN = "" DEFAULT_IMAGE_PATCH_TOKEN = "" DEFAULT_IM_START_TOKEN = "" DEFAULT_IM_END_TOKEN = "" def expand2square(pil_img, background_color=(122, 116, 104)): width, height = pil_img.size if width == height: return pil_img elif width > height: result = Image.new(pil_img.mode, (width, width), background_color) result.paste(pil_img, (0, (width - height) // 2)) return result else: result = Image.new(pil_img.mode, (height, height), background_color) result.paste(pil_img, ((height - width) // 2, 0)) return result def pad2square(image): max_hw, min_hw = max(image.size), min(image.size) aspect_ratio = max_hw / min_hw max_len, min_len = 800, 400 shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw)) longest_edge = int(shortest_edge * aspect_ratio) W, H = image.size if H > W: H, W = longest_edge, shortest_edge else: H, W = shortest_edge, longest_edge image = image.resize((W, H)) return image def load_model(model_path): tokenizer = AutoTokenizer.from_pretrained(model_path) model = LlavaMPTForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False) tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) if mm_use_im_start_end: tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True) vision_tower = model.get_model().vision_tower[0] if vision_tower.device.type == 'meta': vision_tower = CLIPVisionModel.from_pretrained( vision_tower.config._name_or_path, torch_dtype=torch.float16, low_cpu_mem_usage=True).cuda() model.get_model().vision_tower[0] = vision_tower else: vision_tower.to(device='cuda', dtype=torch.float16) vision_config = vision_tower.config vision_config.im_patch_token = tokenizer.convert_tokens_to_ids( [DEFAULT_IMAGE_PATCH_TOKEN])[0] vision_config.use_im_start_end = mm_use_im_start_end if mm_use_im_start_end: vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids( [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN]) model.cuda() if hasattr(model.config, "max_sequence_length"): context_len = model.config.max_sequence_length else: context_len = 2048 return tokenizer, model, context_len class SanitizedLaion(Dataset): def __init__(self, root_dir, index_file, prompt, config, img_extension='.jpg', caption=True) -> None: super().__init__() self.root_dir = root_dir self.image_processor = CLIPImageProcessor.from_pretrained(AutoConfig.from_pretrained(config).mm_vision_tower, torch_dtype=torch.float16) self.prompt = prompt self.img_extension = img_extension self.caption=caption if '.txt' in index_file: with open(index_file, 'r') as f: self.lines = f.readlines() elif '.json' in index_file: with open(index_file, 'r') as f: self.lines = json.load(f) else: raise ValueError(f'{index_file} format not supported') def __len__(self): return len(self.lines) def __getitem__(self, idx): item = self.lines[idx] caption = item['prompt'].strip() prompt = self.prompt.format(caption) if self.caption else self.prompt with open(path.join(self.root_dir, item['path']), 'rb') as f: img = pad2square(Image.open(f).convert('RGB')) return self.image_processor(img, return_tensors='pt')['pixel_values'].squeeze(), prompt, item['path'].split(self.img_extension)[0] @torch.no_grad() def caption(tokenizer, model, context_len, images, prompt, prefix): images = images.to(model.device, dtype=torch.float16) # HACK: 256 is the max image token length hacked replace_token = DEFAULT_IMAGE_PATCH_TOKEN * 256 if getattr(model.config, 'mm_use_im_start_end', False): replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN prompt = list(map(lambda p: p.replace(DEFAULT_IMAGE_TOKEN, replace_token), prompt)) temperature = 0.2 max_new_tokens = 1024 stop_str = '<|im_end|>' max_src_len = context_len - max_new_tokens - 8 input_ids = tokenizer(prompt).input_ids input_ids = list(map(lambda input_id: input_id[-max_src_len:], input_ids)) lens = list(map(lambda x: len(x), input_ids)) longest = max(lens) input_ids = list(map(lambda x: x if len(x) == longest else [tokenizer.pad_token_id] * (longest - len(x)) + x, input_ids)) pred_ids = torch.zeros([images.shape[0], 0], device=model.device, dtype=torch.long) past_key_values = None finish = [False] * images.shape[0] for i in tqdm(range(max_new_tokens), leave=False): if i == 0: out = model( torch.as_tensor(input_ids).cuda(), use_cache=True, images=images) del images else: attention_mask = torch.ones(1, past_key_values[0][0].shape[-2] + 1, device="cuda") out = model(input_ids=token, use_cache=True, attention_mask=attention_mask, past_key_values=past_key_values) past_key_values = out.past_key_values logits = out.logits last_token_logits = logits[:, -1] if temperature < 1e-4: token = torch.argmax(last_token_logits) else: probs = torch.softmax(last_token_logits / temperature, dim=-1) token = torch.multinomial(probs, num_samples=1) pred_ids = torch.concatenate([pred_ids, token], dim=1) for ii in torch.nonzero(token.cpu() == tokenizer.eos_token_id, as_tuple=True)[0]: if finish[ii]: continue ii = int(ii) output = tokenizer.decode(pred_ids[ii][:-1]).removesuffix(stop_str) finish[ii] = True yield output, prefix[ii] if all(finish): break if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model-path", type=str, default="liuhaotian/LLaVA-Lightning-MPT-7B-preview") parser.add_argument("--data-root", type=str, required=True) parser.add_argument('--index', type=str, required=True) parser.add_argument('--output', type=str, required=True) args = parser.parse_args() prompt = """<|im_start|>system - You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab. - You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language. - You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user Given the caption of this image "{}", describe this image in a very detailed manner <|im_end|><|im_start|>assistant\n""" prompt_nocap = """<|im_start|>system - You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab. - You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language. - You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user Describe this image in a very detailed manner <|im_end|><|im_start|>assistant\n""" d = SanitizedLaion(args.data_root, args.index, prompt, args.model_path, img_extension='.png') l = DataLoader(d, batch_size=32, pin_memory=True, num_workers=10) tokenizer, model, context_len = load_model(args.model_path) # model = torch.compile(model) for b in tqdm(l): for c, p in caption(tokenizer, model, context_len, *b): o = path.join(args.output, f'{p}.txt') makedirs(path.dirname(o), exist_ok=True, mode=0o755) with open(o, 'w') as k: k.write(c) ================================================ FILE: PixArt-alpha-ToCa/tools/convert_pixart_alpha_to_diffusers.py ================================================ import argparse import os import torch from transformers import T5EncoderModel, T5Tokenizer from diffusers import AutoencoderKL, DPMSolverMultistepScheduler, PixArtAlphaPipeline, Transformer2DModel ckpt_id = "PixArt-alpha/PixArt-alpha" # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/scripts/inference.py#L125 interpolation_scale = {256: 0.5, 512: 1, 1024: 2} def main(args): all_state_dict = torch.load(args.orig_ckpt_path, map_location='cpu') state_dict = all_state_dict.pop("state_dict") converted_state_dict = {} # Patch embeddings. converted_state_dict["pos_embed.proj.weight"] = state_dict.pop("x_embedder.proj.weight") converted_state_dict["pos_embed.proj.bias"] = state_dict.pop("x_embedder.proj.bias") # Caption projection. converted_state_dict["caption_projection.linear_1.weight"] = state_dict.pop("y_embedder.y_proj.fc1.weight") converted_state_dict["caption_projection.linear_1.bias"] = state_dict.pop("y_embedder.y_proj.fc1.bias") converted_state_dict["caption_projection.linear_2.weight"] = state_dict.pop("y_embedder.y_proj.fc2.weight") converted_state_dict["caption_projection.linear_2.bias"] = state_dict.pop("y_embedder.y_proj.fc2.bias") # AdaLN-single LN converted_state_dict["adaln_single.emb.timestep_embedder.linear_1.weight"] = state_dict.pop( "t_embedder.mlp.0.weight" ) converted_state_dict["adaln_single.emb.timestep_embedder.linear_1.bias"] = state_dict.pop("t_embedder.mlp.0.bias") converted_state_dict["adaln_single.emb.timestep_embedder.linear_2.weight"] = state_dict.pop( "t_embedder.mlp.2.weight" ) converted_state_dict["adaln_single.emb.timestep_embedder.linear_2.bias"] = state_dict.pop("t_embedder.mlp.2.bias") if args.image_size == 1024 and args.multi_scale_train: # Resolution. converted_state_dict["adaln_single.emb.resolution_embedder.linear_1.weight"] = state_dict.pop( "csize_embedder.mlp.0.weight" ) converted_state_dict["adaln_single.emb.resolution_embedder.linear_1.bias"] = state_dict.pop( "csize_embedder.mlp.0.bias" ) converted_state_dict["adaln_single.emb.resolution_embedder.linear_2.weight"] = state_dict.pop( "csize_embedder.mlp.2.weight" ) converted_state_dict["adaln_single.emb.resolution_embedder.linear_2.bias"] = state_dict.pop( "csize_embedder.mlp.2.bias" ) # Aspect ratio. converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_1.weight"] = state_dict.pop( "ar_embedder.mlp.0.weight" ) converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_1.bias"] = state_dict.pop( "ar_embedder.mlp.0.bias" ) converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_2.weight"] = state_dict.pop( "ar_embedder.mlp.2.weight" ) converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_2.bias"] = state_dict.pop( "ar_embedder.mlp.2.bias" ) # Shared norm. converted_state_dict["adaln_single.linear.weight"] = state_dict.pop("t_block.1.weight") converted_state_dict["adaln_single.linear.bias"] = state_dict.pop("t_block.1.bias") for depth in range(28): # Transformer blocks. converted_state_dict[f"transformer_blocks.{depth}.scale_shift_table"] = state_dict.pop( f"blocks.{depth}.scale_shift_table" ) # Attention is all you need 🤘 # Self attention. q, k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.attn.qkv.weight"), 3, dim=0) q_bias, k_bias, v_bias = torch.chunk(state_dict.pop(f"blocks.{depth}.attn.qkv.bias"), 3, dim=0) converted_state_dict[f"transformer_blocks.{depth}.attn1.to_q.weight"] = q converted_state_dict[f"transformer_blocks.{depth}.attn1.to_q.bias"] = q_bias converted_state_dict[f"transformer_blocks.{depth}.attn1.to_k.weight"] = k converted_state_dict[f"transformer_blocks.{depth}.attn1.to_k.bias"] = k_bias converted_state_dict[f"transformer_blocks.{depth}.attn1.to_v.weight"] = v converted_state_dict[f"transformer_blocks.{depth}.attn1.to_v.bias"] = v_bias # Projection. converted_state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.weight"] = state_dict.pop( f"blocks.{depth}.attn.proj.weight" ) converted_state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.bias"] = state_dict.pop( f"blocks.{depth}.attn.proj.bias" ) # Feed-forward. converted_state_dict[f"transformer_blocks.{depth}.ff.net.0.proj.weight"] = state_dict.pop( f"blocks.{depth}.mlp.fc1.weight" ) converted_state_dict[f"transformer_blocks.{depth}.ff.net.0.proj.bias"] = state_dict.pop( f"blocks.{depth}.mlp.fc1.bias" ) converted_state_dict[f"transformer_blocks.{depth}.ff.net.2.weight"] = state_dict.pop( f"blocks.{depth}.mlp.fc2.weight" ) converted_state_dict[f"transformer_blocks.{depth}.ff.net.2.bias"] = state_dict.pop( f"blocks.{depth}.mlp.fc2.bias" ) # Cross-attention. q = state_dict.pop(f"blocks.{depth}.cross_attn.q_linear.weight") q_bias = state_dict.pop(f"blocks.{depth}.cross_attn.q_linear.bias") k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.cross_attn.kv_linear.weight"), 2, dim=0) k_bias, v_bias = torch.chunk(state_dict.pop(f"blocks.{depth}.cross_attn.kv_linear.bias"), 2, dim=0) converted_state_dict[f"transformer_blocks.{depth}.attn2.to_q.weight"] = q converted_state_dict[f"transformer_blocks.{depth}.attn2.to_q.bias"] = q_bias converted_state_dict[f"transformer_blocks.{depth}.attn2.to_k.weight"] = k converted_state_dict[f"transformer_blocks.{depth}.attn2.to_k.bias"] = k_bias converted_state_dict[f"transformer_blocks.{depth}.attn2.to_v.weight"] = v converted_state_dict[f"transformer_blocks.{depth}.attn2.to_v.bias"] = v_bias converted_state_dict[f"transformer_blocks.{depth}.attn2.to_out.0.weight"] = state_dict.pop( f"blocks.{depth}.cross_attn.proj.weight" ) converted_state_dict[f"transformer_blocks.{depth}.attn2.to_out.0.bias"] = state_dict.pop( f"blocks.{depth}.cross_attn.proj.bias" ) # Final block. converted_state_dict["proj_out.weight"] = state_dict.pop("final_layer.linear.weight") converted_state_dict["proj_out.bias"] = state_dict.pop("final_layer.linear.bias") converted_state_dict["scale_shift_table"] = state_dict.pop("final_layer.scale_shift_table") # DiT XL/2 transformer = Transformer2DModel( sample_size=args.image_size // 8, num_layers=28, attention_head_dim=72, in_channels=4, out_channels=8, patch_size=2, attention_bias=True, num_attention_heads=16, cross_attention_dim=1152, activation_fn="gelu-approximate", num_embeds_ada_norm=1000, norm_type="ada_norm_single", norm_elementwise_affine=False, norm_eps=1e-6, caption_channels=4096, ) transformer.load_state_dict(converted_state_dict, strict=True) assert transformer.pos_embed.pos_embed is not None state_dict.pop("pos_embed") state_dict.pop("y_embedder.y_embedding") assert len(state_dict) == 0, f"State dict is not empty, {state_dict.keys()}" num_model_params = sum(p.numel() for p in transformer.parameters()) print(f"Total number of transformer parameters: {num_model_params}") if args.only_transformer: transformer.save_pretrained(os.path.join(args.dump_path, "transformer")) else: scheduler = DPMSolverMultistepScheduler() vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder="sd-vae-ft-ema") tokenizer = T5Tokenizer.from_pretrained(ckpt_id, subfolder="t5-v1_1-xxl") text_encoder = T5EncoderModel.from_pretrained(ckpt_id, subfolder="t5-v1_1-xxl") pipeline = PixArtAlphaPipeline( tokenizer=tokenizer, text_encoder=text_encoder, transformer=transformer, vae=vae, scheduler=scheduler ) pipeline.save_pretrained(args.dump_path) if __name__ == "__main__": parser = argparse.ArgumentParser() # set multi_scale_train=True if using PixArtMS structure during training else set it to False parser.add_argument("--multi_scale_train", default=True, type=str, required=True, help="If use Multi-Scale PixArtMS structure during training.") parser.add_argument("--orig_ckpt_path", default=None, type=str, required=False, help="Path to the checkpoint to convert.") parser.add_argument( "--image_size", default=1024, type=int, choices=[256, 512, 1024], required=False, help="Image size of pretrained model, either 512 or 1024.", ) parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output pipeline.") parser.add_argument("--only_transformer", default=True, type=bool, required=True) args = parser.parse_args() main(args) ================================================ FILE: PixArt-alpha-ToCa/tools/download.py ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. """ Functions for downloading pre-trained PixArt models """ from torchvision.datasets.utils import download_url import torch import os import argparse pretrained_models = {'PixArt-XL-2-512x512.pth', 'PixArt-XL-2-1024-MS.pth'} vae_models = { 'sd-vae-ft-ema/config.json', 'sd-vae-ft-ema/diffusion_pytorch_model.bin' } t5_models = { 't5-v1_1-xxl/config.json', 't5-v1_1-xxl/pytorch_model-00001-of-00002.bin', 't5-v1_1-xxl/pytorch_model-00002-of-00002.bin', 't5-v1_1-xxl/pytorch_model.bin.index.json', 't5-v1_1-xxl/special_tokens_map.json', 't5-v1_1-xxl/spiece.model', 't5-v1_1-xxl/tokenizer_config.json', } def find_model(model_name): """ Finds a pre-trained G.pt model, downloading it if necessary. Alternatively, loads a model from a local path. """ if model_name in pretrained_models: return download_model(model_name) assert os.path.isfile(model_name), f'Could not find PixArt checkpoint at {model_name}' return torch.load(model_name, map_location=lambda storage, loc: storage) def download_model(model_name): """ Downloads a pre-trained PixArt model from the web. """ assert model_name in pretrained_models local_path = f'output/pretrained_models/{model_name}' if not os.path.isfile(local_path): os.makedirs('output/pretrained_models', exist_ok=True) web_path = f'https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/{model_name}' download_url(web_path, 'output/pretrained_models') return torch.load(local_path, map_location=lambda storage, loc: storage) def download_other(model_name, model_zoo, output_dir): """ Downloads a pre-trained PixArt model from the web. """ assert model_name in model_zoo local_path = os.path.join(output_dir, model_name) if not os.path.isfile(local_path): os.makedirs(output_dir, exist_ok=True) web_path = f'https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/{model_name}' print(web_path) download_url(web_path, os.path.join(output_dir, model_name.split('/')[0])) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--model_names', nargs='+', type=str, default=pretrained_models) args = parser.parse_args() model_names = args.model_names model_names = set(model_names) # Download PixArt checkpoints for t5_model in t5_models: download_other(t5_model, t5_models, 'output/pretrained_models/t5_ckpts') for vae_model in vae_models: download_other(vae_model, vae_models, 'output/pretrained_models/') for model in model_names: download_model(model) # for vae_model in vae_models: print('Done.') ================================================ FILE: PixArt-alpha-ToCa/tools/extract_features.py ================================================ import os from pathlib import Path import sys current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) from PIL import Image import torch from torchvision import transforms as T import numpy as np import json from tqdm import tqdm import argparse import threading from queue import Queue from pathlib import Path from torch.utils.data import DataLoader, RandomSampler from accelerate import Accelerator from torchvision.transforms.functional import InterpolationMode from torchvision.datasets.folder import default_loader from diffusion.model.t5 import T5Embedder from diffusers.models import AutoencoderKL from diffusion.data.datasets.InternalData import InternalData from diffusion.utils.misc import SimpleTimer from diffusion.utils.data_sampler import AspectRatioBatchSampler from diffusion.data.builder import DATASETS from diffusion.data import ASPECT_RATIO_512, ASPECT_RATIO_1024 def get_closest_ratio(height: float, width: float, ratios: dict): aspect_ratio = height / width closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio)) return ratios[closest_ratio], float(closest_ratio) @DATASETS.register_module() class DatasetMS(InternalData): def __init__(self, root, image_list_json=None, transform=None, resolution=1024, load_vae_feat=False, aspect_ratio_type=None, start_index=0, end_index=100000000, **kwargs): if image_list_json is None: image_list_json = ['data_info.json'] assert os.path.isabs(root), 'root must be a absolute path' self.root = root self.img_dir_name = 'InternalImgs' # need to change to according to your data structure self.json_dir_name = 'InternalData' # need to change to according to your data structure self.transform = transform self.load_vae_feat = load_vae_feat self.resolution = resolution self.meta_data_clean = [] self.img_samples = [] self.txt_feat_samples = [] self.aspect_ratio = aspect_ratio_type assert self.aspect_ratio in [ASPECT_RATIO_1024, ASPECT_RATIO_512] self.ratio_index = {} self.ratio_nums = {} for k, v in self.aspect_ratio.items(): self.ratio_index[float(k)] = [] # used for self.getitem self.ratio_nums[float(k)] = 0 # used for batch-sampler image_list_json = image_list_json if isinstance(image_list_json, list) else [image_list_json] for json_file in image_list_json: meta_data = self.load_json(os.path.join(self.root, 'partition', json_file)) meta_data_clean = [item for item in meta_data if item['ratio'] <= 4] self.meta_data_clean.extend(meta_data_clean) self.img_samples.extend([os.path.join(self.root.replace(self.json_dir_name, self.img_dir_name), item['path']) for item in meta_data_clean]) self.img_samples = self.img_samples[start_index: end_index] # scan the dataset for ratio static for i, info in enumerate(self.meta_data_clean[:len(self.meta_data_clean)//3]): ori_h, ori_w = info['height'], info['width'] closest_size, closest_ratio = get_closest_ratio(ori_h, ori_w, self.aspect_ratio) self.ratio_nums[closest_ratio] += 1 if len(self.ratio_index[closest_ratio]) == 0: self.ratio_index[closest_ratio].append(i) # Set loader and extensions if self.load_vae_feat: raise ValueError("No VAE loader here") self.loader = default_loader def __getitem__(self, idx): data_info = {} for _ in range(20): try: img_path = self.img_samples[idx] img = self.loader(img_path) if self.transform: img = self.transform(img) # Calculate closest aspect ratio and resize & crop image[w, h] if isinstance(img, Image.Image): h, w = (img.size[1], img.size[0]) assert h, w == (self.meta_data_clean[idx]['height'], self.meta_data_clean[idx]['width']) closest_size, closest_ratio = get_closest_ratio(h, w, self.aspect_ratio) closest_size = list(map(lambda x: int(x), closest_size)) transform = T.Compose([ T.Lambda(lambda img: img.convert('RGB')), T.Resize(closest_size, interpolation=InterpolationMode.BICUBIC), # Image.BICUBIC T.CenterCrop(closest_size), T.ToTensor(), T.Normalize([.5], [.5]), ]) img = transform(img) data_info['img_hw'] = torch.tensor([h, w], dtype=torch.float32) data_info['aspect_ratio'] = closest_ratio # change the path according to your data structure return img, '_'.join(self.img_samples[idx].rsplit('/', 2)[-2:]) # change from 'serial-number-of-dir/serial-number-of-image.png' ---> 'serial-number-of-dir_serial-number-of-image.png' except Exception as e: print(f"Error details: {str(e)}") idx = np.random.randint(len(self)) raise RuntimeError('Too many bad data.') def get_data_info(self, idx): data_info = self.meta_data_clean[idx] return {'height': data_info['height'], 'width': data_info['width']} def extract_caption_t5_do(q): while not q.empty(): item = q.get() extract_caption_t5_job(item) q.task_done() def extract_caption_t5_job(item): global mutex global t5 global t5_save_dir with torch.no_grad(): caption = item['prompt'].strip() if isinstance(caption, str): caption = [caption] save_path = os.path.join(t5_save_dir, Path(item['path']).stem) if os.path.exists(f"{save_path}.npz"): return try: mutex.acquire() caption_emb, emb_mask = t5.get_text_embeddings(caption) mutex.release() emb_dict = { 'caption_feature': caption_emb.float().cpu().data.numpy(), 'attention_mask': emb_mask.cpu().data.numpy(), } np.savez_compressed(save_path, **emb_dict) except Exception as e: print(e) def extract_caption_t5(): global t5 global t5_save_dir # global images_extension t5 = T5Embedder(device="cuda", local_cache=True, cache_dir=f'{args.pretrained_models_dir}/t5_ckpts', model_max_length=120) t5_save_dir = args.t5_save_root os.makedirs(t5_save_dir, exist_ok=True) train_data_json = json.load(open(args.json_path, 'r')) train_data = train_data_json[args.start_index: args.end_index] global mutex mutex = threading.Lock() jobs = Queue() for item in tqdm(train_data): jobs.put(item) for _ in range(20): worker = threading.Thread(target=extract_caption_t5_do, args=(jobs,)) worker.start() jobs.join() def extract_img_vae_do(q): while not q.empty(): item = q.get() extract_img_vae_job(item) q.task_done() def extract_img_vae_job(item): return def extract_img_vae(): vae = AutoencoderKL.from_pretrained(f'{args.pretrained_models_dir}/sd-vae-ft-ema').to(device) train_data_json = json.load(open(args.json_path, 'r')) image_names = set() vae_save_root = f'{args.vae_save_root}/{image_resize}resolution' os.umask(0o000) # file permission: 666; dir permission: 777 os.makedirs(vae_save_root, exist_ok=True) vae_save_dir = os.path.join(vae_save_root, 'noflip') os.makedirs(vae_save_dir, exist_ok=True) for item in train_data_json: image_name = item['path'] if image_name in image_names: continue image_names.add(image_name) lines = sorted(image_names) lines = lines[args.start_index: args.end_index] _, images_extension = os.path.splitext(lines[0]) transform = T.Compose([ T.Lambda(lambda img: img.convert('RGB')), T.Resize(image_resize), # Image.BICUBIC T.CenterCrop(image_resize), T.ToTensor(), T.Normalize([.5], [.5]), ]) os.umask(0o000) # file permission: 666; dir permission: 777 for image_name in tqdm(lines): save_path = os.path.join(vae_save_dir, Path(image_name).stem) if os.path.exists(f"{save_path}.npy"): continue try: img = Image.open(f'{args.dataset_root}/{image_name}') img = transform(img).to(device)[None] with torch.no_grad(): posterior = vae.encode(img).latent_dist z = torch.cat([posterior.mean, posterior.std], dim=1).detach().cpu().numpy().squeeze() np.save(save_path, z) except Exception as e: print(e) print(image_name) def save_results(results, paths, signature, work_dir): timer = SimpleTimer(len(results), log_interval=100, desc="Saving Results") # save to npy new_paths = [] os.umask(0o000) # file permission: 666; dir permission: 777 for res, p in zip(results, paths): file_name = p.split('.')[0] + '.npy' new_folder = signature save_folder = os.path.join(work_dir, new_folder) if os.path.exists(save_folder): raise FileExistsError(f"{save_folder} exists. BE careful not to overwrite your files. Comment this error raising for overwriting!!") os.makedirs(save_folder, exist_ok=True) new_paths.append(os.path.join(new_folder, file_name)) np.save(os.path.join(save_folder, file_name), res) timer.log() # save paths with open(os.path.join(work_dir, f"VAE-{signature}.txt"), 'w') as f: f.write('\n'.join(new_paths)) def inference(vae, dataloader, signature, work_dir): timer = SimpleTimer(len(dataloader), log_interval=100, desc="VAE-Inference") for batch in dataloader: with torch.no_grad(): with torch.cuda.amp.autocast(enabled=True): posterior = vae.encode(batch[0]).latent_dist results = torch.cat([posterior.mean, posterior.std], dim=1).detach().cpu().numpy() path = batch[1] save_results(results, path, signature=signature, work_dir=work_dir) timer.log() def extract_img_vae_multiscale(bs=1): assert image_resize in [512, 1024] work_dir = os.path.abspath(args.vae_save_root) os.umask(0o000) # file permission: 666; dir permission: 777 os.makedirs(work_dir, exist_ok=True) accelerator = Accelerator(mixed_precision='fp16') vae = AutoencoderKL.from_pretrained(f'{args.pretrained_models_dir}/sd-vae-ft-ema').to(device) signature = 'ms' aspect_ratio_type = ASPECT_RATIO_1024 if image_resize == 1024 else ASPECT_RATIO_512 dataset = DatasetMS(args.dataset_root, image_list_json=[args.json_file], transform=None, sample_subset=None, aspect_ratio_type=aspect_ratio_type, start_index=args.start_index, end_index=args.end_index) # create AspectRatioBatchSampler sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset, batch_size=bs, aspect_ratios=dataset.aspect_ratio, ratio_nums=dataset.ratio_nums) # create DataLoader dataloader = DataLoader(dataset, batch_sampler=sampler, num_workers=13, pin_memory=True) dataloader = accelerator.prepare(dataloader, ) inference(vae, dataloader, signature=signature, work_dir=work_dir) accelerator.wait_for_everyone() print('done') def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--multi_scale", action='store_true', default=False, help="multi-scale feature extraction") parser.add_argument("--img_size", default=512, type=int, help="image scale for multi-scale feature extraction") parser.add_argument('--start_index', default=0, type=int) parser.add_argument('--end_index', default=1000000, type=int) parser.add_argument('--json_path', type=str) parser.add_argument('--t5_save_root', default='data/data_toy/caption_feature_wmask', type=str) parser.add_argument('--vae_save_root', default='data/data_toy/img_vae_features', type=str) parser.add_argument('--dataset_root', default='data/data_toy', type=str) parser.add_argument('--pretrained_models_dir', default='output/pretrained_models', type=str) ### for multi-scale(ms) vae feauture extraction parser.add_argument('--json_file', type=str) return parser.parse_args() if __name__ == '__main__': args = get_args() device = "cuda" if torch.cuda.is_available() else "cpu" image_resize = args.img_size # prepare extracted caption t5 features for training extract_caption_t5() # prepare extracted image vae features for training if args.multi_scale: print(f'Extracting Multi-scale Image Resolution based on {image_resize}') extract_img_vae_multiscale(bs=1) # recommend bs = 1 for AspectRatioBatchSampler else: print(f'Extracting Single Image Resolution {image_resize}') extract_img_vae() ================================================ FILE: PixArt-alpha-ToCa/train.sh ================================================ CUDA_VISIBLE_DEVICES=5,6,7 python -m torch.distributed.launch --nproc_per_node=3 \ --master_port=26662 train_scripts/train_controlnet.py \ configs/pixart_app_config/PixArt_xl2_img1024_controlHed_Half.py \ --work-dir output/debug ================================================ FILE: PixArt-alpha-ToCa/train_latents.py ================================================ import os import sys import types from pathlib import Path current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) import argparse import datetime import time import warnings warnings.filterwarnings("ignore") # ignore warning import torch import torch.nn as nn from accelerate import Accelerator, InitProcessGroupKwargs from accelerate.utils import DistributedType from diffusers.models import AutoencoderKL from torch.utils.data import RandomSampler from mmcv.runner import LogBuffer from copy import deepcopy from PIL import Image import numpy as np from diffusion import IDDPM from diffusion.utils.checkpoint import save_checkpoint, load_checkpoint from diffusion.utils.dist_utils import synchronize, get_world_size, clip_grad_norm_ from diffusion.data.builder import build_dataset, build_dataloader, set_data_root from diffusion.model.builder import build_model from diffusion.utils.logger import get_root_logger from diffusion.utils.misc import set_random_seed, read_config, init_random_seed, DebugUnderflowOverflow from diffusion.utils.optimizer import build_optimizer, auto_scale_lr from diffusion.utils.lr_scheduler import build_lr_scheduler from diffusion.utils.data_sampler import AspectRatioBatchSampler, BalancedAspectRatioBatchSampler def set_fsdp_env(): os.environ["ACCELERATE_USE_FSDP"] = 'true' os.environ["FSDP_AUTO_WRAP_POLICY"] = 'TRANSFORMER_BASED_WRAP' os.environ["FSDP_BACKWARD_PREFETCH"] = 'BACKWARD_PRE' os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = 'PixArtBlock' def ema_update(model_dest: nn.Module, model_src: nn.Module, rate): param_dict_src = dict(model_src.named_parameters()) for p_name, p_dest in model_dest.named_parameters(): p_src = param_dict_src[p_name] assert p_src is not p_dest p_dest.data.mul_(rate).add_((1 - rate) * p_src.data) def train(): if config.get('debug_nan', False): DebugUnderflowOverflow(model) logger.info('NaN debugger registered. Start to detect overflow during training.') time_start, last_tic = time.time(), time.time() log_buffer = LogBuffer() start_step = start_epoch * len(train_dataloader) global_step = 0 total_steps = len(train_dataloader) * config.num_epochs # load_vae_feat = getattr(train_dataloader.dataset, 'load_vae_feat', False) # Now you train the model for epoch in range(start_epoch + 1, config.num_epochs + 1): data_time_start= time.time() data_time_all = 0 for step, batch in enumerate(train_dataloader): data_time_all += time.time() - data_time_start # if load_vae_feat: z = batch[0] # else: # with torch.no_grad(): # with torch.cuda.amp.autocast(enabled=config.mixed_precision == 'fp16'): # posterior = vae.encode(batch[0]).latent_dist # if config.sample_posterior: # z = posterior.sample() # else: # z = posterior.mode() clean_images = z * config.scale_factor y = batch[1] y_mask = batch[2] data_info = batch[3] # Sample a random timestep for each image bs = clean_images.shape[0] timesteps = torch.randint(0, config.train_sampling_steps, (bs,), device=clean_images.device).long() grad_norm = None with accelerator.accumulate(model): # Predict the noise residual optimizer.zero_grad() loss_term = train_diffusion.training_losses(model, clean_images, timesteps, model_kwargs=dict(y=y, mask=y_mask, data_info=data_info)) loss = loss_term['loss'].mean() accelerator.backward(loss) if accelerator.sync_gradients: grad_norm = accelerator.clip_grad_norm_(model.parameters(), config.gradient_clip) optimizer.step() lr_scheduler.step() if accelerator.sync_gradients: ema_update(model_ema, model, config.ema_rate) lr = lr_scheduler.get_last_lr()[0] logs = {args.loss_report_name: accelerator.gather(loss).mean().item()} if grad_norm is not None: logs.update(grad_norm=accelerator.gather(grad_norm).mean().item()) log_buffer.update(logs) # logging on terminal if (step + 1) % config.log_interval == 0 or (step + 1) == 1: t = (time.time() - last_tic) / config.log_interval t_d = data_time_all / config.log_interval avg_time = (time.time() - time_start) / (global_step + 1) eta = str(datetime.timedelta(seconds=int(avg_time * (total_steps - start_step - global_step - 1)))) eta_epoch = str(datetime.timedelta(seconds=int(avg_time * (len(train_dataloader) - step - 1)))) # avg_loss = sum(loss_buffer) / len(loss_buffer) log_buffer.average() info = f"Step/Epoch [{(epoch-1)*len(train_dataloader)+step+1}/{epoch}][{step + 1}/{len(train_dataloader)}]:total_eta: {eta}, " \ f"epoch_eta:{eta_epoch}, time_all:{t:.3f}, time_data:{t_d:.3f}, lr:{lr:.3e}, s:({model.module.h}, {model.module.w}), " info += ', '.join([f"{k}:{v:.4f}" for k, v in log_buffer.output.items()]) logger.info(info) last_tic = time.time() log_buffer.clear() data_time_all = 0 logs.update(lr=lr) accelerator.log(logs, step=global_step + start_step) global_step += 1 data_time_start= time.time() synchronize() if accelerator.is_main_process: if ((epoch - 1) * len(train_dataloader) + step + 1) % config.save_model_steps == 0: os.umask(0o000) save_checkpoint(os.path.join(config.work_dir, 'checkpoints'), epoch=epoch, step=(epoch - 1) * len(train_dataloader) + step + 1, model=accelerator.unwrap_model(model), model_ema=accelerator.unwrap_model(model_ema), optimizer=optimizer, lr_scheduler=lr_scheduler ) synchronize() synchronize() if accelerator.is_main_process: if epoch % config.save_model_epochs == 0 or epoch == config.num_epochs: os.umask(0o000) save_checkpoint(os.path.join(config.output_dir, 'checkpoints'), epoch=epoch, step=(epoch - 1) * len(train_dataloader) + step + 1, model=accelerator.unwrap_model(model), model_ema=accelerator.unwrap_model(model_ema), optimizer=optimizer, lr_scheduler=lr_scheduler ) ########### EVAL ################### if epoch % config.save_image_epochs == 0 or epoch == config.num_epochs: if config.validation_prompts is not None: logger.info("Running inference for collecting generated images...") assert config.eval_sampler in ['iddpm', 'dpm-solver', 'sa-solver'] sample_steps_dict = {'iddpm': 100, 'dpm-solver': 20, 'sa-solver': 25} sample_steps = config.eval_steps if config.eval_steps != -1 else sample_steps_dict[config.eval_sampler] # base_ratios = eval(f'ASPECT_RATIO_{config.image_size}_TEST') eval_dir = os.path.join(config.output_dir, 'eval') os.makedirs(eval_dir, exist_ok=True) save_path = os.path.join(eval_dir, f'{epoch}_{global_step}.png') model.eval() images = [] # device = t5.device for ip, prompt in enumerate(config.validation_prompts): prompts = [prompt] # prompts = [] # prompt_clean, _, hw, ar, custom_hw = prepare_prompt_ar(prompt, base_ratios, device=device, show=False) # ar for aspect ratio # if config.image_size == 1024: # latent_size_h, latent_size_w = int(hw[0, 0] // 8), int(hw[0, 1] // 8) # else: # hw = torch.tensor([[config.image_size, config.image_size]], dtype=torch.float, device=device).repeat(bs, 1) # ar = torch.tensor([[1.]], device=device).repeat(bs, 1) # latent_size_h, latent_size_w = latent_size, latent_size # prompts.append(prompt_clean.strip()) null_y = model.module.y_embedder.y_embedding[None].repeat(len(prompts), 1, 1)[:, None] with torch.no_grad(): caption_embs, emb_masks, len_prompts = val_txt_embs[ip] # caption_embs, emb_masks = t5.get_text_embeddings(prompts) # caption_embs = caption_embs.float()[:, None] print(f'finish embedding') n = len_prompts if config.eval_sampler == 'iddpm': # Create sampling noise: z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device).repeat(2, 1, 1, 1) model_kwargs = dict(y=torch.cat([caption_embs, null_y]), cfg_scale=config.cfg_scale, data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks) diffusion = IDDPM(str(sample_steps)) # Sample images: samples = diffusion.p_sample_loop( model.module.forward_with_cfg, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device ) samples, _ = samples.chunk(2, dim=0) # Remove null class samples elif config.eval_sampler == 'dpm-solver': # Create sampling noise: z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device) model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks) dpm_solver = DPMS(model.module.forward_with_dpmsolver, condition=caption_embs, uncondition=null_y, cfg_scale=config.cfg_scale, model_kwargs=model_kwargs) samples = dpm_solver.sample( z, steps=sample_steps, order=2, skip_type="time_uniform", method="multistep", ) elif config.eval_sampler == 'sa-solver': # Create sampling noise: model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks) sa_solver = SASolverSampler(model.module.forward_with_dpmsolver, device=device) samples = sa_solver.sample( S=25, batch_size=n, shape=(4, latent_size_h, latent_size_w), eta=1, conditioning=caption_embs, unconditional_conditioning=null_y, unconditional_guidance_scale=config.cfg_scale, model_kwargs=model_kwargs, )[0] samples = vae.decode(samples / 0.18215).sample # decode image image = make_grid(samples, nrow=1, normalize=True, value_range=(-1, 1)) image = image.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy() image = Image.fromarray(image) images.append(image) image_grid = make_image_grid(images, 2, len(images)//2) image_grid.save(save_path) for tracker in accelerator.trackers: if tracker.name == "tensorboard": np_images = np.stack([np.asarray(img) for img in images]) tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC") elif tracker.name == "comet_ml": logger.info('Logging validation images') tracker.writer.log_image(image_grid, name=f"{epoch}", step=global_step) else: logger.warn(f"image logging not implemented for {tracker.name}") del images, image, samples, image_grid torch.cuda.empty_cache() model.train() synchronize() def parse_args(): parser = argparse.ArgumentParser(description="Process some integers.") parser.add_argument("config", type=str, help="config") parser.add_argument("--cloud", action='store_true', default=False, help="cloud or local machine") parser.add_argument('--work-dir', help='the dir to save logs and models') parser.add_argument('--resume-from', help='the dir to resume the training') parser.add_argument('--load-from', default=None, help='the dir to load a ckpt for training') parser.add_argument('--local-rank', type=int, default=-1) parser.add_argument('--local_rank', type=int, default=-1) parser.add_argument('--debug', action='store_true') parser.add_argument( "--report_to", type=str, default="tensorboard", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' ), ) parser.add_argument( "--tracker_project_name", type=str, default="text2image-fine-tune", help=( "The `project_name` argument passed to Accelerator.init_trackers for" " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator" ), ) parser.add_argument("--loss_report_name", type=str, default="loss") args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() config = read_config(args.config) if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None config.work_dir = args.work_dir if args.cloud: config.data_root = '/data/data' if args.resume_from is not None: config.load_from = None config.resume_from = dict( checkpoint=args.resume_from, load_ema=False, resume_optimizer=True, resume_lr_scheduler=True) if args.debug: config.log_interval = 1 config.train_batch_size = 8 config.valid_num = 100 os.umask(0o000) config.output_dir = os.path.join(config.work_dir, f"""{config.model}_{config.dataset_alias}_{config.image_size}_batch{config.train_batch_size}_{config.lr_schedule}_lr{config.optimizer['lr']}_warmup{config.lr_schedule_args['num_warmup_steps']}_gas{config.gradient_accumulation_steps}""") os.makedirs(config.output_dir, exist_ok=True) init_handler = InitProcessGroupKwargs() init_handler.timeout = datetime.timedelta(seconds=5400) # change timeout to avoid a strange NCCL bug # Initialize accelerator and tensorboard logging if config.use_fsdp: init_train = 'FSDP' from accelerate import FullyShardedDataParallelPlugin from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig set_fsdp_env() fsdp_plugin = FullyShardedDataParallelPlugin(state_dict_config=FullStateDictConfig(offload_to_cpu=False, rank0_only=False),) else: init_train = 'DDP' fsdp_plugin = None even_batches = True if config.multi_scale: even_batches=False, if args.report_to == "comet_ml": import comet_ml comet_ml.init( project_name=args.tracker_project_name, ) accelerator = Accelerator( mixed_precision=config.mixed_precision, gradient_accumulation_steps=config.gradient_accumulation_steps, log_with=args.report_to, project_dir=os.path.join(config.output_dir, "logs"), fsdp_plugin=fsdp_plugin, even_batches=even_batches, kwargs_handlers=[init_handler] ) logger = get_root_logger(os.path.join(config.output_dir, 'train_log.log')) config.seed = init_random_seed(config.get('seed', None)) set_random_seed(config.seed) if accelerator.is_main_process: config.dump(os.path.join(config.output_dir, 'config.py')) logger.info(f"Config: \n{config.pretty_text}") logger.info(f"World_size: {get_world_size()}, seed: {config.seed}") logger.info(f"Initializing: {init_train} for training") image_size = config.image_size # @param [256, 512] latent_size = int(image_size) // 8 pred_sigma = getattr(config, 'pred_sigma', True) learn_sigma = getattr(config, 'learn_sigma', True) and pred_sigma model_kwargs={"window_block_indexes": config.window_block_indexes, "window_size": config.window_size, "use_rel_pos": config.use_rel_pos, "lewei_scale": config.lewei_scale, 'config':config, 'model_max_length': config.model_max_length} if config.validation_prompts is not None: logger.info('Precompute validation prompt embeddings') from diffusion.model.utils import prepare_prompt_ar from diffusion import IDDPM, DPMS, SASolverSampler from diffusion.model.t5 import T5Embedder from diffusion.data.datasets import ASPECT_RATIO_256_TEST, ASPECT_RATIO_512_TEST, ASPECT_RATIO_1024_TEST from diffusers.utils import make_image_grid from torchvision.utils import make_grid t5 = T5Embedder(device="cuda", local_cache=True, cache_dir='output/pretrained_models/t5_ckpts', torch_dtype=torch.float) device = t5.device base_ratios = eval(f'ASPECT_RATIO_{config.image_size}_TEST') pbs = 1 val_txt_embs = [] for prompt in config.validation_prompts: prompts = [] prompt_clean, _, hw, ar, custom_hw = prepare_prompt_ar(prompt, base_ratios, device=device, show=False) # ar for aspect ratio if config.image_size == 1024: latent_size_h, latent_size_w = int(hw[0, 0] // 8), int(hw[0, 1] // 8) else: hw = torch.tensor([[config.image_size, config.image_size]], dtype=torch.float, device=device).repeat(pbs, 1) ar = torch.tensor([[1.]], device=device).repeat(pbs, 1) latent_size_h, latent_size_w = latent_size, latent_size prompts.append(prompt_clean.strip()) with torch.no_grad(): caption_embs, emb_masks = t5.get_text_embeddings(prompts) caption_embs = caption_embs.float()[:, None] val_txt_embs.append([caption_embs, emb_masks, len(prompts)]) del t5 import gc # garbage collect library gc.collect() torch.cuda.empty_cache() logger.info('[ DONE ]') # build models train_diffusion = IDDPM(str(config.train_sampling_steps), learn_sigma=learn_sigma, pred_sigma=pred_sigma, snr=config.snr_loss) model = build_model(config.model, config.grad_checkpointing, config.get('fp32_attention', False), input_size=latent_size, learn_sigma=learn_sigma, pred_sigma=pred_sigma, **model_kwargs).train() logger.info(f"{model.__class__.__name__} Model Parameters: {sum(p.numel() for p in model.parameters()):,}") logger.info(f"T5 max token length: {config.model_max_length}") model_ema = deepcopy(model).eval() if config.load_from is not None: if args.load_from is not None: config.load_from = args.load_from missing, unexpected = load_checkpoint(config.load_from, model, load_ema=config.get('load_ema', False)) logger.warning(f'Missing keys: {missing}') logger.warning(f'Unexpected keys: {unexpected}') ema_update(model_ema, model, 0.) if not config.data.load_vae_feat: vae = AutoencoderKL.from_pretrained(config.vae_pretrained).cuda() # prepare for FSDP clip grad norm calculation if accelerator.distributed_type == DistributedType.FSDP: for m in accelerator._models: m.clip_grad_norm_ = types.MethodType(clip_grad_norm_, m) # build dataloader set_data_root(config.data_root) dataset = build_dataset(config.data, resolution=image_size, aspect_ratio_type=config.aspect_ratio_type) if config.multi_scale: batch_sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset, batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, drop_last=True, ratio_nums=dataset.ratio_nums, config=config, valid_num=config.valid_num) # used for balanced sampling # batch_sampler = BalancedAspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset, # batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, # ratio_nums=dataset.ratio_nums) train_dataloader = build_dataloader(dataset, batch_sampler=batch_sampler, num_workers=config.num_workers) else: logger.info(f'Batch size {config.train_batch_size}') train_dataloader = build_dataloader(dataset, num_workers=config.num_workers, batch_size=config.train_batch_size, shuffle=True) # build optimizer and lr scheduler lr_scale_ratio = 1 if config.get('auto_lr', None): lr_scale_ratio = auto_scale_lr(config.train_batch_size * get_world_size() * config.gradient_accumulation_steps, config.optimizer, **config.auto_lr) optimizer = build_optimizer(model, config.optimizer) lr_scheduler = build_lr_scheduler(config, optimizer, train_dataloader, lr_scale_ratio) timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()) if accelerator.is_main_process: tracker_config = dict(vars(config)) accelerator.init_trackers(args.tracker_project_name, tracker_config) accelerator.get_tracker("comet_ml").writer.add_tags([config.model, config.dataset_alias, config.image_size, config.lr_schedule, f'bs{config.train_batch_size}', f'gs{config.gradient_accumulation_steps}' ]) start_epoch = 0 if config.resume_from is not None and config.resume_from['checkpoint'] is not None: start_epoch, missing, unexpected = load_checkpoint(**config.resume_from, model=model, model_ema=model_ema, optimizer=optimizer, lr_scheduler=lr_scheduler, ) logger.warning(f'Missing keys: {missing}') logger.warning(f'Unexpected keys: {unexpected}') # Prepare everything # There is no specific order to remember, you just need to unpack the # objects in the same order you gave them to the prepare method. model, model_ema = accelerator.prepare(model, model_ema) optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler) train() ================================================ FILE: PixArt-alpha-ToCa/train_scripts/train.py ================================================ import argparse import datetime import os import sys import time import types import warnings from copy import deepcopy from pathlib import Path import torch import torch.nn as nn from accelerate import Accelerator, InitProcessGroupKwargs from accelerate.utils import DistributedType from diffusers.models import AutoencoderKL from mmcv.runner import LogBuffer from torch.utils.data import RandomSampler from diffusion import IDDPM from diffusion.data.builder import build_dataset, build_dataloader, set_data_root from diffusion.model.builder import build_model from diffusion.utils.checkpoint import save_checkpoint, load_checkpoint from diffusion.utils.data_sampler import AspectRatioBatchSampler, BalancedAspectRatioBatchSampler from diffusion.utils.dist_utils import get_world_size, clip_grad_norm_ from diffusion.utils.logger import get_root_logger from diffusion.utils.lr_scheduler import build_lr_scheduler from diffusion.utils.misc import set_random_seed, read_config, init_random_seed, DebugUnderflowOverflow from diffusion.utils.optimizer import build_optimizer, auto_scale_lr warnings.filterwarnings("ignore") # ignore warning current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) def set_fsdp_env(): os.environ["ACCELERATE_USE_FSDP"] = 'true' os.environ["FSDP_AUTO_WRAP_POLICY"] = 'TRANSFORMER_BASED_WRAP' os.environ["FSDP_BACKWARD_PREFETCH"] = 'BACKWARD_PRE' os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = 'PixArtBlock' def ema_update(model_dest: nn.Module, model_src: nn.Module, rate): param_dict_src = dict(model_src.named_parameters()) for p_name, p_dest in model_dest.named_parameters(): p_src = param_dict_src[p_name] assert p_src is not p_dest p_dest.data.mul_(rate).add_((1 - rate) * p_src.data) def train(): if config.get('debug_nan', False): DebugUnderflowOverflow(model) logger.info('NaN debugger registered. Start to detect overflow during training.') time_start, last_tic = time.time(), time.time() log_buffer = LogBuffer() start_step = start_epoch * len(train_dataloader) global_step = 0 total_steps = len(train_dataloader) * config.num_epochs load_vae_feat = getattr(train_dataloader.dataset, 'load_vae_feat', False) # Now you train the model for epoch in range(start_epoch + 1, config.num_epochs + 1): data_time_start= time.time() data_time_all = 0 for step, batch in enumerate(train_dataloader): data_time_all += time.time() - data_time_start if load_vae_feat: z = batch[0] else: with torch.no_grad(): with torch.cuda.amp.autocast(enabled=config.mixed_precision == 'fp16'): posterior = vae.encode(batch[0]).latent_dist if config.sample_posterior: z = posterior.sample() else: z = posterior.mode() clean_images = z * config.scale_factor y = batch[1] y_mask = batch[2] data_info = batch[3] # Sample a random timestep for each image bs = clean_images.shape[0] timesteps = torch.randint(0, config.train_sampling_steps, (bs,), device=clean_images.device).long() grad_norm = None with accelerator.accumulate(model): # Predict the noise residual optimizer.zero_grad() loss_term = train_diffusion.training_losses(model, clean_images, timesteps, model_kwargs=dict(y=y, mask=y_mask, data_info=data_info)) loss = loss_term['loss'].mean() accelerator.backward(loss) if accelerator.sync_gradients: grad_norm = accelerator.clip_grad_norm_(model.parameters(), config.gradient_clip) optimizer.step() lr_scheduler.step() if accelerator.sync_gradients: ema_update(model_ema, model, config.ema_rate) lr = lr_scheduler.get_last_lr()[0] logs = {args.loss_report_name: accelerator.gather(loss).mean().item()} if grad_norm is not None: logs.update(grad_norm=accelerator.gather(grad_norm).mean().item()) log_buffer.update(logs) if (step + 1) % config.log_interval == 0 or (step + 1) == 1: t = (time.time() - last_tic) / config.log_interval t_d = data_time_all / config.log_interval avg_time = (time.time() - time_start) / (global_step + 1) eta = str(datetime.timedelta(seconds=int(avg_time * (total_steps - start_step - global_step - 1)))) eta_epoch = str(datetime.timedelta(seconds=int(avg_time * (len(train_dataloader) - step - 1)))) # avg_loss = sum(loss_buffer) / len(loss_buffer) log_buffer.average() info = f"Step/Epoch [{(epoch-1)*len(train_dataloader)+step+1}/{epoch}][{step + 1}/{len(train_dataloader)}]:total_eta: {eta}, " \ f"epoch_eta:{eta_epoch}, time_all:{t:.3f}, time_data:{t_d:.3f}, lr:{lr:.3e}, s:({model.module.h}, {model.module.w}), " info += ', '.join([f"{k}:{v:.4f}" for k, v in log_buffer.output.items()]) logger.info(info) last_tic = time.time() log_buffer.clear() data_time_all = 0 logs.update(lr=lr) accelerator.log(logs, step=global_step + start_step) global_step += 1 data_time_start= time.time() if ((epoch - 1) * len(train_dataloader) + step + 1) % config.save_model_steps == 0: accelerator.wait_for_everyone() if accelerator.is_main_process: os.umask(0o000) save_checkpoint(os.path.join(config.work_dir, 'checkpoints'), epoch=epoch, step=(epoch - 1) * len(train_dataloader) + step + 1, model=accelerator.unwrap_model(model), model_ema=accelerator.unwrap_model(model_ema), optimizer=optimizer, lr_scheduler=lr_scheduler ) if epoch % config.save_model_epochs == 0 or epoch == config.num_epochs: accelerator.wait_for_everyone() if accelerator.is_main_process: os.umask(0o000) save_checkpoint(os.path.join(config.work_dir, 'checkpoints'), epoch=epoch, step=(epoch - 1) * len(train_dataloader) + step + 1, model=accelerator.unwrap_model(model), model_ema=accelerator.unwrap_model(model_ema), optimizer=optimizer, lr_scheduler=lr_scheduler ) def parse_args(): parser = argparse.ArgumentParser(description="Process some integers.") parser.add_argument("config", type=str, help="config") parser.add_argument("--cloud", action='store_true', default=False, help="cloud or local machine") parser.add_argument('--work-dir', help='the dir to save logs and models') parser.add_argument('--resume-from', help='the dir to resume the training') parser.add_argument('--load-from', default=None, help='the dir to load a ckpt for training') parser.add_argument('--local-rank', type=int, default=-1) parser.add_argument('--local_rank', type=int, default=-1) parser.add_argument('--debug', action='store_true') parser.add_argument( "--report_to", type=str, default="tensorboard", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' ), ) parser.add_argument( "--tracker_project_name", type=str, default="text2image-fine-tune", help=( "The `project_name` argument passed to Accelerator.init_trackers for" " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator" ), ) parser.add_argument("--loss_report_name", type=str, default="loss") args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() config = read_config(args.config) if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None config.work_dir = args.work_dir if args.cloud: config.data_root = '/data/data' if args.resume_from is not None: config.load_from = None config.resume_from = dict( checkpoint=args.resume_from, load_ema=False, resume_optimizer=True, resume_lr_scheduler=True) if args.debug: config.log_interval = 1 config.train_batch_size = 8 config.valid_num = 100 os.umask(0o000) os.makedirs(config.work_dir, exist_ok=True) init_handler = InitProcessGroupKwargs() init_handler.timeout = datetime.timedelta(seconds=5400) # change timeout to avoid a strange NCCL bug # Initialize accelerator and tensorboard logging if config.use_fsdp: init_train = 'FSDP' from accelerate import FullyShardedDataParallelPlugin from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig set_fsdp_env() fsdp_plugin = FullyShardedDataParallelPlugin(state_dict_config=FullStateDictConfig(offload_to_cpu=False, rank0_only=False),) else: init_train = 'DDP' fsdp_plugin = None even_batches = True if config.multi_scale: even_batches=False, accelerator = Accelerator( mixed_precision=config.mixed_precision, gradient_accumulation_steps=config.gradient_accumulation_steps, log_with=args.report_to, project_dir=os.path.join(config.work_dir, "logs"), fsdp_plugin=fsdp_plugin, even_batches=even_batches, kwargs_handlers=[init_handler] ) logger = get_root_logger(os.path.join(config.work_dir, 'train_log.log')) config.seed = init_random_seed(config.get('seed', None)) set_random_seed(config.seed) if accelerator.is_main_process: config.dump(os.path.join(config.work_dir, 'config.py')) logger.info(f"Config: \n{config.pretty_text}") logger.info(f"World_size: {get_world_size()}, seed: {config.seed}") logger.info(f"Initializing: {init_train} for training") image_size = config.image_size # @param [256, 512, 1024] latent_size = int(image_size) // 8 pred_sigma = getattr(config, 'pred_sigma', True) learn_sigma = getattr(config, 'learn_sigma', True) and pred_sigma model_kwargs={"window_block_indexes": config.window_block_indexes, "window_size": config.window_size, "use_rel_pos": config.use_rel_pos, "lewei_scale": config.lewei_scale, 'config':config, 'model_max_length': config.model_max_length} # build models train_diffusion = IDDPM(str(config.train_sampling_steps), learn_sigma=learn_sigma, pred_sigma=pred_sigma, snr=config.snr_loss) model = build_model(config.model, config.grad_checkpointing, config.get('fp32_attention', False), input_size=latent_size, learn_sigma=learn_sigma, pred_sigma=pred_sigma, **model_kwargs).train() logger.info(f"{model.__class__.__name__} Model Parameters: {sum(p.numel() for p in model.parameters()):,}") model_ema = deepcopy(model).eval() if config.load_from is not None: if args.load_from is not None: config.load_from = args.load_from missing, unexpected = load_checkpoint(config.load_from, model, load_ema=config.get('load_ema', False)) logger.warning(f'Missing keys: {missing}') logger.warning(f'Unexpected keys: {unexpected}') ema_update(model_ema, model, 0.) if not config.data.load_vae_feat: vae = AutoencoderKL.from_pretrained(config.vae_pretrained).cuda() # prepare for FSDP clip grad norm calculation if accelerator.distributed_type == DistributedType.FSDP: for m in accelerator._models: m.clip_grad_norm_ = types.MethodType(clip_grad_norm_, m) # build dataloader set_data_root(config.data_root) dataset = build_dataset(config.data, resolution=image_size, aspect_ratio_type=config.aspect_ratio_type) if config.multi_scale: batch_sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset, batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, drop_last=True, ratio_nums=dataset.ratio_nums, config=config, valid_num=config.valid_num) # used for balanced sampling # batch_sampler = BalancedAspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset, # batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, # ratio_nums=dataset.ratio_nums) train_dataloader = build_dataloader(dataset, batch_sampler=batch_sampler, num_workers=config.num_workers) else: train_dataloader = build_dataloader(dataset, num_workers=config.num_workers, batch_size=config.train_batch_size, shuffle=True) # build optimizer and lr scheduler lr_scale_ratio = 1 if config.get('auto_lr', None): lr_scale_ratio = auto_scale_lr(config.train_batch_size * get_world_size() * config.gradient_accumulation_steps, config.optimizer, **config.auto_lr) optimizer = build_optimizer(model, config.optimizer) lr_scheduler = build_lr_scheduler(config, optimizer, train_dataloader, lr_scale_ratio) timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()) if accelerator.is_main_process: tracker_config = dict(vars(config)) try: accelerator.init_trackers(args.tracker_project_name, tracker_config) except: accelerator.init_trackers(f"tb_{timestamp}") start_epoch = 0 if config.resume_from is not None and config.resume_from['checkpoint'] is not None: start_epoch, missing, unexpected = load_checkpoint(**config.resume_from, model=model, model_ema=model_ema, optimizer=optimizer, lr_scheduler=lr_scheduler, ) logger.warning(f'Missing keys: {missing}') logger.warning(f'Unexpected keys: {unexpected}') # Prepare everything # There is no specific order to remember, you just need to unpack the # objects in the same order you gave them to the prepare method. model, model_ema = accelerator.prepare(model, model_ema) optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler) train() ================================================ FILE: PixArt-alpha-ToCa/train_scripts/train_controlnet.py ================================================ import argparse import datetime import os import sys import time import types import warnings from pathlib import Path current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) import torch from accelerate import Accelerator, InitProcessGroupKwargs from accelerate.utils import DistributedType from mmcv.runner import LogBuffer from torch.utils.data import RandomSampler from diffusion import IDDPM from diffusion.data.builder import build_dataset, build_dataloader, set_data_root from diffusion.model.builder import build_model from diffusion.model.nets import PixArtMS, ControlPixArtHalf, ControlPixArtMSHalf from diffusion.utils.checkpoint import save_checkpoint, load_checkpoint from diffusion.utils.data_sampler import AspectRatioBatchSampler, BalancedAspectRatioBatchSampler from diffusion.utils.dist_utils import synchronize, get_world_size, clip_grad_norm_ from diffusion.utils.logger import get_root_logger from diffusion.utils.lr_scheduler import build_lr_scheduler from diffusion.utils.misc import set_random_seed, read_config, init_random_seed, DebugUnderflowOverflow from diffusion.utils.optimizer import build_optimizer, auto_scale_lr warnings.filterwarnings("ignore") # ignore warning def set_fsdp_env(): os.environ["ACCELERATE_USE_FSDP"] = 'true' os.environ["FSDP_AUTO_WRAP_POLICY"] = 'TRANSFORMER_BASED_WRAP' os.environ["FSDP_BACKWARD_PREFETCH"] = 'BACKWARD_PRE' os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = 'PixArtBlock' def train(): if config.get('debug_nan', False): DebugUnderflowOverflow(model) logger.info('NaN debugger registered. Start to detect overflow during training.') time_start, last_tic = time.time(), time.time() log_buffer = LogBuffer() start_step = start_epoch * len(train_dataloader) global_step = 0 total_steps = len(train_dataloader) * config.num_epochs load_vae_feat = getattr(train_dataloader.dataset, 'load_vae_feat', False) if not load_vae_feat: raise ValueError("Only support load vae features for now.") # Now you train the model for epoch in range(start_epoch + 1, config.num_epochs + 1): data_time_start = time.time() data_time_all = 0 for step, batch in enumerate(train_dataloader): data_time_all += time.time() - data_time_start z = batch[0] # 4 x 4 x 128 x 128 z:vae output, 3x1024x1024->vae->4x128x128 clean_images = z * config.scale_factor # vae needed scale factor y = batch[1] # 4 x 1 x 120 x 4096 # T5 extracted feature of caption, 120 token, 4096 y_mask = batch[2] # 4 x 1 x 1 x 120 # caption indicate whether valid data_info = batch[3] # Sample a random timestep for each image bs = clean_images.shape[0] timesteps = torch.randint(0, config.train_sampling_steps, (bs,), device=clean_images.device).long() grad_norm = None with accelerator.accumulate(model): # Predict the noise residual optimizer.zero_grad() loss_term = train_diffusion.training_losses(model, clean_images, timesteps, model_kwargs=dict(y=y, mask=y_mask, data_info=data_info, c=data_info['condition'] * config.scale_factor)) loss = loss_term['loss'].mean() accelerator.backward(loss) if accelerator.sync_gradients: grad_norm = accelerator.clip_grad_norm_(model.parameters(), config.gradient_clip) optimizer.step() lr_scheduler.step() lr = lr_scheduler.get_last_lr()[0] logs = {"loss": accelerator.gather(loss).mean().item()} if grad_norm is not None: logs.update(grad_norm=accelerator.gather(grad_norm).mean().item()) log_buffer.update(logs) if (step + 1) % config.log_interval == 0 or (step + 1) == 1: t = (time.time() - last_tic) / config.log_interval t_d = data_time_all / config.log_interval avg_time = (time.time() - time_start) / (global_step + 1) eta = str(datetime.timedelta(seconds=int(avg_time * (total_steps - start_step - global_step - 1)))) eta_epoch = str(datetime.timedelta(seconds=int(avg_time * (len(train_dataloader) - step - 1)))) # avg_loss = sum(loss_buffer) / len(loss_buffer) log_buffer.average() info = f"Step/Epoch [{(epoch - 1) * len(train_dataloader) + step + 1}/{epoch}][{step + 1}/{len(train_dataloader)}]:total_eta: {eta}, " \ f"epoch_eta:{eta_epoch}, time_all:{t:.3f}, time_data:{t_d:.3f}, lr:{lr:.3e}, s:({data_info['img_hw'][0][0].item()}, {data_info['img_hw'][0][1].item()}), " info += ', '.join([f"{k}:{v:.4f}" for k, v in log_buffer.output.items()]) logger.info(info) last_tic = time.time() log_buffer.clear() data_time_all = 0 logs.update(lr=lr) accelerator.log(logs, step=global_step + start_step) if (global_step + 1) % 1000 == 0 and config.s3_work_dir is not None: logger.info(f"s3_work_dir: {config.s3_work_dir}") global_step += 1 data_time_start = time.time() synchronize() if accelerator.is_main_process: if ((epoch - 1) * len(train_dataloader) + step + 1) % config.save_model_steps == 0: os.umask(0o000) # file permission: 666; dir permission: 777 save_checkpoint(os.path.join(config.work_dir, 'checkpoints'), epoch=epoch, step=(epoch - 1) * len(train_dataloader) + step + 1, model=accelerator.unwrap_model(model), optimizer=optimizer, lr_scheduler=lr_scheduler ) synchronize() synchronize() # After each epoch you optionally sample some demo images with evaluate() and save the model if accelerator.is_main_process: if epoch % config.save_model_epochs == 0 or epoch == config.num_epochs: os.umask(0o000) # file permission: 666; dir permission: 777 save_checkpoint(os.path.join(config.work_dir, 'checkpoints'), epoch=epoch, step=(epoch - 1) * len(train_dataloader) + step + 1, model=accelerator.unwrap_model(model), optimizer=optimizer, lr_scheduler=lr_scheduler ) synchronize() def parse_args(): parser = argparse.ArgumentParser(description="Process some integers.") parser.add_argument("config", type=str, help="config") parser.add_argument("--cloud", action='store_true', default=False, help="cloud or local machine") parser.add_argument('--work-dir', help='the dir to save logs and models') parser.add_argument('--resume_from', help='the dir to save logs and models') parser.add_argument('--local-rank', type=int, default=-1) parser.add_argument('--local_rank', type=int, default=-1) parser.add_argument('--debug', action='store_true') parser.add_argument( "--report_to", type=str, default="tensorboard", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' ), ) parser.add_argument( "--tracker_project_name", type=str, default="text2image-fine-tune", help=( "The `project_name` argument passed to Accelerator.init_trackers for" " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator" ), ) parser.add_argument('--lr', type=float, default=2e-4) parser.add_argument('--data_root', type=str, default=None) parser.add_argument('--resume_optimizer', action='store_true') parser.add_argument('--resume_lr_scheduler', action='store_true') args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() config = read_config(args.config) if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None config.work_dir = args.work_dir if args.cloud: config.data_root = '/data/data' if args.data_root: config.data_root = args.data_root if args.resume_from is not None: config.load_from = None config.resume_from = dict( checkpoint=args.resume_from, load_ema=False, resume_optimizer=args.resume_optimizer, resume_lr_scheduler=args.resume_lr_scheduler) if args.debug: config.log_interval = 1 config.train_batch_size = 6 config.optimizer.update({'lr': args.lr}) os.umask(0o000) # file permission: 666; dir permission: 777 os.makedirs(config.work_dir, exist_ok=True) init_handler = InitProcessGroupKwargs() init_handler.timeout = datetime.timedelta(seconds=9600) # change timeout to avoid a strange NCCL bug # Initialize accelerator and tensorboard logging if config.use_fsdp: init_train = 'FSDP' from accelerate import FullyShardedDataParallelPlugin from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig set_fsdp_env() fsdp_plugin = FullyShardedDataParallelPlugin(state_dict_config=FullStateDictConfig(offload_to_cpu=False, rank0_only=False),) else: init_train = 'DDP' fsdp_plugin = None even_batches = True if config.multi_scale: even_batches=False, accelerator = Accelerator( mixed_precision=config.mixed_precision, gradient_accumulation_steps=config.gradient_accumulation_steps, log_with=args.report_to, project_dir=os.path.join(config.work_dir, "logs"), fsdp_plugin=fsdp_plugin, even_batches=even_batches, kwargs_handlers=[init_handler] ) logger = get_root_logger(os.path.join(config.work_dir, 'train_log.log')) config.seed = init_random_seed(config.get('seed', None)) set_random_seed(config.seed) if accelerator.is_main_process: config.dump(os.path.join(config.work_dir, 'config.py')) logger.info(f"Config: \n{config.pretty_text}") logger.info(f"World_size: {get_world_size()}, seed: {config.seed}") logger.info(f"Initializing: {init_train} for training") image_size = config.image_size # @param [512, 1024] latent_size = int(image_size) // 8 pred_sigma = getattr(config, 'pred_sigma', True) learn_sigma = getattr(config, 'learn_sigma', True) and pred_sigma model_kwargs={"window_block_indexes": config.window_block_indexes, "window_size": config.window_size, "use_rel_pos": config.use_rel_pos, "lewei_scale": config.lewei_scale, 'config':config, 'model_max_length': config.model_max_length} # build models train_diffusion = IDDPM(str(config.train_sampling_steps)) model: PixArtMS = build_model(config.model, config.grad_checkpointing, config.get('fp32_attention', False), input_size=latent_size, learn_sigma=learn_sigma, pred_sigma=pred_sigma, **model_kwargs) if config.load_from is not None and args.resume_from is None: # load from PixArt model missing, unexpected = load_checkpoint(config.load_from, model) logger.warning(f'Missing keys: {missing}') logger.warning(f'Unexpected keys: {unexpected}') if image_size == 1024: model: ControlPixArtMSHalf = ControlPixArtMSHalf(model, copy_blocks_num=config.copy_blocks_num).train() else: model: ControlPixArtHalf = ControlPixArtHalf(model, copy_blocks_num=config.copy_blocks_num).train() logger.info(f"{model.__class__.__name__} Model Parameters: {sum(p.numel() for p in model.parameters()):,}") logger.info(f"T5 max token length: {config.model_max_length}") # if args.local_rank == 0: # for name, params in model.named_parameters(): # if params.requires_grad == False: logger.info(f"freeze param: {name}") # # for name, params in model.named_parameters(): # if params.requires_grad == True: logger.info(f"trainable param: {name}") # prepare for FSDP clip grad norm calculation if accelerator.distributed_type == DistributedType.FSDP: for m in accelerator._models: m.clip_grad_norm_ = types.MethodType(clip_grad_norm_, m) # build dataloader set_data_root(config.data_root) dataset = build_dataset(config.data, resolution=image_size, aspect_ratio_type=config.aspect_ratio_type, train_ratio=config.train_ratio) if config.multi_scale: batch_sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset, batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, drop_last=True, ratio_nums=dataset.ratio_nums, config=config, valid_num=1) # batch_sampler = BalancedAspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset, # batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, # ratio_nums=dataset.ratio_nums) train_dataloader = build_dataloader(dataset, batch_sampler=batch_sampler, num_workers=config.num_workers) else: train_dataloader = build_dataloader(dataset, num_workers=config.num_workers, batch_size=config.train_batch_size, shuffle=True) # build optimizer and lr scheduler lr_scale_ratio = 1 if config.get('auto_lr', None): lr_scale_ratio = auto_scale_lr(config.train_batch_size * get_world_size() * config.gradient_accumulation_steps, config.optimizer, **config.auto_lr) optimizer = build_optimizer(model.controlnet, config.optimizer) lr_scheduler = build_lr_scheduler(config, optimizer, train_dataloader, lr_scale_ratio) timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()) if accelerator.is_main_process: tracker_config = dict(vars(config)) try: accelerator.init_trackers(args.tracker_project_name, tracker_config) except: accelerator.init_trackers(f"tb_{timestamp}") start_epoch = 0 if config.resume_from is not None and config.resume_from['checkpoint'] is not None: if args.resume_optimizer == False or args.resume_lr_scheduler == False: missing, unexpected = load_checkpoint(args.resume_from, model) else: start_epoch, missing, unexpected = load_checkpoint(**config.resume_from, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, ) logger.warning(f'Missing keys: {missing}') logger.warning(f'Unexpected keys: {unexpected}') # Prepare everything # There is no specific order to remember, you just need to unpack the # objects in the same order you gave them to the prepare method. model = accelerator.prepare(model,) optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler) train() ================================================ FILE: PixArt-alpha-ToCa/train_scripts/train_diffusers.py ================================================ import argparse import datetime import os import sys import time import types import warnings from pathlib import Path current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) import accelerate import gc import numpy as np import torch import torch.nn as nn from accelerate import Accelerator, InitProcessGroupKwargs from accelerate.utils import DistributedType from copy import deepcopy from diffusers import AutoencoderKL, Transformer2DModel, PixArtAlphaPipeline, DPMSolverMultistepScheduler from mmcv.runner import LogBuffer from packaging import version from torch.utils.data import RandomSampler from transformers import T5Tokenizer, T5EncoderModel from diffusion import IDDPM from diffusion.data.builder import build_dataset, build_dataloader, set_data_root from diffusion.utils.data_sampler import AspectRatioBatchSampler, BalancedAspectRatioBatchSampler from diffusion.utils.dist_utils import get_world_size, clip_grad_norm_, flush from diffusion.utils.logger import get_root_logger, rename_file_with_creation_time from diffusion.utils.lr_scheduler import build_lr_scheduler from diffusion.utils.misc import set_random_seed, read_config, init_random_seed, DebugUnderflowOverflow from diffusion.utils.optimizer import build_optimizer, auto_scale_lr warnings.filterwarnings("ignore") # ignore warning def set_fsdp_env(): os.environ["ACCELERATE_USE_FSDP"] = 'true' os.environ["FSDP_AUTO_WRAP_POLICY"] = 'TRANSFORMER_BASED_WRAP' os.environ["FSDP_BACKWARD_PREFETCH"] = 'BACKWARD_PRE' os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = 'Transformer2DModel' def ema_update(model_dest: nn.Module, model_src: nn.Module, rate): param_dict_src = dict(model_src.named_parameters()) for p_name, p_dest in model_dest.named_parameters(): p_src = param_dict_src[p_name] assert p_src is not p_dest p_dest.data.mul_(rate).add_((1 - rate) * p_src.data) def token_drop(y, y_mask, force_drop_ids=None): """ Drops labels to enable classifier-free guidance. """ if force_drop_ids is None: drop_ids = torch.rand(y.shape[0]).cuda() < config.class_dropout_prob else: drop_ids = force_drop_ids == 1 y = torch.where(drop_ids[:, None, None], uncond_prompt_embeds, y) y_mask = torch.where(drop_ids[:, None], uncond_prompt_attention_mask, y_mask) return y, y_mask def get_null_embed(npz_file, max_length=120): if os.path.exists(npz_file) and (npz_file.endswith('.npz') or npz_file.endswith('.pth')): data = torch.load(npz_file) uncond_prompt_embeds = data['uncond_prompt_embeds'].to(accelerator.device) uncond_prompt_attention_mask = data['uncond_prompt_attention_mask'].to(accelerator.device) else: tokenizer = T5Tokenizer.from_pretrained(args.pipeline_load_from, subfolder="tokenizer") text_encoder = T5EncoderModel.from_pretrained(args.pipeline_load_from, subfolder="text_encoder") uncond = tokenizer("", max_length=max_length, padding="max_length", truncation=True, return_tensors="pt") uncond_prompt_embeds = text_encoder(uncond.input_ids, attention_mask=uncond.attention_mask)[0] torch.save({ 'uncond_prompt_embeds': uncond_prompt_embeds.cpu(), 'uncond_prompt_attention_mask': uncond.attention_mask.cpu() }, npz_file) uncond_prompt_embeds = uncond_prompt_embeds.to(accelerator.device) uncond_prompt_attention_mask = uncond.attention_mask.to(accelerator.device) return uncond_prompt_embeds, uncond_prompt_attention_mask def prepare_vis(): if accelerator.is_main_process: # preparing embeddings for visualization. We put it here for saving GPU memory validation_prompts = [ "dog", "portrait photo of a girl, photograph, highly detailed face, depth of field", "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k", "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", "A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece", ] logger.info("Preparing Visualization prompt embeddings...") logger.info(f"Loading text encoder and tokenizer from {args.pipeline_load_from} ...") skip = True for prompt in validation_prompts: if not os.path.exists(f'output/tmp/{prompt}_{max_length}token.pth'): skip = False break if accelerator.is_main_process and not skip: print(f"Saving visualizate prompt text embedding at output/tmp/") tokenizer = T5Tokenizer.from_pretrained(args.pipeline_load_from, subfolder="tokenizer") text_encoder = T5EncoderModel.from_pretrained(args.pipeline_load_from, subfolder="text_encoder").to(accelerator.device) for prompt in validation_prompts: caption_token = tokenizer(prompt, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt").to(accelerator.device) caption_emb = text_encoder(caption_token.input_ids, attention_mask=caption_token.attention_mask)[0] torch.save({'caption_embeds': caption_emb, 'emb_mask': caption_token.attention_mask}, f'output/tmp/{prompt}_{max_length}token.pth') flush() @torch.inference_mode() def log_validation(model, accelerator, weight_dtype, step): logger.info("Running validation... ") model = accelerator.unwrap_model(model) pipeline = PixArtAlphaPipeline.from_pretrained( args.pipeline_load_from, transformer=model, tokenizer=None, text_encoder=None, torch_dtype=weight_dtype, ) pipeline = pipeline.to(accelerator.device) pipeline.set_progress_bar_config(disable=True) generator = torch.Generator(device=accelerator.device).manual_seed(0) validation_prompts = [ "dog", "portrait photo of a girl, photograph, highly detailed face, depth of field", "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k", "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", "A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece", ] image_logs = [] images = [] latents = [] for _, prompt in enumerate(validation_prompts): embed = torch.load(f'output/tmp/{prompt}_{max_length}token.pth', map_location='cpu') caption_embs, emb_masks = embed['caption_embeds'].to(accelerator.device), embed['emb_mask'].to(accelerator.device) latents.append(pipeline( num_inference_steps=14, num_images_per_prompt=1, generator=generator, guidance_scale=4.5, prompt_embeds=caption_embs, prompt_attention_mask=emb_masks, negative_prompt=None, negative_prompt_embeds=uncond_prompt_embeds, negative_prompt_attention_mask=uncond_prompt_attention_mask, output_type="latent", ).images) flush() for latent in latents: images.append(pipeline.vae.decode(latent.to(weight_dtype) / pipeline.vae.config.scaling_factor, return_dict=False)[0]) for prompt, image in zip(validation_prompts, images): image = pipeline.image_processor.postprocess(image, output_type="pil") image_logs.append({"validation_prompt": prompt, "images": image}) for tracker in accelerator.trackers: if tracker.name == "tensorboard": for log in image_logs: images = log["images"] validation_prompt = log["validation_prompt"] formatted_images = [] for image in images: formatted_images.append(np.asarray(image)) formatted_images = np.stack(formatted_images) tracker.writer.add_images(validation_prompt, formatted_images, step, dataformats="NHWC") elif tracker.name == "wandb": import wandb formatted_images = [] for log in image_logs: images = log["images"] validation_prompt = log["validation_prompt"] for image in images: image = wandb.Image(image, caption=validation_prompt) formatted_images.append(image) tracker.log({"validation": formatted_images}) else: logger.warn(f"image logging not implemented for {tracker.name}") del pipeline gc.collect() torch.cuda.empty_cache() return image_logs def train(model): if config.get('debug_nan', False): DebugUnderflowOverflow(model) logger.info('NaN debugger registered. Start to detect overflow during training.') time_start, last_tic = time.time(), time.time() log_buffer = LogBuffer() global_step = start_step + 1 load_vae_feat = getattr(train_dataloader.dataset, 'load_vae_feat', False) # Now you train the model for epoch in range(start_epoch + 1, config.num_epochs + 1): data_time_start= time.time() data_time_all = 0 for step, batch in enumerate(train_dataloader): data_time_all += time.time() - data_time_start if load_vae_feat: z = batch[0] else: with torch.no_grad(): with torch.cuda.amp.autocast(enabled=config.mixed_precision == 'fp16'): posterior = vae.encode(batch[0]).latent_dist if config.sample_posterior: z = posterior.sample() else: z = posterior.mode() latents = (z * config.scale_factor).to(weight_dtype) y = batch[1].squeeze(1).to(weight_dtype) y_mask = batch[2].squeeze(1).squeeze(1).to(weight_dtype) y, y_mask = token_drop(y, y_mask) # classifier-free guidance data_info = {'resolution': batch[3]['img_hw'].to(weight_dtype), 'aspect_ratio': batch[3]['aspect_ratio'].to(weight_dtype),} # Sample a random timestep for each image bs = latents.shape[0] timesteps = torch.randint(0, config.train_sampling_steps, (bs,), device=latents.device).long() grad_norm = None with accelerator.accumulate(model): # Predict the noise residual optimizer.zero_grad() loss_term = train_diffusion.training_losses_diffusers( model, latents, timesteps, model_kwargs = dict(encoder_hidden_states=y, encoder_attention_mask=y_mask, added_cond_kwargs=data_info), ) loss = loss_term['loss'].mean() accelerator.backward(loss) if accelerator.sync_gradients: grad_norm = accelerator.clip_grad_norm_(model.parameters(), config.gradient_clip) optimizer.step() lr_scheduler.step() # if accelerator.sync_gradients: # ema_update(model_ema, accelerator.unwrap_model(model), config.ema_rate) lr = lr_scheduler.get_last_lr()[0] logs = {args.loss_report_name: accelerator.gather(loss).mean().item()} if grad_norm is not None: logs.update(grad_norm=accelerator.gather(grad_norm).mean().item()) log_buffer.update(logs) if (step + 1) % config.log_interval == 0 or (step + 1) == 1: t = (time.time() - last_tic) / config.log_interval t_d = data_time_all / config.log_interval avg_time = (time.time() - time_start) / (global_step - start_step) eta = str(datetime.timedelta(seconds=int(avg_time * (total_steps - global_step - 1)))) eta_epoch = str(datetime.timedelta(seconds=int(avg_time * (len(train_dataloader) - step - 1)))) # avg_loss = sum(loss_buffer) / len(loss_buffer) log_buffer.average() info = f"Step/Epoch [{global_step}/{epoch}][{step + 1}/{len(train_dataloader)}]:total_eta: {eta}, " \ f"epoch_eta:{eta_epoch}, time_all:{t:.3f}, time_data:{t_d:.3f}, lr:{lr:.3e}," \ f"s:({data_info['resolution'][0][0].item()}, {data_info['resolution'][0][1].item()}), " # f"s:({data_info['resolution'][0][0].item() * relative_to_1024 // 8}, {data_info['resolution'][0][1].item() * relative_to_1024 // 8}), " info += ', '.join([f"{k}:{v:.4f}" for k, v in log_buffer.output.items()]) logger.info(info) last_tic = time.time() log_buffer.clear() data_time_all = 0 logs.update(lr=lr) accelerator.log(logs, step=global_step) global_step += 1 data_time_start= time.time() accelerator.wait_for_everyone() if accelerator.is_main_process: if global_step % config.save_model_steps == 0: save_path = os.path.join(os.path.join(config.work_dir, 'checkpoints'), f"checkpoint-{global_step}") os.umask(0o000) logger.info(f"Start to save state to {save_path}") accelerator.save_state(save_path) logger.info(f"Saved state to {save_path}") if global_step % config.eval_sampling_steps == 0 or (step + 1) == 1: log_validation(model, accelerator, weight_dtype, global_step) accelerator.wait_for_everyone() if epoch % config.save_model_epochs == 0 or epoch == config.num_epochs: os.umask(0o000) save_path = os.path.join(os.path.join(config.work_dir, 'checkpoints'), f"checkpoint-{global_step}") logger.info(f"Start to save state to {save_path}") model = accelerator.unwrap_model(model) model.save_pretrained(save_path) logger.info(f"Saved state to {save_path}") def parse_args(): parser = argparse.ArgumentParser(description="Process some integers.") parser.add_argument("config", type=str, help="config") parser.add_argument("--cloud", action='store_true', default=False, help="cloud or local machine") parser.add_argument('--work-dir', help='the dir to save logs and models') parser.add_argument('--resume-from', help='the dir to resume the training') parser.add_argument('--load-from', default=None, help='the dir to load a ckpt for training') parser.add_argument('--local-rank', type=int, default=-1) parser.add_argument('--local_rank', type=int, default=-1) parser.add_argument('--debug', action='store_true') parser.add_argument("--pipeline_load_from", default='output/pretrained_models/pixart_omega_sdxl_256px_diffusers_from512', type=str, help="path for loading text_encoder, tokenizer and vae") parser.add_argument( "--report_to", type=str, default="tensorboard", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' ), ) parser.add_argument( "--tracker_project_name", type=str, default="text2image-pixart-omega", help=( "The `project_name` argument passed to Accelerator.init_trackers for" " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator" ), ) parser.add_argument("--loss_report_name", type=str, default="loss") args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() config = read_config(args.config) if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None config.work_dir = args.work_dir if args.cloud: config.data_root = '/data/data' if args.resume_from is not None: config.resume_from = args.resume_from if args.debug: config.log_interval = 1 config.train_batch_size = 32 config.valid_num = 100 os.umask(0o000) os.makedirs(config.work_dir, exist_ok=True) init_handler = InitProcessGroupKwargs() init_handler.timeout = datetime.timedelta(seconds=5400) # change timeout to avoid a strange NCCL bug # Initialize accelerator and tensorboard logging if config.use_fsdp: init_train = 'FSDP' from accelerate import FullyShardedDataParallelPlugin from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig set_fsdp_env() fsdp_plugin = FullyShardedDataParallelPlugin(state_dict_config=FullStateDictConfig(offload_to_cpu=False, rank0_only=False),) else: init_train = 'DDP' fsdp_plugin = None even_batches = True if config.multi_scale: even_batches=False, accelerator = Accelerator( mixed_precision=config.mixed_precision, gradient_accumulation_steps=config.gradient_accumulation_steps, log_with=args.report_to, project_dir=os.path.join(config.work_dir, "logs"), fsdp_plugin=fsdp_plugin, even_batches=even_batches, kwargs_handlers=[init_handler] ) log_name = 'train_log.log' if accelerator.is_main_process: if os.path.exists(os.path.join(config.work_dir, log_name)): rename_file_with_creation_time(os.path.join(config.work_dir, log_name)) logger = get_root_logger(os.path.join(config.work_dir, log_name)) logger.info(accelerator.state) config.seed = init_random_seed(config.get('seed', None)) set_random_seed(config.seed) if accelerator.is_main_process: config.dump(os.path.join(config.work_dir, 'config.py')) logger.info(f"Config: \n{config.pretty_text}") logger.info(f"World_size: {get_world_size()}, seed: {config.seed}") logger.info(f"Initializing: {init_train} for training") image_size = config.image_size # @param [256, 512, 1024] latent_size = int(image_size) // 8 relative_to_1024 = float(image_size / 1024) pred_sigma = getattr(config, 'pred_sigma', True) learn_sigma = getattr(config, 'learn_sigma', True) and pred_sigma # Create for unconditional prompt embedding for classifier free guidance logger.info("Embedding for classifier free guidance") max_length = config.model_max_length uncond_prompt_embeds, uncond_prompt_attention_mask = get_null_embed( f'output/pretrained_models/null_embed_diffusers_{max_length}token.pth', max_length=max_length ) # preparing embeddings for visualization. We put it here for saving GPU memory prepare_vis() # build models train_diffusion = IDDPM(str(config.train_sampling_steps), learn_sigma=learn_sigma, pred_sigma=pred_sigma, snr=config.snr_loss) model = Transformer2DModel.from_pretrained(config.load_from, subfolder="transformer").train() logger.info(f"{model.__class__.__name__} Model Parameters: {sum(p.numel() for p in model.parameters()):,}") logger.info(f"lewei scale: {model.pos_embed.interpolation_scale} base size: {model.pos_embed.base_size}") # model_ema = deepcopy(model).eval() # 9. Handle mixed precision and device placement # For mixed precision training we cast all non-trainable weigths to half-precision # as these weights are only used for inference, keeping weights in full precision is not required. weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": weight_dtype = torch.float16 elif accelerator.mixed_precision == "bf16": weight_dtype = torch.bfloat16 # 11. Enable optimizations # model.enable_xformers_memory_efficient_attention() # not available for now # for name, params in model.named_parameters(): # if params.requires_grad == False: logger.info(f"freeze param: {name}") # # for name, params in model.named_parameters(): # if params.requires_grad == True: logger.info(f"trainable param: {name}") # 10. Handle saving and loading of checkpoints # `accelerate` 0.16.0 will have better support for customized saving if version.parse(accelerate.__version__) >= version.parse("0.16.0"): # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format def save_model_hook(models, weights, output_dir): if accelerator.is_main_process: transformer_ = accelerator.unwrap_model(models[0]) # save weights in peft format to be able to load them back transformer_.save_pretrained(output_dir) for _, model in enumerate(models): # make sure to pop weight so that corresponding model is not saved again weights.pop() def load_model_hook(models, input_dir): for i in range(len(models)): # pop models so that they are not loaded again model = models.pop() # load diffusers style into model load_model = Transformer2DModel.from_pretrained(input_dir) model.register_to_config(**load_model.config) model.load_state_dict(load_model.state_dict()) del load_model accelerator.register_save_state_pre_hook(save_model_hook) accelerator.register_load_state_pre_hook(load_model_hook) if config.grad_checkpointing: model.enable_gradient_checkpointing() if not config.data.load_vae_feat: vae = AutoencoderKL.from_pretrained(config.vae_pretrained).cuda() # prepare for FSDP clip grad norm calculation if accelerator.distributed_type == DistributedType.FSDP: for m in accelerator._models: m.clip_grad_norm_ = types.MethodType(clip_grad_norm_, m) # build dataloader set_data_root(config.data_root) logger.info(f"ratio of real user prompt: {config.real_prompt_ratio}") dataset = build_dataset( config.data, resolution=image_size, aspect_ratio_type=config.aspect_ratio_type, real_prompt_ratio=config.real_prompt_ratio, max_length=max_length, config=config, ) if config.multi_scale: batch_sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset, batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, drop_last=True, ratio_nums=dataset.ratio_nums, config=config, valid_num=config.valid_num) # used for balanced sampling # batch_sampler = BalancedAspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset, # batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, # ratio_nums=dataset.ratio_nums) train_dataloader = build_dataloader(dataset, batch_sampler=batch_sampler, num_workers=config.num_workers) else: train_dataloader = build_dataloader(dataset, num_workers=config.num_workers, batch_size=config.train_batch_size, shuffle=True) # build optimizer and lr scheduler lr_scale_ratio = 1 if config.get('auto_lr', None): lr_scale_ratio = auto_scale_lr(config.train_batch_size * get_world_size() * config.gradient_accumulation_steps, config.optimizer, **config.auto_lr) optimizer = build_optimizer(model, config.optimizer) lr_scheduler = build_lr_scheduler(config, optimizer, train_dataloader, lr_scale_ratio) timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()) if accelerator.is_main_process: tracker_config = dict(vars(config)) accelerator.init_trackers(f"tb_{timestamp}_{args.tracker_project_name}") logger.info(f"Training tracker at tb_{timestamp}_{args.tracker_project_name}") start_epoch = 0 start_step = 0 total_steps = len(train_dataloader) * config.num_epochs # Prepare everything # There is no specific order to remember, you just need to unpack the # objects in the same order you gave them to the prepare method. # model, model_ema = accelerator.prepare(model, model_ema) model = accelerator.prepare(model) optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler) if config.resume_from is not None: if config.resume_from != "latest": path = os.path.basename(config.resume_from) else: # Get the most recent checkpoint dirs = os.listdir(os.path.join(config.work_dir, 'checkpoints')) dirs = [d for d in dirs if d.startswith("checkpoint")] dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) path = dirs[-1] if len(dirs) > 0 else None if path is None: accelerator.print(f"Checkpoint '{config.resume_from}' does not exist. Starting a new training run.") config.resume_from = None else: accelerator.print(f"Resuming from checkpoint {path}") accelerator.load_state(os.path.join(config.work_dir, 'checkpoints', path)) start_step = int(path.split("-")[1]) start_epoch = start_step // len(train_dataloader) train(model) ================================================ FILE: PixArt-alpha-ToCa/train_scripts/train_dreambooth.py ================================================ import os import sys import types from pathlib import Path current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) import argparse import datetime import time import warnings warnings.filterwarnings("ignore") # ignore warning from mmcv.runner import LogBuffer from copy import deepcopy from diffusion.utils.checkpoint import save_checkpoint, load_checkpoint import torch import torch.nn as nn from accelerate import Accelerator, InitProcessGroupKwargs from accelerate.utils import DistributedType from torch.utils.data import RandomSampler from diffusion import IDDPM from diffusion.utils.dist_utils import synchronize, get_world_size, clip_grad_norm_ from diffusion.data.builder import build_dataset, build_dataloader, set_data_root from diffusion.model.builder import build_model from diffusion.utils.logger import get_root_logger from diffusion.utils.misc import set_random_seed, read_config, init_random_seed, DebugUnderflowOverflow from diffusion.utils.optimizer import build_optimizer, auto_scale_lr from diffusion.utils.lr_scheduler import build_lr_scheduler from diffusion.model.t5 import T5Embedder from diffusion.utils.data_sampler import AspectRatioBatchSampler def set_fsdp_env(): os.environ["ACCELERATE_USE_FSDP"] = 'true' os.environ["FSDP_AUTO_WRAP_POLICY"] = 'TRANSFORMER_BASED_WRAP' os.environ["FSDP_BACKWARD_PREFETCH"] = 'BACKWARD_PRE' os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = 'PixArtBlock' def ema_update(model_dest: nn.Module, model_src: nn.Module, rate): param_dict_src = dict(model_src.named_parameters()) for p_name, p_dest in model_dest.named_parameters(): p_src = param_dict_src[p_name] assert p_src is not p_dest p_dest.data.mul_(rate).add_((1 - rate) * p_src.data) def train(): if config.get('debug_nan', False): DebugUnderflowOverflow(model) logger.info('NaN debugger registered. Start to detect overflow during training.') time_start, last_tic = time.time(), time.time() log_buffer = LogBuffer() start_step = start_epoch * len(train_dataloader) global_step = 0 total_steps = len(train_dataloader) * config.num_epochs # txt related prompt = config.data.prompt if isinstance(config.data.prompt, list) else [config.data.prompt] llm_embed_model = T5Embedder(device="cpu", local_cache=True, cache_dir='output/pretrained_models/t5_ckpts', torch_dtype=torch.float) prompt_embs, attention_mask = llm_embed_model.get_text_embeddings(prompt) prompt_embs, attention_mask = prompt_embs[None].cuda(), attention_mask[None].cuda() del llm_embed_model # Now you train the model for epoch in range(start_epoch + 1, config.num_epochs + 1): data_time_start= time.time() data_time_all = 0 for step, batch in enumerate(train_dataloader): data_time_all += time.time() - data_time_start z = batch[0] clean_images = z * config.scale_factor y = prompt_embs y_mask = attention_mask data_info = batch[1] # Sample a random timestep for each image bs = clean_images.shape[0] timesteps = torch.randint(0, config.train_sampling_steps, (bs,), device=clean_images.device).long() grad_norm = None with accelerator.accumulate(model): # Predict the noise residual optimizer.zero_grad() loss_term = train_diffusion.training_losses(model, clean_images, timesteps, model_kwargs=dict(y=y, mask=y_mask, data_info=data_info)) loss = loss_term['loss'].mean() accelerator.backward(loss) if accelerator.sync_gradients: grad_norm = accelerator.clip_grad_norm_(model.parameters(), config.gradient_clip) optimizer.step() lr_scheduler.step() if accelerator.sync_gradients: ema_update(model_ema, model, config.ema_rate) lr = lr_scheduler.get_last_lr()[0] logs = {"loss": accelerator.gather(loss).mean().item()} if grad_norm is not None: logs.update(grad_norm=accelerator.gather(grad_norm).mean().item()) log_buffer.update(logs) if (step + 1) % config.log_interval == 0: t = (time.time() - last_tic) / config.log_interval t_d = data_time_all / config.log_interval avg_time = (time.time() - time_start) / (global_step + 1) eta = str(datetime.timedelta(seconds=int(avg_time * (total_steps - start_step - global_step - 1)))) eta_epoch = str(datetime.timedelta(seconds=int(avg_time * (len(train_dataloader) - step - 1)))) # avg_loss = sum(loss_buffer) / len(loss_buffer) log_buffer.average() info = f"Steps [{(epoch-1)*len(train_dataloader)+step+1}][{step + 1}/{len(train_dataloader)}]:total_eta: {eta}, " \ f"epoch_eta:{eta_epoch}, time_all:{t:.3f}, time_data:{t_d:.3f}, lr:{lr:.3e}, s:({model.module.h}, {model.module.w}), " info += ', '.join([f"{k}:{v:.4f}" for k, v in log_buffer.output.items()]) logger.info(info) last_tic = time.time() log_buffer.clear() data_time_all = 0 logs.update(lr=lr) accelerator.log(logs, step=global_step + start_step) global_step += 1 data_time_start= time.time() synchronize() if accelerator.is_main_process: if ((epoch - 1) * len(train_dataloader) + step + 1) % config.save_model_steps == 0: os.umask(0o000) save_checkpoint(os.path.join(config.work_dir, 'checkpoints'), epoch=epoch, step=(epoch - 1) * len(train_dataloader) + step + 1, model=accelerator.unwrap_model(model), model_ema=accelerator.unwrap_model(model_ema), optimizer=optimizer, lr_scheduler=lr_scheduler ) synchronize() synchronize() if accelerator.is_main_process: if epoch % config.save_model_epochs == 0 or epoch == config.num_epochs: os.umask(0o000) save_checkpoint(os.path.join(config.work_dir, 'checkpoints'), epoch=epoch, step=(epoch - 1) * len(train_dataloader) + step + 1, model=accelerator.unwrap_model(model), model_ema=accelerator.unwrap_model(model_ema), optimizer=optimizer, lr_scheduler=lr_scheduler ) synchronize() def parse_args(): parser = argparse.ArgumentParser(description="Process some integers.") parser.add_argument("config", type=str, help="config") parser.add_argument('--work-dir', help='the dir to save logs and models') parser.add_argument('--resume-from', help='the dir to resume the training') parser.add_argument('--load-from', default=None, help='the dir to load a ckpt for training') parser.add_argument('--local-rank', type=int, default=-1) parser.add_argument('--local_rank', type=int, default=-1) parser.add_argument('--debug', action='store_true') parser.add_argument('--save_step', type=int, default=100) parser.add_argument('--lr', type=float, default=5e-6) parser.add_argument('--train_class', type=str) parser.add_argument('--prompt', type=str, default='a photo of sks dog') args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() config = read_config(args.config) if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None config.work_dir = args.work_dir if args.resume_from is not None: config.resume_from = dict( checkpoint=args.resume_from, load_ema=False, resume_optimizer=True, resume_lr_scheduler=True) if args.debug: config.log_interval = 1 config.train_batch_size = 1 config.save_model_steps=args.save_step config.data.update({'prompt': [args.prompt], 'root': args.train_class}) config.optimizer.update({'lr': args.lr}) os.umask(0o000) os.makedirs(config.work_dir, exist_ok=True) init_handler = InitProcessGroupKwargs() init_handler.timeout = datetime.timedelta(seconds=5400) # change timeout to avoid a strange NCCL bug # Initialize accelerator and tensorboard logging if config.use_fsdp: init_train = 'FSDP' from accelerate import FullyShardedDataParallelPlugin from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig set_fsdp_env() fsdp_plugin = FullyShardedDataParallelPlugin(state_dict_config=FullStateDictConfig(offload_to_cpu=False, rank0_only=False),) else: init_train = 'DDP' fsdp_plugin = None even_batches = True if config.multi_scale: even_batches=False, accelerator = Accelerator( mixed_precision=config.mixed_precision, gradient_accumulation_steps=config.gradient_accumulation_steps, log_with="tensorboard", project_dir=os.path.join(config.work_dir, "logs"), fsdp_plugin=fsdp_plugin, even_batches=even_batches, kwargs_handlers=[init_handler] ) logger = get_root_logger(os.path.join(config.work_dir, 'train_log.log')) config.seed = init_random_seed(config.get('seed', None)) set_random_seed(config.seed) if accelerator.is_main_process: config.dump(os.path.join(config.work_dir, 'config.py')) logger.info(f"Config: \n{config.pretty_text}") logger.info(f"World_size: {get_world_size()}, seed: {config.seed}") logger.info(f"Initializing: {init_train} for training") image_size = config.image_size # @param [256, 512] latent_size = int(image_size) // 8 pred_sigma = getattr(config, 'pred_sigma', True) learn_sigma = getattr(config, 'learn_sigma', True) and pred_sigma model_kwargs={"window_block_indexes": config.window_block_indexes, "window_size": config.window_size, "use_rel_pos": config.use_rel_pos, "lewei_scale": config.lewei_scale, 'config':config, 'model_max_length': config.model_max_length} # build models train_diffusion = IDDPM(str(config.train_sampling_steps)) eval_diffusion = IDDPM(str(config.eval_sampling_steps)) model = build_model(config.model, config.grad_checkpointing, config.get('fp32_attention', False), input_size=latent_size, learn_sigma=learn_sigma, pred_sigma=pred_sigma, **model_kwargs).train() logger.info(f"{config.model} Model Parameters: {sum(p.numel() for p in model.parameters()):,}") model_ema = deepcopy(model).eval() if config.load_from is not None: if args.load_from is not None: config.load_from = args.load_from missing, unexpected = load_checkpoint(config.load_from, model, load_ema=config.get('load_ema', False)) # model.reparametrize() if accelerator.is_main_process: print('Warning Missing keys: ', missing) print('Warning Unexpected keys', unexpected) ema_update(model_ema, model, 0.) # prepare for FSDP clip grad norm calculation if accelerator.distributed_type == DistributedType.FSDP: for m in accelerator._models: m.clip_grad_norm_ = types.MethodType(clip_grad_norm_, m) # build dataloader logger.warning(f"Training prompt: {config.data['prompt']}, Training data class: {config.data['root']}") set_data_root(config.data_root) dataset = build_dataset(config.data, resolution=image_size, aspect_ratio_type=config.aspect_ratio_type) if config.multi_scale: batch_sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset, batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, drop_last=True, ratio_nums=dataset.ratio_nums, config=config, valid_num=1) # batch_sampler = BalancedAspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset, # batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, # ratio_nums=dataset.ratio_nums) train_dataloader = build_dataloader(dataset, batch_sampler=batch_sampler, num_workers=config.num_workers) else: train_dataloader = build_dataloader(dataset, num_workers=config.num_workers, batch_size=config.train_batch_size, shuffle=True) # build optimizer and lr scheduler lr_scale_ratio = 1 if config.get('auto_lr', None): lr_scale_ratio = auto_scale_lr(config.train_batch_size * get_world_size() * config.gradient_accumulation_steps, config.optimizer, **config.auto_lr) optimizer = build_optimizer(model, config.optimizer) lr_scheduler = build_lr_scheduler(config, optimizer, train_dataloader, lr_scale_ratio) timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()) if accelerator.is_main_process: accelerator.init_trackers(f"tb_{timestamp}") start_epoch = 0 if config.resume_from is not None and config.resume_from['checkpoint'] is not None: start_epoch, missing, unexpected = load_checkpoint(**config.resume_from, model=model, model_ema=model_ema, optimizer=optimizer, lr_scheduler=lr_scheduler, ) if accelerator.is_main_process: print('Warning Missing keys: ', missing) print('Warning Unexpected keys', unexpected) # Prepare everything # There is no specific order to remember, you just need to unpack the # objects in the same order you gave them to the prepare method. model, model_ema = accelerator.prepare(model, model_ema) optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler) train() ================================================ FILE: PixArt-alpha-ToCa/train_scripts/train_pixart_lcm.py ================================================ import os import sys import types from pathlib import Path current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) import argparse import datetime import time import warnings warnings.filterwarnings("ignore") # ignore warning import torch import torch.nn as nn from accelerate import Accelerator, InitProcessGroupKwargs from accelerate.utils import DistributedType from diffusers.models import AutoencoderKL from torch.utils.data import RandomSampler from mmcv.runner import LogBuffer from copy import deepcopy import numpy as np import torch.nn.functional as F from tqdm import tqdm from diffusion import IDDPM from diffusion.utils.checkpoint import save_checkpoint, load_checkpoint from diffusion.utils.dist_utils import synchronize, get_world_size, clip_grad_norm_ from diffusion.data.builder import build_dataset, build_dataloader, set_data_root from diffusion.model.builder import build_model from diffusion.utils.logger import get_root_logger from diffusion.utils.misc import set_random_seed, read_config, init_random_seed, DebugUnderflowOverflow from diffusion.utils.optimizer import build_optimizer, auto_scale_lr from diffusion.utils.lr_scheduler import build_lr_scheduler from diffusion.utils.data_sampler import AspectRatioBatchSampler, BalancedAspectRatioBatchSampler from diffusion.lcm_scheduler import LCMScheduler from torchvision.utils import save_image def set_fsdp_env(): os.environ["ACCELERATE_USE_FSDP"] = 'true' os.environ["FSDP_AUTO_WRAP_POLICY"] = 'TRANSFORMER_BASED_WRAP' os.environ["FSDP_BACKWARD_PREFETCH"] = 'BACKWARD_PRE' os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = 'PixArtBlock' def ema_update(model_dest: nn.Module, model_src: nn.Module, rate): param_dict_src = dict(model_src.named_parameters()) for p_name, p_dest in model_dest.named_parameters(): p_src = param_dict_src[p_name] assert p_src is not p_dest p_dest.data.mul_(rate).add_((1 - rate) * p_src.data) def append_dims(x, target_dims): """Appends dimensions to the end of a tensor until it has target_dims dimensions.""" dims_to_append = target_dims - x.ndim if dims_to_append < 0: raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less") return x[(...,) + (None,) * dims_to_append] # From LCMScheduler.get_scalings_for_boundary_condition_discrete def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=10.0): c_skip = sigma_data**2 / ((timestep / 0.1) ** 2 + sigma_data**2) c_out = (timestep / 0.1) / ((timestep / 0.1) ** 2 + sigma_data**2) ** 0.5 return c_skip, c_out def extract_into_tensor(a, t, x_shape): b, *_ = t.shape out = a.gather(-1, t) return out.reshape(b, *((1,) * (len(x_shape) - 1))) class DDIMSolver: def __init__(self, alpha_cumprods, timesteps=1000, ddim_timesteps=50): # DDIM sampling parameters step_ratio = timesteps // ddim_timesteps self.ddim_timesteps = (np.arange(1, ddim_timesteps + 1) * step_ratio).round().astype(np.int64) - 1 self.ddim_alpha_cumprods = alpha_cumprods[self.ddim_timesteps] self.ddim_alpha_cumprods_prev = np.asarray( [alpha_cumprods[0]] + alpha_cumprods[self.ddim_timesteps[:-1]].tolist() ) # convert to torch tensors self.ddim_timesteps = torch.from_numpy(self.ddim_timesteps).long() self.ddim_alpha_cumprods = torch.from_numpy(self.ddim_alpha_cumprods) self.ddim_alpha_cumprods_prev = torch.from_numpy(self.ddim_alpha_cumprods_prev) def to(self, device): self.ddim_timesteps = self.ddim_timesteps.to(device) self.ddim_alpha_cumprods = self.ddim_alpha_cumprods.to(device) self.ddim_alpha_cumprods_prev = self.ddim_alpha_cumprods_prev.to(device) return self def ddim_step(self, pred_x0, pred_noise, timestep_index): alpha_cumprod_prev = extract_into_tensor(self.ddim_alpha_cumprods_prev, timestep_index, pred_x0.shape) dir_xt = (1.0 - alpha_cumprod_prev).sqrt() * pred_noise x_prev = alpha_cumprod_prev.sqrt() * pred_x0 + dir_xt return x_prev @torch.no_grad() def log_validation(model, step, device): if hasattr(model, 'module'): model = model.module scheduler = LCMScheduler(beta_start=0.0001, beta_end=0.02, beta_schedule="linear", prediction_type="epsilon") scheduler.set_timesteps(4, 50) infer_timesteps = scheduler.timesteps dog_embed = torch.load('data/tmp/dog.pth', map_location='cpu') caption_embs, emb_masks = dog_embed['dog_text'].to(device), dog_embed['dog_mask'].to(device) hw = torch.tensor([[1024, 1024]], dtype=torch.float, device=device).repeat(1, 1) ar = torch.tensor([[1.]], device=device).repeat(1, 1) # Create sampling noise: infer_latents = torch.randn(1, 4, 1024, 1024, device=device) model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks) logger.info("Running validation... ") # 7. LCM MultiStep Sampling Loop: for i, t in tqdm(list(enumerate(infer_timesteps))): ts = torch.full((1,), t, device=device, dtype=torch.long) # model prediction (v-prediction, eps, x) model_pred = model(infer_latents, ts, caption_embs, **model_kwargs)[:, :4] # compute the previous noisy sample x_t -> x_t-1 infer_latents, denoised = scheduler.step(model_pred, i, t, infer_latents, return_dict=False) samples = vae.decode(denoised / 0.18215).sample torch.cuda.empty_cache() save_image(samples[0], f'output_cv/vis/{step}.jpg', nrow=1, normalize=True, value_range=(-1, 1)) def train(): if config.get('debug_nan', False): DebugUnderflowOverflow(model) logger.info('NaN debugger registered. Start to detect overflow during training.') time_start, last_tic = time.time(), time.time() log_buffer = LogBuffer() start_step = start_epoch * len(train_dataloader) global_step = 0 total_steps = len(train_dataloader) * config.num_epochs load_vae_feat = getattr(train_dataloader.dataset, 'load_vae_feat', False) # Create uncond embeds for classifier free guidance uncond_prompt_embeds = model.module.y_embedder.y_embedding.repeat(config.train_batch_size, 1, 1, 1) # Now you train the model for epoch in range(start_epoch + 1, config.num_epochs + 1): data_time_start= time.time() data_time_all = 0 for step, batch in enumerate(train_dataloader): data_time_all += time.time() - data_time_start if load_vae_feat: z = batch[0] else: with torch.no_grad(): with torch.cuda.amp.autocast(enabled=config.mixed_precision == 'fp16'): posterior = vae.encode(batch[0]).latent_dist if config.sample_posterior: z = posterior.sample() else: z = posterior.mode() latents = z * config.scale_factor y = batch[1] y_mask = batch[2] data_info = batch[3] # Sample a random timestep for each image grad_norm = None with accelerator.accumulate(model): # Predict the noise residual optimizer.zero_grad() # Sample noise that we'll add to the latents noise = torch.randn_like(latents) bsz = latents.shape[0] # Sample a random timestep for each image t_n ~ U[0, N - k - 1] without bias. topk = config.train_sampling_steps // config.num_ddim_timesteps index = torch.randint(0, config.num_ddim_timesteps, (bsz,), device=latents.device).long() start_timesteps = solver.ddim_timesteps[index] timesteps = start_timesteps - topk timesteps = torch.where(timesteps < 0, torch.zeros_like(timesteps), timesteps) # Get boundary scalings for start_timesteps and (end) timesteps. c_skip_start, c_out_start = scalings_for_boundary_conditions(start_timesteps) c_skip_start, c_out_start = [append_dims(x, latents.ndim) for x in [c_skip_start, c_out_start]] c_skip, c_out = scalings_for_boundary_conditions(timesteps) c_skip, c_out = [append_dims(x, latents.ndim) for x in [c_skip, c_out]] # Sample a random guidance scale w from U[w_min, w_max] and embed it # w = (config.w_max - config.w_min) * torch.rand((bsz,)) + config.w_min w = config.cfg_scale * torch.ones((bsz,)) w = w.reshape(bsz, 1, 1, 1) w = w.to(device=latents.device, dtype=latents.dtype) # Get online LCM prediction on z_{t_{n + k}}, w, c, t_{n + k} _, pred_x_0, noisy_model_input = train_diffusion.training_losses(model, latents, start_timesteps, model_kwargs=dict(y=y, mask=y_mask, data_info=data_info), noise=noise) model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0 # Use the ODE solver to predict the kth step in the augmented PF-ODE trajectory after # noisy_latents with both the conditioning embedding c and unconditional embedding 0 # Get teacher model prediction on noisy_latents and conditional embedding with torch.no_grad(): with torch.autocast("cuda"): cond_teacher_output, cond_pred_x0, _ = train_diffusion.training_losses(model_teacher, latents, start_timesteps, model_kwargs=dict(y=y, mask=y_mask, data_info=data_info), noise=noise) # Get teacher model prediction on noisy_latents and unconditional embedding uncond_teacher_output, uncond_pred_x0, _ = train_diffusion.training_losses(model_teacher, latents, start_timesteps, model_kwargs=dict(y=uncond_prompt_embeds, mask=y_mask, data_info=data_info), noise=noise) # Perform "CFG" to get x_prev estimate (using the LCM paper's CFG formulation) pred_x0 = cond_pred_x0 + w * (cond_pred_x0 - uncond_pred_x0) pred_noise = cond_teacher_output + w * (cond_teacher_output - uncond_teacher_output) x_prev = solver.ddim_step(pred_x0, pred_noise, index) # Get target LCM prediction on x_prev, w, c, t_n with torch.no_grad(): with torch.autocast("cuda", enabled=True): _, pred_x_0, _ = train_diffusion.training_losses(model_ema, x_prev.float(), timesteps, model_kwargs=dict(y=y, mask=y_mask, data_info=data_info), skip_noise=True) target = c_skip * x_prev + c_out * pred_x_0 # Calculate loss if config.loss_type == "l2": loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") elif config.loss_type == "huber": loss = torch.mean(torch.sqrt((model_pred.float() - target.float()) ** 2 + config.huber_c**2) - config.huber_c) # Backpropagation on the online student model (`model`) accelerator.backward(loss) if accelerator.sync_gradients: grad_norm = accelerator.clip_grad_norm_(model.parameters(), config.gradient_clip) optimizer.step() lr_scheduler.step() optimizer.zero_grad(set_to_none=True) if accelerator.sync_gradients: ema_update(model_ema, model, config.ema_decay) lr = lr_scheduler.get_last_lr()[0] logs = {"loss": accelerator.gather(loss).mean().item()} if grad_norm is not None: logs.update(grad_norm=accelerator.gather(grad_norm).mean().item()) log_buffer.update(logs) if (step + 1) % config.log_interval == 0 or (step + 1) == 1: t = (time.time() - last_tic) / config.log_interval t_d = data_time_all / config.log_interval avg_time = (time.time() - time_start) / (global_step + 1) eta = str(datetime.timedelta(seconds=int(avg_time * (total_steps - start_step - global_step - 1)))) eta_epoch = str(datetime.timedelta(seconds=int(avg_time * (len(train_dataloader) - step - 1)))) # avg_loss = sum(loss_buffer) / len(loss_buffer) log_buffer.average() info = f"Step/Epoch [{(epoch-1)*len(train_dataloader)+step+1}/{epoch}][{step + 1}/{len(train_dataloader)}]:total_eta: {eta}, " \ f"epoch_eta:{eta_epoch}, time_all:{t:.3f}, time_data:{t_d:.3f}, lr:{lr:.3e}, s:({data_info['resolution'][0][0].item()}, {data_info['resolution'][0][1].item()}), " info += ', '.join([f"{k}:{v:.4f}" for k, v in log_buffer.output.items()]) logger.info(info) last_tic = time.time() log_buffer.clear() data_time_all = 0 logs.update(lr=lr) accelerator.log(logs, step=global_step + start_step) global_step += 1 data_time_start= time.time() synchronize() torch.cuda.empty_cache() if accelerator.is_main_process: # log_validation(model_ema, step, model.device) if ((epoch - 1) * len(train_dataloader) + step + 1) % config.save_model_steps == 0: os.umask(0o000) save_checkpoint(os.path.join(config.work_dir, 'checkpoints'), epoch=epoch, step=(epoch - 1) * len(train_dataloader) + step + 1, model=accelerator.unwrap_model(model), model_ema=accelerator.unwrap_model(model_ema), optimizer=optimizer, lr_scheduler=lr_scheduler ) synchronize() synchronize() if accelerator.is_main_process: if epoch % config.save_model_epochs == 0 or epoch == config.num_epochs: os.umask(0o000) save_checkpoint(os.path.join(config.work_dir, 'checkpoints'), epoch=epoch, step=(epoch - 1) * len(train_dataloader) + step + 1, model=accelerator.unwrap_model(model), model_ema=accelerator.unwrap_model(model_ema), optimizer=optimizer, lr_scheduler=lr_scheduler ) synchronize() def parse_args(): parser = argparse.ArgumentParser(description="Process some integers.") parser.add_argument("config", type=str, help="config") parser.add_argument("--cloud", action='store_true', default=False, help="cloud or local machine") parser.add_argument('--work-dir', help='the dir to save logs and models') parser.add_argument('--resume-from', help='the dir to resume the training') parser.add_argument('--load-from', default=None, help='the dir to load a ckpt for training') parser.add_argument('--local-rank', type=int, default=-1) parser.add_argument('--local_rank', type=int, default=-1) parser.add_argument('--debug', action='store_true') args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() config = read_config(args.config) if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None config.work_dir = args.work_dir if args.cloud: config.data_root = '/data/data' if args.resume_from is not None: config.load_from = None config.resume_from = dict( checkpoint=args.resume_from, load_ema=False, resume_optimizer=True, resume_lr_scheduler=True) if args.debug: config.log_interval = 1 config.train_batch_size = 11 config.valid_num = 100 config.load_from = None os.umask(0o000) os.makedirs(config.work_dir, exist_ok=True) init_handler = InitProcessGroupKwargs() init_handler.timeout = datetime.timedelta(seconds=5400) # change timeout to avoid a strange NCCL bug # Initialize accelerator and tensorboard logging if config.use_fsdp: init_train = 'FSDP' from accelerate import FullyShardedDataParallelPlugin from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig set_fsdp_env() fsdp_plugin = FullyShardedDataParallelPlugin(state_dict_config=FullStateDictConfig(offload_to_cpu=False, rank0_only=False),) else: init_train = 'DDP' fsdp_plugin = None even_batches = True if config.multi_scale: even_batches=False, accelerator = Accelerator( mixed_precision=config.mixed_precision, gradient_accumulation_steps=config.gradient_accumulation_steps, log_with="tensorboard", project_dir=os.path.join(config.work_dir, "logs"), fsdp_plugin=fsdp_plugin, even_batches=even_batches, kwargs_handlers=[init_handler] ) logger = get_root_logger(os.path.join(config.work_dir, 'train_log.log')) config.seed = init_random_seed(config.get('seed', None)) set_random_seed(config.seed) if accelerator.is_main_process: config.dump(os.path.join(config.work_dir, 'config.py')) logger.info(f"Config: \n{config.pretty_text}") logger.info(f"World_size: {get_world_size()}, seed: {config.seed}") logger.info(f"Initializing: {init_train} for training") image_size = config.image_size # @param [256, 512] latent_size = int(image_size) // 8 pred_sigma = getattr(config, 'pred_sigma', True) learn_sigma = getattr(config, 'learn_sigma', True) and pred_sigma model_kwargs={"window_block_indexes": config.window_block_indexes, "window_size": config.window_size, "use_rel_pos": config.use_rel_pos, "lewei_scale": config.lewei_scale, 'config':config, 'model_max_length': config.model_max_length} # build models train_diffusion = IDDPM(str(config.train_sampling_steps), learn_sigma=learn_sigma, pred_sigma=pred_sigma, snr=config.snr_loss, return_startx=True) model = build_model(config.model, config.grad_checkpointing, config.get('fp32_attention', False), input_size=latent_size, learn_sigma=learn_sigma, pred_sigma=pred_sigma, **model_kwargs).train() logger.info(f"{model.__class__.__name__} Model Parameters: {sum(p.numel() for p in model.parameters()):,}") if config.load_from is not None: if args.load_from is not None: config.load_from = args.load_from missing, unexpected = load_checkpoint(config.load_from, model, load_ema=config.get('load_ema', False)) logger.warning(f'Missing keys: {missing}') logger.warning(f'Unexpected keys: {unexpected}') model_ema = deepcopy(model).eval() model_teacher = deepcopy(model).eval() if not config.data.load_vae_feat: vae = AutoencoderKL.from_pretrained(config.vae_pretrained).cuda() # prepare for FSDP clip grad norm calculation if accelerator.distributed_type == DistributedType.FSDP: for m in accelerator._models: m.clip_grad_norm_ = types.MethodType(clip_grad_norm_, m) # build dataloader set_data_root(config.data_root) dataset = build_dataset(config.data, resolution=image_size, aspect_ratio_type=config.aspect_ratio_type) if config.multi_scale: batch_sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset, batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, drop_last=True, ratio_nums=dataset.ratio_nums, config=config, valid_num=config.valid_num) # used for balanced sampling # batch_sampler = BalancedAspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset, # batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, # ratio_nums=dataset.ratio_nums) train_dataloader = build_dataloader(dataset, batch_sampler=batch_sampler, num_workers=config.num_workers) else: train_dataloader = build_dataloader(dataset, num_workers=config.num_workers, batch_size=config.train_batch_size, shuffle=True) # build optimizer and lr scheduler lr_scale_ratio = 1 if config.get('auto_lr', None): lr_scale_ratio = auto_scale_lr(config.train_batch_size * get_world_size() * config.gradient_accumulation_steps, config.optimizer, **config.auto_lr) optimizer = build_optimizer(model, config.optimizer) lr_scheduler = build_lr_scheduler(config, optimizer, train_dataloader, lr_scale_ratio) timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()) if accelerator.is_main_process: accelerator.init_trackers(f"tb_{timestamp}") start_epoch = 0 if config.resume_from is not None and config.resume_from['checkpoint'] is not None: start_epoch, missing, unexpected = load_checkpoint(**config.resume_from, model=model, model_ema=model_ema, optimizer=optimizer, lr_scheduler=lr_scheduler, ) logger.warning(f'Missing keys: {missing}') logger.warning(f'Unexpected keys: {unexpected}') solver = DDIMSolver(train_diffusion.alphas_cumprod, timesteps=config.train_sampling_steps, ddim_timesteps=config.num_ddim_timesteps) solver.to(accelerator.device) # Prepare everything # There is no specific order to remember, you just need to unpack the # objects in the same order you gave them to the prepare method. model, model_ema, model_teacher = accelerator.prepare(model, model_ema, model_teacher) # model, model_ema = accelerator.prepare(model, model_ema) optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler) train() ================================================ FILE: PixArt-alpha-ToCa/train_scripts/train_pixart_lcm_lora.py ================================================ import os import sys import types from pathlib import Path current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) import argparse import datetime import time import warnings warnings.filterwarnings("ignore") # ignore warning import torch from accelerate import Accelerator, InitProcessGroupKwargs from accelerate.utils import DistributedType from torch.utils.data import RandomSampler from mmcv.runner import LogBuffer import torch.nn.functional as F import numpy as np import re from packaging import version import accelerate from diffusion import IDDPM from diffusion.utils.dist_utils import get_world_size, clip_grad_norm_ from diffusion.data.builder import build_dataset, build_dataloader, set_data_root from diffusion.utils.logger import get_root_logger from diffusion.utils.misc import set_random_seed, read_config, init_random_seed, DebugUnderflowOverflow from diffusion.utils.optimizer import build_optimizer, auto_scale_lr from diffusion.utils.lr_scheduler import build_lr_scheduler from diffusion.utils.data_sampler import AspectRatioBatchSampler, BalancedAspectRatioBatchSampler from peft import LoraConfig, get_peft_model, get_peft_model_state_dict from diffusers import AutoencoderKL, Transformer2DModel, StableDiffusionPipeline, PixArtAlphaPipeline def set_fsdp_env(): os.environ["ACCELERATE_USE_FSDP"] = 'true' os.environ["FSDP_AUTO_WRAP_POLICY"] = 'TRANSFORMER_BASED_WRAP' os.environ["FSDP_BACKWARD_PREFETCH"] = 'BACKWARD_PRE' os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = 'PixArtBlock' def filter_keys(key_set): def _f(dictionary): return {k: v for k, v in dictionary.items() if k in key_set} return _f def append_dims(x, target_dims): """Appends dimensions to the end of a tensor until it has target_dims dimensions.""" dims_to_append = target_dims - x.ndim if dims_to_append < 0: raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less") return x[(...,) + (None,) * dims_to_append] # From LCMScheduler.get_scalings_for_boundary_condition_discrete def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=10.0): c_skip = sigma_data**2 / ((timestep / 0.1) ** 2 + sigma_data**2) c_out = (timestep / 0.1) / ((timestep / 0.1) ** 2 + sigma_data**2) ** 0.5 return c_skip, c_out # Compare LCMScheduler.step, Step 4 def predicted_origin(model_output, timesteps, sample, prediction_type, alphas, sigmas): if prediction_type == "epsilon": sigmas = extract_into_tensor(sigmas, timesteps, sample.shape) alphas = extract_into_tensor(alphas, timesteps, sample.shape) pred_x_0 = (sample - sigmas * model_output) / alphas elif prediction_type == "v_prediction": sigmas = extract_into_tensor(sigmas, timesteps, sample.shape) alphas = extract_into_tensor(alphas, timesteps, sample.shape) pred_x_0 = alphas * sample - sigmas * model_output else: raise ValueError(f"Prediction type {prediction_type} currently not supported.") return pred_x_0 def extract_into_tensor(a, t, x_shape): b, *_ = t.shape out = a.gather(-1, t) return out.reshape(b, *((1,) * (len(x_shape) - 1))) class DDIMSolver: def __init__(self, alpha_cumprods, timesteps=1000, ddim_timesteps=50): # DDIM sampling parameters step_ratio = timesteps // ddim_timesteps self.ddim_timesteps = (np.arange(1, ddim_timesteps + 1) * step_ratio).round().astype(np.int64) - 1 self.ddim_alpha_cumprods = alpha_cumprods[self.ddim_timesteps] self.ddim_alpha_cumprods_prev = np.asarray( [alpha_cumprods[0]] + alpha_cumprods[self.ddim_timesteps[:-1]].tolist() ) # convert to torch tensors self.ddim_timesteps = torch.from_numpy(self.ddim_timesteps).long() self.ddim_alpha_cumprods = torch.from_numpy(self.ddim_alpha_cumprods) self.ddim_alpha_cumprods_prev = torch.from_numpy(self.ddim_alpha_cumprods_prev) def to(self, device): self.ddim_timesteps = self.ddim_timesteps.to(device) self.ddim_alpha_cumprods = self.ddim_alpha_cumprods.to(device) self.ddim_alpha_cumprods_prev = self.ddim_alpha_cumprods_prev.to(device) return self def ddim_step(self, pred_x0, pred_noise, timestep_index): alpha_cumprod_prev = extract_into_tensor(self.ddim_alpha_cumprods_prev, timestep_index, pred_x0.shape) dir_xt = (1.0 - alpha_cumprod_prev).sqrt() * pred_noise x_prev = alpha_cumprod_prev.sqrt() * pred_x0 + dir_xt return x_prev def train(model): if config.get('debug_nan', False): DebugUnderflowOverflow(model) logger.info('NaN debugger registered. Start to detect overflow during training.') time_start, last_tic = time.time(), time.time() log_buffer = LogBuffer() global_step = start_step load_vae_feat = getattr(train_dataloader.dataset, 'load_vae_feat', False) # Create uncond embeds for classifier free guidance uncond_prompt_embeds = torch.load('output/pretrained_models/null_embed.pth', map_location='cpu').to(accelerator.device).repeat(config.train_batch_size, 1, 1, 1) # Now you train the model for epoch in range(start_epoch + 1, config.num_epochs + 1): data_time_start= time.time() data_time_all = 0 for step, batch in enumerate(train_dataloader): data_time_all += time.time() - data_time_start if load_vae_feat: z = batch[0] else: with torch.no_grad(): with torch.cuda.amp.autocast(enabled=config.mixed_precision == 'fp16'): posterior = vae.encode(batch[0]).latent_dist if config.sample_posterior: z = posterior.sample() else: z = posterior.mode() latents = (z * config.scale_factor).to(weight_dtype) y = batch[1].squeeze(1).to(weight_dtype) y_mask = batch[2].squeeze(1).squeeze(1).to(weight_dtype) data_info = {'resolution': batch[3]['img_hw'].to(weight_dtype), 'aspect_ratio': batch[3]['aspect_ratio'].to(weight_dtype),} # Sample a random timestep for each image grad_norm = None with accelerator.accumulate(model): # Predict the noise residual optimizer.zero_grad() # Sample noise that we'll add to the latents noise = torch.randn_like(latents) bsz = latents.shape[0] # Sample a random timestep for each image t_n ~ U[0, N - k - 1] without bias. topk = config.train_sampling_steps // config.num_ddim_timesteps index = torch.randint(0, config.num_ddim_timesteps, (bsz,), device=latents.device).long() start_timesteps = solver.ddim_timesteps[index] timesteps = start_timesteps - topk timesteps = torch.where(timesteps < 0, torch.zeros_like(timesteps), timesteps) # Get boundary scalings for start_timesteps and (end) timesteps. c_skip_start, c_out_start = scalings_for_boundary_conditions(start_timesteps) c_skip_start, c_out_start = [append_dims(x, latents.ndim) for x in [c_skip_start, c_out_start]] c_skip, c_out = scalings_for_boundary_conditions(timesteps) c_skip, c_out = [append_dims(x, latents.ndim) for x in [c_skip, c_out]] # Sample a random guidance scale w from U[w_min, w_max] and embed it # w = (config.w_max - config.w_min) * torch.rand((bsz,)) + config.w_min w = config.cfg_scale * torch.ones((bsz,)) w = w.reshape(bsz, 1, 1, 1) w = w.to(device=latents.device, dtype=latents.dtype) # Get online LCM prediction on z_{t_{n + k}}, w, c, t_{n + k} _, pred_x_0, noisy_model_input = train_diffusion.training_losses_diffusers( model, latents, start_timesteps, model_kwargs=dict(encoder_hidden_states=y, encoder_attention_mask=y_mask, added_cond_kwargs=data_info), noise=noise ) model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0 with torch.no_grad(): with torch.autocast("cuda"): cond_teacher_output, cond_pred_x0, _ = train_diffusion.training_losses_diffusers( model_teacher, latents, start_timesteps, model_kwargs=dict(encoder_hidden_states=y, encoder_attention_mask=y_mask, added_cond_kwargs=data_info), noise=noise ) # Get teacher model prediction on noisy_latents and unconditional embedding uncond_teacher_output, uncond_pred_x0, _ = train_diffusion.training_losses_diffusers( model_teacher, latents, start_timesteps, model_kwargs=dict(encoder_hidden_states=uncond_prompt_embeds, encoder_attention_mask=y_mask, added_cond_kwargs=data_info), noise=noise ) # Perform "CFG" to get x_prev estimate (using the LCM paper's CFG formulation) pred_x0 = cond_pred_x0 + w * (cond_pred_x0 - uncond_pred_x0) pred_noise = cond_teacher_output + w * (cond_teacher_output - uncond_teacher_output) x_prev = solver.ddim_step(pred_x0, pred_noise, index) # Get target LCM prediction on x_prev, w, c, t_n with torch.no_grad(): with torch.autocast("cuda", enabled=True): _, pred_x_0, _ = train_diffusion.training_losses_diffusers( model, x_prev.float(), timesteps, model_kwargs=dict(encoder_hidden_states=y, encoder_attention_mask=y_mask, added_cond_kwargs=data_info), skip_noise=True ) target = c_skip * x_prev + c_out * pred_x_0 # Calculate loss if config.loss_type == "l2": loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") elif config.loss_type == "huber": loss = torch.mean(torch.sqrt((model_pred.float() - target.float()) ** 2 + config.huber_c**2) - config.huber_c) accelerator.backward(loss) if accelerator.sync_gradients: grad_norm = accelerator.clip_grad_norm_(model.parameters(), config.gradient_clip) optimizer.step() lr_scheduler.step() optimizer.zero_grad(set_to_none=True) lr = lr_scheduler.get_last_lr()[0] logs = {"loss": accelerator.gather(loss).mean().item()} if grad_norm is not None: logs.update(grad_norm=accelerator.gather(grad_norm).mean().item()) log_buffer.update(logs) if (step + 1) % config.log_interval == 0 or (step + 1) == 1: t = (time.time() - last_tic) / config.log_interval t_d = data_time_all / config.log_interval avg_time = (time.time() - time_start) / (global_step + 1) eta = str(datetime.timedelta(seconds=int(avg_time * (total_steps - start_step - global_step - 1)))) eta_epoch = str(datetime.timedelta(seconds=int(avg_time * (len(train_dataloader) - step - 1)))) # avg_loss = sum(loss_buffer) / len(loss_buffer) log_buffer.average() info = f"Step/Epoch [{(epoch-1)*len(train_dataloader)+step+1}/{epoch}][{step + 1}/{len(train_dataloader)}]:total_eta: {eta}, " \ f"epoch_eta:{eta_epoch}, time_all:{t:.3f}, time_data:{t_d:.3f}, lr:{lr:.3e}, s:({data_info['resolution'][0][0].item()}, {data_info['resolution'][0][1].item()}), " info += ', '.join([f"{k}:{v:.4f}" for k, v in log_buffer.output.items()]) logger.info(info) last_tic = time.time() log_buffer.clear() data_time_all = 0 logs.update(lr=lr) accelerator.log(logs, step=global_step + start_step) global_step += 1 data_time_start= time.time() accelerator.wait_for_everyone() if accelerator.is_main_process: if ((epoch - 1) * len(train_dataloader) + step + 1) % config.save_model_steps == 0: save_path = os.path.join(os.path.join(config.work_dir, 'checkpoints'), f"checkpoint-{(epoch - 1) * len(train_dataloader) + step + 1}") os.umask(0o000) logger.info(f"Start to save state to {save_path}") accelerator.save_state(save_path) logger.info(f"Saved state to {save_path}") accelerator.wait_for_everyone() if epoch % config.save_model_epochs == 0 or epoch == config.num_epochs: os.umask(0o000) save_path = os.path.join(os.path.join(config.work_dir, 'checkpoints'), f"checkpoint-{(epoch - 1) * len(train_dataloader) + step + 1}") logger.info(f"Start to save state to {save_path}") model = accelerator.unwrap_model(model) model.save_pretrained(save_path) lora_state_dict = get_peft_model_state_dict(model, adapter_name="default") StableDiffusionPipeline.save_lora_weights(os.path.join(save_path, "transformer_lora"), lora_state_dict) logger.info(f"Saved state to {save_path}") def parse_args(): parser = argparse.ArgumentParser(description="Process some integers.") parser.add_argument("config", type=str, help="config") parser.add_argument("--cloud", action='store_true', default=False, help="cloud or local machine") parser.add_argument("--work-dir", default='output', help='the dir to save logs and models') parser.add_argument("--resume-from", help='the dir to save logs and models') parser.add_argument("--local-rank", type=int, default=-1) parser.add_argument("--local_rank", type=int, default=-1) parser.add_argument("--debug", action='store_true') parser.add_argument("--lora_rank", type=int, default=64, help="The rank of the LoRA projection matrix.", ) args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() config = read_config(args.config) config.resume_from = None if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None config.work_dir = args.work_dir if args.cloud: config.data_root = '/data/data' if args.resume_from is not None: config.resume_from = args.resume_from if args.debug: config.log_interval = 1 config.train_batch_size = 4 config.valid_num = 10 config.save_model_steps = 10 os.umask(0o000) os.makedirs(config.work_dir, exist_ok=True) init_handler = InitProcessGroupKwargs() init_handler.timeout = datetime.timedelta(seconds=5400) # change timeout to avoid a strange NCCL bug # Initialize accelerator and tensorboard logging if config.use_fsdp: init_train = 'FSDP' from accelerate import FullyShardedDataParallelPlugin from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig set_fsdp_env() fsdp_plugin = FullyShardedDataParallelPlugin(state_dict_config=FullStateDictConfig(offload_to_cpu=False, rank0_only=False),) else: init_train = 'DDP' fsdp_plugin = None even_batches = True if config.multi_scale: even_batches=False, accelerator = Accelerator( mixed_precision=config.mixed_precision, gradient_accumulation_steps=config.gradient_accumulation_steps, log_with="tensorboard", project_dir=os.path.join(config.work_dir, "logs"), fsdp_plugin=fsdp_plugin, even_batches=even_batches, kwargs_handlers=[init_handler] ) logger = get_root_logger(os.path.join(config.work_dir, 'train_log.log')) logger.info(accelerator.state) config.seed = init_random_seed(config.get('seed', None)) set_random_seed(config.seed) if accelerator.is_main_process: config.dump(os.path.join(config.work_dir, 'config.py')) logger.info(f"Config: \n{config.pretty_text}") logger.info(f"World_size: {get_world_size()}, seed: {config.seed}") logger.info(f"Initializing: {init_train} for training") image_size = config.image_size # @param [256, 512] latent_size = int(image_size) // 8 pred_sigma = getattr(config, 'pred_sigma', True) learn_sigma = getattr(config, 'learn_sigma', True) and pred_sigma # prepare null_embedding for training if not os.path.exists('output/pretrained_models/null_embed.pth'): logger.info(f"Creating output/pretrained_models/null_embed.pth") os.makedirs('output/pretrained_models/', exist_ok=True) pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16, use_safetensors=True,).to("cuda") torch.save(pipe.encode_prompt(""), 'output/pretrained_models/null_embed.pth') del pipe torch.cuda.empty_cache() # build models train_diffusion = IDDPM(str(config.train_sampling_steps), learn_sigma=learn_sigma, pred_sigma=pred_sigma, return_startx=True) model_teacher = Transformer2DModel.from_pretrained(config.load_from, subfolder="transformer") model_teacher.requires_grad_(False) model = Transformer2DModel.from_pretrained(config.load_from, subfolder="transformer").train() logger.info(f"{model.__class__.__name__} Model Parameters: {sum(p.numel() for p in model.parameters()):}") lora_config = LoraConfig( r=config.lora_rank, target_modules=[ "to_q", "to_k", "to_v", "to_out.0", "proj_in", "proj_out", "ff.net.0.proj", "ff.net.2", "proj", "linear", "linear_1", "linear_2", # "scale_shift_table", # not available due to the implementation in huggingface/peft, working on it. ], ) print(lora_config) model = get_peft_model(model, lora_config) model.print_trainable_parameters() # 9. Handle mixed precision and device placement # For mixed precision training we cast all non-trainable weigths to half-precision # as these weights are only used for inference, keeping weights in full precision is not required. weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": weight_dtype = torch.float16 elif accelerator.mixed_precision == "bf16": weight_dtype = torch.bfloat16 # 11. Enable optimizations # model.enable_xformers_memory_efficient_attention() # model_teacher.enable_xformers_memory_efficient_attention() lora_layers = filter(lambda p: p.requires_grad, model.parameters()) # for name, params in model.named_parameters(): # if params.requires_grad == False: logger.info(f"freeze param: {name}") # # for name, params in model.named_parameters(): # if params.requires_grad == True: logger.info(f"trainable param: {name}") # 10. Handle saving and loading of checkpoints # `accelerate` 0.16.0 will have better support for customized saving if version.parse(accelerate.__version__) >= version.parse("0.16.0"): # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format def save_model_hook(models, weights, output_dir): if accelerator.is_main_process: transformer_ = accelerator.unwrap_model(models[0]) lora_state_dict = get_peft_model_state_dict(transformer_, adapter_name="default") StableDiffusionPipeline.save_lora_weights(os.path.join(output_dir, "transformer_lora"), lora_state_dict) # save weights in peft format to be able to load them back transformer_.save_pretrained(output_dir) for _, model in enumerate(models): # make sure to pop weight so that corresponding model is not saved again weights.pop() def load_model_hook(models, input_dir): # load the LoRA into the model transformer_ = accelerator.unwrap_model(models[0]) transformer_.load_adapter(input_dir, "default", is_trainable=True) for _ in range(len(models)): # pop models so that they are not loaded again models.pop() accelerator.register_save_state_pre_hook(save_model_hook) accelerator.register_load_state_pre_hook(load_model_hook) if config.grad_checkpointing: model.enable_gradient_checkpointing() if not config.data.load_vae_feat: vae = AutoencoderKL.from_pretrained(config.vae_pretrained).cuda() # prepare for FSDP clip grad norm calculation if accelerator.distributed_type == DistributedType.FSDP: for m in accelerator._models: m.clip_grad_norm_ = types.MethodType(clip_grad_norm_, m) # build dataloader set_data_root(config.data_root) dataset = build_dataset(config.data, resolution=image_size, aspect_ratio_type=config.aspect_ratio_type) if config.multi_scale: batch_sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset, batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, drop_last=True, ratio_nums=dataset.ratio_nums, config=config, valid_num=config.valid_num) # used for balanced sampling # batch_sampler = BalancedAspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset, # batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, # ratio_nums=dataset.ratio_nums) train_dataloader = build_dataloader(dataset, batch_sampler=batch_sampler, num_workers=config.num_workers) else: train_dataloader = build_dataloader(dataset, num_workers=config.num_workers, batch_size=config.train_batch_size, shuffle=True) # build optimizer and lr scheduler lr_scale_ratio = 1 if config.get('auto_lr', None): lr_scale_ratio = auto_scale_lr(config.train_batch_size * get_world_size() * config.gradient_accumulation_steps, config.optimizer, **config.auto_lr) optimizer = build_optimizer(model, config.optimizer) lr_scheduler = build_lr_scheduler(config, optimizer, train_dataloader, lr_scale_ratio) timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()) if accelerator.is_main_process: accelerator.init_trackers(f"tb_{timestamp}") start_epoch = 0 start_step = 0 total_steps = len(train_dataloader) * config.num_epochs solver = DDIMSolver(train_diffusion.alphas_cumprod, timesteps=config.train_sampling_steps, ddim_timesteps=config.num_ddim_timesteps) solver.to(accelerator.device) # Prepare everything # There is no specific order to remember, you just need to unpack the # objects in the same order you gave them to the prepare method. model, model_teacher = accelerator.prepare(model, model_teacher) optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler) if config.resume_from is not None: if config.resume_from != "latest": path = os.path.basename(config.resume_from) else: # Get the most recent checkpoint dirs = os.listdir(os.path.join(config.work_dir, 'checkpoints')) dirs = [d for d in dirs if d.startswith("checkpoint")] dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) path = dirs[-1] if len(dirs) > 0 else None if path is None: accelerator.print(f"Checkpoint '{config.resume_from}' does not exist. Starting a new training run.") config.resume_from = None else: accelerator.print(f"Resuming from checkpoint {path}") accelerator.load_state(os.path.join(config.work_dir, 'checkpoints', path)) start_step = int(path.split("-")[1]) start_epoch = start_step // len(train_dataloader) train(model) ================================================ FILE: PixArt-alpha-ToCa/train_scripts/train_pixart_lora_hf.py ================================================ # coding=utf-8 # Copyright 2023 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Fine-tuning script for Stable Diffusion for text2image with support for LoRA.""" import argparse import logging import math import os import random import shutil from pathlib import Path from typing import List, Union import datasets import numpy as np import torch import torch.nn.functional as F import torch.utils.checkpoint import transformers import accelerate from accelerate import Accelerator from accelerate.logging import get_logger from accelerate.utils import ProjectConfiguration, set_seed from datasets import load_dataset from huggingface_hub import create_repo, upload_folder from packaging import version from peft import LoraConfig, get_peft_model_state_dict, get_peft_model, PeftModel from torchvision import transforms from tqdm.auto import tqdm import diffusers from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, StableDiffusionPipeline, PixArtAlphaPipeline, Transformer2DModel from transformers import T5EncoderModel, T5Tokenizer from diffusers.optimization import get_scheduler from diffusers.training_utils import compute_snr from diffusers.utils import check_min_version, is_wandb_available from diffusers.utils.import_utils import is_xformers_available # Will error if the minimal version of diffusers is not installed. Remove at your own risks. check_min_version("0.25.0.dev0") logger = get_logger(__name__, log_level="INFO") # TODO: This function should be removed once training scripts are rewritten in PEFT def text_encoder_lora_state_dict(text_encoder): state_dict = {} def text_encoder_attn_modules(text_encoder): from transformers import CLIPTextModel, CLIPTextModelWithProjection attn_modules = [] if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)): for i, layer in enumerate(text_encoder.text_model.encoder.layers): name = f"text_model.encoder.layers.{i}.self_attn" mod = layer.self_attn attn_modules.append((name, mod)) return attn_modules for name, module in text_encoder_attn_modules(text_encoder): for k, v in module.q_proj.lora_linear_layer.state_dict().items(): state_dict[f"{name}.q_proj.lora_linear_layer.{k}"] = v for k, v in module.k_proj.lora_linear_layer.state_dict().items(): state_dict[f"{name}.k_proj.lora_linear_layer.{k}"] = v for k, v in module.v_proj.lora_linear_layer.state_dict().items(): state_dict[f"{name}.v_proj.lora_linear_layer.{k}"] = v for k, v in module.out_proj.lora_linear_layer.state_dict().items(): state_dict[f"{name}.out_proj.lora_linear_layer.{k}"] = v return state_dict def save_model_card(repo_id: str, images=None, base_model=str, dataset_name=str, repo_folder=None): img_str = "" for i, image in enumerate(images): image.save(os.path.join(repo_folder, f"image_{i}.png")) img_str += f"![img_{i}](./image_{i}.png)\n" yaml = f""" --- license: creativeml-openrail-m base_model: {base_model} tags: - stable-diffusion - stable-diffusion-diffusers - text-to-image - diffusers - lora inference: true --- """ model_card = f""" # LoRA text2image fine-tuning - {repo_id} These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n {img_str} """ with open(os.path.join(repo_folder, "README.md"), "w") as f: f.write(yaml + model_card) def parse_args(): parser = argparse.ArgumentParser(description="Simple example of a training script.") parser.add_argument( "--pretrained_model_name_or_path", type=str, default=None, required=True, help="Path to pretrained model or model identifier from huggingface.co/models.", ) parser.add_argument( "--revision", type=str, default=None, required=False, help="Revision of pretrained model identifier from huggingface.co/models.", ) parser.add_argument( "--variant", type=str, default=None, help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", ) parser.add_argument( "--dataset_name", type=str, default=None, help=( "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private," " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem," " or to a folder containing files that 🤗 Datasets can understand." ), ) parser.add_argument( "--dataset_config_name", type=str, default=None, help="The config of the Dataset, leave as None if there's only one config.", ) parser.add_argument( "--train_data_dir", type=str, default=None, help=( "A folder containing the training data. Folder contents must follow the structure described in" " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file" " must exist to provide the captions for the images. Ignored if `dataset_name` is specified." ), ) parser.add_argument( "--image_column", type=str, default="image", help="The column of the dataset containing an image." ) parser.add_argument( "--caption_column", type=str, default="text", help="The column of the dataset containing a caption or a list of captions.", ) parser.add_argument( "--validation_prompt", type=str, default=None, help="A prompt that is sampled during training for inference." ) parser.add_argument( "--num_validation_images", type=int, default=4, help="Number of images that should be generated during validation with `validation_prompt`.", ) parser.add_argument( "--validation_epochs", type=int, default=1, help=( "Run fine-tuning validation every X epochs. The validation process consists of running the prompt" " `args.validation_prompt` multiple times: `args.num_validation_images`." ), ) parser.add_argument( "--max_train_samples", type=int, default=None, help=( "For debugging purposes or quicker training, truncate the number of training examples to this " "value if set." ), ) parser.add_argument( "--output_dir", type=str, default="sd-model-finetuned-lora", help="The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--cache_dir", type=str, default=None, help="The directory where the downloaded models and datasets will be stored.", ) parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") parser.add_argument( "--resolution", type=int, default=512, help=( "The resolution for input images, all the images in the train/validation dataset will be resized to this" " resolution" ), ) parser.add_argument( "--center_crop", default=False, action="store_true", help=( "Whether to center crop the input images to the resolution. If not set, the images will be randomly" " cropped. The images will be resized to the resolution first before cropping." ), ) parser.add_argument( "--random_flip", action="store_true", help="whether to randomly flip images horizontally", ) parser.add_argument( "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader." ) parser.add_argument("--num_train_epochs", type=int, default=100) parser.add_argument( "--max_train_steps", type=int, default=None, help="Total number of training steps to perform. If provided, overrides num_train_epochs.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--gradient_checkpointing", action="store_true", help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", ) parser.add_argument( "--learning_rate", type=float, default=1e-6, help="Initial learning rate (after the potential warmup period) to use.", ) parser.add_argument( "--scale_lr", action="store_true", default=False, help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.", ) parser.add_argument( "--lr_scheduler", type=str, default="constant", help=( 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' ' "constant", "constant_with_warmup"]' ), ) parser.add_argument( "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." ) parser.add_argument( "--snr_gamma", type=float, default=None, help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. " "More details here: https://arxiv.org/abs/2303.09556.", ) parser.add_argument( "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." ) parser.add_argument( "--use_dora", action="store_true", default=False, help="Whether or not to use Dora. For more information, see" " https://huggingface.co/docs/peft/package_reference/lora#peft.LoraConfig.use_dora" ) parser.add_argument( "--use_rslora", action="store_true", default=False, help="Whether or not to use RS Lora. For more information, see" " https://huggingface.co/docs/peft/package_reference/lora#peft.LoraConfig.use_rslora" ) parser.add_argument( "--allow_tf32", action="store_true", help=( "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" ), ) parser.add_argument( "--dataloader_num_workers", type=int, default=0, help=( "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process." ), ) parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") # ----Diffusion Training Arguments---- parser.add_argument( "--proportion_empty_prompts", type=float, default=0, help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).", ) parser.add_argument( "--prediction_type", type=str, default=None, help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.", ) parser.add_argument( "--hub_model_id", type=str, default=None, help="The name of the repository to keep in sync with the local `output_dir`.", ) parser.add_argument( "--logging_dir", type=str, default="logs", help=( "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." ), ) parser.add_argument( "--mixed_precision", type=str, default=None, choices=["no", "fp16", "bf16"], help=( "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the" " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." ), ) parser.add_argument( "--report_to", type=str, default="tensorboard", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' ), ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument( "--checkpointing_steps", type=int, default=500, help=( "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming" " training using `--resume_from_checkpoint`." ), ) parser.add_argument( "--checkpoints_total_limit", type=int, default=None, help=("Max number of checkpoints to store."), ) parser.add_argument( "--resume_from_checkpoint", type=str, default=None, help=( "Whether training should be resumed from a previous checkpoint. Use a path saved by" ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' ), ) parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." ) parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.") parser.add_argument( "--rank", type=int, default=4, help=("The dimension of the LoRA update matrices."), ) parser.add_argument("--local-rank", type=int, default=-1) args = parser.parse_args() env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) if env_local_rank != -1 and env_local_rank != args.local_rank: args.local_rank = env_local_rank # Sanity checks if args.dataset_name is None and args.train_data_dir is None: raise ValueError("Need either a dataset name or a training folder.") if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1: raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].") return args DATASET_NAME_MAPPING = {"lambdalabs/pokemon-blip-captions": ("image", "text"),} def main(): args = parse_args() logging_dir = Path(args.output_dir, args.logging_dir) accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, project_config=accelerator_project_config, ) if args.report_to == "wandb": if not is_wandb_available(): raise ImportError("Make sure to install wandb if you want to use it for logging during training.") import wandb # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_warning() diffusers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() diffusers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) if args.push_to_hub: repo_id = create_repo(repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token).repo_id # See Section 3.1. of the paper. max_length = 120 # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora transformer) to half-precision # as these weights are only used for inference, keeping weights in full precision is not required. weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": weight_dtype = torch.float16 elif accelerator.mixed_precision == "bf16": weight_dtype = torch.bfloat16 # Load scheduler, tokenizer and models. noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler", torch_dtype=weight_dtype) tokenizer = T5Tokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision, torch_dtype=weight_dtype) text_encoder = T5EncoderModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, torch_dtype=weight_dtype) text_encoder.requires_grad_(False) text_encoder.to(accelerator.device) vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant, torch_dtype=weight_dtype) vae.requires_grad_(False) vae.to(accelerator.device) transformer = Transformer2DModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="transformer", torch_dtype=weight_dtype) # freeze parameters of models to save more memory transformer.requires_grad_(False) # Freeze the transformer parameters before adding adapters for param in transformer.parameters(): param.requires_grad_(False) lora_config = LoraConfig( r=args.rank, init_lora_weights="gaussian", target_modules=[ "to_k", "to_q", "to_v", "to_out.0", "proj_in", "proj_out", "ff.net.0.proj", "ff.net.2", "proj", "linear", "linear_1", "linear_2", # "scale_shift_table", # not available due to the implementation in huggingface/peft, working on it. ], use_dora = args.use_dora, use_rslora = args.use_rslora ) # Move transformer, vae and text_encoder to device and cast to weight_dtype transformer.to(accelerator.device) def cast_training_params(model: Union[torch.nn.Module, List[torch.nn.Module]], dtype=torch.float32): if not isinstance(model, list): model = [model] for m in model: for param in m.parameters(): # only upcast trainable parameters into fp32 if param.requires_grad: param.data = param.to(dtype) transformer = get_peft_model(transformer, lora_config) if args.mixed_precision == "fp16": # only upcast trainable parameters (LoRA) into fp32 cast_training_params(transformer, dtype=torch.float32) transformer.print_trainable_parameters() # 10. Handle saving and loading of checkpoints # `accelerate` 0.16.0 will have better support for customized saving if version.parse(accelerate.__version__) >= version.parse("0.16.0"): # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format def save_model_hook(models, weights, output_dir): if accelerator.is_main_process: transformer_ = accelerator.unwrap_model(transformer) lora_state_dict = get_peft_model_state_dict(transformer_, adapter_name="default") StableDiffusionPipeline.save_lora_weights(os.path.join(output_dir, "transformer_lora"), lora_state_dict) # save weights in peft format to be able to load them back transformer_.save_pretrained(output_dir) for _, model in enumerate(models): # make sure to pop weight so that corresponding model is not saved again weights.pop() def load_model_hook(models, input_dir): # load the LoRA into the model transformer_ = accelerator.unwrap_model(transformer) transformer_.load_adapter(input_dir, "default", is_trainable=True) for _ in range(len(models)): # pop models so that they are not loaded again models.pop() accelerator.register_save_state_pre_hook(save_model_hook) accelerator.register_load_state_pre_hook(load_model_hook) if args.enable_xformers_memory_efficient_attention: if is_xformers_available(): import xformers xformers_version = version.parse(xformers.__version__) if xformers_version == version.parse("0.0.16"): logger.warn( "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details." ) transformer.enable_xformers_memory_efficient_attention() else: raise ValueError("xformers is not available. Make sure it is installed correctly") lora_layers = filter(lambda p: p.requires_grad, transformer.parameters()) # Enable TF32 for faster training on Ampere GPUs, # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices if args.allow_tf32: torch.backends.cuda.matmul.allow_tf32 = True if args.gradient_checkpointing: transformer.enable_gradient_checkpointing() if args.scale_lr: args.learning_rate = args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes # Initialize the optimizer if args.use_8bit_adam: try: import bitsandbytes as bnb except ImportError: raise ImportError("Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`") optimizer_cls = bnb.optim.AdamW8bit else: optimizer_cls = torch.optim.AdamW optimizer = optimizer_cls( lora_layers, lr=args.learning_rate, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, eps=args.adam_epsilon, ) # Get the datasets: you can either provide your own training and evaluation files (see below) # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub). # In distributed training, the load_dataset function guarantees that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. dataset = load_dataset( args.dataset_name, args.dataset_config_name, cache_dir=args.cache_dir, data_dir=args.train_data_dir, ) else: data_files = {} if args.train_data_dir is not None: data_files["train"] = os.path.join(args.train_data_dir, "**") dataset = load_dataset( "imagefolder", data_files=data_files, cache_dir=args.cache_dir, ) # See more about loading custom images at # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder # Preprocessing the datasets. # We need to tokenize inputs and targets. column_names = dataset["train"].column_names # 6. Get the column names for input/target. dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None) if args.image_column is None: image_column = dataset_columns[0] if dataset_columns is not None else column_names[0] else: image_column = args.image_column if image_column not in column_names: raise ValueError( f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}" ) if args.caption_column is None: caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1] else: caption_column = args.caption_column if caption_column not in column_names: raise ValueError( f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}" ) # Preprocessing the datasets. # We need to tokenize input captions and transform the images. def tokenize_captions(examples, is_train=True, proportion_empty_prompts=0., max_length=120): captions = [] for caption in examples[caption_column]: if random.random() < proportion_empty_prompts: captions.append("") elif isinstance(caption, str): captions.append(caption) elif isinstance(caption, (list, np.ndarray)): # take a random caption if there are multiple captions.append(random.choice(caption) if is_train else caption[0]) else: raise ValueError( f"Caption column `{caption_column}` should contain either strings or lists of strings." ) inputs = tokenizer(captions, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt") return inputs.input_ids, inputs.attention_mask # Preprocessing the datasets. train_transforms = transforms.Compose( [ transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR), transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution), transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x), transforms.ToTensor(), transforms.Normalize([0.5], [0.5]), ] ) def preprocess_train(examples): images = [image.convert("RGB") for image in examples[image_column]] examples["pixel_values"] = [train_transforms(image) for image in images] examples["input_ids"], examples['prompt_attention_mask'] = tokenize_captions(examples, proportion_empty_prompts=args.proportion_empty_prompts, max_length=max_length) return examples with accelerator.main_process_first(): if args.max_train_samples is not None: dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples)) # Set the training transforms train_dataset = dataset["train"].with_transform(preprocess_train) def collate_fn(examples): pixel_values = torch.stack([example["pixel_values"] for example in examples]) pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float() input_ids = torch.stack([example["input_ids"] for example in examples]) prompt_attention_mask = torch.stack([example["prompt_attention_mask"] for example in examples]) return {"pixel_values": pixel_values, "input_ids": input_ids, 'prompt_attention_mask': prompt_attention_mask} # DataLoaders creation: train_dataloader = torch.utils.data.DataLoader( train_dataset, shuffle=True, collate_fn=collate_fn, batch_size=args.train_batch_size, num_workers=args.dataloader_num_workers, ) # Scheduler and math around the number of training steps. overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch overrode_max_train_steps = True lr_scheduler = get_scheduler( args.lr_scheduler, optimizer=optimizer, num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes, num_training_steps=args.max_train_steps * accelerator.num_processes, ) # Prepare everything with our `accelerator`. transformer, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(transformer, optimizer, train_dataloader, lr_scheduler) # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if overrode_max_train_steps: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # Afterwards we recalculate our number of training epochs args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # We need to initialize the trackers we use, and also store our configuration. # The trackers initializes automatically on the main process. if accelerator.is_main_process: accelerator.init_trackers("text2image-fine-tune", config=vars(args)) # Train! total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") global_step = 0 first_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint != "latest": path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = os.listdir(args.output_dir) dirs = [d for d in dirs if d.startswith("checkpoint")] dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) path = dirs[-1] if len(dirs) > 0 else None if path is None: accelerator.print( f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run." ) args.resume_from_checkpoint = None initial_global_step = 0 else: accelerator.print(f"Resuming from checkpoint {path}") accelerator.load_state(os.path.join(args.output_dir, path)) global_step = int(path.split("-")[1]) initial_global_step = global_step first_epoch = global_step // num_update_steps_per_epoch else: initial_global_step = 0 progress_bar = tqdm( range(0, args.max_train_steps), initial=initial_global_step, desc="Steps", # Only show the progress bar once on each machine. disable=not accelerator.is_local_main_process, ) for epoch in range(first_epoch, args.num_train_epochs): transformer.train() train_loss = 0.0 for step, batch in enumerate(train_dataloader): with accelerator.accumulate(transformer): # Convert images to latent space latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample() latents = latents * vae.config.scaling_factor # Sample noise that we'll add to the latents noise = torch.randn_like(latents) if args.noise_offset: # https://www.crosslabs.org//blog/diffusion-with-offset-noise noise += args.noise_offset * torch.randn((latents.shape[0], latents.shape[1], 1, 1), device=latents.device) bsz = latents.shape[0] # Sample a random timestep for each image timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device) timesteps = timesteps.long() # Add noise to the latents according to the noise magnitude at each timestep # (this is the forward diffusion process) noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) # Get the text embedding for conditioning prompt_embeds = text_encoder(batch["input_ids"], attention_mask=batch['prompt_attention_mask'])[0] prompt_attention_mask = batch['prompt_attention_mask'] # Get the target for loss depending on the prediction type if args.prediction_type is not None: # set prediction_type of scheduler if defined noise_scheduler.register_to_config(prediction_type=args.prediction_type) if noise_scheduler.config.prediction_type == "epsilon": target = noise elif noise_scheduler.config.prediction_type == "v_prediction": target = noise_scheduler.get_velocity(latents, noise, timesteps) else: raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") # Prepare micro-conditions. added_cond_kwargs = {"resolution": None, "aspect_ratio": None} if getattr(transformer, 'module', transformer).config.sample_size == 128: resolution = torch.tensor([args.resolution, args.resolution]).repeat(bsz, 1) aspect_ratio = torch.tensor([float(args.resolution / args.resolution)]).repeat(bsz, 1) resolution = resolution.to(dtype=weight_dtype, device=latents.device) aspect_ratio = aspect_ratio.to(dtype=weight_dtype, device=latents.device) added_cond_kwargs = {"resolution": resolution, "aspect_ratio": aspect_ratio} # Predict the noise residual and compute loss model_pred = transformer(noisy_latents, encoder_hidden_states=prompt_embeds, encoder_attention_mask=prompt_attention_mask, timestep=timesteps, added_cond_kwargs=added_cond_kwargs).sample.chunk(2, 1)[0] if args.snr_gamma is None: loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") else: # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556. # Since we predict the noise instead of x_0, the original formulation is slightly changed. # This is discussed in Section 4.2 of the same paper. snr = compute_snr(noise_scheduler, timesteps) if noise_scheduler.config.prediction_type == "v_prediction": # Velocity objective requires that we add one to SNR values before we divide by them. snr = snr + 1 mse_loss_weights = (torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr) loss = F.mse_loss(model_pred.float(), target.float(), reduction="none") loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights loss = loss.mean() # Gather the losses across all processes for logging (if we use distributed training). avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean() train_loss += avg_loss.item() / args.gradient_accumulation_steps # Backpropagate accelerator.backward(loss) if accelerator.sync_gradients: params_to_clip = lora_layers accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm) optimizer.step() lr_scheduler.step() optimizer.zero_grad() # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: progress_bar.update(1) global_step += 1 accelerator.log({"train_loss": train_loss}, step=global_step) train_loss = 0.0 if global_step % args.checkpointing_steps == 0: if accelerator.is_main_process: # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` if args.checkpoints_total_limit is not None: checkpoints = os.listdir(args.output_dir) checkpoints = [d for d in checkpoints if d.startswith("checkpoint")] checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1])) # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints if len(checkpoints) >= args.checkpoints_total_limit: num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1 removing_checkpoints = checkpoints[0:num_to_remove] logger.info(f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints") logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}") for removing_checkpoint in removing_checkpoints: removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint) shutil.rmtree(removing_checkpoint) save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") accelerator.save_state(save_path) unwrapped_transformer = accelerator.unwrap_model(transformer, keep_fp32_wrapper=False) transformer_lora_state_dict = get_peft_model_state_dict(unwrapped_transformer) StableDiffusionPipeline.save_lora_weights( save_directory=save_path, unet_lora_layers=transformer_lora_state_dict, safe_serialization=True, ) logger.info(f"Saved state to {save_path}") logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]} progress_bar.set_postfix(**logs) if global_step >= args.max_train_steps: break if accelerator.is_main_process: if args.validation_prompt is not None and epoch % args.validation_epochs == 0: logger.info( f"Running validation... \n Generating {args.num_validation_images} images with prompt:" f" {args.validation_prompt}." ) # create pipeline pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, transformer=accelerator.unwrap_model(transformer, keep_fp32_wrapper=False), text_encoder=text_encoder, vae=vae, torch_dtype=weight_dtype, ) pipeline = pipeline.to(accelerator.device) pipeline.set_progress_bar_config(disable=True) # run inference generator = torch.Generator(device=accelerator.device) if args.seed is not None: generator = generator.manual_seed(args.seed) images = [] for _ in range(args.num_validation_images): images.append(pipeline(args.validation_prompt, num_inference_steps=20, generator=generator).images[0]) for tracker in accelerator.trackers: if tracker.name == "tensorboard": np_images = np.stack([np.asarray(img) for img in images]) tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC") if tracker.name == "wandb": tracker.log( { "validation": [wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)] } ) del pipeline torch.cuda.empty_cache() # Save the lora layers accelerator.wait_for_everyone() if accelerator.is_main_process: transformer = accelerator.unwrap_model(transformer, keep_fp32_wrapper=False) transformer.save_pretrained(args.output_dir) lora_state_dict = get_peft_model_state_dict(transformer) StableDiffusionPipeline.save_lora_weights(os.path.join(args.output_dir, "transformer_lora"), lora_state_dict) if args.push_to_hub: save_model_card( repo_id, images=images, base_model=args.pretrained_model_name_or_path, dataset_name=args.dataset_name, repo_folder=args.output_dir, ) upload_folder( repo_id=repo_id, folder_path=args.output_dir, commit_message="End of training", ignore_patterns=["step_*", "epoch_*"], ) # Final inference # Load previous transformer transformer = Transformer2DModel.from_pretrained(args.pretrained_model_name_or_path, subfolder='transformer', torch_dtype=weight_dtype) # load lora weight transformer = PeftModel.from_pretrained(transformer, args.output_dir) # Load previous pipeline pipeline = DiffusionPipeline.from_pretrained(args.pretrained_model_name_or_path, transformer=transformer, text_encoder=text_encoder, vae=vae, torch_dtype=weight_dtype,) pipeline = pipeline.to(accelerator.device) del transformer torch.cuda.empty_cache() # run inference generator = torch.Generator(device=accelerator.device) if args.seed is not None: generator = generator.manual_seed(args.seed) images = [] for _ in range(args.num_validation_images): images.append(pipeline(args.validation_prompt, num_inference_steps=20, generator=generator).images[0]) if accelerator.is_main_process: for tracker in accelerator.trackers: if len(images) != 0: if tracker.name == "tensorboard": np_images = np.stack([np.asarray(img) for img in images]) tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC") if tracker.name == "wandb": tracker.log( { "test": [ wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images) ] } ) accelerator.end_training() if __name__ == "__main__": main() ================================================ FILE: PixArt-alpha-ToCa-tools/clip_score.py ================================================ import os import torch from PIL import Image from torchvision.transforms import ToTensor from torchmetrics.multimodal.clip_score import CLIPScore from tqdm import tqdm import torch.multiprocessing as mp # Load prompts file def load_prompts(txt_file): with open(txt_file, "r") as f: prompts = f.read().splitlines() return prompts # Find matching image file: first, directly use the prompt as the filename, # and if not found, match using a prefix def find_image_file(image_folder, prompt): img_filename = prompt + ".jpg" # Assume filename is {prompt}.jpg img_path = os.path.join(image_folder, img_filename) if os.path.exists(img_path): return img_path # If direct match fails, use prefix matching for file in os.listdir(image_folder): if file.startswith(prompt[:20]): # Use the first 20 characters as a prefix for matching return os.path.join(image_folder, file) return None # Load a batch of images and convert them to Tensors def load_images(image_folder, prompts_batch): images = [] valid_prompts = [] for prompt in prompts_batch: img_path = find_image_file(image_folder, prompt) if img_path: try: image = Image.open(img_path).convert("RGB") image_tensor = ToTensor()(image).unsqueeze(0) # Shape (1, C, H, W) images.append(image_tensor) valid_prompts.append(prompt) except Exception as e: print(f"Error loading image {img_path}: {e}") else: print(f"No image found for prompt: {prompt}") if len(images) > 0: images_tensor = torch.cat(images, dim=0) # Combine into a single batch (N, C, H, W) return images_tensor, valid_prompts else: return None, None # Single task: process a batch of prompts and corresponding images, and calculate CLIP Score def process_batch(prompts_batch, image_folder, model_path, device): clip_score_metric = CLIPScore(model_name_or_path=model_path).to(device) # Load image batch images_tensor, valid_prompts = load_images(image_folder, prompts_batch) if images_tensor is not None: images_tensor = images_tensor.to(device) with torch.no_grad(): # Avoid building computation graph, reducing memory consumption # Calculate CLIP Score for each image and prompt for i, prompt in enumerate(valid_prompts): clip_score_metric.update(images_tensor[i].unsqueeze(0).float(), prompt) # Release memory del images_tensor torch.cuda.empty_cache() return clip_score_metric.compute().item() else: return None # Split data into batches def chunked(iterable, batch_size): """Yield successive n-sized chunks from iterable.""" for i in range(0, len(iterable), batch_size): yield iterable[i:i + batch_size] # Main processing function def main_worker(rank, prompts, image_folder, model_path, device, batch_size, queue): # Split into batches prompts_batches = list(chunked(prompts, batch_size)) clip_scores = [] for batch in prompts_batches: score = process_batch(batch, image_folder, model_path, device) if score is not None: clip_scores.append(score) # After processing each batch, send information to the main process queue.put(1) # Send signal indicating one batch is processed queue.put(clip_scores) # Put final result into the queue for the main process def main(prompt_file="prompts.txt", image_folder="images", batch_size=64, num_workers=4): # Load prompts prompts = load_prompts(prompt_file) model_path = "/root/autodl-tmp/pretrained_models/clip-vit-large-patch14" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Create multiprocessing queue queue = mp.Queue() # Start multiple processes processes = [] chunk_size = len(prompts) // num_workers total_batches = (len(prompts) + batch_size - 1) // batch_size # Calculate total batch count for rank in range(num_workers): worker_prompts = prompts[rank * chunk_size: (rank + 1) * chunk_size] p = mp.Process(target=main_worker, args=(rank, worker_prompts, image_folder, model_path, device, batch_size, queue)) p.start() processes.append(p) # Use tqdm to create a progress bar with tqdm(total=total_batches, desc="Processing batches") as pbar: all_scores = [] finished_batches = 0 # Get results or progress from the queue while finished_batches < total_batches: result = queue.get() if isinstance(result, list): # If it's a list, it means final scores all_scores.extend(result) else: pbar.update(1) # Update progress bar finished_batches += 1 # Wait for subprocesses to end for p in processes: p.join() # Calculate final result if all_scores: final_clip_score = sum(all_scores) / len(all_scores) print(f"Final averaged CLIP Score for folder '{image_folder}': {final_clip_score}") else: print(f"No valid images found in folder '{image_folder}'.") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Calculate CLIP Score for images and prompts with batch parallel processing.") parser.add_argument("--prompt_file", type=str, default="/root/autodl-tmp/COCO/COCO_caption_prompts_30k.txt", help="Path to the prompts text file.") parser.add_argument("--image_folder", type=str, default="/root/autodl-tmp/vis/2024-09-04_custom_epochunknown_stepunknown_scale4.5_step20_size256_bs100_sampdpm-solver_seed0", help="Path to the folder containing images.") parser.add_argument("--batch_size", type=int, default=64, help="Number of images to process in each batch.") parser.add_argument("--num_workers", type=int, default=4, help="Number of parallel workers.") args = parser.parse_args() # Set multiprocessing start method to 'spawn', suitable for CUDA mp.set_start_method('spawn', force=True) main(prompt_file=args.prompt_file, image_folder=args.image_folder, batch_size=args.batch_size, num_workers=args.num_workers) ================================================ FILE: README.md ================================================
# **[ICLR 2025]** *ToCa*: Accelerating Diffusion Transformers with *To*ken-wise Feature *Ca*ching

## 🔥 News * `2025/03/10` 🚀🚀 Our latest work "From Reusing to Forecasting: Accelerating Diffusion Models with TaylorSeers" is released! Codes are available at [TaylorSeer](https://github.com/Shenyi-Z/TaylorSeer)! TaylorSeer supports lossless compression at a rate of 4.99x on FLUX.1-dev (with a latency speedup of 3.53x) and high-quality acceleration at a compression rate of 5.00x on HunyuanVideo (with a latency speedup of 4.65x)! We hope *TaylorSeer* can move the paradigm of feature caching methods from reusing to forecasting.For more details, please refer to our latest research [paper](https://arxiv.org/abs/2503.06923). * `2025/02/19` 🚀🚀 ToCa solution for **FLUX** has been officially released after adjustments, now achieving up to **3.14× lossless acceleration**! * `2025/01/22` 💥💥 ToCa is honored to be accepted by ICLR 2025! * `2024/12/29` 🚀🚀 We release our work [DuCa](https://arxiv.org/abs/2412.18911) about accelerating diffusion transformers for FREE, which achieves nearly lossless acceleration of **2.50×** on [OpenSora](https://github.com/hpcaitech/Open-Sora)! 🎉 **DuCa also overcomes the limitation of ToCa by fully supporting FlashAttention, enabling broader compatibility and efficiency improvements.** * `2024/12/24` 🤗🤗 We release an open-sourse repo "[Awesome-Token-Reduction-for-Model-Compression](https://github.com/xuyang-liu16/Awesome-Token-Reduction-for-Model-Compression)", which collects recent awesome token reduction papers! Feel free to contribute your suggestions! * `2024/12/20` 💥💥 Our ToCa has achieved nearly lossless acceleration of **1.51×** on [FLUX](https://huggingface.co/spaces/black-forest-labs/FLUX.1-schnell), feel free to check the latest version of our [paper](https://arxiv.org/pdf/2410.05317#page=19)! * `2024/12/10` 💥💥 Our team's recent work, **SiTo** (https://github.com/EvelynZhang-epiclab/SiTo), has been accepted to **AAAI 2025**. It accelerates diffusion models through adaptive **Token Pruning**. * `2024/10/16` 🤗🤗 Users with autodl accounts can now quickly experience [OpenSora-ToCa](https://www.codewithgpu.com/i/Shenyi-Z/ToCa/OpenSora-ToCa) by directly using our publicly available image! * `2024/10/12` 🚀🚀 We release our work [ToCa](https://arxiv.org/abs/2410.05317) about accelerating diffusion transformers for FREE, which achieves nearly lossless acceleration of **2.36×** on [OpenSora](https://github.com/hpcaitech/Open-Sora)! * `2024/07/15` 🤗🤗 We release an open-sourse repo "[Awesome-Generation-Acceleration](https://github.com/xuyang-liu16/Awesome-Generation-Acceleration)", which collects recent awesome generation accleration papers! Feel free to contribute your suggestions! ## TODO: - [x] Support for FLOPs calculation - [x] Add the FLUX version of ToCa - [ ] Further optimize the code logic to reduce the time consumption of tensor operations ## Dependencies ``` cmd Python>=3.9 CUDA>=11.8 ``` ## 🛠 Installation ``` cmd git clone https://github.com/Shenyi-Z/ToCa.git ``` ### Environment Settings #### Original Models (recommended) We evaluated our model under the same environments as the original models. So you may set the environments through following the requirements of the mentioned original models. Links: | Original Models | urls | | :--------------: | :------------------------------------------: | | DiT | https://github.com/facebookresearch/DiT | | PixArt-α | https://github.com/PixArt-alpha/PixArt-alpha | | OpenSora | https://github.com/hpcaitech/Open-Sora | | FLUX | https://github.com/black-forest-labs/flux | Besides, we provide a replica for our environment here:
From our environment.yaml ##### DiT ```bash cd DiT-ToCa conda env create -f environment-dit.yml ``` ##### PixArt-α ```bash cd PixArt-alpha-ToCa conda env create -f environment-pixart.yml ``` ##### OpenSora ```bash cd Open-Sora conda env create -f environment-opensora.yml pip install -v . # for development mode, `pip install -v -e .` ```
## 🚀 Run and evaluation ### Run DiT-ToCa #### DDPM-250 Steps sample images for **visualization** ```bash cd DiT-ToCa python sample.py --image-size 256 --num-sampling-steps 250 --cache-type attention --fresh-threshold 4 --fresh-ratio 0.07 --ratio-scheduler ToCa-ddpm250 --force-fresh global --soft-fresh-weight 0.25 ``` sample images for **evaluation** (e.g 50k) ```bash cd DiT-ToCa torchrun --nnodes=1 --nproc_per_node=6 sample_ddp.py --model DiT-XL/2 --per-proc-batch-size 150 --image-size 256 --cfg-scale 1.5 --num-sampling-steps 250 --cache-type attention --fresh-ratio 0.07 --ratio-scheduler ToCa-ddpm250 --force-fresh global --fresh-threshold 4 --soft-fresh-weight 0.25 --num-fid-samples 50000 ``` #### DDIM-50 Steps sample images for **visualization** ```bash cd DiT-ToCa python sample.py --image-size 256 --num-sampling-steps 50 --cache-type attention --fresh-threshold 3 --fresh-ratio 0.07 --ratio-scheduler ToCa-ddim50 --force-fresh global --soft-fresh-weight 0.25 --ddim-sample ``` sample images for **evaluation** (e.g 50k) ```bash cd DiT-ToCa torchrun --nnodes=1 --nproc_per_node=6 sample_ddp.py --model DiT-XL/2 --per-proc-batch-size 150 --image-size 256 --cfg-scale 1.5 --num-sampling-steps 50 --cache-type attention --fresh-ratio 0.07 --ratio-scheduler ToCa-ddim50 --force-fresh global --fresh-threshold 3 --soft-fresh-weight 0.25 --num-fid-samples 50000 --ddim-sample ``` #### test FLOPs Just add --test-FLOPs, here an example: ```bash cd DiT-ToCa python sample.py --image-size 256 --num-sampling-steps 50 --cache-type attention --fresh-threshold 3 --fresh-ratio 0.07 --ratio-scheduler ToCa-ddim50 --force-fresh global --soft-fresh-weight 0.25 --ddim-sample --test-FLOPs ``` ### Run PixArt-α-ToCa sample images for **visualization** ```bash cd PixArt-alpha-ToCa python scripts/inference.py --model_path /root/autodl-tmp/pretrained_models/PixArt-XL-2-256x256.pth --image_size 256 --bs 100 --txt_file /root/autodl-tmp/test.txt --fresh_threshold 3 --fresh_ratio 0.30 --cache_type attention --force_fresh global --soft_fresh_weight 0.25 --ratio_scheduler ToCa ``` sample images for **evaluation** (e.g 30k for COCO, 1.6k for PartiPrompts) ```bash cd PixArt-alpha-ToCa torchrun --nproc_per_node=6 scripts/inference_ddp.py --model_path /root/autodl-tmp/pretrained_models/PixArt-XL-2-256x256.pth --image_size 256 --bs 100 --txt_file /root/autodl-tmp/COCO/COCO_caption_prompts_30k.txt --fresh_threshold 3 --fresh_ratio 0.30 --cache_type attention --force_fresh global --soft_fresh_weight 0.25 --ratio_scheduler ToCa ``` (Besides, if you need our npz file: https://drive.google.com/file/d/1vUdoSgdIvtXo1cAS_aOFCJ1-XC_i1KEQ/view?usp=sharing) ### Run OpenSora-ToCa sample video for **visualization** ```bash cd Open-Sora python scripts/inference.py configs/opensora-v1-2/inference/sample.py --num-frames 2s --resolution 480p --aspect-ratio 9:16 --prompt "a beautiful waterfall" ``` sample video for **VBench evaluation** ```bash cd Open-Sora bash eval/vbench/launch.sh /root/autodl-tmp/pretrained_models/hpcai-tech/OpenSora-STDiT-v3/model.safetensors 51 opensora-ToCa 480p 9:16 ``` (remember replacing "/root/autodl-tmp/pretrained_models/hpcai-tech/OpenSora-STDiT-v3/model.safetensors" with your own path!) ### Run FLUX-ToCa First, you need to enter the environment adapted for FLUX. While the official documentation uses `venv` to build the environment, you can also set it up using `conda`, which you might be more familiar with.
How to build a conda environment for FLUX? ```bash cd flux-ToCa conda create -n flux python=3.10 pip install -e ".[all]" ```
For interactive sampling run ```bash python -m flux --name --loop ``` Or to generate a single sample run ```bash python -m flux --name \ --height --width \ --prompt "" ``` Typically, `` should be set to `flux-dev`. Generate image samples with a txt file ```bash python src/sample.py --prompt_file --width 1024 --height 1024 --model_name flux-dev --add_sampling_metadata --output_dir --num_steps 50 ``` The `--add_sampling_metadata` parameter is used to control whether the prompt is added to the image's EXIF metadata. We also provide function for FLOPs testing, but **in this mode, no generated samples are given**. ```bash python src/sample.py --prompt_file --width 1024 --height 1024 --model_name flux-dev --add_sampling_metadata --output_dir --num_steps 50 --test_FLOPs ``` Use the framework of Geneval for evaluation ```bash python src/geneval_flux.py /root/geneval/prompts/evaluation_metadata.jsonl --model_name flux-dev --n_samples 4 --steps 50 --width 1024 --height 1024 --seed 42 --output_dir /root/autodl-tmp/samples/flux-ToCa ```
How to prepare environment for geneval? The environment required for Geneval's metric computation is somewhat specific. As of February 2025, it is not yet possible to set up the environment directly using the default method provided in the project. However, we can follow the guidance in this Geneval issue [https://github.com/djghosh13/geneval/issues/12](https://github.com/djghosh13/geneval/issues/12) to set up the environment. The instructions are very detailed.
#### Awesome acceleration results for the Latest Version of ToCa on FLUX | Method | Geneval $\uparrow$
overall score | ImageRewrd $\uparrow$
DrawBench200 | FLOPs $\downarrow$ | Latency $\downarrow$ | Compress Ratio $\uparrow$ | Speed Up $\uparrow$ | | ------------ | :-----------------------------------: | :-------------------------------------: | :----------------: | :------------------: | :-----------------------: | :-----------------: | | **original** | 0.6752 | 0.9898 | 3719.50 | 33.87s | 1.00 | 1.00 | | 60% steps | 0.6700 | 0.9739 | 2231.70 | 20.49s | 1.67 | 1.65 | | 50% steps | 0.6656 | 0.9429 | 1859.75 | 17.12s | 2.00 | 1.98 | | 40% steps | 0.6606 | 0.9317 | 1487.80 | 13.77s | 2.62 | 2.45 | | **FORA3** | 0.6594 | 0.9227 | 1320.07 | 12.98s | 2.82 | 2.61 | | **ToCa4-01** | 0.6748 | **0.9798** | 1263.22 | 11.91s | 2.94 | 2.84 | | **ToCa5-01** | **0.6750** | 0.9731 | 1126.76 | 10.80s | 3.30 | 3.14 | | **ToCa6-01** | 0.6653 | 0.9493 | 990.30 | 9.48s | 3.76 | 3.57 |
Explanation of the Improved ToCa The **acceleration effect has significantly improved while maintaining generation quality** compared with the previous version. This is because, in the current version of the code, we have further optimized ToCa and adopted more reliable metrics (Image Reward on DrawBench200, Geneval).
## 👍 Acknowledgements - Thanks to [DiT](https://github.com/facebookresearch/DiT) for their great work and codebase upon which we build DiT-ToCa. - Thanks to [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) for their great work and codebase upon which we build PixArt-α-ToCa. - Thanks to [OpenSora](https://github.com/hpcaitech/Open-Sora) for their great work and codebase upon which we build OpenSora-ToCa. - Thanks to [FLUX](https://github.com/black-forest-labs/flux) for their great work and codebase upon which we build FLUX-ToCa. ## 📌 Citation ```bibtex @article{zou2024accelerating, title={Accelerating Diffusion Transformers with Token-wise Feature Caching}, author={Zou, Chang and Liu, Xuyang and Liu, Ting and Huang, Siteng and Zhang, Linfeng}, journal={arXiv preprint arXiv:2410.05317}, year={2024} } ``` ## :e-mail: Contact If you have any questions, please email [`shenyizou@outlook.com`](mailto:shenyizou@outlook.com). ================================================ FILE: flux-ToCa/.gitignore ================================================ # Created by https://www.toptal.com/developers/gitignore/api/linux,windows,macos,visualstudiocode,python # Edit at https://www.toptal.com/developers/gitignore?templates=linux,windows,macos,visualstudiocode,python ### Linux ### *~ # temporary files which can be created if a process still has a handle open of a deleted file .fuse_hidden* # KDE directory preferences .directory # Linux trash folder which might appear on any partition or disk .Trash-* # .nfs files are created when an open file is removed but is still being accessed .nfs* ### macOS ### # General .DS_Store .AppleDouble .LSOverride # Icon must end with two \r Icon # Thumbnails ._* # Files that might appear in the root of a volume .DocumentRevisions-V100 .fseventsd .Spotlight-V100 .TemporaryItems .Trashes .VolumeIcon.icns .com.apple.timemachine.donotpresent # Directories potentially created on remote AFP share .AppleDB .AppleDesktop Network Trash Folder Temporary Items .apdisk ### Python ### # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ ### VisualStudioCode ### .vscode/* !.vscode/settings.json !.vscode/tasks.json !.vscode/launch.json !.vscode/extensions.json *.code-workspace # Local History for Visual Studio Code .history/ ### VisualStudioCode Patch ### # Ignore all local history of files .history .ionide ### Windows ### # Windows thumbnail cache files Thumbs.db Thumbs.db:encryptable ehthumbs.db ehthumbs_vista.db # Dump file *.stackdump # Folder config file [Dd]esktop.ini # Recycle Bin used on file shares $RECYCLE.BIN/ # Windows Installer files *.cab *.msi *.msix *.msm *.msp # Windows shortcuts *.lnk # End of https://www.toptal.com/developers/gitignore/api/linux,windows,macos,visualstudiocode,python ================================================ FILE: flux-ToCa/LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: flux-ToCa/README.md ================================================ # FLUX by Black Forest Labs: https://blackforestlabs.ai. Documentation for our API can be found here: [docs.bfl.ml](https://docs.bfl.ml/). ![grid](assets/grid.jpg) This repo contains minimal inference code to run image generation & editing with our Flux models. ## Local installation ```bash cd $HOME && git clone https://github.com/black-forest-labs/flux cd $HOME/flux # Using pyvenv python3.10 -m venv .venv source .venv/bin/activate pip install -e ".[all]" ``` ### Models We are offering an extensive suite of models. For more information about the individual models, please refer to the link under **Usage**. | Name | Usage | HuggingFace repo | License | | --------------------------- | ---------------------------------------------------------- | -------------------------------------------------------------- | --------------------------------------------------------------------- | | `FLUX.1 [schnell]` | [Text to Image](docs/text-to-image.md) | https://huggingface.co/black-forest-labs/FLUX.1-schnell | [apache-2.0](model_licenses/LICENSE-FLUX1-schnell) | | `FLUX.1 [dev]` | [Text to Image](docs/text-to-image.md) | https://huggingface.co/black-forest-labs/FLUX.1-dev | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | | `FLUX.1 Fill [dev]` | [In/Out-painting](docs/fill.md) | https://huggingface.co/black-forest-labs/FLUX.1-Fill-dev | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | | `FLUX.1 Canny [dev]` | [Structural Conditioning](docs/structural-conditioning.md) | https://huggingface.co/black-forest-labs/FLUX.1-Canny-dev | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | | `FLUX.1 Depth [dev]` | [Structural Conditioning](docs/structural-conditioning.md) | https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | | `FLUX.1 Canny [dev] LoRA` | [Structural Conditioning](docs/structural-conditioning.md) | https://huggingface.co/black-forest-labs/FLUX.1-Canny-dev-lora | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | | `FLUX.1 Depth [dev] LoRA` | [Structural Conditioning](docs/structural-conditioning.md) | https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev-lora | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | | `FLUX.1 Redux [dev]` | [Image variation](docs/image-variation.md) | https://huggingface.co/black-forest-labs/FLUX.1-Redux-dev | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | | `FLUX.1 [pro]` | [Text to Image](docs/text-to-image.md) | [Available in our API.](https://docs.bfl.ml/) | | | `FLUX1.1 [pro]` | [Text to Image](docs/text-to-image.md) | [Available in our API.](https://docs.bfl.ml/) | | | `FLUX1.1 [pro] Ultra/raw` | [Text to Image](docs/text-to-image.md) | [Available in our API.](https://docs.bfl.ml/) | | | `FLUX.1 Fill [pro]` | [In/Out-painting](docs/fill.md) | [Available in our API.](https://docs.bfl.ml/) | | | `FLUX.1 Canny [pro]` | [Structural Conditioning](docs/structural-conditioning.md) | [Available in our API.](https://docs.bfl.ml/) | | | `FLUX.1 Depth [pro]` | [Structural Conditioning](docs/structural-conditioning.md) | [Available in our API.](https://docs.bfl.ml/) | | | `FLUX1.1 Redux [pro]` | [Image variation](docs/image-variation.md) | [Available in our API.](https://docs.bfl.ml/) | | | `FLUX1.1 Redux [pro] Ultra` | [Image variation](docs/image-variation.md) | [Available in our API.](https://docs.bfl.ml/) | | The weights of the autoencoder are also released under [apache-2.0](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md) and can be found in the HuggingFace repos above. ## API usage Our API offers access to our models. It is documented here: [docs.bfl.ml](https://docs.bfl.ml/). In this repository we also offer an easy python interface. To use this, you first need to register with the API on [api.bfl.ml](https://api.bfl.ml/), and create a new API key. To use the API key either run `export BFL_API_KEY=` or provide it via the `api_key=` parameter. It is also expected that you have installed the package as above. Usage from python: ```python from flux.api import ImageRequest # this will create an api request directly but not block until the generation is finished request = ImageRequest("A beautiful beach", name="flux.1.1-pro") # or: request = ImageRequest("A beautiful beach", name="flux.1.1-pro", api_key="your_key_here") # any of the following will block until the generation is finished request.url # -> https:<...>/sample.jpg request.bytes # -> b"..." bytes for the generated image request.save("outputs/api.jpg") # saves the sample to local storage request.image # -> a PIL image ``` Usage from the command line: ```bash $ python -m flux.api --prompt="A beautiful beach" url https:<...>/sample.jpg # generate and save the result $ python -m flux.api --prompt="A beautiful beach" save outputs/api # open the image directly $ python -m flux.api --prompt="A beautiful beach" image show ``` ## Citation If you find the provided code or models useful for your research, consider citing them as: ```bib @misc{flux2023, author={Black Forest Labs}, title={FLUX}, year={2023}, howpublished={\url{https://github.com/black-forest-labs/flux}}, } ``` ================================================ FILE: flux-ToCa/demo_gr.py ================================================ import os import time import uuid import gradio as gr import numpy as np import torch from einops import rearrange from PIL import ExifTags, Image from transformers import pipeline from flux.cli import SamplingOptions from flux.sampling import denoise, get_noise, get_schedule, prepare, unpack from flux.ideas import denoise_cache from flux.util import configs, embed_watermark, load_ae, load_clip, load_flow_model, load_t5 NSFW_THRESHOLD = 0.85 def get_models(name: str, device: torch.device, offload: bool, is_schnell: bool): t5 = load_t5(device, max_length=256 if is_schnell else 512) clip = load_clip(device) model = load_flow_model(name, device="cpu" if offload else device) ae = load_ae(name, device="cpu" if offload else device) nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device) return model, ae, t5, clip, nsfw_classifier class FluxGenerator: def __init__(self, model_name: str, device: str, offload: bool): self.device = torch.device(device) self.offload = offload self.model_name = model_name self.is_schnell = model_name == "flux-schnell" self.model, self.ae, self.t5, self.clip, self.nsfw_classifier = get_models( model_name, device=self.device, offload=self.offload, is_schnell=self.is_schnell, ) @torch.inference_mode() def generate_image( self, width, height, num_steps, guidance, seed, prompt, init_image=None, image2image_strength=0.0, add_sampling_metadata=True, ): seed = int(seed) if seed == -1: seed = None opts = SamplingOptions( prompt=prompt, width=width, height=height, num_steps=num_steps, guidance=guidance, seed=seed, ) if opts.seed is None: opts.seed = torch.Generator(device="cpu").seed() print(f"Generating '{opts.prompt}' with seed {opts.seed}") t0 = time.perf_counter() if init_image is not None: if isinstance(init_image, np.ndarray): init_image = torch.from_numpy(init_image).permute(2, 0, 1).float() / 255.0 init_image = init_image.unsqueeze(0) init_image = init_image.to(self.device) init_image = torch.nn.functional.interpolate(init_image, (opts.height, opts.width)) if self.offload: self.ae.encoder.to(self.device) init_image = self.ae.encode(init_image.to()) if self.offload: self.ae = self.ae.cpu() torch.cuda.empty_cache() # prepare input x = get_noise( 1, opts.height, opts.width, device=self.device, dtype=torch.bfloat16, seed=opts.seed, ) timesteps = get_schedule( opts.num_steps, x.shape[-1] * x.shape[-2] // 4, shift=(not self.is_schnell), ) if init_image is not None: t_idx = int((1 - image2image_strength) * num_steps) t = timesteps[t_idx] timesteps = timesteps[t_idx:] x = t * x + (1.0 - t) * init_image.to(x.dtype) if self.offload: self.t5, self.clip = self.t5.to(self.device), self.clip.to(self.device) inp = prepare(t5=self.t5, clip=self.clip, img=x, prompt=opts.prompt) # offload TEs to CPU, load model to gpu if self.offload: self.t5, self.clip = self.t5.cpu(), self.clip.cpu() torch.cuda.empty_cache() self.model = self.model.to(self.device) # denoise initial noise x = denoise_cache(self.model, **inp, timesteps=timesteps, guidance=opts.guidance) # offload model, load autoencoder to gpu if self.offload: self.model.cpu() torch.cuda.empty_cache() self.ae.decoder.to(x.device) # decode latents to pixel space x = unpack(x.float(), opts.height, opts.width) with torch.autocast(device_type=self.device.type, dtype=torch.bfloat16): x = self.ae.decode(x) if self.offload: self.ae.decoder.cpu() torch.cuda.empty_cache() t1 = time.perf_counter() print(f"Done in {t1 - t0:.1f}s.") # bring into PIL format x = x.clamp(-1, 1) x = embed_watermark(x.float()) x = rearrange(x[0], "c h w -> h w c") img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy()) nsfw_score = [x["score"] for x in self.nsfw_classifier(img) if x["label"] == "nsfw"][0] if nsfw_score < NSFW_THRESHOLD: filename = f"output/gradio/{uuid.uuid4()}.jpg" os.makedirs(os.path.dirname(filename), exist_ok=True) exif_data = Image.Exif() if init_image is None: exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux" else: exif_data[ExifTags.Base.Software] = "AI generated;img2img;flux" exif_data[ExifTags.Base.Make] = "Black Forest Labs" exif_data[ExifTags.Base.Model] = self.model_name if add_sampling_metadata: exif_data[ExifTags.Base.ImageDescription] = prompt img.save(filename, format="jpeg", exif=exif_data, quality=95, subsampling=0) return img, str(opts.seed), filename, None else: return None, str(opts.seed), None, "Your generated image may contain NSFW content." def create_demo( model_name: str, device: str = "cuda" if torch.cuda.is_available() else "cpu", offload: bool = False ): generator = FluxGenerator(model_name, device, offload) is_schnell = model_name == "flux-schnell" with gr.Blocks() as demo: gr.Markdown(f"# Flux Image Generation Demo - Model: {model_name}") with gr.Row(): with gr.Column(): prompt = gr.Textbox( label="Prompt", value='a photo of a forest with mist swirling around the tree trunks. The word "FLUX" is painted over it in big, red brush strokes with visible texture', ) do_img2img = gr.Checkbox(label="Image to Image", value=False, interactive=not is_schnell) init_image = gr.Image(label="Input Image", visible=False) image2image_strength = gr.Slider( 0.0, 1.0, 0.8, step=0.1, label="Noising strength", visible=False ) with gr.Accordion("Advanced Options", open=False): width = gr.Slider(128, 8192, 1360, step=16, label="Width") height = gr.Slider(128, 8192, 768, step=16, label="Height") num_steps = gr.Slider(1, 50, 4 if is_schnell else 50, step=1, label="Number of steps") guidance = gr.Slider( 1.0, 10.0, 3.5, step=0.1, label="Guidance", interactive=not is_schnell ) seed = gr.Textbox(-1, label="Seed (-1 for random)") add_sampling_metadata = gr.Checkbox( label="Add sampling parameters to metadata?", value=True ) generate_btn = gr.Button("Generate") with gr.Column(): output_image = gr.Image(label="Generated Image") seed_output = gr.Number(label="Used Seed") warning_text = gr.Textbox(label="Warning", visible=False) download_btn = gr.File(label="Download full-resolution") def update_img2img(do_img2img): return { init_image: gr.update(visible=do_img2img), image2image_strength: gr.update(visible=do_img2img), } do_img2img.change(update_img2img, do_img2img, [init_image, image2image_strength]) generate_btn.click( fn=generator.generate_image, inputs=[ width, height, num_steps, guidance, seed, prompt, init_image, image2image_strength, add_sampling_metadata, ], outputs=[output_image, seed_output, download_btn, warning_text], ) return demo if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Flux") parser.add_argument( "--name", type=str, default="flux-schnell", choices=list(configs.keys()), help="Model name" ) parser.add_argument( "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device to use" ) parser.add_argument("--offload", action="store_true", help="Offload model to CPU when not in use") parser.add_argument("--share", action="store_true", help="Create a public link to your demo") args = parser.parse_args() demo = create_demo(args.name, args.device, args.offload) demo.launch(share=args.share) ================================================ FILE: flux-ToCa/demo_st.py ================================================ import os import re import time from glob import iglob from io import BytesIO import streamlit as st import torch from einops import rearrange from fire import Fire from PIL import ExifTags, Image from st_keyup import st_keyup from torchvision import transforms from transformers import pipeline from flux.cli import SamplingOptions from flux.sampling import denoise, get_noise, get_schedule, prepare, unpack from flux.ideas import denoise_cache from flux.util import ( configs, embed_watermark, load_ae, load_clip, load_flow_model, load_t5, ) NSFW_THRESHOLD = 0.85 @st.cache_resource() def get_models(name: str, device: torch.device, offload: bool, is_schnell: bool): t5 = load_t5(device, max_length=256 if is_schnell else 512) clip = load_clip(device) model = load_flow_model(name, device="cpu" if offload else device) ae = load_ae(name, device="cpu" if offload else device) nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device) return model, ae, t5, clip, nsfw_classifier def get_image() -> torch.Tensor | None: image = st.file_uploader("Input", type=["jpg", "JPEG", "png"]) if image is None: return None image = Image.open(image).convert("RGB") transform = transforms.Compose( [ transforms.ToTensor(), transforms.Lambda(lambda x: 2.0 * x - 1.0), ] ) img: torch.Tensor = transform(image) return img[None, ...] @torch.inference_mode() def main( device: str = "cuda" if torch.cuda.is_available() else "cpu", offload: bool = False, output_dir: str = "output", ): torch_device = torch.device(device) names = list(configs.keys()) name = st.selectbox("Which model to load?", names) if name is None or not st.checkbox("Load model", False): return is_schnell = name == "flux-schnell" model, ae, t5, clip, nsfw_classifier = get_models( name, device=torch_device, offload=offload, is_schnell=is_schnell, ) do_img2img = ( st.checkbox( "Image to Image", False, disabled=is_schnell, help="Partially noise an image and denoise again to get variations.\n\nOnly works for flux-dev", ) and not is_schnell ) if do_img2img: init_image = get_image() if init_image is None: st.warning("Please add an image to do image to image") image2image_strength = st.number_input("Noising strength", min_value=0.0, max_value=1.0, value=0.8) if init_image is not None: h, w = init_image.shape[-2:] st.write(f"Got image of size {w}x{h} ({h*w/1e6:.2f}MP)") resize_img = st.checkbox("Resize image", False) or init_image is None else: init_image = None resize_img = True image2image_strength = 0.0 # allow for packing and conversion to latent space width = int( 16 * (st.number_input("Width", min_value=128, value=1360, step=16, disabled=not resize_img) // 16) ) height = int( 16 * (st.number_input("Height", min_value=128, value=768, step=16, disabled=not resize_img) // 16) ) num_steps = int(st.number_input("Number of steps", min_value=1, value=(4 if is_schnell else 50))) guidance = float(st.number_input("Guidance", min_value=1.0, value=3.5, disabled=is_schnell)) seed_str = st.text_input("Seed", disabled=is_schnell) if seed_str.isdecimal(): seed = int(seed_str) else: st.info("No seed set, set to positive integer to enable") seed = None save_samples = st.checkbox("Save samples?", not is_schnell) add_sampling_metadata = st.checkbox("Add sampling parameters to metadata?", True) default_prompt = ( "a photo of a forest with mist swirling around the tree trunks. The word " '"FLUX" is painted over it in big, red brush strokes with visible texture' ) prompt = st_keyup("Enter a prompt", value=default_prompt, debounce=300, key="interactive_text") output_name = os.path.join(output_dir, "img_{idx}.jpg") if not os.path.exists(output_dir): os.makedirs(output_dir) idx = 0 else: fns = [fn for fn in iglob(output_name.format(idx="*")) if re.search(r"img_[0-9]+\.jpg$", fn)] if len(fns) > 0: idx = max(int(fn.split("_")[-1].split(".")[0]) for fn in fns) + 1 else: idx = 0 rng = torch.Generator(device="cpu") if "seed" not in st.session_state: st.session_state.seed = rng.seed() def increment_counter(): st.session_state.seed += 1 def decrement_counter(): if st.session_state.seed > 0: st.session_state.seed -= 1 opts = SamplingOptions( prompt=prompt, width=width, height=height, num_steps=num_steps, guidance=guidance, seed=seed, ) if name == "flux-schnell": cols = st.columns([5, 1, 1, 5]) with cols[1]: st.button("↩", on_click=increment_counter) with cols[2]: st.button("↪", on_click=decrement_counter) if is_schnell or st.button("Sample"): if is_schnell: opts.seed = st.session_state.seed elif opts.seed is None: opts.seed = rng.seed() print(f"Generating '{opts.prompt}' with seed {opts.seed}") t0 = time.perf_counter() if init_image is not None: if resize_img: init_image = torch.nn.functional.interpolate(init_image, (opts.height, opts.width)) else: h, w = init_image.shape[-2:] init_image = init_image[..., : 16 * (h // 16), : 16 * (w // 16)] opts.height = init_image.shape[-2] opts.width = init_image.shape[-1] if offload: ae.encoder.to(torch_device) init_image = ae.encode(init_image.to(torch_device)) if offload: ae = ae.cpu() torch.cuda.empty_cache() # prepare input x = get_noise( 1, opts.height, opts.width, device=torch_device, dtype=torch.bfloat16, seed=opts.seed, ) # divide pixel space by 16**2 to account for latent space conversion timesteps = get_schedule( opts.num_steps, (x.shape[-1] * x.shape[-2]) // 4, shift=(not is_schnell), ) if init_image is not None: t_idx = int((1 - image2image_strength) * num_steps) t = timesteps[t_idx] timesteps = timesteps[t_idx:] x = t * x + (1.0 - t) * init_image.to(x.dtype) if offload: t5, clip = t5.to(torch_device), clip.to(torch_device) inp = prepare(t5=t5, clip=clip, img=x, prompt=opts.prompt) # offload TEs to CPU, load model to gpu if offload: t5, clip = t5.cpu(), clip.cpu() torch.cuda.empty_cache() model = model.to(torch_device) # denoise initial noise x = denoise_cache(model, **inp, timesteps=timesteps, guidance=opts.guidance) # offload model, load autoencoder to gpu if offload: model.cpu() torch.cuda.empty_cache() ae.decoder.to(x.device) # decode latents to pixel space x = unpack(x.float(), opts.height, opts.width) with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16): x = ae.decode(x) if offload: ae.decoder.cpu() torch.cuda.empty_cache() t1 = time.perf_counter() fn = output_name.format(idx=idx) print(f"Done in {t1 - t0:.1f}s.") # bring into PIL format and save x = x.clamp(-1, 1) x = embed_watermark(x.float()) x = rearrange(x[0], "c h w -> h w c") img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy()) nsfw_score = [x["score"] for x in nsfw_classifier(img) if x["label"] == "nsfw"][0] if nsfw_score < NSFW_THRESHOLD: buffer = BytesIO() exif_data = Image.Exif() if init_image is None: exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux" else: exif_data[ExifTags.Base.Software] = "AI generated;img2img;flux" exif_data[ExifTags.Base.Make] = "Black Forest Labs" exif_data[ExifTags.Base.Model] = name if add_sampling_metadata: exif_data[ExifTags.Base.ImageDescription] = prompt img.save(buffer, format="jpeg", exif=exif_data, quality=95, subsampling=0) img_bytes = buffer.getvalue() if save_samples: print(f"Saving {fn}") with open(fn, "wb") as file: file.write(img_bytes) idx += 1 st.session_state["samples"] = { "prompt": opts.prompt, "img": img, "seed": opts.seed, "bytes": img_bytes, } opts.seed = None else: st.warning("Your generated image may contain NSFW content.") st.session_state["samples"] = None samples = st.session_state.get("samples", None) if samples is not None: st.image(samples["img"], caption=samples["prompt"]) st.download_button( "Download full-resolution", samples["bytes"], file_name="generated.jpg", mime="image/jpg", ) st.write(f"Seed: {samples['seed']}") def app(): Fire(main) if __name__ == "__main__": app() ================================================ FILE: flux-ToCa/demo_st_fill.py ================================================ import os import re import tempfile import time from glob import iglob from io import BytesIO import numpy as np import streamlit as st import torch from einops import rearrange from PIL import ExifTags, Image from st_keyup import st_keyup from streamlit_drawable_canvas import st_canvas from transformers import pipeline from flux.sampling import denoise, get_noise, get_schedule, prepare_fill, unpack from flux.ideas import denoise_cache from flux.util import embed_watermark, load_ae, load_clip, load_flow_model, load_t5 NSFW_THRESHOLD = 0.85 def add_border_and_mask(image, zoom_all=1.0, zoom_left=0, zoom_right=0, zoom_up=0, zoom_down=0, overlap=0): """Adds a black border around the image with individual side control and mask overlap""" orig_width, orig_height = image.size # Calculate padding for each side (in pixels) left_pad = int(orig_width * zoom_left) right_pad = int(orig_width * zoom_right) top_pad = int(orig_height * zoom_up) bottom_pad = int(orig_height * zoom_down) # Calculate overlap in pixels overlap_left = int(orig_width * overlap) overlap_right = int(orig_width * overlap) overlap_top = int(orig_height * overlap) overlap_bottom = int(orig_height * overlap) # If using the all-sides zoom, add it to each side if zoom_all > 1.0: extra_each_side = (zoom_all - 1.0) / 2 left_pad += int(orig_width * extra_each_side) right_pad += int(orig_width * extra_each_side) top_pad += int(orig_height * extra_each_side) bottom_pad += int(orig_height * extra_each_side) # Calculate new dimensions (ensure they're multiples of 32) new_width = 32 * round((orig_width + left_pad + right_pad) / 32) new_height = 32 * round((orig_height + top_pad + bottom_pad) / 32) # Create new image with black border bordered_image = Image.new("RGB", (new_width, new_height), (0, 0, 0)) # Paste original image in position paste_x = left_pad paste_y = top_pad bordered_image.paste(image, (paste_x, paste_y)) # Create mask (white where the border is, black where the original image was) mask = Image.new("L", (new_width, new_height), 255) # White background # Paste black rectangle with overlap adjustment mask.paste( 0, ( paste_x + overlap_left, # Left edge moves right paste_y + overlap_top, # Top edge moves down paste_x + orig_width - overlap_right, # Right edge moves left paste_y + orig_height - overlap_bottom, # Bottom edge moves up ), ) return bordered_image, mask @st.cache_resource() def get_models(name: str, device: torch.device, offload: bool): t5 = load_t5(device, max_length=128) clip = load_clip(device) model = load_flow_model(name, device="cpu" if offload else device) ae = load_ae(name, device="cpu" if offload else device) nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device) return model, ae, t5, clip, nsfw_classifier def resize(img: Image.Image, min_mp: float = 0.5, max_mp: float = 2.0) -> Image.Image: width, height = img.size mp = (width * height) / 1_000_000 # Current megapixels if min_mp <= mp <= max_mp: # Even if MP is in range, ensure dimensions are multiples of 32 new_width = int(32 * round(width / 32)) new_height = int(32 * round(height / 32)) if new_width != width or new_height != height: return img.resize((new_width, new_height), Image.Resampling.LANCZOS) return img # Calculate scaling factor if mp < min_mp: scale = (min_mp / mp) ** 0.5 else: # mp > max_mp scale = (max_mp / mp) ** 0.5 new_width = int(32 * round(width * scale / 32)) new_height = int(32 * round(height * scale / 32)) return img.resize((new_width, new_height), Image.Resampling.LANCZOS) def clear_canvas_state(): """Clear all canvas-related state""" keys_to_clear = ["canvas", "last_image_dims"] for key in keys_to_clear: if key in st.session_state: del st.session_state[key] def set_new_image(img: Image.Image): """Safely set a new image and clear relevant state""" st.session_state["current_image"] = img clear_canvas_state() st.rerun() def downscale_image(img: Image.Image, scale_factor: float) -> Image.Image: """Downscale image by a given factor while maintaining 32-pixel multiple dimensions""" if scale_factor >= 1.0: return img width, height = img.size new_width = int(32 * round(width * scale_factor / 32)) new_height = int(32 * round(height * scale_factor / 32)) # Ensure minimum dimensions new_width = max(64, new_width) # minimum 64 pixels new_height = max(64, new_height) # minimum 64 pixels return img.resize((new_width, new_height), Image.Resampling.LANCZOS) @torch.inference_mode() def main( device: str = "cuda" if torch.cuda.is_available() else "cpu", offload: bool = False, output_dir: str = "output", ): torch_device = torch.device(device) st.title("Flux Fill: Inpainting & Outpainting") # Model selection and loading name = "flux-dev-fill" if not st.checkbox("Load model", False): return try: model, ae, t5, clip, nsfw_classifier = get_models( name, device=torch_device, offload=offload, ) except Exception as e: st.error(f"Error loading models: {e}") return # Mode selection mode = st.radio("Select Mode", ["Inpainting", "Outpainting"]) # Image handling - either from previous generation or new upload if "input_image" in st.session_state: image = st.session_state["input_image"] del st.session_state["input_image"] set_new_image(image) st.write("Continuing from previous result") else: uploaded_image = st.file_uploader("Upload image", type=["jpg", "jpeg", "png"]) if uploaded_image is None: st.warning("Please upload an image") return if ( "current_image_name" not in st.session_state or st.session_state["current_image_name"] != uploaded_image.name ): try: image = Image.open(uploaded_image).convert("RGB") st.session_state["current_image_name"] = uploaded_image.name set_new_image(image) except Exception as e: st.error(f"Error loading image: {e}") return else: image = st.session_state.get("current_image") if image is None: st.error("Error: Image state is invalid. Please reupload the image.") clear_canvas_state() return # Add downscale control with st.expander("Image Size Control"): current_mp = (image.size[0] * image.size[1]) / 1_000_000 st.write(f"Current image size: {image.size[0]}x{image.size[1]} ({current_mp:.1f}MP)") scale_factor = st.slider( "Downscale Factor", min_value=0.1, max_value=1.0, value=1.0, step=0.1, help="1.0 = original size, 0.5 = half size, etc.", ) if scale_factor < 1.0 and st.button("Apply Downscaling"): image = downscale_image(image, scale_factor) set_new_image(image) st.rerun() # Resize image with validation try: original_mp = (image.size[0] * image.size[1]) / 1_000_000 image = resize(image) width, height = image.size current_mp = (width * height) / 1_000_000 if width % 32 != 0 or height % 32 != 0: st.error("Error: Image dimensions must be multiples of 32") return st.write(f"Image dimensions: {width}x{height} pixels") if original_mp != current_mp: st.write( f"Image has been resized from {original_mp:.1f}MP to {current_mp:.1f}MP to stay within bounds (0.5MP - 2MP)" ) except Exception as e: st.error(f"Error processing image: {e}") return if mode == "Outpainting": # Outpainting controls zoom_all = st.slider("Zoom Out Amount (All Sides)", min_value=1.0, max_value=3.0, value=1.0, step=0.1) with st.expander("Advanced Zoom Controls"): st.info("These controls add additional zoom to specific sides") col1, col2 = st.columns(2) with col1: zoom_left = st.slider("Left", min_value=0.0, max_value=1.0, value=0.0, step=0.1) zoom_right = st.slider("Right", min_value=0.0, max_value=1.0, value=0.0, step=0.1) with col2: zoom_up = st.slider("Up", min_value=0.0, max_value=1.0, value=0.0, step=0.1) zoom_down = st.slider("Down", min_value=0.0, max_value=1.0, value=0.0, step=0.1) overlap = st.slider("Overlap", min_value=0.01, max_value=0.25, value=0.01, step=0.01) # Generate bordered image and mask image_for_generation, mask = add_border_and_mask( image, zoom_all=zoom_all, zoom_left=zoom_left, zoom_right=zoom_right, zoom_up=zoom_up, zoom_down=zoom_down, overlap=overlap, ) width, height = image_for_generation.size # Show preview col1, col2 = st.columns(2) with col1: st.image(image_for_generation, caption="Image with Border") with col2: st.image(mask, caption="Mask (white areas will be generated)") else: # Inpainting mode # Canvas setup with dimension tracking canvas_key = f"canvas_{width}_{height}" if "last_image_dims" not in st.session_state: st.session_state.last_image_dims = (width, height) elif st.session_state.last_image_dims != (width, height): clear_canvas_state() st.session_state.last_image_dims = (width, height) st.rerun() try: canvas_result = st_canvas( fill_color="rgba(255, 255, 255, 0.0)", stroke_width=st.slider("Brush size", 1, 500, 50), stroke_color="#fff", background_image=image, height=height, width=width, drawing_mode="freedraw", key=canvas_key, display_toolbar=True, ) except Exception as e: st.error(f"Error creating canvas: {e}") clear_canvas_state() st.rerun() return # Sampling parameters num_steps = int(st.number_input("Number of steps", min_value=1, value=50)) guidance = float(st.number_input("Guidance", min_value=1.0, value=30.0)) seed_str = st.text_input("Seed") if seed_str.isdecimal(): seed = int(seed_str) else: st.info("No seed set, using random seed") seed = None save_samples = st.checkbox("Save samples?", True) add_sampling_metadata = st.checkbox("Add sampling parameters to metadata?", True) # Prompt input prompt = st_keyup("Enter a prompt", value="", debounce=300, key="interactive_text") # Setup output path output_name = os.path.join(output_dir, "img_{idx}.jpg") if not os.path.exists(output_dir): os.makedirs(output_dir) idx = 0 else: fns = [fn for fn in iglob(output_name.format(idx="*")) if re.search(r"img_[0-9]+\.jpg$", fn)] idx = len(fns) if st.button("Generate"): valid_input = False if mode == "Inpainting" and canvas_result.image_data is not None: valid_input = True # Create mask from canvas try: mask = Image.fromarray(canvas_result.image_data) mask = mask.getchannel("A") # Get alpha channel mask_array = np.array(mask) mask_array = (mask_array > 0).astype(np.uint8) * 255 mask = Image.fromarray(mask_array) image_for_generation = image except Exception as e: st.error(f"Error creating mask: {e}") return elif mode == "Outpainting": valid_input = True # image_for_generation and mask are already set above if not valid_input: st.error("Please draw a mask or configure outpainting settings") return # Create temporary files with ( tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_img, tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_mask, ): try: image_for_generation.save(tmp_img.name) mask.save(tmp_mask.name) except Exception as e: st.error(f"Error saving temporary files: {e}") return try: # Generate inpainting/outpainting rng = torch.Generator(device="cpu") if seed is None: seed = rng.seed() print(f"Generating with seed {seed}:\n{prompt}") t0 = time.perf_counter() x = get_noise( 1, height, width, device=torch_device, dtype=torch.bfloat16, seed=seed, ) if offload: t5, clip, ae = t5.to(torch_device), clip.to(torch_device), ae.to(torch_device) inp = prepare_fill( t5, clip, x, prompt=prompt, ae=ae, img_cond_path=tmp_img.name, mask_path=tmp_mask.name, ) timesteps = get_schedule(num_steps, inp["img"].shape[1], shift=True) if offload: t5, clip, ae = t5.cpu(), clip.cpu(), ae.cpu() torch.cuda.empty_cache() model = model.to(torch_device) x = denoise_cache(model, **inp, timesteps=timesteps, guidance=guidance) if offload: model.cpu() torch.cuda.empty_cache() ae.decoder.to(x.device) x = unpack(x.float(), height, width) with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16): x = ae.decode(x) t1 = time.perf_counter() print(f"Done in {t1 - t0:.1f}s") # Process and display result x = x.clamp(-1, 1) x = embed_watermark(x.float()) x = rearrange(x[0], "c h w -> h w c") img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy()) nsfw_score = [x["score"] for x in nsfw_classifier(img) if x["label"] == "nsfw"][0] if nsfw_score < NSFW_THRESHOLD: buffer = BytesIO() exif_data = Image.Exif() exif_data[ExifTags.Base.Software] = "AI generated;inpainting;flux" exif_data[ExifTags.Base.Make] = "Black Forest Labs" exif_data[ExifTags.Base.Model] = name if add_sampling_metadata: exif_data[ExifTags.Base.ImageDescription] = prompt img.save(buffer, format="jpeg", exif=exif_data, quality=95, subsampling=0) img_bytes = buffer.getvalue() if save_samples: fn = output_name.format(idx=idx) print(f"Saving {fn}") with open(fn, "wb") as file: file.write(img_bytes) st.session_state["samples"] = { "prompt": prompt, "img": img, "seed": seed, "bytes": img_bytes, } else: st.warning("Your generated image may contain NSFW content.") st.session_state["samples"] = None except Exception as e: st.error(f"Error during generation: {e}") return finally: # Clean up temporary files try: os.unlink(tmp_img.name) os.unlink(tmp_mask.name) except Exception as e: print(f"Error cleaning up temporary files: {e}") # Display results samples = st.session_state.get("samples", None) if samples is not None: st.image(samples["img"], caption=samples["prompt"]) col1, col2 = st.columns(2) with col1: st.download_button( "Download full-resolution", samples["bytes"], file_name="generated.jpg", mime="image/jpg", ) with col2: if st.button("Continue from this image"): # Store the generated image new_image = samples["img"] # Clear ALL canvas state clear_canvas_state() if "samples" in st.session_state: del st.session_state["samples"] # Set as current image st.session_state["current_image"] = new_image st.rerun() st.write(f"Seed: {samples['seed']}") if __name__ == "__main__": st.set_page_config(layout="wide") main() ================================================ FILE: flux-ToCa/docs/fill.md ================================================ ## Models FLUX.1 Fill introduces advanced inpainting and outpainting capabilities. It allows for seamless edits that integrate naturally with existing images. | Name | HuggingFace repo | License | sha256sum | | ------------------- | -------------------------------------------------------- | --------------------------------------------------------------------- | ---------------------------------------------------------------- | | `FLUX.1 Fill [dev]` | https://huggingface.co/black-forest-labs/FLUX.1-Fill-dev | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | 03e289f530df51d014f48e675a9ffa2141bc003259bf5f25d75b957e920a41ca | | `FLUX.1 Fill [pro]` | Only available in our API. | ## Examples ![inpainting](../assets/docs/inpainting.png) ![outpainting](../assets/docs/outpainting.png) ## Open-weights usage The weights will be downloaded automatically from HuggingFace once you start one of the demos. To download `FLUX.1 Fill [dev]`, you will need to be logged in, see [here](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-login). Alternatively, if you have downloaded the model weights manually from [here](https://huggingface.co/black-forest-labs/FLUX.1-Fill-dev), you can specify the downloaded paths via environment variables: ```bash export FLUX_DEV_FILL= export AE= ``` For interactive sampling run ```bash python -m src.flux.cli_fill --loop ``` Or to generate a single sample run ```bash python -m src.flux.cli_fill \ --img_cond_path \ --img_mask_path ``` The input_mask should be an image of the same size as the conditioning image that only contains black and white pixels; see [an example mask](../assets/cup_mask.png) for [this image](../assets/cup.png). We also provide an interactive streamlit demo. The demo can be run via ```bash streamlit run demo_st_fill.py ``` ================================================ FILE: flux-ToCa/docs/image-variation.md ================================================ ## Models FLUX.1 Redux is an adapter for the FLUX.1 text-to-image base models, FLUX.1 [dev] and FLUX.1 [schnell], which can be used to generate image variations. In addition, FLUX.1 Redux [pro] is available in our API and, augmenting the [dev] adapter, the API endpoint allows users to modify an image given a textual description. The feature is supported in our latest model FLUX1.1 [pro] Ultra, allowing for combining input images and text prompts to create high-quality 4-megapixel outputs with flexible aspect ratios. | Name | HuggingFace repo | License | sha256sum | | --------------------------- | ----------------------------------------------------------------------------------------------- | --------------------------------------------------------------------- | ---------------------------------------------------------------- | | `FLUX.1 Redux [dev]` | https://huggingface.co/black-forest-labs/FLUX.1-Redux-dev | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | a1b3bdcb4bdc58ce04874b9ca776d61fc3e914bb6beab41efb63e4e2694dca45 | | `FLUX.1 Redux [pro]` | [Available in our API.](https://docs.bfl.ml/) Supports image variations. | | `FLUX1.1 Redux [pro] Ultra` | [Available in our API.](https://docs.bfl.ml/) Supports image variations based on a text prompt. | ## Examples ![redux](../assets/docs/redux.png) ## Open-weights usage The text-to-image base model weights and the autoencoder weights will be downloaded automatically from HuggingFace once you start the demo. To download `FLUX.1 [dev]`, you will need to be logged in, see [here](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-login). You need to manually download the adapter weights from [here](https://huggingface.co/black-forest-labs/FLUX.1-Redux-dev) and specify them via an environment variable `export FLUX_REDUX=`. In general, you may specify any manually downloaded weights via environment variables: ```bash export FLUX_REDUX= export FLUX_SCHNELL= export FLUX_DEV= export AE= ``` For interactive sampling run ```bash python -m src.flux.cli_redux --loop --name ``` where `name` is one of `flux-dev` or `flux-schnell`. ================================================ FILE: flux-ToCa/docs/structural-conditioning.md ================================================ ## Models Structural conditioning uses canny edge or depth detection to maintain precise control during image transformations. By preserving the original image's structure through edge or depth maps, users can make text-guided edits while keeping the core composition intact. This is particularly effective for retexturing images. We release four variations: two based on edge maps (full model and LoRA for FLUX.1 [dev]) and two based on depth maps (full model and LoRA for FLUX.1 [dev]). | Name | HuggingFace repo | License | sha256sum | | ------------------------- | -------------------------------------------------------------- | --------------------------------------------------------------------- | ---------------------------------------------------------------- | | `FLUX.1 Canny [dev]` | https://huggingface.co/black-forest-labs/FLUX.1-Canny-dev | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | 996876670169591cb412b937fbd46ea14cbed6933aef17c48a2dcd9685c98cdb | | `FLUX.1 Depth [dev]` | https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | 41360d1662f44ca45bc1b665fe6387e91802f53911001630d970a4f8be8dac21 | | `FLUX.1 Canny [dev] LoRA` | https://huggingface.co/black-forest-labs/FLUX.1-Canny-dev-lora | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | 8eaa21b9c43d5e7242844deb64b8cf22ae9010f813f955ca8c05f240b8a98f7e | | `FLUX.1 Depth [dev] LoRA` | https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev-lora | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | 1938b38ea0fdd98080fa3e48beb2bedfbc7ad102d8b65e6614de704a46d8b907 | | `FLUX.1 Canny [pro]` | [Available in our API](https://docs.bfl.ml/). | | `FLUX.1 Depth [pro]` | [Available in our API](https://docs.bfl.ml/). | ## Examples ![canny](../assets/docs/canny.png) ![depth](../assets/docs/depth.png) ## Open-weights usage The full model weights (`FLUX.1 Canny [dev], Flux.1 Depth [dev], FLUX.1 [dev], and the autoencoder) will be downloaded automatically from HuggingFace once you start one of the demos. To download them, you will need to be logged in, see [here](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-login). The LoRA weights are not downloaded automatically, but can be downloaded manually [here (Canny)](https://huggingface.co/black-forest-labs/FLUX.1-Canny-dev-lora) and [here (Depth)](https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev-lora). You may specify any manually downloaded weights via environment variables: (**necessary for LoRAs**): ```bash export FLUX_DEV_DEPTH= export FLUX_DEV_CANNY= export FLUX_DEV_DEPTH_LORA= export FLUX_DEV_CANNY_LORA= export FLUX_REDUX= export FLUX_SCHNELL= export FLUX_DEV= export AE= ``` For interactive sampling run ```bash python -m src.flux.cli_control --loop --name ``` where `name` is one of `flux-dev-canny`, `flux-dev-depth`, `flux-dev-canny-lora`, or `flux-dev-depth-lora`. ## Diffusers usage Flux Control (including the LoRAs) is also compatible with the `diffusers` Python library. Check out the [documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux) to learn more. ================================================ FILE: flux-ToCa/docs/text-to-image.md ================================================ ## Models We currently offer four text-to-image models. `FLUX1.1 [pro]` is our most capable model which can generate images at up to 4MP while maintaining an impressive generation time of only 10 seconds per sample. | Name | HuggingFace repo | License | sha256sum | | ------------------------- | ------------------------------------------------------- | --------------------------------------------------------------------- | ---------------------------------------------------------------- | | `FLUX.1 [schnell]` | https://huggingface.co/black-forest-labs/FLUX.1-schnell | [apache-2.0](model_licenses/LICENSE-FLUX1-schnell) | 9403429e0052277ac2a87ad800adece5481eecefd9ed334e1f348723621d2a0a | | `FLUX.1 [dev]` | https://huggingface.co/black-forest-labs/FLUX.1-dev | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | 4610115bb0c89560703c892c59ac2742fa821e60ef5871b33493ba544683abd7 | | `FLUX.1 [pro]` | [Available in our API](https://docs.bfl.ml/). | | `FLUX1.1 [pro]` | [Available in our API](https://docs.bfl.ml/). | | `FLUX1.1 [pro] Ultra/raw` | [Available in our API](https://docs.bfl.ml/). | ## Open-weights usage The weights will be downloaded automatically from HuggingFace once you start one of the demos. To download `FLUX.1 [dev]`, you will need to be logged in, see [here](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-login). If you have downloaded the model weights manually, you can specify the downloaded paths via environment-variables: ```bash export FLUX_SCHNELL= export FLUX_DEV= export AE= ``` For interactive sampling run ```bash python -m flux --name --loop ``` Or to generate a single sample run ```bash python -m flux --name \ --height --width \ --prompt "" ``` We also provide a streamlit demo that does both text-to-image and image-to-image. The demo can be run via ```bash streamlit run demo_st.py ``` We also offer a Gradio-based demo for an interactive experience. To run the Gradio demo: ```bash python demo_gr.py --name flux-schnell --device cuda ``` Options: - `--name`: Choose the model to use (options: "flux-schnell", "flux-dev") - `--device`: Specify the device to use (default: "cuda" if available, otherwise "cpu") - `--offload`: Offload model to CPU when not in use - `--share`: Create a public link to your demo To run the demo with the dev model and create a public link: ```bash python demo_gr.py --name flux-dev --share ``` ## Diffusers integration `FLUX.1 [schnell]` and `FLUX.1 [dev]` are integrated with the [🧨 diffusers](https://github.com/huggingface/diffusers) library. To use it with diffusers, install it: ```shell pip install git+https://github.com/huggingface/diffusers.git ``` Then you can use `FluxPipeline` to run the model ```python import torch from diffusers import FluxPipeline model_id = "black-forest-labs/FLUX.1-schnell" #you can also use `black-forest-labs/FLUX.1-dev` pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16) pipe.enable_model_cpu_offload() #save some VRAM by offloading the model to CPU. Remove this if you have enough GPU power prompt = "A cat holding a sign that says hello world" seed = 42 image = pipe( prompt, output_type="pil", num_inference_steps=4, #use a larger number if you are using [dev] generator=torch.Generator("cpu").manual_seed(seed) ).images[0] image.save("flux-schnell.png") ``` To learn more check out the [diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux) documentation ================================================ FILE: flux-ToCa/model_cards/FLUX.1-dev.md ================================================ ![FLUX.1 [dev] Grid](../assets/dev_grid.jpg) `FLUX.1 [dev]` is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions. For more information, please read our [blog post](https://blackforestlabs.ai/announcing-black-forest-labs/). # Key Features 1. Cutting-edge output quality, second only to our state-of-the-art model `FLUX.1 [pro]`. 2. Competitive prompt following, matching the performance of closed source alternatives. 3. Trained using guidance distillation, making `FLUX.1 [dev]` more efficient. 4. Open weights to drive new scientific research, and empower artists to develop innovative workflows. 5. Generated outputs can be used for personal, scientific, and commercial purposes, as described in the [flux-1-dev-non-commercial-license](./licence.md). # Usage We provide a reference implementation of `FLUX.1 [dev]`, as well as sampling code, in a dedicated [github repository](https://github.com/black-forest-labs/flux). Developers and creatives looking to build on top of `FLUX.1 [dev]` are encouraged to use this as a starting point. ## API Endpoints The FLUX.1 models are also available via API from the following sources 1. [bfl.ml](https://docs.bfl.ml/) (currently `FLUX.1 [pro]`) 2. [replicate.com](https://replicate.com/collections/flux) 3. [fal.ai](https://fal.ai/models/fal-ai/flux/dev) ## ComfyUI `FLUX.1 [dev]` is also available in [Comfy UI](https://github.com/comfyanonymous/ComfyUI) for local inference with a node-based workflow. --- # Limitations - This model is not intended or able to provide factual information. - As a statistical model this checkpoint might amplify existing societal biases. - The model may fail to generate output that matches the prompts. - Prompt following is heavily influenced by the prompting-style. # Out-of-Scope Use The model and its derivatives may not be used - In any way that violates any applicable national, federal, state, local or international law or regulation. - For the purpose of exploiting, harming or attempting to exploit or harm minors in any way; including but not limited to the solicitation, creation, acquisition, or dissemination of child exploitative content. - To generate or disseminate verifiably false information and/or content with the purpose of harming others. - To generate or disseminate personal identifiable information that can be used to harm an individual. - To harass, abuse, threaten, stalk, or bully individuals or groups of individuals. - To create non-consensual nudity or illegal pornographic content. - For fully automated decision making that adversely impacts an individual's legal rights or otherwise creates or modifies a binding, enforceable obligation. - Generating or facilitating large-scale disinformation campaigns. # License This model falls under the [`FLUX.1 [dev]` Non-Commercial License](https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/LICENSE.md). ================================================ FILE: flux-ToCa/model_cards/FLUX.1-schnell.md ================================================ ![FLUX.1 [schnell] Grid](../assets/schnell_grid.jpg) `FLUX.1 [schnell]` is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions. For more information, please read our [blog post](https://blackforestlabs.ai/announcing-black-forest-labs/). # Key Features 1. Cutting-edge output quality and competitive prompt following, matching the performance of closed source alternatives. 2. Trained using latent adversarial diffusion distillation, `FLUX.1 [schnell]` can generate high-quality images in only 1 to 4 steps. 3. Released under the `apache-2.0` licence, the model can be used for personal, scientific, and commercial purposes. # Usage We provide a reference implementation of `FLUX.1 [schnell]`, as well as sampling code, in a dedicated [github repository](https://github.com/black-forest-labs/flux). Developers and creatives looking to build on top of `FLUX.1 [schnell]` are encouraged to use this as a starting point. ## API Endpoints The FLUX.1 models are also available via API from the following sources 1. [bfl.ml](https://docs.bfl.ml/) (currently `FLUX.1 [pro]`) 2. [replicate.com](https://replicate.com/collections/flux) 3. [fal.ai](https://fal.ai/models/fal-ai/flux/schnell) ## ComfyUI `FLUX.1 [schnell]` is also available in [Comfy UI](https://github.com/comfyanonymous/ComfyUI) for local inference with a node-based workflow. --- # Limitations - This model is not intended or able to provide factual information. - As a statistical model this checkpoint might amplify existing societal biases. - The model may fail to generate output that matches the prompts. - Prompt following is heavily influenced by the prompting-style. # Out-of-Scope Use The model and its derivatives may not be used - In any way that violates any applicable national, federal, state, local or international law or regulation. - For the purpose of exploiting, harming or attempting to exploit or harm minors in any way; including but not limited to the solicitation, creation, acquisition, or dissemination of child exploitative content. - To generate or disseminate verifiably false information and/or content with the purpose of harming others. - To generate or disseminate personal identifiable information that can be used to harm an individual. - To harass, abuse, threaten, stalk, or bully individuals or groups of individuals. - To create non-consensual nudity or illegal pornographic content. - For fully automated decision making that adversely impacts an individual's legal rights or otherwise creates or modifies a binding, enforceable obligation. - Generating or facilitating large-scale disinformation campaigns. ================================================ FILE: flux-ToCa/model_licenses/LICENSE-FLUX1-dev ================================================ FLUX.1 [dev] Non-Commercial License Black Forest Labs, Inc. (“we” or “our” or “Company”) is pleased to make available the weights, parameters and inference code for the FLUX.1 [dev] Model (as defined below) freely available for your non-commercial and non-production use as set forth in this FLUX.1 [dev] Non-Commercial License (“License”). The “FLUX.1 [dev] Model” means the FLUX.1 [dev] AI models, including FLUX.1 [dev], FLUX.1 Fill [dev], FLUX.1 Depth [dev], FLUX.1 Canny [dev], FLUX.1 Redux [dev], FLUX.1 Canny [dev] LoRA and FLUX.1 Depth [dev] LoRA, and their elements which includes algorithms, software, checkpoints, parameters, source code (inference code, evaluation code, and if applicable, fine-tuning code) and any other materials associated with the FLUX.1 [dev] AI models made available by Company under this License, including if any, the technical documentation, manuals and instructions for the use and operation thereof (collectively, “FLUX.1 [dev] Model”). By downloading, accessing, use, Distributing (as defined below), or creating a Derivative (as defined below) of the FLUX.1 [dev] Model, you agree to the terms of this License. If you do not agree to this License, then you do not have any rights to access, use, Distribute or create a Derivative of the FLUX.1 [dev] Model and you must immediately cease using the FLUX.1 [dev] Model. If you are agreeing to be bound by the terms of this License on behalf of your employer or other entity, you represent and warrant to us that you have full legal authority to bind your employer or such entity to this License. If you do not have the requisite authority, you may not accept the License or access the FLUX.1 [dev] Model on behalf of your employer or other entity. 1. Definitions. Capitalized terms used in this License but not defined herein have the following meanings: a. “Derivative” means any (i) modified version of the FLUX.1 [dev] Model (including but not limited to any customized or fine-tuned version thereof), (ii) work based on the FLUX.1 [dev] Model, or (iii) any other derivative work thereof. For the avoidance of doubt, Outputs are not considered Derivatives under this License. b. “Distribution” or “Distribute” or “Distributing” means providing or making available, by any means, a copy of the FLUX.1 [dev] Models and/or the Derivatives as the case may be. c. “Non-Commercial Purpose” means any of the following uses, but only so far as you do not receive any direct or indirect payment arising from the use of the model or its output: (i) personal use for research, experiment, and testing for the benefit of public knowledge, personal study, private entertainment, hobby projects, or otherwise not directly or indirectly connected to any commercial activities, business operations, or employment responsibilities; (ii) use by commercial or for-profit entities for testing, evaluation, or non-commercial research and development in a non-production environment, (iii) use by any charitable organization for charitable purposes, or for testing or evaluation. For clarity, use for revenue-generating activity or direct interactions with or impacts on end users, or use to train, fine tune or distill other models for commercial use is not a Non-Commercial purpose. d. “Outputs” means any content generated by the operation of the FLUX.1 [dev] Models or the Derivatives from a prompt (i.e., text instructions) provided by users. For the avoidance of doubt, Outputs do not include any components of a FLUX.1 [dev] Models, such as any fine-tuned versions of the FLUX.1 [dev] Models, the weights, or parameters. e. “you” or “your” means the individual or entity entering into this License with Company. 2. License Grant. a. License. Subject to your compliance with this License, Company grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license to access, use, create Derivatives of, and Distribute the FLUX.1 [dev] Models solely for your Non-Commercial Purposes. The foregoing license is personal to you, and you may not assign or sublicense this License or any other rights or obligations under this License without Company’s prior written consent; any such assignment or sublicense will be void and will automatically and immediately terminate this License. Any restrictions set forth herein in regarding the FLUX.1 [dev] Model also applies to any Derivative you create or that are created on your behalf. b. Non-Commercial Use Only. You may only access, use, Distribute, or creative Derivatives of or the FLUX.1 [dev] Model or Derivatives for Non-Commercial Purposes. If You want to use a FLUX.1 [dev] Model a Derivative for any purpose that is not expressly authorized under this License, such as for a commercial activity, you must request a license from Company, which Company may grant to you in Company’s sole discretion and which additional use may be subject to a fee, royalty or other revenue share. Please contact Company at the following e-mail address if you want to discuss such a license: info@blackforestlabs.ai. c. Reserved Rights. The grant of rights expressly set forth in this License are the complete grant of rights to you in the FLUX.1 [dev] Model, and no other licenses are granted, whether by waiver, estoppel, implication, equity or otherwise. Company and its licensors reserve all rights not expressly granted by this License. d. Outputs. We claim no ownership rights in and to the Outputs. You are solely responsible for the Outputs you generate and their subsequent uses in accordance with this License. You may use Output for any purpose (including for commercial purposes), except as expressly prohibited herein. You may not use the Output to train, fine-tune or distill a model that is competitive with the FLUX.1 [dev] Model. 3. Distribution. Subject to this License, you may Distribute copies of the FLUX.1 [dev] Model and/or Derivatives made by you, under the following conditions: a. you must make available a copy of this License to third-party recipients of the FLUX.1 [dev] Models and/or Derivatives you Distribute, and specify that any rights to use the FLUX.1 [dev] Models and/or Derivatives shall be directly granted by Company to said third-party recipients pursuant to this License; b. you must make prominently display the following notice alongside the Distribution of the FLUX.1 [dev] Model or Derivative (such as via a “Notice” text file distributed as part of such FLUX.1 [dev] Model or Derivative) (the “Attribution Notice”): “The FLUX.1 [dev] Model is licensed by Black Forest Labs. Inc. under the FLUX.1 [dev] Non-Commercial License. Copyright Black Forest Labs. Inc. IN NO EVENT SHALL BLACK FOREST LABS, INC. BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH USE OF THIS MODEL.” c. in the case of Distribution of Derivatives made by you, you must also include in the Attribution Notice a statement that you have modified the applicable FLUX.1 [dev] Model; and d. in the case of Distribution of Derivatives made by you, any terms and conditions you impose on any third-party recipients relating to Derivatives made by or for you shall neither limit such third-party recipients’ use of the FLUX.1 [dev] Model or any Derivatives made by or for Company in accordance with this License nor conflict with any of its terms and conditions. e. In the case of Distribution of Derivatives made by you, you must not misrepresent or imply, through any means, that the Derivatives made by or for you and/or any modified version of the FLUX.1 [dev] Model you Distribute under your name and responsibility is an official product of the Company or has been endorsed, approved or validated by the Company, unless you are authorized by Company to do so in writing. 4. Restrictions. You will not, and will not permit, assist or cause any third party to a. use, modify, copy, reproduce, create Derivatives of, or Distribute the FLUX.1 [dev] Model (or any Derivative thereof, or any data produced by the FLUX.1 [dev] Model), in whole or in part, for (i) any commercial or production purposes, (ii) military purposes, (iii) purposes of surveillance, including any research or development relating to surveillance, (iv) biometric processing, (v) in any manner that infringes, misappropriates, or otherwise violates any third-party rights, or (vi) in any manner that violates any applicable law and violating any privacy or security laws, rules, regulations, directives, or governmental requirements (including the General Data Privacy Regulation (Regulation (EU) 2016/679), the California Consumer Privacy Act, and any and all laws governing the processing of biometric information), as well as all amendments and successor laws to any of the foregoing; b. alter or remove copyright and other proprietary notices which appear on or in any portion of the FLUX.1 [dev] Model; c. utilize any equipment, device, software, or other means to circumvent or remove any security or protection used by Company in connection with the FLUX.1 [dev] Model, or to circumvent or remove any usage restrictions, or to enable functionality disabled by FLUX.1 [dev] Model; or d. offer or impose any terms on the FLUX.1 [dev] Model that alter, restrict, or are inconsistent with the terms of this License. e. violate any applicable U.S. and non-U.S. export control and trade sanctions laws (“Export Laws”) in connection with your use or Distribution of any FLUX.1 [dev] Model; f. directly or indirectly Distribute, export, or otherwise transfer FLUX.1 [dev] Model (a) to any individual, entity, or country prohibited by Export Laws; (b) to anyone on U.S. or non-U.S. government restricted parties lists; or (c) for any purpose prohibited by Export Laws, including nuclear, chemical or biological weapons, or missile technology applications; 3) use or download FLUX.1 [dev] Model if you or they are (a) located in a comprehensively sanctioned jurisdiction, (b) currently listed on any U.S. or non-U.S. restricted parties list, or (c) for any purpose prohibited by Export Laws; and (4) will not disguise your location through IP proxying or other methods. 5. DISCLAIMERS. THE FLUX.1 [dev] MODEL IS PROVIDED “AS IS” AND “WITH ALL FAULTS” WITH NO WARRANTY OF ANY KIND, EXPRESS OR IMPLIED. COMPANY EXPRESSLY DISCLAIMS ALL REPRESENTATIONS AND WARRANTIES, EXPRESS OR IMPLIED, WHETHER BY STATUTE, CUSTOM, USAGE OR OTHERWISE AS TO ANY MATTERS RELATED TO THE FLUX.1 [dev] MODEL, INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, SATISFACTORY QUALITY, OR NON-INFRINGEMENT. COMPANY MAKES NO WARRANTIES OR REPRESENTATIONS THAT THE FLUX.1 [dev] MODEL WILL BE ERROR FREE OR FREE OF VIRUSES OR OTHER HARMFUL COMPONENTS, OR PRODUCE ANY PARTICULAR RESULTS. 6. LIMITATION OF LIABILITY. TO THE FULLEST EXTENT PERMITTED BY LAW, IN NO EVENT WILL COMPANY BE LIABLE TO YOU OR YOUR EMPLOYEES, AFFILIATES, USERS, OFFICERS OR DIRECTORS (A) UNDER ANY THEORY OF LIABILITY, WHETHER BASED IN CONTRACT, TORT, NEGLIGENCE, STRICT LIABILITY, WARRANTY, OR OTHERWISE UNDER THIS LICENSE, OR (B) FOR ANY INDIRECT, CONSEQUENTIAL, EXEMPLARY, INCIDENTAL, PUNITIVE OR SPECIAL DAMAGES OR LOST PROFITS, EVEN IF COMPANY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. THE FLUX.1 [dev] MODEL, ITS CONSTITUENT COMPONENTS, AND ANY OUTPUT (COLLECTIVELY, “MODEL MATERIALS”) ARE NOT DESIGNED OR INTENDED FOR USE IN ANY APPLICATION OR SITUATION WHERE FAILURE OR FAULT OF THE MODEL MATERIALS COULD REASONABLY BE ANTICIPATED TO LEAD TO SERIOUS INJURY OF ANY PERSON, INCLUDING POTENTIAL DISCRIMINATION OR VIOLATION OF AN INDIVIDUAL’S PRIVACY RIGHTS, OR TO SEVERE PHYSICAL, PROPERTY, OR ENVIRONMENTAL DAMAGE (EACH, A “HIGH-RISK USE”). IF YOU ELECT TO USE ANY OF THE MODEL MATERIALS FOR A HIGH-RISK USE, YOU DO SO AT YOUR OWN RISK. YOU AGREE TO DESIGN AND IMPLEMENT APPROPRIATE DECISION-MAKING AND RISK-MITIGATION PROCEDURES AND POLICIES IN CONNECTION WITH A HIGH-RISK USE SUCH THAT EVEN IF THERE IS A FAILURE OR FAULT IN ANY OF THE MODEL MATERIALS, THE SAFETY OF PERSONS OR PROPERTY AFFECTED BY THE ACTIVITY STAYS AT A LEVEL THAT IS REASONABLE, APPROPRIATE, AND LAWFUL FOR THE FIELD OF THE HIGH-RISK USE. 7. INDEMNIFICATION You will indemnify, defend and hold harmless Company and our subsidiaries and affiliates, and each of our respective shareholders, directors, officers, employees, agents, successors, and assigns (collectively, the “Company Parties”) from and against any losses, liabilities, damages, fines, penalties, and expenses (including reasonable attorneys’ fees) incurred by any Company Party in connection with any claim, demand, allegation, lawsuit, proceeding, or investigation (collectively, “Claims”) arising out of or related to (a) your access to or use of the FLUX.1 [dev] Model (as well as any Output, results or data generated from such access or use), including any High-Risk Use (defined below); (b) your violation of this License; or (c) your violation, misappropriation or infringement of any rights of another (including intellectual property or other proprietary rights and privacy rights). You will promptly notify the Company Parties of any such Claims, and cooperate with Company Parties in defending such Claims. You will also grant the Company Parties sole control of the defense or settlement, at Company’s sole option, of any Claims. This indemnity is in addition to, and not in lieu of, any other indemnities or remedies set forth in a written agreement between you and Company or the other Company Parties. 8. Termination; Survival. a. This License will automatically terminate upon any breach by you of the terms of this License. b. We may terminate this License, in whole or in part, at any time upon notice (including electronic) to you. c. If You initiate any legal action or proceedings against Company or any other entity (including a cross-claim or counterclaim in a lawsuit), alleging that the FLUX.1 [dev] Model or any Derivative, or any part thereof, infringe upon intellectual property or other rights owned or licensable by you, then any licenses granted to you under this License will immediately terminate as of the date such legal action or claim is filed or initiated. d. Upon termination of this License, you must cease all use, access or Distribution of the FLUX.1 [dev] Model and any Derivatives. The following sections survive termination of this License 2(c), 2(d), 4-11. 9. Third Party Materials. The FLUX.1 [dev] Model may contain third-party software or other components (including free and open source software) (all of the foregoing, “Third Party Materials”), which are subject to the license terms of the respective third-party licensors. Your dealings or correspondence with third parties and your use of or interaction with any Third Party Materials are solely between you and the third party. Company does not control or endorse, and makes no representations or warranties regarding, any Third Party Materials, and your access to and use of such Third Party Materials are at your own risk. 10. Trademarks. You have not been granted any trademark license as part of this License and may not use any name or mark associated with Company without the prior written permission of Company, except to the extent necessary to make the reference required in the Attribution Notice as specified above or as is reasonably necessary in describing the FLUX.1 [dev] Model and its creators. 11. General. This License will be governed and construed under the laws of the State of Delaware without regard to conflicts of law provisions. If any provision or part of a provision of this License is unlawful, void or unenforceable, that provision or part of the provision is deemed severed from this License, and will not affect the validity and enforceability of any remaining provisions. The failure of Company to exercise or enforce any right or provision of this License will not operate as a waiver of such right or provision. This License does not confer any third-party beneficiary rights upon any other person or entity. This License, together with the Documentation, contains the entire understanding between you and Company regarding the subject matter of this License, and supersedes all other written or oral agreements and understandings between you and Company regarding such subject matter. No change or addition to any provision of this License will be binding unless it is in writing and signed by an authorized representative of both you and Company. ================================================ FILE: flux-ToCa/model_licenses/LICENSE-FLUX1-schnell ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: You must give any other recipients of the Work or Derivative Works a copy of this License; and You must cause any modified files to carry prominent notices stating that You changed the files; and You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS ================================================ FILE: flux-ToCa/pyproject.toml ================================================ [project] name = "flux" authors = [ { name = "Black Forest Labs", email = "support@blackforestlabs.ai" }, ] description = "Inference codebase for FLUX" readme = "README.md" requires-python = ">=3.10" license = { file = "LICENSE.md" } dynamic = ["version"] dependencies = [ "torch == 2.5.1", "torchvision", "einops", "fire >= 0.6.0", "huggingface-hub", "safetensors", "sentencepiece", "transformers", "tokenizers", "protobuf", "requests", "invisible-watermark", "ruff == 0.6.8", ] [project.optional-dependencies] streamlit = [ "streamlit", "streamlit-drawable-canvas", "streamlit-keyup", ] gradio = [ "gradio", ] all = [ "flux[streamlit]", "flux[gradio]", ] [project.scripts] flux = "flux.cli:app" [build-system] build-backend = "setuptools.build_meta" requires = ["setuptools>=64", "wheel", "setuptools_scm>=8"] [tool.ruff] line-length = 110 target-version = "py310" extend-exclude = ["/usr/lib/*"] [tool.ruff.lint] ignore = [ "E501", # line too long - will be fixed in format ] [tool.ruff.format] quote-style = "double" indent-style = "space" line-ending = "auto" skip-magic-trailing-comma = false docstring-code-format = true exclude = [ "src/flux/_version.py", # generated by setuptools_scm ] [tool.ruff.lint.isort] combine-as-imports = true force-wrap-aliases = true known-local-folder = ["src"] known-first-party = ["flux"] [tool.pyright] include = ["src"] exclude = [ "**/__pycache__", # cache directories "./typings", # generated type stubs ] stubPath = "./typings" [tool.tomlsort] in_place = true no_sort_tables = true spaces_before_inline_comment = 1 spaces_indent_inline_array = 2 trailing_comma_inline_array = true sort_first = [ "project", "build-system", "tool.setuptools", ] # needs to be last for CI reasons [tool.setuptools_scm] write_to = "src/flux/_version.py" parentdir_prefix_version = "flux-" fallback_version = "0.0.0" version_scheme = "post-release" ================================================ FILE: flux-ToCa/setup.py ================================================ import setuptools setuptools.setup() ================================================ FILE: flux-ToCa/src/flux/__init__.py ================================================ try: from ._version import ( version as __version__, # type: ignore version_tuple, ) except ImportError: __version__ = "unknown (no version information available)" version_tuple = (0, 0, "unknown", "noinfo") from pathlib import Path PACKAGE = __package__.replace("_", "-") PACKAGE_ROOT = Path(__file__).parent ================================================ FILE: flux-ToCa/src/flux/__main__.py ================================================ from .cli import app if __name__ == "__main__": app() ================================================ FILE: flux-ToCa/src/flux/_version.py ================================================ # file generated by setuptools_scm # don't change, don't track in version control TYPE_CHECKING = False if TYPE_CHECKING: from typing import Tuple, Union VERSION_TUPLE = Tuple[Union[int, str], ...] else: VERSION_TUPLE = object version: str __version__: str __version_tuple__: VERSION_TUPLE version_tuple: VERSION_TUPLE __version__ = version = '0.0.post49+gd06f828.d20250206' __version_tuple__ = version_tuple = (0, 0, 'gd06f828.d20250206') ================================================ FILE: flux-ToCa/src/flux/api.py ================================================ import io import os import time from pathlib import Path import requests from PIL import Image API_URL = "https://api.bfl.ml" API_ENDPOINTS = { "flux.1-pro": "flux-pro", "flux.1-dev": "flux-dev", "flux.1.1-pro": "flux-pro-1.1", } class ApiException(Exception): def __init__(self, status_code: int, detail: str | list[dict] | None = None): super().__init__() self.detail = detail self.status_code = status_code def __str__(self) -> str: return self.__repr__() def __repr__(self) -> str: if self.detail is None: message = None elif isinstance(self.detail, str): message = self.detail else: message = "[" + ",".join(d["msg"] for d in self.detail) + "]" return f"ApiException({self.status_code=}, {message=}, detail={self.detail})" class ImageRequest: def __init__( self, # api inputs prompt: str, name: str = "flux.1.1-pro", width: int | None = None, height: int | None = None, num_steps: int | None = None, prompt_upsampling: bool | None = None, seed: int | None = None, guidance: float | None = None, interval: float | None = None, safety_tolerance: int | None = None, # behavior of this class validate: bool = True, launch: bool = True, api_key: str | None = None, ): """ Manages an image generation request to the API. All parameters not specified will use the API defaults. Args: prompt: Text prompt for image generation. width: Width of the generated image in pixels. Must be a multiple of 32. height: Height of the generated image in pixels. Must be a multiple of 32. name: Which model version to use num_steps: Number of steps for the image generation process. prompt_upsampling: Whether to perform upsampling on the prompt. seed: Optional seed for reproducibility. guidance: Guidance scale for image generation. safety_tolerance: Tolerance level for input and output moderation. Between 0 and 6, 0 being most strict, 6 being least strict. validate: Run input validation launch: Directly launches request api_key: Your API key if not provided by the environment Raises: ValueError: For invalid input, when `validate` ApiException: For errors raised from the API """ if validate: if name not in API_ENDPOINTS.keys(): raise ValueError(f"Invalid model {name}") elif width is not None and width % 32 != 0: raise ValueError(f"width must be divisible by 32, got {width}") elif width is not None and not (256 <= width <= 1440): raise ValueError(f"width must be between 256 and 1440, got {width}") elif height is not None and height % 32 != 0: raise ValueError(f"height must be divisible by 32, got {height}") elif height is not None and not (256 <= height <= 1440): raise ValueError(f"height must be between 256 and 1440, got {height}") elif num_steps is not None and not (1 <= num_steps <= 50): raise ValueError(f"steps must be between 1 and 50, got {num_steps}") elif guidance is not None and not (1.5 <= guidance <= 5.0): raise ValueError(f"guidance must be between 1.5 and 4, got {guidance}") elif interval is not None and not (1.0 <= interval <= 4.0): raise ValueError(f"interval must be between 1 and 4, got {interval}") elif safety_tolerance is not None and not (0 <= safety_tolerance <= 6.0): raise ValueError(f"safety_tolerance must be between 0 and 6, got {interval}") if name == "flux.1-dev": if interval is not None: raise ValueError("Interval is not supported for flux.1-dev") if name == "flux.1.1-pro": if interval is not None or num_steps is not None or guidance is not None: raise ValueError("Interval, num_steps and guidance are not supported for " "flux.1.1-pro") self.name = name self.request_json = { "prompt": prompt, "width": width, "height": height, "steps": num_steps, "prompt_upsampling": prompt_upsampling, "seed": seed, "guidance": guidance, "interval": interval, "safety_tolerance": safety_tolerance, } self.request_json = {key: value for key, value in self.request_json.items() if value is not None} self.request_id: str | None = None self.result: dict | None = None self._image_bytes: bytes | None = None self._url: str | None = None if api_key is None: self.api_key = os.environ.get("BFL_API_KEY") else: self.api_key = api_key if launch: self.request() def request(self): """ Request to generate the image. """ if self.request_id is not None: return response = requests.post( f"{API_URL}/v1/{API_ENDPOINTS[self.name]}", headers={ "accept": "application/json", "x-key": self.api_key, "Content-Type": "application/json", }, json=self.request_json, ) result = response.json() if response.status_code != 200: raise ApiException(status_code=response.status_code, detail=result.get("detail")) self.request_id = response.json()["id"] def retrieve(self) -> dict: """ Wait for the generation to finish and retrieve response. """ if self.request_id is None: self.request() while self.result is None: response = requests.get( f"{API_URL}/v1/get_result", headers={ "accept": "application/json", "x-key": self.api_key, }, params={ "id": self.request_id, }, ) result = response.json() if "status" not in result: raise ApiException(status_code=response.status_code, detail=result.get("detail")) elif result["status"] == "Ready": self.result = result["result"] elif result["status"] == "Pending": time.sleep(0.5) else: raise ApiException(status_code=200, detail=f"API returned status '{result['status']}'") return self.result @property def bytes(self) -> bytes: """ Generated image as bytes. """ if self._image_bytes is None: response = requests.get(self.url) if response.status_code == 200: self._image_bytes = response.content else: raise ApiException(status_code=response.status_code) return self._image_bytes @property def url(self) -> str: """ Public url to retrieve the image from """ if self._url is None: result = self.retrieve() self._url = result["sample"] return self._url @property def image(self) -> Image.Image: """ Load the image as a PIL Image """ return Image.open(io.BytesIO(self.bytes)) def save(self, path: str): """ Save the generated image to a local path """ suffix = Path(self.url).suffix if not path.endswith(suffix): path = path + suffix Path(path).resolve().parent.mkdir(parents=True, exist_ok=True) with open(path, "wb") as file: file.write(self.bytes) if __name__ == "__main__": from fire import Fire Fire(ImageRequest) ================================================ FILE: flux-ToCa/src/flux/cli.py ================================================ import os import re import time from dataclasses import dataclass from glob import iglob import torch from fire import Fire from transformers import pipeline from flux.sampling import denoise, get_noise, get_schedule, prepare, unpack from flux.ideas import denoise_cache from flux.util import configs, load_ae, load_clip, load_flow_model, load_t5, save_image NSFW_THRESHOLD = 0.85 @dataclass class SamplingOptions: prompt: str width: int height: int num_steps: int guidance: float seed: int | None def parse_prompt(options: SamplingOptions) -> SamplingOptions | None: user_question = "Next prompt (write /h for help, /q to quit and leave empty to repeat):\n" usage = ( "Usage: Either write your prompt directly, leave this field empty " "to repeat the prompt or write a command starting with a slash:\n" "- '/w ' will set the width of the generated image\n" "- '/h ' will set the height of the generated image\n" "- '/s ' sets the next seed\n" "- '/g ' sets the guidance (flux-dev only)\n" "- '/n ' sets the number of steps\n" "- '/q' to quit" ) while (prompt := input(user_question)).startswith("/"): if prompt.startswith("/w"): if prompt.count(" ") != 1: print(f"Got invalid command '{prompt}'\n{usage}") continue _, width = prompt.split() options.width = 16 * (int(width) // 16) print( f"Setting resolution to {options.width} x {options.height} " f"({options.height *options.width/1e6:.2f}MP)" ) elif prompt.startswith("/h"): if prompt.count(" ") != 1: print(f"Got invalid command '{prompt}'\n{usage}") continue _, height = prompt.split() options.height = 16 * (int(height) // 16) print( f"Setting resolution to {options.width} x {options.height} " f"({options.height *options.width/1e6:.2f}MP)" ) elif prompt.startswith("/g"): if prompt.count(" ") != 1: print(f"Got invalid command '{prompt}'\n{usage}") continue _, guidance = prompt.split() options.guidance = float(guidance) print(f"Setting guidance to {options.guidance}") elif prompt.startswith("/s"): if prompt.count(" ") != 1: print(f"Got invalid command '{prompt}'\n{usage}") continue _, seed = prompt.split() options.seed = int(seed) print(f"Setting seed to {options.seed}") elif prompt.startswith("/n"): if prompt.count(" ") != 1: print(f"Got invalid command '{prompt}'\n{usage}") continue _, steps = prompt.split() options.num_steps = int(steps) print(f"Setting number of steps to {options.num_steps}") elif prompt.startswith("/q"): print("Quitting") return None else: if not prompt.startswith("/h"): print(f"Got invalid command '{prompt}'\n{usage}") print(usage) if prompt != "": options.prompt = prompt return options @torch.inference_mode() def main( name: str = "flux-schnell", width: int = 1360, height: int = 768, seed: int | None = None, prompt: str = ( "a photo of a forest with mist swirling around the tree trunks. The word " '"FLUX" is painted over it in big, red brush strokes with visible texture' ), device: str = "cuda" if torch.cuda.is_available() else "cpu", num_steps: int | None = None, loop: bool = False, guidance: float = 3.5, offload: bool = False, output_dir: str = "output", add_sampling_metadata: bool = True, ): """ Sample the flux model. Either interactively (set `--loop`) or run for a single image. Args: name: Name of the model to load height: height of the sample in pixels (should be a multiple of 16) width: width of the sample in pixels (should be a multiple of 16) seed: Set a seed for sampling output_name: where to save the output image, `{idx}` will be replaced by the index of the sample prompt: Prompt used for sampling device: Pytorch device num_steps: number of sampling steps (default 4 for schnell, 50 for guidance distilled) loop: start an interactive session and sample multiple times guidance: guidance value used for guidance distillation add_sampling_metadata: Add the prompt to the image Exif metadata """ nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device) if name not in configs: available = ", ".join(configs.keys()) raise ValueError(f"Got unknown model name: {name}, chose from {available}") torch_device = torch.device(device) if num_steps is None: num_steps = 4 if name == "flux-schnell" else 50 # allow for packing and conversion to latent space height = 16 * (height // 16) width = 16 * (width // 16) output_name = os.path.join(output_dir, "img_{idx}.jpg") if not os.path.exists(output_dir): os.makedirs(output_dir) idx = 0 else: fns = [fn for fn in iglob(output_name.format(idx="*")) if re.search(r"img_[0-9]+\.jpg$", fn)] if len(fns) > 0: idx = max(int(fn.split("_")[-1].split(".")[0]) for fn in fns) + 1 else: idx = 0 # init all components t5 = load_t5(torch_device, max_length=256 if name == "flux-schnell" else 512) clip = load_clip(torch_device) model = load_flow_model(name, device="cpu" if offload else torch_device) ae = load_ae(name, device="cpu" if offload else torch_device) rng = torch.Generator(device="cpu") opts = SamplingOptions( prompt=prompt, width=width, height=height, num_steps=num_steps, guidance=guidance, seed=seed, ) if loop: opts = parse_prompt(opts) while opts is not None: if opts.seed is None: opts.seed = rng.seed() print(f"Generating with seed {opts.seed}:\n{opts.prompt}") t0 = time.perf_counter() # prepare input x = get_noise( 1, opts.height, opts.width, device=torch_device, dtype=torch.bfloat16, seed=opts.seed, ) opts.seed = None if offload: ae = ae.cpu() torch.cuda.empty_cache() t5, clip = t5.to(torch_device), clip.to(torch_device) inp = prepare(t5, clip, x, prompt=opts.prompt) timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell")) # offload TEs to CPU, load model to gpu if offload: t5, clip = t5.cpu(), clip.cpu() torch.cuda.empty_cache() model = model.to(torch_device) # denoise initial noise x = denoise_cache(model, **inp, timesteps=timesteps, guidance=opts.guidance) # offload model, load autoencoder to gpu if offload: model.cpu() torch.cuda.empty_cache() ae.decoder.to(x.device) # decode latents to pixel space x = unpack(x.float(), opts.height, opts.width) with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16): x = ae.decode(x) if torch.cuda.is_available(): torch.cuda.synchronize() t1 = time.perf_counter() fn = output_name.format(idx=idx) print(f"Done in {t1 - t0:.1f}s. Saving {fn}") idx = save_image(nsfw_classifier, name, output_name, idx, x, add_sampling_metadata, prompt) if loop: print("-" * 80) opts = parse_prompt(opts) else: opts = None def app(): Fire(main) if __name__ == "__main__": app() ================================================ FILE: flux-ToCa/src/flux/cli_control.py ================================================ import os import re import time from dataclasses import dataclass from glob import iglob import torch from fire import Fire from transformers import pipeline from flux.modules.image_embedders import CannyImageEncoder, DepthImageEncoder from flux.sampling import denoise, get_noise, get_schedule, prepare_control, unpack from flux.ideas import denoise_cache from flux.util import configs, load_ae, load_clip, load_flow_model, load_t5, save_image @dataclass class SamplingOptions: prompt: str width: int height: int num_steps: int guidance: float seed: int | None img_cond_path: str lora_scale: float | None def parse_prompt(options: SamplingOptions) -> SamplingOptions | None: user_question = "Next prompt (write /h for help, /q to quit and leave empty to repeat):\n" usage = ( "Usage: Either write your prompt directly, leave this field empty " "to repeat the prompt or write a command starting with a slash:\n" "- '/w ' will set the width of the generated image\n" "- '/h ' will set the height of the generated image\n" "- '/s ' sets the next seed\n" "- '/g ' sets the guidance (flux-dev only)\n" "- '/n ' sets the number of steps\n" "- '/q' to quit" ) while (prompt := input(user_question)).startswith("/"): if prompt.startswith("/w"): if prompt.count(" ") != 1: print(f"Got invalid command '{prompt}'\n{usage}") continue _, width = prompt.split() options.width = 16 * (int(width) // 16) print( f"Setting resolution to {options.width} x {options.height} " f"({options.height *options.width/1e6:.2f}MP)" ) elif prompt.startswith("/h"): if prompt.count(" ") != 1: print(f"Got invalid command '{prompt}'\n{usage}") continue _, height = prompt.split() options.height = 16 * (int(height) // 16) print( f"Setting resolution to {options.width} x {options.height} " f"({options.height *options.width/1e6:.2f}MP)" ) elif prompt.startswith("/g"): if prompt.count(" ") != 1: print(f"Got invalid command '{prompt}'\n{usage}") continue _, guidance = prompt.split() options.guidance = float(guidance) print(f"Setting guidance to {options.guidance}") elif prompt.startswith("/s"): if prompt.count(" ") != 1: print(f"Got invalid command '{prompt}'\n{usage}") continue _, seed = prompt.split() options.seed = int(seed) print(f"Setting seed to {options.seed}") elif prompt.startswith("/n"): if prompt.count(" ") != 1: print(f"Got invalid command '{prompt}'\n{usage}") continue _, steps = prompt.split() options.num_steps = int(steps) print(f"Setting number of steps to {options.num_steps}") elif prompt.startswith("/q"): print("Quitting") return None else: if not prompt.startswith("/h"): print(f"Got invalid command '{prompt}'\n{usage}") print(usage) if prompt != "": options.prompt = prompt return options def parse_img_cond_path(options: SamplingOptions | None) -> SamplingOptions | None: if options is None: return None user_question = "Next conditioning image (write /h for help, /q to quit and leave empty to repeat):\n" usage = ( "Usage: Either write your prompt directly, leave this field empty " "to repeat the conditioning image or write a command starting with a slash:\n" "- '/q' to quit" ) while True: img_cond_path = input(user_question) if img_cond_path.startswith("/"): if img_cond_path.startswith("/q"): print("Quitting") return None else: if not img_cond_path.startswith("/h"): print(f"Got invalid command '{img_cond_path}'\n{usage}") print(usage) continue if img_cond_path == "": break if not os.path.isfile(img_cond_path) or not img_cond_path.lower().endswith( (".jpg", ".jpeg", ".png", ".webp") ): print(f"File '{img_cond_path}' does not exist or is not a valid image file") continue options.img_cond_path = img_cond_path break return options def parse_lora_scale(options: SamplingOptions | None) -> tuple[SamplingOptions | None, bool]: changed = False if options is None: return None, changed user_question = "Next lora scale (write /h for help, /q to quit and leave empty to repeat):\n" usage = ( "Usage: Either write your prompt directly, leave this field empty " "to repeat the lora scale or write a command starting with a slash:\n" "- '/q' to quit" ) while (prompt := input(user_question)).startswith("/"): if prompt.startswith("/q"): print("Quitting") return None, changed else: if not prompt.startswith("/h"): print(f"Got invalid command '{prompt}'\n{usage}") print(usage) if prompt != "": options.lora_scale = float(prompt) changed = True return options, changed @torch.inference_mode() def main( name: str, width: int = 1024, height: int = 1024, seed: int | None = None, prompt: str = "a robot made out of gold", device: str = "cuda" if torch.cuda.is_available() else "cpu", num_steps: int = 50, loop: bool = False, guidance: float | None = None, offload: bool = False, output_dir: str = "output", add_sampling_metadata: bool = True, img_cond_path: str = "assets/robot.webp", lora_scale: float | None = 0.85, ): """ Sample the flux model. Either interactively (set `--loop`) or run for a single image. Args: height: height of the sample in pixels (should be a multiple of 16) width: width of the sample in pixels (should be a multiple of 16) seed: Set a seed for sampling output_name: where to save the output image, `{idx}` will be replaced by the index of the sample prompt: Prompt used for sampling device: Pytorch device num_steps: number of sampling steps (default 4 for schnell, 50 for guidance distilled) loop: start an interactive session and sample multiple times guidance: guidance value used for guidance distillation add_sampling_metadata: Add the prompt to the image Exif metadata img_cond_path: path to conditioning image (jpeg/png/webp) """ nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device) assert name in [ "flux-dev-canny", "flux-dev-depth", "flux-dev-canny-lora", "flux-dev-depth-lora", ], f"Got unknown model name: {name}" if guidance is None: if name in ["flux-dev-canny", "flux-dev-canny-lora"]: guidance = 30.0 elif name in ["flux-dev-depth", "flux-dev-depth-lora"]: guidance = 10.0 else: raise NotImplementedError() if name not in configs: available = ", ".join(configs.keys()) raise ValueError(f"Got unknown model name: {name}, chose from {available}") torch_device = torch.device(device) output_name = os.path.join(output_dir, "img_{idx}.jpg") if not os.path.exists(output_dir): os.makedirs(output_dir) idx = 0 else: fns = [fn for fn in iglob(output_name.format(idx="*")) if re.search(r"img_[0-9]+\.jpg$", fn)] if len(fns) > 0: idx = max(int(fn.split("_")[-1].split(".")[0]) for fn in fns) + 1 else: idx = 0 # init all components t5 = load_t5(torch_device, max_length=512) clip = load_clip(torch_device) model = load_flow_model(name, device="cpu" if offload else torch_device) ae = load_ae(name, device="cpu" if offload else torch_device) # set lora scale if "lora" in name and lora_scale is not None: for _, module in model.named_modules(): if hasattr(module, "set_scale"): module.set_scale(lora_scale) if name in ["flux-dev-depth", "flux-dev-depth-lora"]: img_embedder = DepthImageEncoder(torch_device) elif name in ["flux-dev-canny", "flux-dev-canny-lora"]: img_embedder = CannyImageEncoder(torch_device) else: raise NotImplementedError() rng = torch.Generator(device="cpu") opts = SamplingOptions( prompt=prompt, width=width, height=height, num_steps=num_steps, guidance=guidance, seed=seed, img_cond_path=img_cond_path, lora_scale=lora_scale, ) if loop: opts = parse_prompt(opts) opts = parse_img_cond_path(opts) if "lora" in name: opts, changed = parse_lora_scale(opts) if changed: # update the lora scale: for _, module in model.named_modules(): if hasattr(module, "set_scale"): module.set_scale(opts.lora_scale) while opts is not None: if opts.seed is None: opts.seed = rng.seed() print(f"Generating with seed {opts.seed}:\n{opts.prompt}") t0 = time.perf_counter() # prepare input x = get_noise( 1, opts.height, opts.width, device=torch_device, dtype=torch.bfloat16, seed=opts.seed, ) opts.seed = None if offload: t5, clip, ae = t5.to(torch_device), clip.to(torch_device), ae.to(torch_device) inp = prepare_control( t5, clip, x, prompt=opts.prompt, ae=ae, encoder=img_embedder, img_cond_path=opts.img_cond_path, ) timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell")) # offload TEs and AE to CPU, load model to gpu if offload: t5, clip, ae = t5.cpu(), clip.cpu(), ae.cpu() torch.cuda.empty_cache() model = model.to(torch_device) # denoise initial noise x = denoise_cache(model, **inp, timesteps=timesteps, guidance=opts.guidance) # offload model, load autoencoder to gpu if offload: model.cpu() torch.cuda.empty_cache() ae.decoder.to(x.device) # decode latents to pixel space x = unpack(x.float(), opts.height, opts.width) with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16): x = ae.decode(x) if torch.cuda.is_available(): torch.cuda.synchronize() t1 = time.perf_counter() print(f"Done in {t1 - t0:.1f}s") idx = save_image(nsfw_classifier, name, output_name, idx, x, add_sampling_metadata, prompt) if loop: print("-" * 80) opts = parse_prompt(opts) opts = parse_img_cond_path(opts) if "lora" in name: opts, changed = parse_lora_scale(opts) if changed: # update the lora scale: for _, module in model.named_modules(): if hasattr(module, "set_scale"): module.set_scale(opts.lora_scale) else: opts = None def app(): Fire(main) if __name__ == "__main__": app() ================================================ FILE: flux-ToCa/src/flux/cli_fill.py ================================================ import os import re import time from dataclasses import dataclass from glob import iglob import torch from fire import Fire from PIL import Image from transformers import pipeline from flux.sampling import denoise, get_noise, get_schedule, prepare_fill, unpack from flux.ideas import denoise_cache from flux.util import configs, load_ae, load_clip, load_flow_model, load_t5, save_image @dataclass class SamplingOptions: prompt: str width: int height: int num_steps: int guidance: float seed: int | None img_cond_path: str img_mask_path: str def parse_prompt(options: SamplingOptions) -> SamplingOptions | None: user_question = "Next prompt (write /h for help, /q to quit and leave empty to repeat):\n" usage = ( "Usage: Either write your prompt directly, leave this field empty " "to repeat the prompt or write a command starting with a slash:\n" "- '/s ' sets the next seed\n" "- '/g ' sets the guidance (flux-dev only)\n" "- '/n ' sets the number of steps\n" "- '/q' to quit" ) while (prompt := input(user_question)).startswith("/"): if prompt.startswith("/g"): if prompt.count(" ") != 1: print(f"Got invalid command '{prompt}'\n{usage}") continue _, guidance = prompt.split() options.guidance = float(guidance) print(f"Setting guidance to {options.guidance}") elif prompt.startswith("/s"): if prompt.count(" ") != 1: print(f"Got invalid command '{prompt}'\n{usage}") continue _, seed = prompt.split() options.seed = int(seed) print(f"Setting seed to {options.seed}") elif prompt.startswith("/n"): if prompt.count(" ") != 1: print(f"Got invalid command '{prompt}'\n{usage}") continue _, steps = prompt.split() options.num_steps = int(steps) print(f"Setting number of steps to {options.num_steps}") elif prompt.startswith("/q"): print("Quitting") return None else: if not prompt.startswith("/h"): print(f"Got invalid command '{prompt}'\n{usage}") print(usage) if prompt != "": options.prompt = prompt return options def parse_img_cond_path(options: SamplingOptions | None) -> SamplingOptions | None: if options is None: return None user_question = "Next conditioning image (write /h for help, /q to quit and leave empty to repeat):\n" usage = ( "Usage: Either write your prompt directly, leave this field empty " "to repeat the conditioning image or write a command starting with a slash:\n" "- '/q' to quit" ) while True: img_cond_path = input(user_question) if img_cond_path.startswith("/"): if img_cond_path.startswith("/q"): print("Quitting") return None else: if not img_cond_path.startswith("/h"): print(f"Got invalid command '{img_cond_path}'\n{usage}") print(usage) continue if img_cond_path == "": break if not os.path.isfile(img_cond_path) or not img_cond_path.lower().endswith( (".jpg", ".jpeg", ".png", ".webp") ): print(f"File '{img_cond_path}' does not exist or is not a valid image file") continue else: with Image.open(img_cond_path) as img: width, height = img.size if width % 32 != 0 or height % 32 != 0: print(f"Image dimensions must be divisible by 32, got {width}x{height}") continue options.img_cond_path = img_cond_path break return options def parse_img_mask_path(options: SamplingOptions | None) -> SamplingOptions | None: if options is None: return None user_question = "Next conditioning mask (write /h for help, /q to quit and leave empty to repeat):\n" usage = ( "Usage: Either write your prompt directly, leave this field empty " "to repeat the conditioning mask or write a command starting with a slash:\n" "- '/q' to quit" ) while True: img_mask_path = input(user_question) if img_mask_path.startswith("/"): if img_mask_path.startswith("/q"): print("Quitting") return None else: if not img_mask_path.startswith("/h"): print(f"Got invalid command '{img_mask_path}'\n{usage}") print(usage) continue if img_mask_path == "": break if not os.path.isfile(img_mask_path) or not img_mask_path.lower().endswith( (".jpg", ".jpeg", ".png", ".webp") ): print(f"File '{img_mask_path}' does not exist or is not a valid image file") continue else: with Image.open(img_mask_path) as img: width, height = img.size if width % 32 != 0 or height % 32 != 0: print(f"Image dimensions must be divisible by 32, got {width}x{height}") continue else: with Image.open(options.img_cond_path) as img_cond: img_cond_width, img_cond_height = img_cond.size if width != img_cond_width or height != img_cond_height: print( f"Mask dimensions must match conditioning image, got {width}x{height} and {img_cond_width}x{img_cond_height}" ) continue options.img_mask_path = img_mask_path break return options @torch.inference_mode() def main( seed: int | None = None, prompt: str = "a white paper cup", device: str = "cuda" if torch.cuda.is_available() else "cpu", num_steps: int = 50, loop: bool = False, guidance: float = 30.0, offload: bool = False, output_dir: str = "output", add_sampling_metadata: bool = True, img_cond_path: str = "assets/cup.png", img_mask_path: str = "assets/cup_mask.png", ): """ Sample the flux model. Either interactively (set `--loop`) or run for a single image. This demo assumes that the conditioning image and mask have the same shape and that height and width are divisible by 32. Args: seed: Set a seed for sampling output_name: where to save the output image, `{idx}` will be replaced by the index of the sample prompt: Prompt used for sampling device: Pytorch device num_steps: number of sampling steps (default 4 for schnell, 50 for guidance distilled) loop: start an interactive session and sample multiple times guidance: guidance value used for guidance distillation add_sampling_metadata: Add the prompt to the image Exif metadata img_cond_path: path to conditioning image (jpeg/png/webp) img_mask_path: path to conditioning mask (jpeg/png/webp """ nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device) name = "flux-dev-fill" if name not in configs: available = ", ".join(configs.keys()) raise ValueError(f"Got unknown model name: {name}, chose from {available}") torch_device = torch.device(device) output_name = os.path.join(output_dir, "img_{idx}.jpg") if not os.path.exists(output_dir): os.makedirs(output_dir) idx = 0 else: fns = [fn for fn in iglob(output_name.format(idx="*")) if re.search(r"img_[0-9]+\.jpg$", fn)] if len(fns) > 0: idx = max(int(fn.split("_")[-1].split(".")[0]) for fn in fns) + 1 else: idx = 0 # init all components t5 = load_t5(torch_device, max_length=128) clip = load_clip(torch_device) model = load_flow_model(name, device="cpu" if offload else torch_device) ae = load_ae(name, device="cpu" if offload else torch_device) rng = torch.Generator(device="cpu") with Image.open(img_cond_path) as img: width, height = img.size opts = SamplingOptions( prompt=prompt, width=width, height=height, num_steps=num_steps, guidance=guidance, seed=seed, img_cond_path=img_cond_path, img_mask_path=img_mask_path, ) if loop: opts = parse_prompt(opts) opts = parse_img_cond_path(opts) with Image.open(opts.img_cond_path) as img: width, height = img.size opts.height = height opts.width = width opts = parse_img_mask_path(opts) while opts is not None: if opts.seed is None: opts.seed = rng.seed() print(f"Generating with seed {opts.seed}:\n{opts.prompt}") t0 = time.perf_counter() # prepare input x = get_noise( 1, opts.height, opts.width, device=torch_device, dtype=torch.bfloat16, seed=opts.seed, ) opts.seed = None if offload: t5, clip, ae = t5.to(torch_device), clip.to(torch_device), ae.to(torch_device) inp = prepare_fill( t5, clip, x, prompt=opts.prompt, ae=ae, img_cond_path=opts.img_cond_path, mask_path=opts.img_mask_path, ) timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell")) # offload TEs and AE to CPU, load model to gpu if offload: t5, clip, ae = t5.cpu(), clip.cpu(), ae.cpu() torch.cuda.empty_cache() model = model.to(torch_device) # denoise initial noise x = denoise_cache(model, **inp, timesteps=timesteps, guidance=opts.guidance) # offload model, load autoencoder to gpu if offload: model.cpu() torch.cuda.empty_cache() ae.decoder.to(x.device) # decode latents to pixel space x = unpack(x.float(), opts.height, opts.width) with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16): x = ae.decode(x) if torch.cuda.is_available(): torch.cuda.synchronize() t1 = time.perf_counter() print(f"Done in {t1 - t0:.1f}s") idx = save_image(nsfw_classifier, name, output_name, idx, x, add_sampling_metadata, prompt) if loop: print("-" * 80) opts = parse_prompt(opts) opts = parse_img_cond_path(opts) with Image.open(opts.img_cond_path) as img: width, height = img.size opts.height = height opts.width = width opts = parse_img_mask_path(opts) else: opts = None def app(): Fire(main) if __name__ == "__main__": app() ================================================ FILE: flux-ToCa/src/flux/cli_redux.py ================================================ import os import re import time from dataclasses import dataclass from glob import iglob import torch from fire import Fire from transformers import pipeline from flux.modules.image_embedders import ReduxImageEncoder from flux.sampling import denoise, get_noise, get_schedule, prepare_redux, unpack from flux.ideas import denoise_cache from flux.util import configs, load_ae, load_clip, load_flow_model, load_t5, save_image @dataclass class SamplingOptions: prompt: str width: int height: int num_steps: int guidance: float seed: int | None img_cond_path: str def parse_prompt(options: SamplingOptions) -> SamplingOptions | None: user_question = "Write /h for help, /q to quit and leave empty to repeat):\n" usage = ( "Usage: Leave this field empty to do nothing " "or write a command starting with a slash:\n" "- '/w ' will set the width of the generated image\n" "- '/h ' will set the height of the generated image\n" "- '/s ' sets the next seed\n" "- '/g ' sets the guidance (flux-dev only)\n" "- '/n ' sets the number of steps\n" "- '/q' to quit" ) while (prompt := input(user_question)).startswith("/"): if prompt.startswith("/w"): if prompt.count(" ") != 1: print(f"Got invalid command '{prompt}'\n{usage}") continue _, width = prompt.split() options.width = 16 * (int(width) // 16) print( f"Setting resolution to {options.width} x {options.height} " f"({options.height *options.width/1e6:.2f}MP)" ) elif prompt.startswith("/h"): if prompt.count(" ") != 1: print(f"Got invalid command '{prompt}'\n{usage}") continue _, height = prompt.split() options.height = 16 * (int(height) // 16) print( f"Setting resolution to {options.width} x {options.height} " f"({options.height *options.width/1e6:.2f}MP)" ) elif prompt.startswith("/g"): if prompt.count(" ") != 1: print(f"Got invalid command '{prompt}'\n{usage}") continue _, guidance = prompt.split() options.guidance = float(guidance) print(f"Setting guidance to {options.guidance}") elif prompt.startswith("/s"): if prompt.count(" ") != 1: print(f"Got invalid command '{prompt}'\n{usage}") continue _, seed = prompt.split() options.seed = int(seed) print(f"Setting seed to {options.seed}") elif prompt.startswith("/n"): if prompt.count(" ") != 1: print(f"Got invalid command '{prompt}'\n{usage}") continue _, steps = prompt.split() options.num_steps = int(steps) print(f"Setting number of steps to {options.num_steps}") elif prompt.startswith("/q"): print("Quitting") return None else: if not prompt.startswith("/h"): print(f"Got invalid command '{prompt}'\n{usage}") print(usage) return options def parse_img_cond_path(options: SamplingOptions | None) -> SamplingOptions | None: if options is None: return None user_question = "Next conditioning image (write /h for help, /q to quit and leave empty to repeat):\n" usage = ( "Usage: Either write your prompt directly, leave this field empty " "to repeat the conditioning image or write a command starting with a slash:\n" "- '/q' to quit" ) while True: img_cond_path = input(user_question) if img_cond_path.startswith("/"): if img_cond_path.startswith("/q"): print("Quitting") return None else: if not img_cond_path.startswith("/h"): print(f"Got invalid command '{img_cond_path}'\n{usage}") print(usage) continue if img_cond_path == "": break if not os.path.isfile(img_cond_path) or not img_cond_path.lower().endswith( (".jpg", ".jpeg", ".png", ".webp") ): print(f"File '{img_cond_path}' does not exist or is not a valid image file") continue options.img_cond_path = img_cond_path break return options @torch.inference_mode() def main( name: str = "flux-dev", width: int = 1360, height: int = 768, seed: int | None = None, device: str = "cuda" if torch.cuda.is_available() else "cpu", num_steps: int | None = None, loop: bool = False, guidance: float = 2.5, offload: bool = False, output_dir: str = "output", add_sampling_metadata: bool = True, img_cond_path: str = "assets/robot.webp", ): """ Sample the flux model. Either interactively (set `--loop`) or run for a single image. Args: name: Name of the model to load height: height of the sample in pixels (should be a multiple of 16) width: width of the sample in pixels (should be a multiple of 16) seed: Set a seed for sampling output_name: where to save the output image, `{idx}` will be replaced by the index of the sample prompt: Prompt used for sampling device: Pytorch device num_steps: number of sampling steps (default 4 for schnell, 50 for guidance distilled) loop: start an interactive session and sample multiple times guidance: guidance value used for guidance distillation add_sampling_metadata: Add the prompt to the image Exif metadata img_cond_path: path to conditioning image (jpeg/png/webp) """ nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device) if name not in configs: available = ", ".join(configs.keys()) raise ValueError(f"Got unknown model name: {name}, chose from {available}") torch_device = torch.device(device) if num_steps is None: num_steps = 4 if name == "flux-schnell" else 50 output_name = os.path.join(output_dir, "img_{idx}.jpg") if not os.path.exists(output_dir): os.makedirs(output_dir) idx = 0 else: fns = [fn for fn in iglob(output_name.format(idx="*")) if re.search(r"img_[0-9]+\.jpg$", fn)] if len(fns) > 0: idx = max(int(fn.split("_")[-1].split(".")[0]) for fn in fns) + 1 else: idx = 0 # init all components t5 = load_t5(torch_device, max_length=256 if name == "flux-schnell" else 512) clip = load_clip(torch_device) model = load_flow_model(name, device="cpu" if offload else torch_device) ae = load_ae(name, device="cpu" if offload else torch_device) img_embedder = ReduxImageEncoder(torch_device) rng = torch.Generator(device="cpu") prompt = "" opts = SamplingOptions( prompt=prompt, width=width, height=height, num_steps=num_steps, guidance=guidance, seed=seed, img_cond_path=img_cond_path, ) if loop: opts = parse_prompt(opts) opts = parse_img_cond_path(opts) while opts is not None: if opts.seed is None: opts.seed = rng.seed() print(f"Generating with seed {opts.seed}:\n{opts.prompt}") t0 = time.perf_counter() # prepare input x = get_noise( 1, opts.height, opts.width, device=torch_device, dtype=torch.bfloat16, seed=opts.seed, ) opts.seed = None if offload: ae = ae.cpu() torch.cuda.empty_cache() t5, clip = t5.to(torch_device), clip.to(torch_device) inp = prepare_redux( t5, clip, x, prompt=opts.prompt, encoder=img_embedder, img_cond_path=opts.img_cond_path, ) timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell")) # offload TEs to CPU, load model to gpu if offload: t5, clip = t5.cpu(), clip.cpu() torch.cuda.empty_cache() model = model.to(torch_device) # denoise initial noise x = denoise_cache(model, **inp, timesteps=timesteps, guidance=opts.guidance) # offload model, load autoencoder to gpu if offload: model.cpu() torch.cuda.empty_cache() ae.decoder.to(x.device) # decode latents to pixel space x = unpack(x.float(), opts.height, opts.width) with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16): x = ae.decode(x) if torch.cuda.is_available(): torch.cuda.synchronize() t1 = time.perf_counter() print(f"Done in {t1 - t0:.1f}s") idx = save_image(nsfw_classifier, name, output_name, idx, x, add_sampling_metadata, prompt) if loop: print("-" * 80) opts = parse_prompt(opts) opts = parse_img_cond_path(opts) else: opts = None def app(): Fire(main) if __name__ == "__main__": app() ================================================ FILE: flux-ToCa/src/flux/ideas/__init__.py ================================================ from .cache_denoise import denoise_cache ================================================ FILE: flux-ToCa/src/flux/ideas/cache_denoise.py ================================================ import torch from ..model import Flux from torch import Tensor from ..modules.cache_functions import cache_init def denoise_cache( model: Flux, # model input img: Tensor, img_ids: Tensor, txt: Tensor, txt_ids: Tensor, vec: Tensor, # sampling parameters timesteps: list[float], guidance: float = 4.0, ): # init cache cache_dic, current = cache_init(timesteps) # this is ignored for schnell guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype) current['step']=0 current['num_steps'] = len(timesteps)-1 for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]): t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device) current['t'] = t_curr #print(t_curr) pred = model( img=img, img_ids=img_ids, txt=txt, txt_ids=txt_ids, y=vec, timesteps=t_vec, cache_dic = cache_dic, current = current, guidance=guidance_vec, ) #print(img.shape) img = img + (t_prev - t_curr) * pred current['step'] += 1 return img ================================================ FILE: flux-ToCa/src/flux/math.py ================================================ import torch from einops import rearrange from torch import Tensor def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, **kwargs) -> Tensor: cache_dic = kwargs.get('cache_dic', None) current = kwargs.get('current', None) q, k = apply_rope(q, k, pe) if cache_dic is None: x, score = dot_product_attention(q, k, v) #x = torch.nn.functional.scaled_dot_product_attention(q, k, v) elif cache_dic['cache_type'] == 'attention': x, score = dot_product_attention(q, k, v) cache_dic['attn_map'][-1][current['stream']][current['layer']]['total'] = score else: #x = torch.nn.functional.scaled_dot_product_attention(q, k, v) x, score = dot_product_attention(q, k, v) # if you are testing the FLOPs, should change to dot_product_attention x = rearrange(x, "B H L D -> B L (H D)") return x def rope(pos: Tensor, dim: int, theta: int) -> Tensor: assert dim % 2 == 0 scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim omega = 1.0 / (theta**scale) out = torch.einsum("...n,d->...nd", pos, omega) out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1) out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2) return out.float() def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]: xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2) xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2) xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1] xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1] return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk) ############################################################################################################ import math def dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor | torch.Tensor: L, S = query.size(-2), key.size(-2) scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device) if is_causal: assert attn_mask is None temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0) attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf")) attn_bias.to(query.dtype) if attn_mask is not None: if attn_mask.dtype == torch.bool: attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf")) else: attn_bias += attn_mask if enable_gqa: key = key.repeat_interleave(query.size(-3)//key.size(-3), -3) value = value.repeat_interleave(query.size(-3)//value.size(-3), -3) #attn_weight = query @ key.transpose(-2, -1) * scale_factor attn_weight = torch.matmul(query, key.transpose(-2, -1))* scale_factor attn_weight += attn_bias #attn_weight = torch.softmax(attn_weight, dim=-1) #attn_weight = torch.dropout(attn_weight, dropout_p, train=True) # #return torch.matmul(attn_weight, value) attn_map = torch.softmax(attn_weight, dim=-1) attn_weight = torch.dropout(attn_map, dropout_p, train=True) #return attn_weight @ value, attn_map.mean(dim=1).mean(dim=1) return torch.matmul(attn_weight, value), attn_map.mean(dim=1).mean(dim=1) ================================================ FILE: flux-ToCa/src/flux/model.py ================================================ from dataclasses import dataclass import torch from torch import Tensor, nn from flux.modules.layers import ( DoubleStreamBlock, EmbedND, LastLayer, MLPEmbedder, SingleStreamBlock, timestep_embedding, ) from flux.modules.lora import LinearLora, replace_linear_with_lora from flux.modules.cache_functions import cal_type @dataclass class FluxParams: in_channels: int out_channels: int vec_in_dim: int context_in_dim: int hidden_size: int mlp_ratio: float num_heads: int depth: int depth_single_blocks: int axes_dim: list[int] theta: int qkv_bias: bool guidance_embed: bool class Flux(nn.Module): """ Transformer model for flow matching on sequences. """ def __init__(self, params: FluxParams): super().__init__() self.params = params self.in_channels = params.in_channels self.out_channels = params.out_channels if params.hidden_size % params.num_heads != 0: raise ValueError( f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}" ) pe_dim = params.hidden_size // params.num_heads if sum(params.axes_dim) != pe_dim: raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}") self.hidden_size = params.hidden_size self.num_heads = params.num_heads self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim) self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True) self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size) self.guidance_in = ( MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity() ) self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size) self.double_blocks = nn.ModuleList( [ DoubleStreamBlock( self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, qkv_bias=params.qkv_bias, ) for _ in range(params.depth) ] ) self.single_blocks = nn.ModuleList( [ SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio) for _ in range(params.depth_single_blocks) ] ) self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels) def forward( self, img: Tensor, img_ids: Tensor, txt: Tensor, txt_ids: Tensor, timesteps: Tensor, y: Tensor, guidance: Tensor | None = None, *args, **kwargs, ) -> Tensor: if img.ndim != 3 or txt.ndim != 3: raise ValueError("Input img and txt tensors must have 3 dimensions.") cache_dic = kwargs.get('cache_dic', None) current = kwargs.get('current', None) # running on sequences img img = self.img_in(img) vec = self.time_in(timestep_embedding(timesteps, 256)) if self.params.guidance_embed: if guidance is None: raise ValueError("Didn't get guidance strength for guidance distilled model.") vec = vec + self.guidance_in(timestep_embedding(guidance, 256)) vec = vec + self.vector_in(y) txt = self.txt_in(txt) ids = torch.cat((txt_ids, img_ids), dim=1) pe = self.pe_embedder(ids) cal_type(cache_dic=cache_dic, current=current) for i, block in enumerate(self.double_blocks): current['layer'] = i img, txt = block(img=img, txt=txt, vec=vec, pe=pe, cache_dic=cache_dic, current=current) img = torch.cat((txt, img), 1) for i, block in enumerate(self.single_blocks): current['layer'] = i img = block(img, vec=vec, pe=pe, cache_dic=cache_dic, current=current) img = img[:, txt.shape[1] :, ...] img = self.final_layer(img, vec) # (N, T, patch_size ** 2 * out_channels) return img class FluxLoraWrapper(Flux): def __init__( self, lora_rank: int = 128, lora_scale: float = 1.0, *args, **kwargs, ) -> None: super().__init__(*args, **kwargs) self.lora_rank = lora_rank replace_linear_with_lora( self, max_rank=lora_rank, scale=lora_scale, ) def set_lora_scale(self, scale: float) -> None: for module in self.modules(): if isinstance(module, LinearLora): module.set_scale(scale=scale) ================================================ FILE: flux-ToCa/src/flux/modules/autoencoder.py ================================================ from dataclasses import dataclass import torch from einops import rearrange from torch import Tensor, nn @dataclass class AutoEncoderParams: resolution: int in_channels: int ch: int out_ch: int ch_mult: list[int] num_res_blocks: int z_channels: int scale_factor: float shift_factor: float def swish(x: Tensor) -> Tensor: return x * torch.sigmoid(x) class AttnBlock(nn.Module): def __init__(self, in_channels: int): super().__init__() self.in_channels = in_channels self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True) self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1) self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1) self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1) self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1) def attention(self, h_: Tensor) -> Tensor: h_ = self.norm(h_) q = self.q(h_) k = self.k(h_) v = self.v(h_) b, c, h, w = q.shape q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous() k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous() v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous() h_ = nn.functional.scaled_dot_product_attention(q, k, v) return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b) def forward(self, x: Tensor) -> Tensor: return x + self.proj_out(self.attention(x)) class ResnetBlock(nn.Module): def __init__(self, in_channels: int, out_channels: int): super().__init__() self.in_channels = in_channels out_channels = in_channels if out_channels is None else out_channels self.out_channels = out_channels self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True) self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True) self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) if self.in_channels != self.out_channels: self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) def forward(self, x): h = x h = self.norm1(h) h = swish(h) h = self.conv1(h) h = self.norm2(h) h = swish(h) h = self.conv2(h) if self.in_channels != self.out_channels: x = self.nin_shortcut(x) return x + h class Downsample(nn.Module): def __init__(self, in_channels: int): super().__init__() # no asymmetric padding in torch conv, must do it ourselves self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0) def forward(self, x: Tensor): pad = (0, 1, 0, 1) x = nn.functional.pad(x, pad, mode="constant", value=0) x = self.conv(x) return x class Upsample(nn.Module): def __init__(self, in_channels: int): super().__init__() self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) def forward(self, x: Tensor): x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest") x = self.conv(x) return x class Encoder(nn.Module): def __init__( self, resolution: int, in_channels: int, ch: int, ch_mult: list[int], num_res_blocks: int, z_channels: int, ): super().__init__() self.ch = ch self.num_resolutions = len(ch_mult) self.num_res_blocks = num_res_blocks self.resolution = resolution self.in_channels = in_channels # downsampling self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1) curr_res = resolution in_ch_mult = (1,) + tuple(ch_mult) self.in_ch_mult = in_ch_mult self.down = nn.ModuleList() block_in = self.ch for i_level in range(self.num_resolutions): block = nn.ModuleList() attn = nn.ModuleList() block_in = ch * in_ch_mult[i_level] block_out = ch * ch_mult[i_level] for _ in range(self.num_res_blocks): block.append(ResnetBlock(in_channels=block_in, out_channels=block_out)) block_in = block_out down = nn.Module() down.block = block down.attn = attn if i_level != self.num_resolutions - 1: down.downsample = Downsample(block_in) curr_res = curr_res // 2 self.down.append(down) # middle self.mid = nn.Module() self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in) self.mid.attn_1 = AttnBlock(block_in) self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in) # end self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True) self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1) def forward(self, x: Tensor) -> Tensor: # downsampling hs = [self.conv_in(x)] for i_level in range(self.num_resolutions): for i_block in range(self.num_res_blocks): h = self.down[i_level].block[i_block](hs[-1]) if len(self.down[i_level].attn) > 0: h = self.down[i_level].attn[i_block](h) hs.append(h) if i_level != self.num_resolutions - 1: hs.append(self.down[i_level].downsample(hs[-1])) # middle h = hs[-1] h = self.mid.block_1(h) h = self.mid.attn_1(h) h = self.mid.block_2(h) # end h = self.norm_out(h) h = swish(h) h = self.conv_out(h) return h class Decoder(nn.Module): def __init__( self, ch: int, out_ch: int, ch_mult: list[int], num_res_blocks: int, in_channels: int, resolution: int, z_channels: int, ): super().__init__() self.ch = ch self.num_resolutions = len(ch_mult) self.num_res_blocks = num_res_blocks self.resolution = resolution self.in_channels = in_channels self.ffactor = 2 ** (self.num_resolutions - 1) # compute in_ch_mult, block_in and curr_res at lowest res block_in = ch * ch_mult[self.num_resolutions - 1] curr_res = resolution // 2 ** (self.num_resolutions - 1) self.z_shape = (1, z_channels, curr_res, curr_res) # z to block_in self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1) # middle self.mid = nn.Module() self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in) self.mid.attn_1 = AttnBlock(block_in) self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in) # upsampling self.up = nn.ModuleList() for i_level in reversed(range(self.num_resolutions)): block = nn.ModuleList() attn = nn.ModuleList() block_out = ch * ch_mult[i_level] for _ in range(self.num_res_blocks + 1): block.append(ResnetBlock(in_channels=block_in, out_channels=block_out)) block_in = block_out up = nn.Module() up.block = block up.attn = attn if i_level != 0: up.upsample = Upsample(block_in) curr_res = curr_res * 2 self.up.insert(0, up) # prepend to get consistent order # end self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True) self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1) def forward(self, z: Tensor) -> Tensor: # z to block_in h = self.conv_in(z) # middle h = self.mid.block_1(h) h = self.mid.attn_1(h) h = self.mid.block_2(h) # upsampling for i_level in reversed(range(self.num_resolutions)): for i_block in range(self.num_res_blocks + 1): h = self.up[i_level].block[i_block](h) if len(self.up[i_level].attn) > 0: h = self.up[i_level].attn[i_block](h) if i_level != 0: h = self.up[i_level].upsample(h) # end h = self.norm_out(h) h = swish(h) h = self.conv_out(h) return h class DiagonalGaussian(nn.Module): def __init__(self, sample: bool = True, chunk_dim: int = 1): super().__init__() self.sample = sample self.chunk_dim = chunk_dim def forward(self, z: Tensor) -> Tensor: mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim) if self.sample: std = torch.exp(0.5 * logvar) return mean + std * torch.randn_like(mean) else: return mean class AutoEncoder(nn.Module): def __init__(self, params: AutoEncoderParams): super().__init__() self.encoder = Encoder( resolution=params.resolution, in_channels=params.in_channels, ch=params.ch, ch_mult=params.ch_mult, num_res_blocks=params.num_res_blocks, z_channels=params.z_channels, ) self.decoder = Decoder( resolution=params.resolution, in_channels=params.in_channels, ch=params.ch, out_ch=params.out_ch, ch_mult=params.ch_mult, num_res_blocks=params.num_res_blocks, z_channels=params.z_channels, ) self.reg = DiagonalGaussian() self.scale_factor = params.scale_factor self.shift_factor = params.shift_factor def encode(self, x: Tensor) -> Tensor: z = self.reg(self.encoder(x)) z = self.scale_factor * (z - self.shift_factor) return z def decode(self, z: Tensor) -> Tensor: z = z / self.scale_factor + self.shift_factor return self.decoder(z) def forward(self, x: Tensor) -> Tensor: return self.decode(self.encode(x)) ================================================ FILE: flux-ToCa/src/flux/modules/cache_functions/__init__.py ================================================ from .cache_cutfresh import cache_cutfresh from .fresh_ratio_scheduler import fresh_ratio_scheduler from .score_evaluate import score_evaluate from .global_force_fresh import global_force_fresh from .cache_cutfresh import cache_cutfresh from .update_cache import update_cache from .force_init import force_init from .attention import cached_attention_forward from .cache_init import cache_init from .cal_type import cal_type from .force_scheduler import force_scheduler from .support_set_selection import support_set_selection ================================================ FILE: flux-ToCa/src/flux/modules/cache_functions/attention.py ================================================ # Besides, re-arrange the attention module from torch.jit import Final import torch import torch.nn as nn import torch.nn.functional as F from typing import Optional, Union #from xformers.ops.fmha.attn_bias import BlockDiagonalMask def cached_attention_forward( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, #attn_bias: Optional[Union[torch.Tensor, BlockDiagonalMask]] = None, attn_bias, p: float = 0.0, scale: Optional[float] = None ) -> torch.Tensor: scale = 1.0 / query.shape[-1] ** 0.5 query = query * scale query = query.transpose(1, 2) key = key.transpose(1, 2) value = value.transpose(1, 2) attn = query @ key.transpose(-2, -1) if attn_bias is not None: attn_bias = attn_bias.materialize(shape= attn.shape, dtype= attn.dtype, device= attn.device) attn = attn + attn_bias #out_map = attn attn_map = attn.softmax(-1) attn = F.dropout(attn_map, p) attn = attn @ value return attn.transpose(1, 2).contiguous(), attn_map.mean(dim=1) ================================================ FILE: flux-ToCa/src/flux/modules/cache_functions/cache_cutfresh.py ================================================ from .fresh_ratio_scheduler import fresh_ratio_scheduler from .score_evaluate import score_evaluate #from .token_merge import token_merge from .support_set_selection import support_set_selection import torch def cache_cutfresh(cache_dic, tokens, current): ''' Cut fresh tokens from the input tokens and update the cache counter. cache_dic: dict, the cache dictionary containing cache(main extra memory cost), indices and some other information. tokens: torch.Tensor, the input tokens to be cut. current: dict, the current step, layer, and module information. Particularly convenient for debugging. ''' step = current['step'] layer = current['layer'] stream = current['stream'] module = current['module'] fresh_ratio = fresh_ratio_scheduler(cache_dic, current) fresh_ratio = torch.clamp(torch.tensor(fresh_ratio, device = tokens.device), min=0, max=1) # Generate the index tensor for fresh tokens score = score_evaluate(cache_dic, tokens, current) # s1, s2, s3 mentioned in the paper #score = local_selection_with_bonus(score, 0.4, 4) # Uniform Spatial Distribution s4 mentioned in the paper indices = score.argsort(dim=-1, descending=True) topk = int(fresh_ratio * score.shape[1]) fresh_indices = indices[:, :topk] stale_indices = indices[:, topk:] #fresh_indices = support_set_selection(tokens, fresh_ratio, 0.4, current, cache_dic) # (B, fresh_ratio * N) # 0.4 # (B, fresh_ratio *N) # Updating the Cache Frequency Score s3 mentioned in the paper # stale tokens index + 1 in each ***module***, fresh tokens index = 0 cache_dic['cache_index'][-1][layer][module] += 1 cache_dic['cache_index'][-1][layer][module].scatter_(dim=1, index=fresh_indices, src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device)) #cache_dic['cache_index']['layer_index'][module] += 1 #cache_dic['cache_index']['layer_index'][module].scatter_(dim=1, index=fresh_indices, # src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device)) fresh_indices_expand = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1]) fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices_expand) return fresh_indices, fresh_tokens def local_selection_with_bonus(score, bonus_ratio, grid_size=2): batch_size, num_tokens = score.shape image_size = int(num_tokens ** 0.5) block_size = grid_size * grid_size assert num_tokens % block_size == 0, "The number of tokens must be divisible by the block size." # Step 1: Reshape score to group it by blocks score_reshaped = score.view(batch_size, image_size // grid_size, grid_size, image_size // grid_size, grid_size) score_reshaped = score_reshaped.permute(0, 1, 3, 2, 4).contiguous() score_reshaped = score_reshaped.view(batch_size, -1, block_size) # [batch_size, num_blocks, block_size] # Step 2: Find the max token in each block max_scores, max_indices = score_reshaped.max(dim=-1, keepdim=True) # [batch_size, num_blocks, 1] # Step 3: Create a mask to identify max score tokens mask = torch.zeros_like(score_reshaped) mask.scatter_(-1, max_indices, 1) # Set mask to 1 at the max indices # Step 4: Apply the bonus only to the max score tokens score_reshaped = score_reshaped + (mask * max_scores * bonus_ratio) # Apply bonus only to max tokens # Step 5: Reshape the score back to its original shape score_modified = score_reshaped.view(batch_size, image_size // grid_size, image_size // grid_size, grid_size, grid_size) score_modified = score_modified.permute(0, 1, 3, 2, 4).contiguous() score_modified = score_modified.view(batch_size, num_tokens) return score_modified ================================================ FILE: flux-ToCa/src/flux/modules/cache_functions/cache_init.py ================================================ def cache_init(timesteps, model_kwargs=None): ''' Initialization for cache. ''' cache_dic = {} cache = {} cache_index = {} cache[-1]={} cache_index[-1]={} cache_index['layer_index']={} cache_dic['attn_map'] = {} cache_dic['attn_map'][-1] = {} cache_dic['attn_map'][-1]['double_stream'] = {} cache_dic['attn_map'][-1]['single_stream'] = {} cache_dic['k-norm'] = {} cache_dic['k-norm'][-1] = {} cache_dic['k-norm'][-1]['double_stream'] = {} cache_dic['k-norm'][-1]['single_stream'] = {} cache_dic['v-norm'] = {} cache_dic['v-norm'][-1] = {} cache_dic['v-norm'][-1]['double_stream'] = {} cache_dic['v-norm'][-1]['single_stream'] = {} cache_dic['cross_attn_map'] = {} cache_dic['cross_attn_map'][-1] = {} cache[-1]['double_stream']={} cache[-1]['single_stream']={} cache_dic['cache_counter'] = 0 for j in range(19): cache[-1]['double_stream'][j] = {} cache_index[-1][j] = {} cache_dic['attn_map'][-1]['double_stream'][j] = {} cache_dic['attn_map'][-1]['double_stream'][j]['total'] = {} cache_dic['attn_map'][-1]['double_stream'][j]['txt_mlp'] = {} cache_dic['attn_map'][-1]['double_stream'][j]['img_mlp'] = {} cache_dic['k-norm'][-1]['double_stream'][j] = {} cache_dic['k-norm'][-1]['double_stream'][j]['txt_mlp'] = {} cache_dic['k-norm'][-1]['double_stream'][j]['img_mlp'] = {} cache_dic['v-norm'][-1]['double_stream'][j] = {} cache_dic['v-norm'][-1]['double_stream'][j]['txt_mlp'] = {} cache_dic['v-norm'][-1]['double_stream'][j]['img_mlp'] = {} for j in range(38): cache[-1]['single_stream'][j] = {} cache_index[-1][j] = {} cache_dic['attn_map'][-1]['single_stream'][j] = {} cache_dic['attn_map'][-1]['single_stream'][j]['total'] = {} cache_dic['k-norm'][-1]['single_stream'][j] = {} cache_dic['k-norm'][-1]['single_stream'][j]['total'] = {} cache_dic['v-norm'][-1]['single_stream'][j] = {} cache_dic['v-norm'][-1]['single_stream'][j]['total'] = {} mode = 'ToCa' if mode == 'original': cache_dic['cache_type'] = 'random' # model_kwargs['cache_type'] # no use cache_dic['cache_index'] = cache_index cache_dic['cache'] = cache cache_dic['fresh_ratio_schedule'] = 'ToCa' # model_kwargs['ratio_scheduler'] cache_dic['fresh_ratio'] = 0.0 # model_kwargs['fresh_ratio'] cache_dic['fresh_threshold'] = 1 # model_kwargs['fresh_threshold'] cache_dic['force_fresh'] = 'global' # model_kwargs['force_fresh'] cache_dic['soft_fresh_weight'] = 0.0 # model_kwargs['soft_fresh_weight'] elif mode == 'ToCa': cache_dic['cache_type'] = 'attention' # Attention cache type for ToCa, use Self-Attention Weight to evaluate the importance of each token cache_dic['cache_index'] = cache_index cache_dic['cache'] = cache cache_dic['fresh_ratio_schedule'] = 'ToCa' cache_dic['fresh_ratio'] = 0.1 cache_dic['fresh_threshold'] = 4 cache_dic['force_fresh'] = 'global' cache_dic['soft_fresh_weight'] = 0.25 current = {} current['final_time'] = timesteps[-2] return cache_dic, current ================================================ FILE: flux-ToCa/src/flux/modules/cache_functions/cal_type.py ================================================ from .force_scheduler import force_scheduler def cal_type(cache_dic, current): ''' Determine calculation type for this step ''' if cache_dic['fresh_ratio'] == 0.0: # FORA: Uniform first_step = (current['step'] == 0) else: # ToCa: First 3 steps enhanced first_step = (current['step'] <= 2) force_fresh = cache_dic['force_fresh'] if not first_step: fresh_interval = cache_dic['cal_threshold'] else: fresh_interval = cache_dic['fresh_threshold'] if (first_step) or (cache_dic['cache_counter'] == fresh_interval - 1 ): current['type'] = 'full' cache_dic['cache_counter'] = 0 force_scheduler(cache_dic, current) # ToCa else: cache_dic['cache_counter'] += 1 current['type'] = 'ToCa' ###################################################################### #if (current['step'] in [3,2,1,0]): # current['type'] = 'full' ================================================ FILE: flux-ToCa/src/flux/modules/cache_functions/force_init.py ================================================ import torch def force_init(cache_dic, current, tokens): ''' Initialization for Force Activation step. ''' cache_dic['cache_index'][-1][current['layer']][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device) #if current['layer'] == 0: # cache_dic['cache_index']['layer_index'][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device) ================================================ FILE: flux-ToCa/src/flux/modules/cache_functions/force_scheduler.py ================================================ import torch def force_scheduler(cache_dic, current): if cache_dic['fresh_ratio'] == 0: # FORA linear_step_weight = 0.0 else: # TokenCache linear_step_weight = 0.0 step_factor = torch.tensor(1 - linear_step_weight + 2 * linear_step_weight * current['step'] / current['num_steps']) threshold = torch.round(cache_dic['fresh_threshold'] / step_factor) # no force constrain for sensitive steps, cause the performance is good enough. # you may have a try. cache_dic['cal_threshold'] = threshold #return threshold ================================================ FILE: flux-ToCa/src/flux/modules/cache_functions/fresh_ratio_scheduler.py ================================================ import torch def fresh_ratio_scheduler(cache_dic, current): ''' Return the fresh ratio for the current step. ''' fresh_ratio = cache_dic['fresh_ratio'] fresh_ratio_schedule = cache_dic['fresh_ratio_schedule'] step = current['step'] num_steps = current['num_steps'] threshold = cache_dic['fresh_threshold'] weight = 0.9 if fresh_ratio_schedule == 'constant': return fresh_ratio elif fresh_ratio_schedule == 'linear': return fresh_ratio * (1 + weight - 2 * weight * step / num_steps) elif fresh_ratio_schedule == 'exp': #return 0.5 * (0.052 ** (step/num_steps)) return fresh_ratio * (weight ** (step / num_steps)) elif fresh_ratio_schedule == 'linear-mode': mode = (step % threshold)/threshold - 0.5 mode_weight = 0.1 return fresh_ratio * (1 + weight - 2 * weight * step / num_steps + mode_weight * mode) elif fresh_ratio_schedule == 'layerwise': return fresh_ratio * (1 + weight - 2 * weight * current['layer'] / 27) elif fresh_ratio_schedule == 'linear-layerwise': step_weight = -0.9 #0.9 step_factor = 1 - step_weight + 2 * step_weight * step / num_steps #if current['layer'] == 2: # return 1.0 #sigmoid #sigmoid_weight = 0.13 #layer_factor = 2 * torch.sigmoid(torch.tensor([sigmoid_weight * (13.5 - current['layer'])])) layer_weight = 0.6 layer_factor = 1 + layer_weight - 2 * layer_weight * current['layer'] / 27 module_weight = 1.0 #TokenCache N=8 2.5 N=6 2.5 #N=4 2.1 module_time_weight = 0.6 module_factor = (1 - (1-module_time_weight) * module_weight) if current['module']=='cross-attn' else (1 + module_time_weight * module_weight) return fresh_ratio * layer_factor * step_factor * module_factor elif fresh_ratio_schedule == 'ToCa': step_weight = 0.0 #0.9 step_factor = 1 - step_weight + 2 * step_weight * step / num_steps layer_weight = 0.5 layer_factor = 1 + layer_weight - 2 * layer_weight * current['layer'] / 27 #module_weight = 1.0 #module_time_weight = 0.6 # this means 60*x% cross-attn computation, and 160*x% mlp computation. This is designed for cross-attn has best temporal redundancy, and mlp has worse. # so cross-attn compute less and mlp compute more. #module_factor = (1 - (1-module_time_weight) * module_weight) if current['module']=='cross-attn' else (1 + module_time_weight * module_weight) stream_weight = 0.6 stream_factor = (1 - stream_weight) if current['stream']=='double_stream' else (1 + stream_weight) return fresh_ratio * layer_factor * step_factor * stream_factor #* module_factor else: raise ValueError("unrecognized fresh ratio schedule", fresh_ratio_schedule) ================================================ FILE: flux-ToCa/src/flux/modules/cache_functions/global_force_fresh.py ================================================ from .force_scheduler import force_scheduler def global_force_fresh(cache_dic, current): ''' Return whether to force fresh tokens globally. ''' first_step = (current['step'] == 0) second_step = (current['step'] == 1) force_fresh = cache_dic['force_fresh'] if not first_step: fresh_threshold = cache_dic['cal_threshold'] else: fresh_threshold = cache_dic['fresh_threshold'] if force_fresh == 'global': return (first_step or (current['step']% fresh_threshold == 0)) elif force_fresh == 'local': return first_step elif force_fresh == 'none': return first_step else: raise ValueError("unrecognized force fresh strategy", force_fresh) ================================================ FILE: flux-ToCa/src/flux/modules/cache_functions/score_evaluate.py ================================================ import torch import torch.nn as nn from .scores import attn_score, similarity_score, norm_score, k_norm_score, v_norm_score def score_evaluate(cache_dic, tokens, current) -> torch.Tensor: ''' Return the score tensor (B, N) for the given tokens. ''' #if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')): # # abandoned branch, if you want to explore the local force fresh strategy, this may help. # force_fresh_mask = torch.as_tensor((cache_dic['cache_index'][-1][current['layer']][current['module']] >= 2 * cache_dic['fresh_threshold']), dtype = int) # 2 because the threshold is for step, not module # force_len = force_fresh_mask.sum(dim=1) # force_indices = force_fresh_mask.argsort(dim = -1, descending = True)[:, :force_len.min()] # force_indices = force_indices[:, torch.randperm(force_indices.shape[1])] # Just see more explanation in the version of DiT-ToCa if needed. if cache_dic['cache_type'] == 'random': score = torch.rand(tokens.shape[0], tokens.shape[1], device=tokens.device) elif cache_dic['cache_type'] == 'straight': score = torch.ones(tokens.shape[0], tokens.shape[1]).to(tokens.device) elif cache_dic['cache_type'] == 'attention': # cache_dic['attn_map'][step][layer] (B, N, N), the last dimention has get softmaxed score = attn_score(cache_dic, current) #score = score + 0.0 * torch.rand_like(score, device= score.device) elif cache_dic['cache_type'] == 'similarity': score = similarity_score(cache_dic, current, tokens) elif cache_dic['cache_type'] == 'norm': score = norm_score(cache_dic, current, tokens) elif cache_dic['cache_type'] == 'k-norm': score = k_norm_score(cache_dic, current) elif cache_dic['cache_type'] == 'v-norm': score = v_norm_score(cache_dic, current) elif cache_dic['cache_type'] == 'compress': score1 = torch.rand(int(tokens.shape[0]*0.5), tokens.shape[1]) score1 = torch.cat([score1, score1], dim=0).to(tokens.device) score2 = cache_dic['attn_map'][-1][current['layer']].sum(dim=1)#.mean(dim=0) # (B, N) # normalize score2 = score2 / score2.max(dim=1, keepdim=True)[0] score = 0.5 * score1 + 0.5 * score2 # abandoned the branch, if you want to explore the local force fresh strategy, this may help. #if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')): # current['is_force_fresh'] is False, cause when it is True, no cut and fresh are needed # #print(torch.ones_like(force_indices, dtype=float, device=force_indices.device).dtype) # score.scatter_(dim=1, index=force_indices, src=torch.ones_like(force_indices, dtype=torch.float32, # device=force_indices.device)) if (True and (cache_dic['force_fresh'] == 'global')): soft_step_score = cache_dic['cache_index'][-1][current['layer']][current['module']].float() / (cache_dic['fresh_threshold']) #soft_layer_score = cache_dic['cache_index']['layer_index'][current['module']].float() / (27) score = score + cache_dic['soft_fresh_weight'] * soft_step_score #+ 0.1 *soft_layer_score return score.to(tokens.device) ================================================ FILE: flux-ToCa/src/flux/modules/cache_functions/scores.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F def attn_score(cache_dic, current): #self_attn_score = 1- cache_dic['attn_map'][-1][current['layer']].diagonal(dim1=1, dim2=2) #self_attn_score = F.normalize(self_attn_score, dim=1, p=2) #attention_score = F.normalize(cache_dic['attn_map'][-1][current['layer']].sum(dim=1), dim=1, p=2) #cross_attn_map = F.threshold(cache_dic['cross_attn_map'][-1][current['layer']],threshold=0.0, value=0.0) #cross_attention_score = F.normalize(cross_attn_map.sum(dim=-1), dim=-1, p=2) # Note: It is important to give a same selection method for cfg and no cfg. # Because the influence of **Cross-Attention** in text-contidional models makes cfg and no cfg a BIG difference. # Same selection for cfg and no cfg #cond_cmap, uncond_cmap = torch.split(cache_dic['attn_map'][-1][current['layer']], len(cache_dic['cross_attn_map'][-1][current['layer']]) // 2, dim=0) #cond_weight = 0.5 #cmap = cond_weight * cond_cmap + (1 - cond_weight) * uncond_cmap ## Entropy score #cross_attention_entropy = -torch.sum(cmap * torch.log(cmap + 1e-7), dim=-1) #cross_attention_score = F.normalize(1 + cross_attention_entropy, dim=1, p=2) # Note here "1" does not influence the sorted sequence, but provie stability. #score = cross_attention_score.repeat(2, 1) if current['stream'] == 'double_stream': score = F.normalize(cache_dic['attn_map'][-1][current['stream']][current['layer']][current['module']], dim=-1, p=2) elif current['stream'] == 'single_stream': score = F.normalize(cache_dic['attn_map'][-1][current['stream']][current['layer']]['total'], dim=-1, p=2) # You can try conbining the self_attention_score (s1) and cross_attention_score (s2) as the final score, there exists a balance. #cross_weight = 0.0 #score = (1-cross_weight) * attention_score + cross_weight * cross_attention_score return score def similarity_score(cache_dic, current, tokens): cosine_sim = F.cosine_similarity(tokens, cache_dic['cache'][-1][current['layer']][current['module']], dim=-1) return F.normalize(1- cosine_sim, dim=-1, p=2) def norm_score(cache_dic, current, tokens): norm = tokens.norm(dim=-1, p=2) return F.normalize(norm, dim=-1, p=2) def kv_norm_score(cache_dic, current): # (B, N, num_heads) #cond_k_norm, uncond_k_norm = torch.split(cache_dic['cache'][-1][current['layer']]['k_norm'], len(cache_dic['cache'][-1][current['layer']]['k_norm']) // 2, dim=0) cond_v_norm, uncond_v_norm = torch.split(cache_dic['cache'][-1][current['layer']]['v_norm'], len(cache_dic['cache'][-1][current['layer']]['v_norm']) // 2, dim=0) cond_weight = 0.5 #k_norm = cond_weight * cond_k_norm + (1 - cond_weight) * uncond_k_norm v_norm = cond_weight * cond_v_norm + (1 - cond_weight) * uncond_v_norm kv_norm = 1 -v_norm ## 计算 (B/2, N) 张量在 N 维度上的每个元素与均值的绝对值差 #kv_norm_mean = kv_norm.mean(dim=-2, keepdim=True) #kv_norm_diff = torch.abs(kv_norm - kv_norm_mean) return F.normalize(kv_norm.sum(dim=-1), p=2).repeat(2, 1) def k_norm_score(cache_dic, current): # (B, N) if current['stream'] == 'double_stream': score = F.normalize(cache_dic['k-norm'][-1][current['stream']][current['layer']][current['module']], dim=-1, p=2) elif current['stream'] == 'single_stream': score = F.normalize(cache_dic['k-norm'][-1][current['stream']][current['layer']]['total'], dim=-1, p=2) return score def v_norm_score(cache_dic, current): # (B, N) if current['stream'] == 'double_stream': score = F.normalize(cache_dic['v-norm'][-1][current['stream']][current['layer']][current['module']], dim=-1, p=2) elif current['stream'] == 'single_stream': score = F.normalize(cache_dic['v-norm'][-1][current['stream']][current['layer']]['total'], dim=-1, p=2) return score ================================================ FILE: flux-ToCa/src/flux/modules/cache_functions/support_set_selection.py ================================================ import torch from typing import Dict def support_set_selection(x: torch.Tensor, fresh_ratio: float, base_ratio: float, current: Dict, cache_dic: Dict) -> torch.Tensor: #selection_start = 0 # #if current['stream'] == 'single_stream': # # only select from the img tokens # x = x[:, cache_dic['txt_shape'] :] # selection_start = cache_dic['txt_shape'] B, N, H = x.shape num_total = int(fresh_ratio * N) # 最终每个 batch 选取的 token 数 base_count = int(base_ratio * num_total) # 随机选取的 token 数 #base_count = 1 add_count = num_total - base_count # 需要从候选集中选取的 token 数 # 1. 随机选取 (B, base_count) 个 token random_indices = torch.randperm(N, device=x.device) base_indices = random_indices[:base_count] other_indices = random_indices[base_count:] base_tokens = x.gather(dim=1, index=base_indices.unsqueeze(-1).expand(B, -1, H)) #other_tokens = x.gather(dim=1, index=other_indices.unsqueeze(-1).expand(-1, -1, H)) # 2. 计算余下 token 与已选 token 的相似度 # normaize base_tokens = base_tokens / base_tokens.norm(dim=-1, keepdim=True) #other_tokens = other_tokens / other_tokens.norm(dim=-1, keepdim=True) x_norm = x / x.norm(dim=-1, keepdim=True) # 计算余下 token 与已选 token 的相似度 similarity = torch.einsum('bnd,bmd->bnm', base_tokens, x_norm) # 计算每列最小值 min_similarity = similarity.min(dim=1).values #min_similarity = similarity.max(dim=1).values # 3. 选取相似度最小的 token _, min_indices = min_similarity.topk(add_count, largest=False) #_, min_indices = min_similarity.topk(add_count, largest=True) # 4. 合并 base_indices 和 min_indices #indices = torch.cat([base_indices, other_indices[min_indices]], dim=-1) indices = torch.cat([base_indices.expand(B, -1), min_indices], dim=-1) #+ selection_start return indices ================================================ FILE: flux-ToCa/src/flux/modules/cache_functions/token_merge.py ================================================ import torch def token_merge(cache_dic, tokens, current, fresh_indices, stale_indices): ''' An abandoned branch in exploring if token merge helps. The answer is no, at least no for training-free strategy. ''' if (current['layer'] % 1 == 0): fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1])) stale_tokens = torch.gather(input = tokens, dim = 1, index = stale_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1])) method = 'similarity' if method == 'distance': descending = False distance = torch.cdist(stale_tokens, fresh_tokens, p=1) stale_fresh_dist, stale_fresh_indices_allstale = torch.min(distance, dim=2) elif method == 'similarity': descending = True fresh_tokens = torch.nn.functional.normalize(fresh_tokens, p=2, dim=-1) stale_tokens = torch.nn.functional.normalize(stale_tokens, p=2, dim=-1) similarity = stale_tokens @ fresh_tokens.transpose(1, 2) stale_fresh_dist, stale_fresh_indices_allstale = torch.max(similarity, dim=2) saved_topk_stale = int((stale_fresh_dist > 0.995).sum(dim=1).min()) merged_stale_sequence = torch.sort(stale_fresh_dist, dim=1, descending=descending)[1][:,:saved_topk_stale] stale_fresh_indices = stale_fresh_indices_allstale.gather(1, merged_stale_sequence) merged_stale_sequence = stale_indices.gather(1, merged_stale_sequence) merged_stale_fresh_indices = fresh_indices.gather(1, stale_fresh_indices) cache_dic['merged_stale_fresh_indices'] = merged_stale_fresh_indices cache_dic['merged_stale_sequence'] = merged_stale_sequence ================================================ FILE: flux-ToCa/src/flux/modules/cache_functions/update_cache.py ================================================ import torch def update_cache(fresh_indices, fresh_tokens, cache_dic, current, fresh_attn_map=None): ''' Update the cache with the fresh tokens. ''' step = current['step'] layer = current['layer'] module = current['module'] # Update the cached tokens at the positions indices = fresh_indices cache_dic['cache'][-1][current['stream']][current['layer']][current['module']].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_tokens.shape[-1]), src=fresh_tokens) ================================================ FILE: flux-ToCa/src/flux/modules/conditioner.py ================================================ from torch import Tensor, nn from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5Tokenizer class HFEmbedder(nn.Module): def __init__(self, version: str, max_length: int, **hf_kwargs): super().__init__() self.is_clip = "openai" in version self.max_length = max_length self.output_key = "pooler_output" if self.is_clip else "last_hidden_state" if self.is_clip: self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(version, max_length=max_length) self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(version, **hf_kwargs) else: self.tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(version, max_length=max_length) self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(version, **hf_kwargs) self.hf_module = self.hf_module.eval().requires_grad_(False) def forward(self, text: list[str]) -> Tensor: batch_encoding = self.tokenizer( text, truncation=True, max_length=self.max_length, return_length=False, return_overflowing_tokens=False, padding="max_length", return_tensors="pt", ) outputs = self.hf_module( input_ids=batch_encoding["input_ids"].to(self.hf_module.device), attention_mask=None, output_hidden_states=False, ) return outputs[self.output_key] ================================================ FILE: flux-ToCa/src/flux/modules/image_embedders.py ================================================ import os import cv2 import numpy as np import torch from einops import rearrange, repeat from PIL import Image from safetensors.torch import load_file as load_sft from torch import nn from transformers import AutoModelForDepthEstimation, AutoProcessor, SiglipImageProcessor, SiglipVisionModel from flux.util import print_load_warning class DepthImageEncoder: depth_model_name = "LiheYoung/depth-anything-large-hf" def __init__(self, device): self.device = device self.depth_model = AutoModelForDepthEstimation.from_pretrained(self.depth_model_name).to(device) self.processor = AutoProcessor.from_pretrained(self.depth_model_name) def __call__(self, img: torch.Tensor) -> torch.Tensor: hw = img.shape[-2:] img = torch.clamp(img, -1.0, 1.0) img_byte = ((img + 1.0) * 127.5).byte() img = self.processor(img_byte, return_tensors="pt")["pixel_values"] depth = self.depth_model(img.to(self.device)).predicted_depth depth = repeat(depth, "b h w -> b 3 h w") depth = torch.nn.functional.interpolate(depth, hw, mode="bicubic", antialias=True) depth = depth / 127.5 - 1.0 return depth class CannyImageEncoder: def __init__( self, device, min_t: int = 50, max_t: int = 200, ): self.device = device self.min_t = min_t self.max_t = max_t def __call__(self, img: torch.Tensor) -> torch.Tensor: assert img.shape[0] == 1, "Only batch size 1 is supported" img = rearrange(img[0], "c h w -> h w c") img = torch.clamp(img, -1.0, 1.0) img_np = ((img + 1.0) * 127.5).numpy().astype(np.uint8) # Apply Canny edge detection canny = cv2.Canny(img_np, self.min_t, self.max_t) # Convert back to torch tensor and reshape canny = torch.from_numpy(canny).float() / 127.5 - 1.0 canny = rearrange(canny, "h w -> 1 1 h w") canny = repeat(canny, "b 1 ... -> b 3 ...") return canny.to(self.device) class ReduxImageEncoder(nn.Module): siglip_model_name = "google/siglip-so400m-patch14-384" def __init__( self, device, redux_dim: int = 1152, txt_in_features: int = 4096, redux_path: str | None = os.getenv("FLUX_REDUX"), dtype=torch.bfloat16, ) -> None: assert redux_path is not None, "Redux path must be provided" super().__init__() self.redux_dim = redux_dim self.device = device if isinstance(device, torch.device) else torch.device(device) self.dtype = dtype with self.device: self.redux_up = nn.Linear(redux_dim, txt_in_features * 3, dtype=dtype) self.redux_down = nn.Linear(txt_in_features * 3, txt_in_features, dtype=dtype) sd = load_sft(redux_path, device=str(device)) missing, unexpected = self.load_state_dict(sd, strict=False, assign=True) print_load_warning(missing, unexpected) self.siglip = SiglipVisionModel.from_pretrained(self.siglip_model_name).to(dtype=dtype) self.normalize = SiglipImageProcessor.from_pretrained(self.siglip_model_name) def __call__(self, x: Image.Image) -> torch.Tensor: imgs = self.normalize.preprocess(images=[x], do_resize=True, return_tensors="pt", do_convert_rgb=True) _encoded_x = self.siglip(**imgs.to(device=self.device, dtype=self.dtype)).last_hidden_state projected_x = self.redux_down(nn.functional.silu(self.redux_up(_encoded_x))) return projected_x ================================================ FILE: flux-ToCa/src/flux/modules/layers.py ================================================ import math from dataclasses import dataclass from typing import Optional import torch from einops import rearrange from torch import Tensor, nn from flux.math import attention, rope from flux.modules.cache_functions import force_init, cache_cutfresh, update_cache class EmbedND(nn.Module): def __init__(self, dim: int, theta: int, axes_dim: list[int]): super().__init__() self.dim = dim self.theta = theta self.axes_dim = axes_dim def forward(self, ids: Tensor) -> Tensor: n_axes = ids.shape[-1] emb = torch.cat( [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)], dim=-3, ) return emb.unsqueeze(1) def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0): """ Create sinusoidal timestep embeddings. :param t: a 1-D Tensor of N indices, one per batch element. These may be fractional. :param dim: the dimension of the output. :param max_period: controls the minimum frequency of the embeddings. :return: an (N, D) Tensor of positional embeddings. """ t = time_factor * t half = dim // 2 freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to( t.device ) args = t[:, None].float() * freqs[None] embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) if dim % 2: embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) if torch.is_floating_point(t): embedding = embedding.to(t) return embedding class MLPEmbedder(nn.Module): def __init__(self, in_dim: int, hidden_dim: int): super().__init__() self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True) self.silu = nn.SiLU() self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True) def forward(self, x: Tensor) -> Tensor: return self.out_layer(self.silu(self.in_layer(x))) class RMSNorm(torch.nn.Module): def __init__(self, dim: int): super().__init__() self.scale = nn.Parameter(torch.ones(dim)) def forward(self, x: Tensor): x_dtype = x.dtype x = x.float() rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6) return (x * rrms).to(dtype=x_dtype) * self.scale class QKNorm(torch.nn.Module): def __init__(self, dim: int): super().__init__() self.query_norm = RMSNorm(dim) self.key_norm = RMSNorm(dim) def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]: q = self.query_norm(q) k = self.key_norm(k) return q.to(v), k.to(v) class SelfAttention(nn.Module): def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.norm = QKNorm(head_dim) self.proj = nn.Linear(dim, dim) def forward(self, x: Tensor, pe: Tensor) -> Tensor: qkv = self.qkv(x) q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) q, k = self.norm(q, k, v) x = attention(q, k, v, pe=pe) x = self.proj(x) return x @dataclass class ModulationOut: shift: Tensor scale: Tensor gate: Tensor class Modulation(nn.Module): def __init__(self, dim: int, double: bool): super().__init__() self.is_double = double self.multiplier = 6 if double else 3 self.lin = nn.Linear(dim, self.multiplier * dim, bias=True) def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]: out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1) return ( ModulationOut(*out[:3]), ModulationOut(*out[3:]) if self.is_double else None, ) class DoubleStreamBlock(nn.Module): def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False): super().__init__() mlp_hidden_dim = int(hidden_size * mlp_ratio) self.num_heads = num_heads self.hidden_size = hidden_size self.img_mod = Modulation(hidden_size, double=True) self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias) self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) self.img_mlp = nn.Sequential( nn.Linear(hidden_size, mlp_hidden_dim, bias=True), nn.GELU(approximate="tanh"), nn.Linear(mlp_hidden_dim, hidden_size, bias=True), ) self.txt_mod = Modulation(hidden_size, double=True) self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias) self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) self.txt_mlp = nn.Sequential( nn.Linear(hidden_size, mlp_hidden_dim, bias=True), nn.GELU(approximate="tanh"), nn.Linear(mlp_hidden_dim, hidden_size, bias=True), ) def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, **kwargs) -> tuple[Tensor, Tensor]: cache_dic = kwargs.get('cache_dic', None) current = kwargs.get('current', None) if cache_dic is None: img_mod1, img_mod2 = self.img_mod(vec) txt_mod1, txt_mod2 = self.txt_mod(vec) # prepare image for attention img_modulated = self.img_norm1(img) img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift img_qkv = self.img_attn.qkv(img_modulated) img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) img_q, img_k = self.img_attn.norm(img_q, img_k, img_v) # prepare txt for attention txt_modulated = self.txt_norm1(txt) txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift txt_qkv = self.txt_attn.qkv(txt_modulated) txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v) # run actual attention q = torch.cat((txt_q, img_q), dim=2) k = torch.cat((txt_k, img_k), dim=2) v = torch.cat((txt_v, img_v), dim=2) attn = attention(q, k, v, pe=pe) txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :] # calculate the img bloks img = img + img_mod1.gate * self.img_attn.proj(img_attn) img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift) # calculate the txt bloks txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn) txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift) else: current['stream'] = 'double_stream' if current['type'] == 'full': img_mod1, img_mod2 = self.img_mod(vec) txt_mod1, txt_mod2 = self.txt_mod(vec) # prepare image for attention img_modulated = self.img_norm1(img) img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift img_qkv = self.img_attn.qkv(img_modulated) img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) if cache_dic['cache_type'] == 'k-norm': img_k_norm = img_k.norm(dim=-1, p=2).mean(dim=1) cache_dic['k-norm'][-1][current['stream']][current['layer']]['img_mlp'] = img_k_norm elif cache_dic['cache_type'] == 'v-norm': img_v_norm = img_v.norm(dim=-1, p=2).mean(dim=1) cache_dic['v-norm'][-1][current['stream']][current['layer']]['img_mlp'] = img_v_norm img_q, img_k = self.img_attn.norm(img_q, img_k, img_v) # prepare txt for attention txt_modulated = self.txt_norm1(txt) txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift txt_qkv = self.txt_attn.qkv(txt_modulated) txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) if cache_dic['cache_type'] == 'k-norm': txt_k_norm = txt_k.norm(dim=-1, p=2).mean(dim=1) cache_dic['k-norm'][-1][current['stream']][current['layer']]['txt_mlp'] = txt_k_norm elif cache_dic['cache_type'] == 'v-norm': txt_v_norm = txt_v.norm(dim=-1, p=2).mean(dim=1) cache_dic['v-norm'][-1][current['stream']][current['layer']]['txt_mlp'] = txt_v_norm txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v) # run actual attention q = torch.cat((txt_q, img_q), dim=2) k = torch.cat((txt_k, img_k), dim=2) v = torch.cat((txt_v, img_v), dim=2) attn = attention(q, k, v, pe=pe, cache_dic=cache_dic, current=current) cache_dic['cache'][-1]['double_stream'][current['layer']]['attn'] = attn txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :] cache_dic['txt_shape'] = txt.shape[1] if cache_dic['cache_type'] == 'attention': cache_dic['attn_map'][-1][current['stream']][current['layer']]['txt_mlp'] = cache_dic['attn_map'][-1][current['stream']][current['layer']]['total'][:, : txt.shape[1]] cache_dic['attn_map'][-1][current['stream']][current['layer']]['img_mlp'] = cache_dic['attn_map'][-1][current['stream']][current['layer']]['total'][:, txt.shape[1] :] current['module'] = 'img_mlp' force_init(cache_dic=cache_dic, current=current, tokens=img) # calculate the img bloks img = img + img_mod1.gate * self.img_attn.proj(img_attn) cache_dic['cache'][-1]['double_stream'][current['layer']]['img_mlp'] = self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift) img = img + img_mod2.gate * cache_dic['cache'][-1]['double_stream'][current['layer']]['img_mlp'] current['module'] = 'txt_mlp' force_init(cache_dic=cache_dic, current=current, tokens=txt) # calculate the txt bloks txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn) cache_dic['cache'][-1]['double_stream'][current['layer']]['txt_mlp'] = self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift) txt = txt + txt_mod2.gate * cache_dic['cache'][-1]['double_stream'][current['layer']]['txt_mlp'] elif current['type'] == 'ToCa': img_mod1, img_mod2 = self.img_mod(vec) txt_mod1, txt_mod2 = self.txt_mod(vec) attn = cache_dic['cache'][-1]['double_stream'][current['layer']]['attn'] txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :] current['module'] = 'img_mlp' # calculate the img bloks img = img + img_mod1.gate * self.img_attn.proj(img_attn) fresh_indices, fresh_tokens_img = cache_cutfresh(cache_dic=cache_dic, tokens=img, current=current) fresh_tokens_img = self.img_mlp((1 + img_mod2.scale) * self.img_norm2(fresh_tokens_img) + img_mod2.shift) update_cache(fresh_indices=fresh_indices, fresh_tokens=fresh_tokens_img, cache_dic=cache_dic, current=current) cache_dic['cache'][-1]['double_stream'][current['layer']]['img_mlp'] img = img + img_mod2.gate * cache_dic['cache'][-1]['double_stream'][current['layer']]['img_mlp'] current['module'] = 'txt_mlp' # calculate the txt bloks txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn) fresh_indices, fresh_tokens_txt = cache_cutfresh(cache_dic=cache_dic, tokens=txt, current=current) fresh_tokens_txt = self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(fresh_tokens_txt) + txt_mod2.shift) update_cache(fresh_indices=fresh_indices, fresh_tokens=fresh_tokens_txt, cache_dic=cache_dic, current=current) txt = txt + txt_mod2.gate * cache_dic['cache'][-1]['double_stream'][current['layer']]['txt_mlp'] elif current['type'] == 'FORA': img_mod1, img_mod2 = self.img_mod(vec) txt_mod1, txt_mod2 = self.txt_mod(vec) img = img + img_mod2.gate * cache_dic['cache'][-1]['double_stream'][current['layer']]['img_mlp'] txt = txt + txt_mod2.gate * cache_dic['cache'][-1]['double_stream'][current['layer']]['txt_mlp'] elif current['type'] == 'aggressive': current['module'] = 'skipped' else: raise ValueError("Unknown cache type.") return img, txt class SingleStreamBlock(nn.Module): """ A DiT block with parallel linear layers as described in https://arxiv.org/abs/2302.05442 and adapted modulation interface. """ def __init__( self, hidden_size: int, num_heads: int, mlp_ratio: float = 4.0, qk_scale: float | None = None, ): super().__init__() self.hidden_dim = hidden_size self.num_heads = num_heads head_dim = hidden_size // num_heads self.scale = qk_scale or head_dim**-0.5 self.mlp_hidden_dim = int(hidden_size * mlp_ratio) # qkv and mlp_in self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim) # proj and mlp_out self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size) self.norm = QKNorm(head_dim) self.hidden_size = hidden_size self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) self.mlp_act = nn.GELU(approximate="tanh") self.modulation = Modulation(hidden_size, double=False) # mlp_in self.mlp_in = nn.Linear(hidden_size, self.mlp_hidden_dim) def load_mlp_in_weights(self, linear1_weight: torch.Tensor, linear1_bias: Optional[torch.Tensor] = None): """ Split and load the weights of the original `linear1` layer, keeping only the MLP hidden layer part. Parameters: - linear1_weight: Tensor, with shape (hidden_size * 3 + mlp_hidden_dim, hidden_size) - linear1_bias: Tensor, with shape (hidden_size * 3 + mlp_hidden_dim,) or None """ hidden_size = self.hidden_size mlp_hidden_dim = self.mlp_hidden_dim device = self.linear1.weight.device # target device self.mlp_in.weight = torch.nn.Parameter(linear1_weight[hidden_size * 3:, :].to(device)) if linear1_bias is not None: self.mlp_in.bias = torch.nn.Parameter(linear1_bias[hidden_size * 3:].to(device)) def forward(self, x: Tensor, vec: Tensor, pe: Tensor, **kwargs) -> Tensor: cache_dic = kwargs.get('cache_dic', None) current = kwargs.get('current', None) mod, _ = self.modulation(vec) if cache_dic is None: x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1) q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) q, k = self.norm(q, k, v) # compute attention attn = attention(q, k, v, pe=pe, cache_dic=cache_dic, current=current) # compute activation in mlp stream, cat again and run second linear layer output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)) else: current['stream'] = 'single_stream' if current['type'] == 'full': #if (current['layer'] == 0): # print(current['step']) x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1) cache_dic['cache'][-1]['single_stream'][current['layer']]['mlp'] = mlp current['module'] = 'attn' q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) if cache_dic['cache_type'] == 'k-norm': cache_dic['k-norm'][-1][current['stream']][current['layer']]['total'] = k.norm(dim=-1, p=2).mean(dim=1) elif cache_dic['cache_type'] == 'v-norm': cache_dic['v-norm'][-1][current['stream']][current['layer']]['total'] = v.norm(dim=-1, p=2).mean(dim=1) q, k = self.norm(q, k, v) # compute attention attn = attention(q, k, v, pe=pe, cache_dic=cache_dic, current=current) force_init(cache_dic=cache_dic, current=current, tokens=attn) cache_dic['cache'][-1]['single_stream'][current['layer']]['attn'] = attn # compute activation in mlp stream, cat again and run second linear layer current['module'] = 'mlp' output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)) force_init(cache_dic=cache_dic, current=current, tokens=output) current['module'] = 'total' cache_dic['cache'][-1]['single_stream'][current['layer']]['total'] = output elif current['type'] == 'ToCa': self.load_mlp_in_weights(self.linear1.weight, self.linear1.bias) current['module'] = 'mlp' fresh_indices, fresh_tokens_mlp = cache_cutfresh(cache_dic=cache_dic, tokens=x, current=current) x_mod = (1 + mod.scale) * self.pre_norm(fresh_tokens_mlp) + mod.shift #cache_dic['cache'][-1]['single_stream'][current['layer']]['mlp'] mlp_fresh = self.mlp_in(x_mod) #_, mlp_fresh1 = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1) update_cache(fresh_indices=fresh_indices, fresh_tokens=mlp_fresh, cache_dic=cache_dic, current=current) # compute attention fake_fresh_attn = torch.gather(input = cache_dic['cache'][-1]['single_stream'][current['layer']]['attn'], dim = 1, index = fresh_indices.unsqueeze(-1).expand(-1, -1, cache_dic['cache'][-1]['single_stream'][current['layer']]['attn'].shape[-1])) current['module'] = 'total' fresh_tokens_output = self.linear2(torch.cat((fake_fresh_attn, self.mlp_act(mlp_fresh)), 2)) update_cache(fresh_indices=fresh_indices, fresh_tokens=fresh_tokens_output, cache_dic=cache_dic, current=current) #attn = cache_dic['cache'][-1]['single_stream'][current['layer']]['attn'] #mlp = cache_dic['cache'][-1]['single_stream'][current['layer']]['mlp'] # compute activation in mlp stream, cat again and run second linear layer #output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)) output = cache_dic['cache'][-1]['single_stream'][current['layer']]['total'] elif current['type'] == 'FORA': output = cache_dic['cache'][-1]['single_stream'][current['layer']]['total'] elif current['type'] == 'aggressive': current['module'] = 'skipped' if current['layer'] == 37: x = cache_dic['cache'][-1]['aggressive_output'] return x else: raise ValueError("Unknown cache type.") if current['layer'] == 37: cache_dic['cache'][-1]['aggressive_output'] = x return x + mod.gate * output class LastLayer(nn.Module): def __init__(self, hidden_size: int, patch_size: int, out_channels: int): super().__init__() self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True) self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)) def forward(self, x: Tensor, vec: Tensor) -> Tensor: shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1) x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :] x = self.linear(x) return x ================================================ FILE: flux-ToCa/src/flux/modules/lora.py ================================================ import torch from torch import nn def replace_linear_with_lora( module: nn.Module, max_rank: int, scale: float = 1.0, ) -> None: for name, child in module.named_children(): if isinstance(child, nn.Linear): new_lora = LinearLora( in_features=child.in_features, out_features=child.out_features, bias=child.bias, rank=max_rank, scale=scale, dtype=child.weight.dtype, device=child.weight.device, ) new_lora.weight = child.weight new_lora.bias = child.bias if child.bias is not None else None setattr(module, name, new_lora) else: replace_linear_with_lora( module=child, max_rank=max_rank, scale=scale, ) class LinearLora(nn.Linear): def __init__( self, in_features: int, out_features: int, bias: bool, rank: int, dtype: torch.dtype, device: torch.device, lora_bias: bool = True, scale: float = 1.0, *args, **kwargs, ) -> None: super().__init__( in_features=in_features, out_features=out_features, bias=bias is not None, device=device, dtype=dtype, *args, **kwargs, ) assert isinstance(scale, float), "scale must be a float" self.scale = scale self.rank = rank self.lora_bias = lora_bias self.dtype = dtype self.device = device if rank > (new_rank := min(self.out_features, self.in_features)): self.rank = new_rank self.lora_A = nn.Linear( in_features=in_features, out_features=self.rank, bias=False, dtype=dtype, device=device, ) self.lora_B = nn.Linear( in_features=self.rank, out_features=out_features, bias=self.lora_bias, dtype=dtype, device=device, ) def set_scale(self, scale: float) -> None: assert isinstance(scale, float), "scalar value must be a float" self.scale = scale def forward(self, input: torch.Tensor) -> torch.Tensor: base_out = super().forward(input) _lora_out_B = self.lora_B(self.lora_A(input)) lora_update = _lora_out_B * self.scale return base_out + lora_update ================================================ FILE: flux-ToCa/src/flux/sampling.py ================================================ import math from typing import Callable import numpy as np import torch from einops import rearrange, repeat from PIL import Image from torch import Tensor from .model import Flux from .modules.autoencoder import AutoEncoder from .modules.conditioner import HFEmbedder from .modules.image_embedders import CannyImageEncoder, DepthImageEncoder, ReduxImageEncoder from .modules.cache_functions import cache_init def get_noise( num_samples: int, height: int, width: int, device: torch.device, dtype: torch.dtype, seed: int, ): return torch.randn( num_samples, 16, # allow for packing 2 * math.ceil(height / 16), 2 * math.ceil(width / 16), device=device, dtype=dtype, generator=torch.Generator(device=device).manual_seed(seed), ) def prepare(t5: HFEmbedder, clip: HFEmbedder, img: Tensor, prompt: str | list[str]) -> dict[str, Tensor]: bs, c, h, w = img.shape if bs == 1 and not isinstance(prompt, str): bs = len(prompt) img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2) if img.shape[0] == 1 and bs > 1: img = repeat(img, "1 ... -> bs ...", bs=bs) img_ids = torch.zeros(h // 2, w // 2, 3) img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[:, None] img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, :] img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs) #small_img_ids = torch.zeros((h // 2) // 2, (w // 2) // 2, 3) #small_img_ids[..., 1] = small_img_ids[..., 1] + torch.arange((h // 2) // 2)[:, None] #small_img_ids[..., 2] = small_img_ids[..., 2] + torch.arange((w // 2) // 2)[None, :] #small_img_ids = repeat(small_img_ids, "h w c -> b (h w) c", b=bs) if isinstance(prompt, str): prompt = [prompt] txt = t5(prompt) if txt.shape[0] == 1 and bs > 1: txt = repeat(txt, "1 ... -> bs ...", bs=bs) txt_ids = torch.zeros(bs, txt.shape[1], 3) vec = clip(prompt) if vec.shape[0] == 1 and bs > 1: vec = repeat(vec, "1 ... -> bs ...", bs=bs) return { "img": img, #"img_ids": [img_ids.to(img.device), small_img_ids.to(img.device)], "img_ids": img_ids.to(img.device), "txt": txt.to(img.device), "txt_ids": txt_ids.to(img.device), "vec": vec.to(img.device), } def prepare_control( t5: HFEmbedder, clip: HFEmbedder, img: Tensor, prompt: str | list[str], ae: AutoEncoder, encoder: DepthImageEncoder | CannyImageEncoder, img_cond_path: str, ) -> dict[str, Tensor]: # load and encode the conditioning image bs, _, h, w = img.shape if bs == 1 and not isinstance(prompt, str): bs = len(prompt) img_cond = Image.open(img_cond_path).convert("RGB") width = w * 8 height = h * 8 img_cond = img_cond.resize((width, height), Image.LANCZOS) img_cond = np.array(img_cond) img_cond = torch.from_numpy(img_cond).float() / 127.5 - 1.0 img_cond = rearrange(img_cond, "h w c -> 1 c h w") with torch.no_grad(): img_cond = encoder(img_cond) img_cond = ae.encode(img_cond) img_cond = img_cond.to(torch.bfloat16) img_cond = rearrange(img_cond, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2) if img_cond.shape[0] == 1 and bs > 1: img_cond = repeat(img_cond, "1 ... -> bs ...", bs=bs) return_dict = prepare(t5, clip, img, prompt) return_dict["img_cond"] = img_cond return return_dict def prepare_fill( t5: HFEmbedder, clip: HFEmbedder, img: Tensor, prompt: str | list[str], ae: AutoEncoder, img_cond_path: str, mask_path: str, ) -> dict[str, Tensor]: # load and encode the conditioning image and the mask bs, _, _, _ = img.shape if bs == 1 and not isinstance(prompt, str): bs = len(prompt) img_cond = Image.open(img_cond_path).convert("RGB") img_cond = np.array(img_cond) img_cond = torch.from_numpy(img_cond).float() / 127.5 - 1.0 img_cond = rearrange(img_cond, "h w c -> 1 c h w") mask = Image.open(mask_path).convert("L") mask = np.array(mask) mask = torch.from_numpy(mask).float() / 255.0 mask = rearrange(mask, "h w -> 1 1 h w") with torch.no_grad(): img_cond = img_cond.to(img.device) mask = mask.to(img.device) img_cond = img_cond * (1 - mask) img_cond = ae.encode(img_cond) mask = mask[:, 0, :, :] mask = mask.to(torch.bfloat16) mask = rearrange( mask, "b (h ph) (w pw) -> b (ph pw) h w", ph=8, pw=8, ) mask = rearrange(mask, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2) if mask.shape[0] == 1 and bs > 1: mask = repeat(mask, "1 ... -> bs ...", bs=bs) img_cond = img_cond.to(torch.bfloat16) img_cond = rearrange(img_cond, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2) if img_cond.shape[0] == 1 and bs > 1: img_cond = repeat(img_cond, "1 ... -> bs ...", bs=bs) img_cond = torch.cat((img_cond, mask), dim=-1) return_dict = prepare(t5, clip, img, prompt) return_dict["img_cond"] = img_cond.to(img.device) return return_dict def prepare_redux( t5: HFEmbedder, clip: HFEmbedder, img: Tensor, prompt: str | list[str], encoder: ReduxImageEncoder, img_cond_path: str, ) -> dict[str, Tensor]: bs, _, h, w = img.shape if bs == 1 and not isinstance(prompt, str): bs = len(prompt) img_cond = Image.open(img_cond_path).convert("RGB") with torch.no_grad(): img_cond = encoder(img_cond) img_cond = img_cond.to(torch.bfloat16) if img_cond.shape[0] == 1 and bs > 1: img_cond = repeat(img_cond, "1 ... -> bs ...", bs=bs) img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2) if img.shape[0] == 1 and bs > 1: img = repeat(img, "1 ... -> bs ...", bs=bs) img_ids = torch.zeros(h // 2, w // 2, 3) img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[:, None] img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, :] img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs) if isinstance(prompt, str): prompt = [prompt] txt = t5(prompt) txt = torch.cat((txt, img_cond.to(txt)), dim=-2) if txt.shape[0] == 1 and bs > 1: txt = repeat(txt, "1 ... -> bs ...", bs=bs) txt_ids = torch.zeros(bs, txt.shape[1], 3) vec = clip(prompt) if vec.shape[0] == 1 and bs > 1: vec = repeat(vec, "1 ... -> bs ...", bs=bs) return { "img": img, "img_ids": img_ids.to(img.device), "txt": txt.to(img.device), "txt_ids": txt_ids.to(img.device), "vec": vec.to(img.device), } def time_shift(mu: float, sigma: float, t: Tensor): return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma) def get_lin_function( x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15 ) -> Callable[[float], float]: m = (y2 - y1) / (x2 - x1) b = y1 - m * x1 return lambda x: m * x + b def get_schedule( num_steps: int, image_seq_len: int, base_shift: float = 0.5, max_shift: float = 1.15, shift: bool = True, ) -> list[float]: # extra step for zero timesteps = torch.linspace(1, 0, num_steps + 1) # shifting the schedule to favor high timesteps for higher signal images if shift: # estimate mu based on linear estimation between two points mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len) timesteps = time_shift(mu, 1.0, timesteps) return timesteps.tolist() def denoise( model: Flux, # model input img: Tensor, img_ids: Tensor, txt: Tensor, txt_ids: Tensor, vec: Tensor, # sampling parameters timesteps: list[float], guidance: float = 4.0, # extra img tokens img_cond: Tensor | None = None, ): # this is ignored for schnell guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype) for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]): t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device) pred = model( img=torch.cat((img, img_cond), dim=-1) if img_cond is not None else img, #img_ids=img_ids[1] if small else img_ids[0], img_ids=img_ids[0], txt=txt, txt_ids=txt_ids, y=vec, timesteps=t_vec, guidance=guidance_vec, ) img = img + (t_prev - t_curr) * pred return img def unpack(x: Tensor, height: int, width: int) -> Tensor: return rearrange( x, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=math.ceil(height / 16), w=math.ceil(width / 16), ph=2, pw=2, ) #################################################################################################### from calflops import calculate_flops def denoise_test_FLOPs( model: Flux, # model input img: Tensor, img_ids: Tensor, txt: Tensor, txt_ids: Tensor, vec: Tensor, # sampling parameters timesteps: list[float], guidance: float = 4.0, ): # init cache cache_dic, current = cache_init(timesteps) # this is ignored for schnell guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype) current['step']=0 current['num_steps'] = len(timesteps)-1 total_flops = 0 for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]): t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device) inputs=dict( img=img, img_ids=img_ids, txt=txt, txt_ids=txt_ids, y=vec, timesteps=t_vec, cache_dic = cache_dic, current = current, guidance=guidance_vec, ) flops, macs, params = calculate_flops(model=model, kwargs = inputs, print_results=False) total_flops += convert_flops(flops) current['step'] += 1 print(f"Total {total_flops * 10 **(-12)} TFLOPs." ) return img import re def convert_flops(flops_str): """ 将表示 FLOPS 的字符串(如 '12.34 GFLOPS', '1.2 TFLOPS')转换为对应的数值。 """ # 使用正则表达式匹配数字和单位 match = re.match(r"([\d.]+)\s*([GT]?FLOPS)", flops_str.strip(), re.IGNORECASE) if not match: raise ValueError(f"无法解析 FLOPS 字符串: {flops_str}") # 提取数字和单位 value = float(match.group(1)) unit = match.group(2).upper() # 根据单位转换为数字 if unit == "GFLOPS": return value * 10**9 elif unit == "TFLOPS": return value * 10**12 else: raise ValueError(f"未知的 FLOPS 单位: {unit}") ================================================ FILE: flux-ToCa/src/flux/util.py ================================================ import os from dataclasses import dataclass import torch from einops import rearrange from huggingface_hub import hf_hub_download from imwatermark import WatermarkEncoder from PIL import ExifTags, Image from safetensors.torch import load_file as load_sft from flux.model import Flux, FluxLoraWrapper, FluxParams from flux.modules.autoencoder import AutoEncoder, AutoEncoderParams from flux.modules.conditioner import HFEmbedder def save_image( nsfw_classifier, name: str, output_name: str, idx: int, x: torch.Tensor, add_sampling_metadata: bool, prompt: str, nsfw_threshold: float = 0.85, ) -> int: fn = output_name.format(idx=idx) print(f"Saving {fn}") # bring into PIL format and save x = x.clamp(-1, 1) x = embed_watermark(x.float()) x = rearrange(x[0], "c h w -> h w c") img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy()) nsfw_score = [x["score"] for x in nsfw_classifier(img) if x["label"] == "nsfw"][0] if nsfw_score < nsfw_threshold: exif_data = Image.Exif() exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux" exif_data[ExifTags.Base.Make] = "Black Forest Labs" exif_data[ExifTags.Base.Model] = name if add_sampling_metadata: exif_data[ExifTags.Base.ImageDescription] = prompt img.save(fn, exif=exif_data, quality=95, subsampling=0) idx += 1 else: print("Your generated image may contain NSFW content.") return idx @dataclass class ModelSpec: params: FluxParams ae_params: AutoEncoderParams ckpt_path: str | None lora_path: str | None ae_path: str | None repo_id: str | None repo_flow: str | None repo_ae: str | None configs = { "flux-dev": ModelSpec( repo_id="black-forest-labs/FLUX.1-dev", repo_flow="flux1-dev.safetensors", repo_ae="ae.safetensors", ckpt_path=os.getenv("FLUX_DEV"), lora_path=None, params=FluxParams( in_channels=64, out_channels=64, vec_in_dim=768, context_in_dim=4096, hidden_size=3072, mlp_ratio=4.0, num_heads=24, depth=19, depth_single_blocks=38, axes_dim=[16, 56, 56], theta=10_000, qkv_bias=True, guidance_embed=True, ), ae_path=os.getenv("AE"), ae_params=AutoEncoderParams( resolution=256, in_channels=3, ch=128, out_ch=3, ch_mult=[1, 2, 4, 4], num_res_blocks=2, z_channels=16, scale_factor=0.3611, shift_factor=0.1159, ), ), "flux-schnell": ModelSpec( repo_id="black-forest-labs/FLUX.1-schnell", repo_flow="flux1-schnell.safetensors", repo_ae="ae.safetensors", ckpt_path=os.getenv("FLUX_SCHNELL"), lora_path=None, params=FluxParams( in_channels=64, out_channels=64, vec_in_dim=768, context_in_dim=4096, hidden_size=3072, mlp_ratio=4.0, num_heads=24, depth=19, depth_single_blocks=38, axes_dim=[16, 56, 56], theta=10_000, qkv_bias=True, guidance_embed=False, ), ae_path=os.getenv("AE"), ae_params=AutoEncoderParams( resolution=256, in_channels=3, ch=128, out_ch=3, ch_mult=[1, 2, 4, 4], num_res_blocks=2, z_channels=16, scale_factor=0.3611, shift_factor=0.1159, ), ), "flux-dev-canny": ModelSpec( repo_id="black-forest-labs/FLUX.1-Canny-dev", repo_flow="flux1-canny-dev.safetensors", repo_ae="ae.safetensors", ckpt_path=os.getenv("FLUX_DEV_CANNY"), lora_path=None, params=FluxParams( in_channels=128, out_channels=64, vec_in_dim=768, context_in_dim=4096, hidden_size=3072, mlp_ratio=4.0, num_heads=24, depth=19, depth_single_blocks=38, axes_dim=[16, 56, 56], theta=10_000, qkv_bias=True, guidance_embed=True, ), ae_path=os.getenv("AE"), ae_params=AutoEncoderParams( resolution=256, in_channels=3, ch=128, out_ch=3, ch_mult=[1, 2, 4, 4], num_res_blocks=2, z_channels=16, scale_factor=0.3611, shift_factor=0.1159, ), ), "flux-dev-canny-lora": ModelSpec( repo_id="black-forest-labs/FLUX.1-dev", repo_flow="flux1-dev.safetensors", repo_ae="ae.safetensors", ckpt_path=os.getenv("FLUX_DEV"), lora_path=os.getenv("FLUX_DEV_CANNY_LORA"), params=FluxParams( in_channels=128, out_channels=64, vec_in_dim=768, context_in_dim=4096, hidden_size=3072, mlp_ratio=4.0, num_heads=24, depth=19, depth_single_blocks=38, axes_dim=[16, 56, 56], theta=10_000, qkv_bias=True, guidance_embed=True, ), ae_path=os.getenv("AE"), ae_params=AutoEncoderParams( resolution=256, in_channels=3, ch=128, out_ch=3, ch_mult=[1, 2, 4, 4], num_res_blocks=2, z_channels=16, scale_factor=0.3611, shift_factor=0.1159, ), ), "flux-dev-depth": ModelSpec( repo_id="black-forest-labs/FLUX.1-Depth-dev", repo_flow="flux1-depth-dev.safetensors", repo_ae="ae.safetensors", ckpt_path=os.getenv("FLUX_DEV_DEPTH"), lora_path=None, params=FluxParams( in_channels=128, out_channels=64, vec_in_dim=768, context_in_dim=4096, hidden_size=3072, mlp_ratio=4.0, num_heads=24, depth=19, depth_single_blocks=38, axes_dim=[16, 56, 56], theta=10_000, qkv_bias=True, guidance_embed=True, ), ae_path=os.getenv("AE"), ae_params=AutoEncoderParams( resolution=256, in_channels=3, ch=128, out_ch=3, ch_mult=[1, 2, 4, 4], num_res_blocks=2, z_channels=16, scale_factor=0.3611, shift_factor=0.1159, ), ), "flux-dev-depth-lora": ModelSpec( repo_id="black-forest-labs/FLUX.1-dev", repo_flow="flux1-dev.safetensors", repo_ae="ae.safetensors", ckpt_path=os.getenv("FLUX_DEV"), lora_path=os.getenv("FLUX_DEV_DEPTH_LORA"), params=FluxParams( in_channels=128, out_channels=64, vec_in_dim=768, context_in_dim=4096, hidden_size=3072, mlp_ratio=4.0, num_heads=24, depth=19, depth_single_blocks=38, axes_dim=[16, 56, 56], theta=10_000, qkv_bias=True, guidance_embed=True, ), ae_path=os.getenv("AE"), ae_params=AutoEncoderParams( resolution=256, in_channels=3, ch=128, out_ch=3, ch_mult=[1, 2, 4, 4], num_res_blocks=2, z_channels=16, scale_factor=0.3611, shift_factor=0.1159, ), ), "flux-dev-fill": ModelSpec( repo_id="black-forest-labs/FLUX.1-Fill-dev", repo_flow="flux1-fill-dev.safetensors", repo_ae="ae.safetensors", ckpt_path=os.getenv("FLUX_DEV_FILL"), lora_path=None, params=FluxParams( in_channels=384, out_channels=64, vec_in_dim=768, context_in_dim=4096, hidden_size=3072, mlp_ratio=4.0, num_heads=24, depth=19, depth_single_blocks=38, axes_dim=[16, 56, 56], theta=10_000, qkv_bias=True, guidance_embed=True, ), ae_path=os.getenv("AE"), ae_params=AutoEncoderParams( resolution=256, in_channels=3, ch=128, out_ch=3, ch_mult=[1, 2, 4, 4], num_res_blocks=2, z_channels=16, scale_factor=0.3611, shift_factor=0.1159, ), ), } def print_load_warning(missing: list[str], unexpected: list[str]) -> None: if len(missing) > 0 and len(unexpected) > 0: print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing)) print("\n" + "-" * 79 + "\n") print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected)) elif len(missing) > 0: print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing)) elif len(unexpected) > 0: print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected)) def load_flow_model( name: str, device: str | torch.device = "cuda", hf_download: bool = True, verbose: bool = False ) -> Flux: # Loading Flux print("Init model") ckpt_path = configs[name].ckpt_path lora_path = configs[name].lora_path if ( ckpt_path is None and configs[name].repo_id is not None and configs[name].repo_flow is not None and hf_download ): ckpt_path = hf_hub_download(configs[name].repo_id, configs[name].repo_flow) with torch.device("meta" if ckpt_path is not None else device): if lora_path is not None: model = FluxLoraWrapper(params=configs[name].params).to(torch.bfloat16) else: model = Flux(configs[name].params).to(torch.bfloat16) if ckpt_path is not None: print("Loading checkpoint") # load_sft doesn't support torch.device sd = load_sft(ckpt_path, device=str(device)) sd = optionally_expand_state_dict(model, sd) missing, unexpected = model.load_state_dict(sd, strict=False, assign=True) if verbose: print_load_warning(missing, unexpected) if configs[name].lora_path is not None: print("Loading LoRA") lora_sd = load_sft(configs[name].lora_path, device=str(device)) # loading the lora params + overwriting scale values in the norms missing, unexpected = model.load_state_dict(lora_sd, strict=False, assign=True) if verbose: print_load_warning(missing, unexpected) return model def load_t5(device: str | torch.device = "cuda", max_length: int = 512) -> HFEmbedder: # max length 64, 128, 256 and 512 should work (if your sequence is short enough) return HFEmbedder("/root/autodl-tmp/pretrained_models/google/t5-v1_1-xxl", max_length=max_length, torch_dtype=torch.bfloat16).to(device) def load_clip(device: str | torch.device = "cuda") -> HFEmbedder: return HFEmbedder("/root/autodl-tmp/pretrained_models/openai/clip-vit-large-patch14", max_length=77, torch_dtype=torch.bfloat16).to(device) def load_ae(name: str, device: str | torch.device = "cuda", hf_download: bool = True) -> AutoEncoder: ckpt_path = configs[name].ae_path if ( ckpt_path is None and configs[name].repo_id is not None and configs[name].repo_ae is not None and hf_download ): ckpt_path = hf_hub_download(configs[name].repo_id, configs[name].repo_ae) # Loading the autoencoder print("Init AE") with torch.device("meta" if ckpt_path is not None else device): ae = AutoEncoder(configs[name].ae_params) if ckpt_path is not None: sd = load_sft(ckpt_path, device=str(device)) missing, unexpected = ae.load_state_dict(sd, strict=False, assign=True) print_load_warning(missing, unexpected) return ae def optionally_expand_state_dict(model: torch.nn.Module, state_dict: dict) -> dict: """ Optionally expand the state dict to match the model's parameters shapes. """ for name, param in model.named_parameters(): if name in state_dict: if state_dict[name].shape != param.shape: print( f"Expanding '{name}' with shape {state_dict[name].shape} to model parameter with shape {param.shape}." ) # expand with zeros: expanded_state_dict_weight = torch.zeros_like(param, device=state_dict[name].device) slices = tuple(slice(0, dim) for dim in state_dict[name].shape) expanded_state_dict_weight[slices] = state_dict[name] state_dict[name] = expanded_state_dict_weight return state_dict class WatermarkEmbedder: def __init__(self, watermark): self.watermark = watermark self.num_bits = len(WATERMARK_BITS) self.encoder = WatermarkEncoder() self.encoder.set_watermark("bits", self.watermark) def __call__(self, image: torch.Tensor) -> torch.Tensor: """ Adds a predefined watermark to the input image Args: image: ([N,] B, RGB, H, W) in range [-1, 1] Returns: same as input but watermarked """ image = 0.5 * image + 0.5 squeeze = len(image.shape) == 4 if squeeze: image = image[None, ...] n = image.shape[0] image_np = rearrange((255 * image).detach().cpu(), "n b c h w -> (n b) h w c").numpy()[:, :, :, ::-1] # torch (b, c, h, w) in [0, 1] -> numpy (b, h, w, c) [0, 255] # watermarking libary expects input as cv2 BGR format for k in range(image_np.shape[0]): image_np[k] = self.encoder.encode(image_np[k], "dwtDct") image = torch.from_numpy(rearrange(image_np[:, :, :, ::-1], "(n b) h w c -> n b c h w", n=n)).to( image.device ) image = torch.clamp(image / 255, min=0.0, max=1.0) if squeeze: image = image[0] image = 2 * image - 1 return image # A fixed 48-bit message that was chosen at random WATERMARK_MESSAGE = 0b001010101111111010000111100111001111010100101110 # bin(x)[2:] gives bits of x as str, use int to convert them to 0/1 WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]] embed_watermark = WatermarkEmbedder(WATERMARK_BITS) ================================================ FILE: flux-ToCa/src/geneval_flux.py ================================================ import argparse import json import os import torch import numpy as np from PIL import Image, ExifTags from tqdm import tqdm, trange from einops import rearrange from torchvision.utils import make_grid from torchvision.transforms import ToTensor # --- Imports related to FLUX module --- from flux.sampling import ( denoise_test_FLOPs, get_noise, get_schedule, prepare, unpack, ) from flux.ideas import denoise_cache from flux.util import ( embed_watermark, load_ae, load_clip, load_flow_model, load_t5, ) from transformers import pipeline # NSFW threshold (adjustable as needed) NSFW_THRESHOLD = 0.85 def parse_args(): parser = argparse.ArgumentParser(description="Generate images using the FLUX model within the Geneval framework") # Required: input JSONL metadata file, each line must contain at least the "prompt" key parser.add_argument( "metadata_file", type=str, help="JSONL file containing metadata for each prompt, each line is a JSON object" ) # FLUX model related parameters parser.add_argument( "--model_name", type=str, default="flux-schnell", choices=["flux-dev", "flux-schnell"], help="FLUX model name" ) parser.add_argument( "--n_samples", type=int, default=1, help="Number of images to generate per prompt" ) parser.add_argument( "--steps", type=int, default=None, help="Number of sampling steps (if not specified: 4 for flux-schnell, 50 for flux-dev)" ) parser.add_argument( "--width", type=int, default=1360, help="Width of the generated image (pixels)" ) parser.add_argument( "--height", type=int, default=768, help="Height of the generated image (pixels)" ) parser.add_argument( "--guidance", type=float, default=3.5, help="Conditional guidance scale" ) parser.add_argument( "--seed", type=int, default=42, help="Random seed" ) parser.add_argument( "--batch_size", type=int, default=1, help="Number of samples per batch during image generation" ) # Output related parameters parser.add_argument( "--output_dir", type=str, default="outputs", help="Output directory to save the generated results" ) parser.add_argument( "--skip_grid", action="store_true", help="Skip saving the overall grid image" ) # Other options parser.add_argument( "--add_sampling_metadata", action="store_true", help="Add the prompt text to the metadata of the generated images" ) parser.add_argument( "--use_nsfw_filter", action="store_true", help="Enable NSFW content filtering (requires downloading the relevant model)" ) parser.add_argument( "--test_FLOPs", action="store_true", help="Test inference FLOPs only (no images will be generated)" ) return parser.parse_args() def main(args): # Read the metadata file, each line is a JSON object (must contain at least the "prompt" field) with open(args.metadata_file, "r", encoding="utf-8") as fp: metadatas = [json.loads(line) for line in fp if line.strip()] # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # If NSFW filtering is enabled, load the corresponding classifier (please modify the model path or name accordingly) if args.use_nsfw_filter: nsfw_classifier = pipeline( "image-classification", model="/path/to/your/nsfw_model", # Please replace with the actual NSFW model path device=0 if torch.cuda.is_available() else -1 ) else: nsfw_classifier = None # If sampling steps are not specified, set default steps based on the model name if args.steps is None: args.steps = 4 if args.model_name == "flux-schnell" else 50 # Ensure the image width and height are multiples of 16 (required by FLUX) args.width = 16 * (args.width // 16) args.height = 16 * (args.height // 16) # Load FLUX model components onto the device (T5, CLIP, Flow model, autoencoder) t5 = load_t5(device, max_length=256 if args.model_name == "flux-schnell" else 512) clip = load_clip(device) model = load_flow_model(args.model_name, device=device) ae = load_ae(args.model_name, device=device) # Generate results for each prompt: # Each prompt corresponds to a subfolder (e.g., outputs/00000/), inside which samples and (optionally) a grid image grid.png are saved, # along with the prompt's metadata saved in a metadata.jsonl file. for idx, metadata in enumerate(metadatas): prompt = metadata.get("prompt", "") print(f"Processing prompt {idx + 1}/{len(metadatas)}: '{prompt}'") # Define output directory and samples directory outpath = os.path.join(args.output_dir, f"{idx:05d}") sample_path = os.path.join(outpath, "samples") # If the output directory already exists, check the number of PNG files already in the samples folder existing_samples = [] sample_count = 0 if os.path.exists(sample_path): files = sorted( fname for fname in os.listdir(sample_path) if fname.endswith(".png") and fname != "grid.png" ) sample_count = len(files) # Load existing images (to be used later for generating the grid image) for fname in files: full_path = os.path.join(sample_path, fname) try: img = Image.open(full_path).convert("RGB") existing_samples.append(ToTensor()(img)) except Exception as e: print(f"Failed to read existing image {full_path}: {e}") # If the number of generated images is sufficient, skip generation if sample_count >= args.n_samples: print(f"Samples for prompt {idx + 1} already exist ({sample_count} images), skipping generation.") continue # Create output directory and samples subdirectory os.makedirs(outpath, exist_ok=True) os.makedirs(sample_path, exist_ok=True) # Save the current prompt's metadata to metadata.jsonl with open(os.path.join(outpath, "metadata.jsonl"), "w", encoding="utf-8") as fp: json.dump(metadata, fp) # Initialize: use the number of existing images as the starting count, and copy existing samples for later grid generation local_index = sample_count all_samples = existing_samples.copy() # The initial value of the progress bar is the number of existing samples pbar = tqdm(total=args.n_samples, initial=sample_count, desc="Sampling") # For the current prompt, only generate the missing images while local_index < args.n_samples: current_bs = min(args.batch_size, args.n_samples - local_index) # Set seed for the current batch (using the number of images already present in the prompt as offset) seed = args.seed + local_index # Generate random noise x = get_noise(current_bs, args.height, args.width, device=device, dtype=torch.bfloat16, seed=seed) prompt_list = [prompt] * current_bs # Prepare input (prompt encoding, initial image noise, etc.) inp = prepare(t5, clip, x, prompt=prompt_list) # Compute denoising schedule based on the input shape (note: the second parameter is the number of latent channels) timesteps = get_schedule(args.steps, inp["img"].shape[1], shift=(args.model_name != "flux-schnell")) with torch.no_grad(): if args.test_FLOPs: latent = denoise_test_FLOPs(model, **inp, timesteps=timesteps, guidance=args.guidance) else: latent = denoise_cache(model, **inp, timesteps=timesteps, guidance=args.guidance) # Unpack latent to a shape suitable for the decoder input latent = unpack(latent.float(), args.height, args.width) # Decode to image with automatic mixed precision with torch.autocast(device_type=device.type, dtype=torch.bfloat16): decoded = ae.decode(latent) # Post-processing: clamp, embed watermark, and rearrange to [B, H, W, C] format decoded = decoded.clamp(-1, 1) decoded = embed_watermark(decoded.float()) images_tensor = rearrange(decoded, "b c h w -> b h w c") # Iterate over each generated image in the current batch for i in range(current_bs): img_array = (127.5 * (images_tensor[i] + 1.0)).cpu().numpy().astype(np.uint8) img = Image.fromarray(img_array) # NSFW filtering (if enabled) if nsfw_classifier is not None: nsfw_result = nsfw_classifier(img) nsfw_score = next((res["score"] for res in nsfw_result if res["label"] == "nsfw"), 0.0) else: nsfw_score = 0.0 if nsfw_score < NSFW_THRESHOLD: # Add sampling metadata (EXIF info); note: PNG format may not fully support EXIF if args.add_sampling_metadata: exif_data = Image.Exif() exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux" exif_data[ExifTags.Base.Make] = "Black Forest Labs" exif_data[ExifTags.Base.Model] = args.model_name exif_data[ExifTags.Base.ImageDescription] = prompt else: exif_data = None sample_fname = os.path.join(sample_path, f"{local_index:05d}.png") if exif_data is not None: img.save(sample_fname, exif=exif_data) else: img.save(sample_fname) all_samples.append(ToTensor()(img)) else: print("The generated image may contain inappropriate content and has been skipped.") local_index += 1 pbar.update(1) # end for current batch pbar.close() # If grid generation is not skipped and there is at least one sample, create and save a grid image (consistent with Geneval format) if not args.skip_grid and len(all_samples) > 0: grid_tensor = torch.stack(all_samples, 0) grid = make_grid(grid_tensor, nrow=args.batch_size) grid = 255.0 * rearrange(grid, "c h w -> h w c").cpu().numpy() grid_img = Image.fromarray(grid.astype(np.uint8)) grid_img.save(os.path.join(outpath, "grid.png")) # end for each prompt print("Generation completed.") if __name__ == "__main__": args = parse_args() main(args) ''' python src/geneval_flux.py /root/geneval/prompts/evaluation_metadata.jsonl --model_name flux-dev --n_samples 4 --steps 50 --width 1024 --height 1024 --seed 42 --output_dir /root/autodl-tmp/samples/geneval_original --batch_size 1 ''' ================================================ FILE: flux-ToCa/src/sample.py ================================================ import os import re import time from dataclasses import dataclass from glob import iglob import torch from einops import rearrange from PIL import ExifTags, Image from transformers import pipeline from tqdm import tqdm from flux.sampling import denoise, get_noise, get_schedule, prepare, unpack, denoise_test_FLOPs from flux.ideas import denoise_cache from flux.util import configs, embed_watermark, load_ae, load_clip, load_flow_model, load_t5 NSFW_THRESHOLD = 0.85 # NSFW score threshold @dataclass class SamplingOptions: prompts: list[str] # List of prompts width: int # Image width height: int # Image height num_steps: int # Number of sampling steps guidance: float # Guidance value seed: int | None # Random seed num_images_per_prompt: int # Number of images generated per prompt batch_size: int # Batch size (number of prompts per batch) model_name: str # Model name output_dir: str # Output directory add_sampling_metadata: bool # Whether to add metadata use_nsfw_filter: bool # Whether to enable NSFW filter test_FLOPs: bool # Whether in FLOPs testing mode (in which case no images are generated) def main(opts: SamplingOptions): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Optional NSFW classifier if opts.use_nsfw_filter: nsfw_classifier = pipeline( "image-classification", model="/root/autodl-tmp/pretrained_models/Falconsai/nsfw_image_detection", device=device ) else: nsfw_classifier = None # Load model model_name = opts.model_name if model_name not in configs: available = ", ".join(configs.keys()) raise ValueError(f"Unknown model name: {model_name}, available: {available}") if opts.num_steps is None: opts.num_steps = 4 if model_name == "flux-schnell" else 50 # Ensure width and height are multiples of 16 opts.width = 16 * (opts.width // 16) opts.height = 16 * (opts.height // 16) # Set output directory and index output_name = os.path.join(opts.output_dir, f"img_{{idx}}.jpg") if not os.path.exists(opts.output_dir): os.makedirs(opts.output_dir) idx = 0 # Image index # Initialize model components torch_device = device # Load T5 and CLIP models onto GPU t5 = load_t5(torch_device, max_length=256 if model_name == "flux-schnell" else 512) clip = load_clip(torch_device) # Load model onto GPU model = load_flow_model(model_name, device=torch_device) ae = load_ae(model_name, device=torch_device) # Set random seed if opts.seed is not None: base_seed = opts.seed else: base_seed = torch.randint(0, 2**32, (1,)).item() prompts = opts.prompts total_images = len(prompts) * opts.num_images_per_prompt progress_bar = tqdm(total=total_images, desc="Generating images") # Calculate number of prompt batches num_prompt_batches = (len(prompts) + opts.batch_size - 1) // opts.batch_size for batch_idx in range(num_prompt_batches): prompt_start = batch_idx * opts.batch_size prompt_end = min(prompt_start + opts.batch_size, len(prompts)) batch_prompts = prompts[prompt_start:prompt_end] num_prompts_in_batch = len(batch_prompts) # For each prompt, generate the corresponding number of images for image_idx in range(opts.num_images_per_prompt): # Prepare random seed seed = base_seed + idx # Set a different seed for each image idx += num_prompts_in_batch # Update image index # Prepare input batch_size = num_prompts_in_batch x = get_noise( batch_size, opts.height, opts.width, device=torch_device, dtype=torch.bfloat16, seed=seed, ) # Prepare prompts # batch_prompts is a list containing the prompts for the current batch inp = prepare(t5, clip, x, prompt=batch_prompts) timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(model_name != "flux-schnell")) # Denoise with torch.no_grad(): if opts.test_FLOPs: x = denoise_test_FLOPs(model, **inp, timesteps=timesteps, guidance=opts.guidance) else: x = denoise_cache(model, **inp, timesteps=timesteps, guidance=opts.guidance) # Decode latent variables x = unpack(x.float(), opts.height, opts.width) with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16): x = ae.decode(x) # Convert to PIL format and save x = x.clamp(-1, 1) x = embed_watermark(x.float()) x = rearrange(x, "b c h w -> b h w c") for i in range(batch_size): img_array = x[i] img = Image.fromarray((127.5 * (img_array + 1.0)).cpu().byte().numpy()) # Optional NSFW filtering if opts.use_nsfw_filter: nsfw_result = nsfw_classifier(img) nsfw_score = next((res["score"] for res in nsfw_result if res["label"] == "nsfw"), 0.0) else: nsfw_score = 0.0 # If filter is not enabled, consider safe if nsfw_score < NSFW_THRESHOLD: exif_data = Image.Exif() exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux" exif_data[ExifTags.Base.Make] = "Black Forest Labs" exif_data[ExifTags.Base.Model] = model_name if opts.add_sampling_metadata: exif_data[ExifTags.Base.ImageDescription] = batch_prompts[i] # Save image fn = output_name.format(idx=idx - num_prompts_in_batch + i) img.save(fn, exif=exif_data, quality=95, subsampling=0) else: print(f"The generated image may contain inappropriate content and has been skipped.") progress_bar.update(1) progress_bar.close() def read_prompts(prompt_file: str): with open(prompt_file, 'r', encoding='utf-8') as f: prompts = [line.strip() for line in f if line.strip()] return prompts def app(): import argparse parser = argparse.ArgumentParser(description="Generate images using the flux model.") parser.add_argument('--prompt_file', type=str, required=True, help='Path to the prompt text file.') parser.add_argument('--width', type=int, default=1360, help='Width of the generated image.') parser.add_argument('--height', type=int, default=768, help='Height of the generated image.') parser.add_argument('--num_steps', type=int, default=None, help='Number of sampling steps.') parser.add_argument('--guidance', type=float, default=3.5, help='Guidance value.') parser.add_argument('--seed', type=int, default=0, help='Random seed.') parser.add_argument('--num_images_per_prompt', type=int, default=1, help='Number of images generated per prompt.') parser.add_argument('--batch_size', type=int, default=1, help='Batch size (number of prompts per batch).') parser.add_argument('--model_name', type=str, default='flux-schnell', choices=['flux-dev', 'flux-schnell'], help='Model name.') parser.add_argument('--output_dir', type=str, default='/root/autodl-tmp/samples', help='Directory to save images.') parser.add_argument('--add_sampling_metadata', action='store_true', help='Whether to add prompts to image metadata.') parser.add_argument('--use_nsfw_filter', action='store_true', help='Enable NSFW filter.') parser.add_argument('--test_FLOPs', action='store_true', help='Test inference FLOPs.') args = parser.parse_args() prompts = read_prompts(args.prompt_file) opts = SamplingOptions( prompts=prompts, width=args.width, height=args.height, num_steps=args.num_steps, guidance=args.guidance, seed=args.seed, num_images_per_prompt=args.num_images_per_prompt, batch_size=args.batch_size, model_name=args.model_name, output_dir=args.output_dir, add_sampling_metadata=args.add_sampling_metadata, use_nsfw_filter=args.use_nsfw_filter, test_FLOPs=args.test_FLOPs, ) main(opts) if __name__ == '__main__': app()