Repository: yumingj/Talk-to-Edit Branch: main Commit: 72c45e109006 Files: 120 Total size: 516.1 KB Directory structure: gitextract_av2x8cy4/ ├── .gitignore ├── README.md ├── configs/ │ ├── attributes_5.json │ ├── editing/ │ │ ├── editing_with_dialog.yml │ │ └── editing_wo_dialog.yml │ └── train/ │ ├── field_1024_bangs.yml │ ├── field_1024_beard.yml │ ├── field_1024_eyeglasses.yml │ ├── field_1024_smiling.yml │ ├── field_1024_young.yml │ ├── field_128_bangs.yml │ ├── field_128_beard.yml │ ├── field_128_eyeglasses.yml │ ├── field_128_smiling.yml │ └── field_128_young.yml ├── data/ │ ├── __init__.py │ └── latent_code_dataset.py ├── editing_quantitative.py ├── editing_with_dialog.py ├── editing_wo_dialog.py ├── environment.yml ├── language/ │ ├── accuracy.py │ ├── build_vocab.py │ ├── dataset.py │ ├── generate_feedback.py │ ├── generate_training_request.py │ ├── language_utils.py │ ├── lstm.py │ ├── preprocess_request.py │ ├── run_encoder.py │ ├── templates/ │ │ ├── attr_wise_caption_templates.json │ │ ├── feedback.json │ │ ├── gender.json │ │ ├── metadata_fsm.json │ │ ├── overall_caption_templates.json │ │ ├── pool.json │ │ ├── system_mode.json │ │ ├── user_fsm.json │ │ ├── user_old_templates.json │ │ └── vocab.json │ ├── train_encoder.py │ └── utils/ │ ├── __init__.py │ ├── eval.py │ ├── logger.py │ ├── lr_schedule.py │ ├── misc.py │ ├── numerical.py │ ├── progress/ │ │ ├── .gitignore │ │ ├── LICENSE │ │ ├── MANIFEST.in │ │ ├── README.rst │ │ ├── progress/ │ │ │ ├── __init__.py │ │ │ ├── bar.py │ │ │ ├── counter.py │ │ │ ├── helpers.py │ │ │ └── spinner.py │ │ ├── setup.py │ │ └── test_progress.py │ ├── setup_logger.py │ └── visualize.py ├── models/ │ ├── __init__.py │ ├── archs/ │ │ ├── __init__.py │ │ ├── attribute_predictor_arch.py │ │ ├── field_function_arch.py │ │ └── stylegan2/ │ │ ├── .gitignore │ │ ├── LICENSE │ │ ├── LICENSE-FID │ │ ├── LICENSE-LPIPS │ │ ├── LICENSE-NVIDIA │ │ ├── __init__.py │ │ ├── apply_factor.py │ │ ├── calc_inception.py │ │ ├── checkpoint/ │ │ │ └── .gitignore │ │ ├── convert_weight.py │ │ ├── dataset.py │ │ ├── distributed.py │ │ ├── fid.py │ │ ├── generate.py │ │ ├── inception.py │ │ ├── inversion.py │ │ ├── lpips/ │ │ │ ├── __init__.py │ │ │ ├── base_model.py │ │ │ ├── dist_model.py │ │ │ ├── networks_basic.py │ │ │ ├── pretrained_networks.py │ │ │ └── weights/ │ │ │ ├── v0.0/ │ │ │ │ ├── alex.pth │ │ │ │ ├── squeeze.pth │ │ │ │ └── vgg.pth │ │ │ └── v0.1/ │ │ │ ├── alex.pth │ │ │ ├── squeeze.pth │ │ │ └── vgg.pth │ │ ├── model.py │ │ ├── non_leaking.py │ │ ├── op/ │ │ │ ├── __init__.py │ │ │ ├── fused_act.py │ │ │ ├── fused_bias_act.cpp │ │ │ ├── fused_bias_act_kernel.cu │ │ │ ├── upfirdn2d.cpp │ │ │ ├── upfirdn2d.py │ │ │ └── upfirdn2d_kernel.cu │ │ ├── ppl.py │ │ ├── sample/ │ │ │ └── .gitignore │ │ └── train.py │ ├── base_model.py │ ├── field_function_model.py │ ├── losses/ │ │ ├── __init__.py │ │ ├── arcface_loss.py │ │ └── discriminator_loss.py │ └── utils.py ├── quantitative_results.py ├── train.py └── utils/ ├── __init__.py ├── crop_img.py ├── dialog_edit_utils.py ├── editing_utils.py ├── inversion_utils.py ├── logger.py ├── numerical_metrics.py ├── options.py └── util.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ experiments/ results/ tb_logger/ *.pyc .vscode/ download download/* *.sh ================================================ FILE: README.md ================================================ # Talk-to-Edit (ICCV2021) ![Python 3.7](https://img.shields.io/badge/python-3.7-green.svg?style=plastic) ![pytorch 1.6.0](https://img.shields.io/badge/pytorch-1.6.0-green.svg?style=plastic) This repository contains the implementation of the following paper: > **Talk-to-Edit: Fine-Grained Facial Editing via Dialog**
> Yuming Jiang, Ziqi Huang, Xingang Pan, Chen Change Loy, Ziwei Liu
> IEEE International Conference on Computer Vision (**ICCV**), 2021
[[Paper](https://arxiv.org/abs/2109.04425)] [[Project Page](https://www.mmlab-ntu.com/project/talkedit/)] [[CelebA-Dialog Dataset](https://github.com/ziqihuangg/CelebA-Dialog)] [[Poster](https://drive.google.com/file/d/1KaojezBNqDrkwcT0yOkvAgqW1grwUDed/view?usp=sharing)] [[Video](https://www.youtube.com/watch?v=ZKMkQhkMXPI)] You can try our colab demo here. Enjoy! 1. Editing with dialog: google colab logo 1. Editing without dialog: google colab logo ## Overview ![overall_structure](./assets/teaser.png) ## Dependencies and Installation 1. Clone Repo ```bash git clone git@github.com:yumingj/Talk-to-Edit.git ``` 1. Create Conda Environment and Install Dependencies ```bash conda env create -f environment.yml conda activate talk_edit ``` - Python >= 3.7 - PyTorch >= 1.6 - CUDA 10.1 - GCC 5.4.0 ## Get Started ## Editing We provide scripts for editing using our pretrained models. 1. First, download the pretrained models from this [link](https://drive.google.com/drive/folders/1W9dvjz8bUolEIG524o8ZvM62uEWKJ5do?usp=sharing) and put them under `./download/pretrained_models` as follows: ``` ./download/pretrained_models ├── 1024_field │ ├── Bangs.pth │ ├── Eyeglasses.pth │ ├── No_Beard.pth │ ├── Smiling.pth │ └── Young.pth ├── 128_field │ ├── Bangs.pth │ ├── Eyeglasses.pth │ ├── No_Beard.pth │ ├── Smiling.pth │ └── Young.pth ├── arcface_resnet18_110.pth ├── language_encoder.pth.tar ├── predictor_1024.pth.tar ├── predictor_128.pth.tar ├── stylegan2_1024.pth ├── stylegan2_128.pt ├── StyleGAN2_FFHQ1024_discriminator.pth └── eval_predictor.pth.tar ``` 1. You can try pure image editing without dialog instructions: ```bash python editing_wo_dialog.py \ --opt ./configs/editing/editing_wo_dialog.yml \ --attr 'Bangs' \ --target_val 5 ``` The editing results will be saved in `./results`. You can change `attr` to one of the following attributes: `Bangs`, `Eyeglasses`, `Beard`, `Smiling`, and `Young(i.e. Age)`. And the `target_val` can be `[0, 1, 2, 3, 4, 5]`. 1. You can also try dialog-based editing, where you talk to the system through the command prompt: ```bash python editing_with_dialog.py --opt ./configs/editing/editing_with_dialog.yml ``` The editing results will be saved in `./results`. **How to talk to the system:** * Our system is able to edit five facial attributes: `Bangs`, `Eyeglasses`, `Beard`, `Smiling`, and `Young(i.e. Age)`. * When prompted with `"Enter your request (Press enter when you finish):"`, you can enter an editing request about one of the five attributes. For example, you can say `"Make the bangs longer."` * To respond to the system's feedback, just talk as if you were talking to a real person. For example, if the system asks `"Is the length of the bangs just right?"` after one round of editing, You can say things like `"Yes."` / `"No."` / `"Yes, and I also want her to smile more happily."`. * To end the conversation, just tell the system things like `"That's all"` / `"Nothing else, thank you."` 1. By default, the above editing would be performed on the teaser image. You may change the image to be edited in two ways: 1) change `line 11: latent_code_index` to other values ranging from `0` to `99`; 2) set `line 10: latent_code_path` to `~`, so that an image would be randomly generated. 1. If you want to try editing on real images, you may download the real images from this [link](https://drive.google.com/drive/folders/1BunrwvlwCBZJnb9QqeUp_uIXMxeXXJrY?usp=sharing) and put them under `./download/real_images`. You could also provide other real images at your choice. You need to change `line 12: img_path` in `editing_with_dialog.yml` or `editing_wo_dialog.yml` according to the path to the real image and set `line 11: is_real_image` as `True`. 1. You can switch the default image size to `128 x 128` by setting `line 3: img_res` to `128` in config files. ## Train the Semantic Field 1. To train the Semantic Field, a number of sampled latent codes should be prepared and then we use the attribute predictor to predict the facial attributes for their corresponding images. The attribute predictor is trained using fine-grained annotations in [CelebA-Dialog](https://github.com/ziqihuangg/CelebA-Dialog) dataset. Here, we provide the latent codes we used. You can download the train data from this [link](https://drive.google.com/drive/folders/1CYBpLIwts3ZVFiFAPb4TTnqYH3NBR63p?usp=sharing) and put them under `./download/train_data` as follows: ``` ./download/train_data ├── 1024 │ ├── Bangs │ ├── Eyeglasses │ ├── No_Beard │ ├── Smiling │ └── Young └── 128 ├── Bangs ├── Eyeglasses ├── No_Beard ├── Smiling └── Young ``` 1. We will also use some editing latent codes to monitor the training phase. You can download the editing latent code from this [link](https://drive.google.com/drive/folders/1G-0srCePEXcPq9HY38Il_4FTVHX_rOa-?usp=sharing) and put them under `./download/editing_data` as follows: ``` ./download/editing_data ├── 1024 │ ├── Bangs.npz.npy │ ├── Eyeglasses.npz.npy │ ├── No_Beard.npz.npy │ ├── Smiling.npz.npy │ └── Young.npz.npy └── 128 ├── Bangs.npz.npy ├── Eyeglasses.npz.npy ├── No_Beard.npz.npy ├── Smiling.npz.npy └── Young.npz.npy ``` 1. All logging files in the training process, *e.g.*, log message, checkpoints, and snapshots, will be saved to `./experiments` and `./tb_logger` directory. 1. There are 10 configuration files under `./configs/train`, named in the format of `field__`. Choose the corresponding configuration file for the attribute and resolution you want. 1. For example, to train the semantic field which edits the attribute `Bangs` in `128x128` image resolution, simply run: ```bash python train.py --opt ./configs/train/field_128_Bangs.yml ``` ## Quantitative Results We provide codes for quantitative results shown in Table 1. Here we use `Bangs` in `128x128` resolution as an example. 1. Use the trained semantic field to edit images. ```bash python editing_quantitative.py \ --opt ./configs/train/field_128_bangs.yml \ --pretrained_path ./download/pretrained_models/128_field/Bangs.pth ``` 2. Evaluate the edited images using quantitative metircs. Change `image_num` for different attribute accordingly: `Bangs: 148`, `Eyeglasses: 82`, `Beard: 129`, `Smiling: 140`, `Young: 61`. ```bash python quantitative_results.py \ --attribute Bangs \ --work_dir ./results/field_128_bangs \ --image_dir ./results/field_128_bangs/visualization \ --image_num 148 ``` ## Qualitative Results ![result](./assets/1024_results_updated.png) ## CelebA-Dialog Dataset ![result](./assets/celeba_dialog.png) Our [**CelebA-Dialog Dataset**](https://github.com/ziqihuangg/CelebA-Dialog) is available for [Download](https://drive.google.com/drive/folders/18nejI_hrwNzWyoF6SW8bL27EYnM4STAs?usp=sharing). **CelebA-Dialog** is a large-scale visual-language face dataset with the following features: - Facial images are annotated with rich **fine-grained labels**, which classify one attribute into multiple degrees according to its semantic meaning. - Accompanied with each image, there are **captions** describing the attributes and a **user request** sample. ![result](./assets/dataset.png) The dataset can be employed as the training and test sets for the following computer vision tasks: fine-grained facial attribute recognition, fine-grained facial manipulation, text-based facial generation and manipulation, face image captioning, and broader natural language based facial recognition and manipulation tasks. ## Citation If you find our repo useful for your research, please consider citing our paper: ```bibtex @inproceedings{jiang2021talk, title={Talk-to-Edit: Fine-Grained Facial Editing via Dialog}, author={Jiang, Yuming and Huang, Ziqi and Pan, Xingang and Loy, Chen Change and Liu, Ziwei}, booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, pages={13799--13808}, year={2021} } @article{jiang2023talk, title={Talk-to-edit: Fine-grained 2d and 3d facial editing via dialog}, author={Jiang, Yuming and Huang, Ziqi and Wu, Tianxing and Pan, Xingang and Loy, Chen Change and Liu, Ziwei}, journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, year={2023}, publisher={IEEE} } ``` ## Contact If you have any question, please feel free to contact us via `yuming002@ntu.edu.sg` or `hu0007qi@ntu.edu.sg`. ## Acknowledgement The codebase is maintained by [Yuming Jiang](https://yumingj.github.io/) and [Ziqi Huang](https://ziqihuangg.github.io/). Part of the code is borrowed from [stylegan2-pytorch](https://github.com/rosinality/stylegan2-pytorch), [IEP](https://github.com/facebookresearch/clevr-iep) and [face-attribute-prediction](https://github.com/d-li14/face-attribute-prediction). ================================================ FILE: configs/attributes_5.json ================================================ { "attr_info":{ "6": { "name": "Bangs", "value":[0, 1, 2, 3, 4, 5], "idx_scale": 1, "idx_bias": 0 }, "16": { "name": "Eyeglasses", "value":[0, 1, 2, 3, 4, 5], "idx_scale": 1, "idx_bias": 0 }, "25": { "name": "No_Beard", "value":[0, 1, 2, 3, 4, 5], "idx_scale": -1, "idx_bias": 5 }, "32": { "name": "Smiling", "value":[0, 1, 2, 3, 4, 5], "idx_scale": 1, "idx_bias": 0 }, "40": { "name": "Young", "value":[0, 1, 2, 3, 4, 5], "idx_scale": -1, "idx_bias": 5 } }, "newIdx_to_attrIdx":{ "0": "6", "1": "16", "2": "25", "3": "32", "4": "40" }, "newIdx_to_attrName":{ "0": "Bangs", "1": "Eyeglasses", "2": "No_Beard", "3": "Smiling", "4": "Young" }, "attrName_to_newIdx":{ "Bangs": "0", "Eyeglasses": "1", "No_Beard": "2", "Smiling": "3", "Young": "4" }, "attrIdx_to_newIdx":{ "6": 0, "16": 1, "25": 2, "32": 3, "40": 4 } } ================================================ FILE: configs/editing/editing_with_dialog.yml ================================================ name: dialog_editing img_res: 1024 # 128 # latent code latent_code_path: ./download/editing_data/teaser_latent_code.npz.npy latent_code_index: 38 # inversion inversion: is_real_image: False # False img_path: ./download/real_images/annehathaway.png crop_img: True device: cuda img_mse_weight: 1.0 step: 600 noise: 0.05 noise_ramp: 0.75 lr: 0.1 lr_gen: !!float 1e-4 use_tb_logger: true set_CUDA_VISIBLE_DEVICES: ~ gpu_ids: [3] attribute: Eyeglasses model_type: FieldFunctionModel fix_layers: true replaced_layers_128: 8 replaced_layers_1024: 10 manual_seed: 2021 # editing configs confidence_thresh: 0 max_cls_num: 5 min_cls_num: 0 max_trials_num: 100 print_every: False transform_z_to_w: False # field_function configs num_layer: 8 hidden_dim: 512 leaky_relu_neg_slope: 0.2 # predictor attr_file: ./configs/attributes_5.json baseline: classification use_sigmoid: True gt_remapping_file: ~ predictor_ckpt_128: ./download/pretrained_models/predictor_128.pth.tar predictor_ckpt_1024: ./download/pretrained_models/predictor_1024.pth.tar # stylegan configs latent_dim: 512 n_mlp: 8 channel_multiplier_128: 1 channel_multiplier_1024: 2 generator_ckpt_128: ./download/pretrained_models/stylegan2_128.pt generator_ckpt_1024: ./download/pretrained_models/stylegan2_1024.pth latent_space: w # ---------- Dialog Editing ----------- has_dialog: True device_name: gpu # pretrained field pretrained_field_128: Bangs: ./download/pretrained_models/128_field/Bangs.pth Eyeglasses: ./download/pretrained_models/128_field/Eyeglasses.pth No_Beard: ./download/pretrained_models/128_field/No_Beard.pth Smiling: ./download/pretrained_models/128_field/Smiling.pth Young: ./download/pretrained_models/128_field/Young.pth pretrained_field_1024: Bangs: ./download/pretrained_models/1024_field/Bangs.pth Eyeglasses: ./download/pretrained_models/1024_field/Eyeglasses.pth No_Beard: ./download/pretrained_models/1024_field/No_Beard.pth Smiling: ./download/pretrained_models/1024_field/Smiling.pth Young: ./download/pretrained_models/1024_field/Young.pth attr_to_idx: Bangs: 0 Eyeglasses: 1 No_Beard: 2 Smiling: 3 Young: 4 # language template files set up feedback_templates_file: ./language/templates/feedback.json metadata_file: ./language/templates/metadata_fsm.json pool_file: ./language/templates/pool.json system_mode_file: ./language/templates/system_mode.json input_vocab_file: ./language/templates/vocab.json # dialog setting postfix_prob: 0.3 whether_enough_general_prob: 0.2 allow_unknown: 1 verbose: 0 # pretrained language encoder pretrained_language_encoder: ./download/pretrained_models/language_encoder.pth.tar language_encoder: word_embedding_dim: 300 text_embed_size: 1024 linear_hidden_size: 256 linear_dropout_rate: 0 ================================================ FILE: configs/editing/editing_wo_dialog.yml ================================================ name: editing_wo_dialog img_res: 1024 # 128 # latent code latent_code_path: ./download/editing_data/teaser_latent_code.npz.npy latent_code_index: 38 # inversion inversion: is_real_image: False # False img_path: ./download/real_images/annehathaway.png crop_img: True device: cuda img_mse_weight: 1.0 step: 600 noise: 0.05 noise_ramp: 0.75 lr: 0.1 lr_gen: !!float 1e-4 use_tb_logger: true set_CUDA_VISIBLE_DEVICES: ~ gpu_ids: [3] attribute: Eyeglasses model_type: FieldFunctionModel fix_layers: true replaced_layers_128: 8 replaced_layers_1024: 10 manual_seed: 2021 # editing configs confidence_thresh: 0 max_cls_num: 5 min_cls_num: 0 max_trials_num: 100 print_every: False transform_z_to_w: False # field_function configs num_layer: 8 hidden_dim: 512 leaky_relu_neg_slope: 0.2 # predictor attr_file: ./configs/attributes_5.json baseline: classification use_sigmoid: True gt_remapping_file: ~ predictor_ckpt_128: ./download/pretrained_models/predictor_128.pth.tar predictor_ckpt_1024: ./download/pretrained_models/predictor_1024.pth.tar # stylegan configs latent_dim: 512 n_mlp: 8 channel_multiplier_128: 1 channel_multiplier_1024: 2 generator_ckpt_128: ./download/pretrained_models/stylegan2_128.pt generator_ckpt_1024: ./download/pretrained_models/stylegan2_1024.pth latent_space: w # ---------- Dialog Editing ----------- has_dialog: False device_name: gpu # pretrained field pretrained_field_128: Bangs: ./download/pretrained_models/128_field/Bangs.pth Eyeglasses: ./download/pretrained_models/128_field/Eyeglasses.pth No_Beard: ./download/pretrained_models/128_field/No_Beard.pth Smiling: ./download/pretrained_models/128_field/Smiling.pth Young: ./download/pretrained_models/128_field/Young.pth pretrained_field_1024: Bangs: ./download/pretrained_models/1024_field/Bangs.pth Eyeglasses: ./download/pretrained_models/1024_field/Eyeglasses.pth No_Beard: ./download/pretrained_models/1024_field/No_Beard.pth Smiling: ./download/pretrained_models/1024_field/Smiling.pth Young: ./download/pretrained_models/1024_field/Young.pth attr_to_idx: Bangs: 0 Eyeglasses: 1 No_Beard: 2 Smiling: 3 Young: 4 ================================================ FILE: configs/train/field_1024_bangs.yml ================================================ name: field_1024_bangs use_tb_logger: true set_CUDA_VISIBLE_DEVICES: ~ gpu_ids: [3] attribute: Bangs model_type: FieldFunctionModel fix_layers: true replaced_layers: 10 # dataset configs batch_size: 8 num_workers: 8 input_latent_dir: ./download/train_data/1024/Bangs editing_latent_code_path: ./download/editing_data/1024/Bangs.npz.npy num_attr: 5 val_on_train_subset: true val_on_valset: true # training configs val_freq: 1 print_freq: 100 weight_decay: 0 manual_seed: 2021 num_epochs: 500 lr: !!float 1e-4 lr_decay: step gamma: 0.1 step: 100 # editing configs confidence_thresh: 0.8 max_cls_num: 5 max_trials_num: 100 print_every: False # field_function configs num_layer: 8 hidden_dim: 512 leaky_relu_neg_slope: 0.2 # loss configs # predictor loss edited_attribute_weight: 1.0 attr_file: ./configs/attributes_5.json predictor_ckpt: ./download/pretrained_models/predictor_1024.pth.tar # arcface loss pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth arcface_weight: 5.0 arcface_loss_type: l1 # disciminator loss disc_weight: 1.0 discriminator_ckpt: ./download/pretrained_models/StyleGAN2_FFHQ1024_discriminator.pth # stylegan configs img_res: 1024 latent_dim: 512 n_mlp: 8 channel_multiplier: 2 generator_ckpt: ./download/pretrained_models/stylegan2_1024.pth latent_space: w ================================================ FILE: configs/train/field_1024_beard.yml ================================================ name: field_1024_beard use_tb_logger: true set_CUDA_VISIBLE_DEVICES: ~ gpu_ids: [3] attribute: No_Beard model_type: FieldFunctionModel fix_layers: true replaced_layers: 10 # dataset configs batch_size: 8 num_workers: 8 input_latent_dir: ./download/train_data/1024/No_Beard editing_latent_code_path: ./download/editing_data/1024/No_Beard.npz.npy num_attr: 5 val_on_train_subset: true val_on_valset: true # training configs val_freq: 1 print_freq: 100 weight_decay: 0 manual_seed: 2021 num_epochs: 30 lr: !!float 1e-4 lr_decay: step gamma: 0.1 step: 100 # editing configs confidence_thresh: 0.8 max_cls_num: 5 max_trials_num: 100 print_every: False # field_function configs num_layer: 8 hidden_dim: 512 leaky_relu_neg_slope: 0.2 # loss configs # predictor loss edited_attribute_weight: 1.0 attr_file: ./configs/attributes_5.json predictor_ckpt: ./download/pretrained_models/predictor_1024.pth.tar # arcface loss pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth arcface_weight: 10.0 arcface_loss_type: l1 # disciminator loss disc_weight: 1.0 discriminator_ckpt: ./download/pretrained_models/StyleGAN2_FFHQ1024_discriminator.pth # stylegan configs img_res: 1024 latent_dim: 512 n_mlp: 8 channel_multiplier: 2 generator_ckpt: ./download/pretrained_models/stylegan2_1024.pth latent_space: w ================================================ FILE: configs/train/field_1024_eyeglasses.yml ================================================ name: field_1024_eyeglasses use_tb_logger: true set_CUDA_VISIBLE_DEVICES: ~ gpu_ids: [3] attribute: Eyeglasses model_type: FieldFunctionModel fix_layers: true replaced_layers: 10 # dataset configs batch_size: 8 num_workers: 8 input_latent_dir: ./download/train_data/1024/Eyeglasses editing_latent_code_path: ./download/editing_data/1024/Eyeglasses.npz.npy num_attr: 5 val_on_train_subset: true val_on_valset: true # training configs val_freq: 1 print_freq: 100 weight_decay: 0 manual_seed: 2021 num_epochs: 30 lr: !!float 1e-4 lr_decay: step gamma: 0.1 step: 100 # editing configs confidence_thresh: 0.8 max_cls_num: 5 max_trials_num: 100 print_every: False # field_function configs num_layer: 8 hidden_dim: 512 leaky_relu_neg_slope: 0.2 # loss configs # predictor loss edited_attribute_weight: 1.0 attr_file: ./configs/attributes_5.json predictor_ckpt: ./download/pretrained_models/predictor_1024.pth.tar # arcface loss pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth arcface_weight: 10.0 arcface_loss_type: l1 # disciminator loss disc_weight: 1.0 discriminator_ckpt: ./download/pretrained_models/StyleGAN2_FFHQ1024_discriminator.pth # stylegan configs img_res: 1024 latent_dim: 512 n_mlp: 8 channel_multiplier: 2 generator_ckpt: ./download/pretrained_models/stylegan2_1024.pth latent_space: w ================================================ FILE: configs/train/field_1024_smiling.yml ================================================ name: field_1024_smiling use_tb_logger: true set_CUDA_VISIBLE_DEVICES: ~ gpu_ids: [3] attribute: Smiling model_type: FieldFunctionModel fix_layers: true replaced_layers: 10 # dataset configs batch_size: 8 num_workers: 8 input_latent_dir: ./download/train_data/1024/Smiling editing_latent_code_path: ./download/editing_data/1024/Smiling.npz.npy num_attr: 5 val_on_train_subset: true val_on_valset: true # training configs val_freq: 1 print_freq: 100 weight_decay: 0 manual_seed: 2021 num_epochs: 30 lr: !!float 1e-4 lr_decay: step gamma: 0.1 step: 100 # editing configs confidence_thresh: 0.8 max_cls_num: 5 max_trials_num: 100 print_every: False # field_function configs num_layer: 8 hidden_dim: 512 leaky_relu_neg_slope: 0.2 # loss configs # predictor loss edited_attribute_weight: 1.0 attr_file: ./configs/attributes_5.json predictor_ckpt: ./download/pretrained_models/predictor_1024.pth.tar # arcface loss pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth arcface_weight: 5.0 arcface_loss_type: l1 # disciminator loss disc_weight: 1.0 discriminator_ckpt: ./download/pretrained_models/StyleGAN2_FFHQ1024_discriminator.pth # stylegan configs img_res: 1024 latent_dim: 512 n_mlp: 8 channel_multiplier: 2 generator_ckpt: ./download/pretrained_models/stylegan2_1024.pth latent_space: w ================================================ FILE: configs/train/field_1024_young.yml ================================================ name: field_1024_young use_tb_logger: true set_CUDA_VISIBLE_DEVICES: ~ gpu_ids: [3] attribute: Young model_type: FieldFunctionModel fix_layers: true replaced_layers: 10 # dataset configs batch_size: 8 num_workers: 8 input_latent_dir: ./download/train_data/1024/Young editing_latent_code_path: ./download/editing_data/1024/Young.npz.npy num_attr: 5 val_on_train_subset: true val_on_valset: true # training configs val_freq: 1 print_freq: 100 weight_decay: 0 manual_seed: 2021 num_epochs: 30 lr: !!float 1e-4 lr_decay: step gamma: 0.1 step: 100 # editing configs confidence_thresh: 0.8 max_cls_num: 5 max_trials_num: 100 print_every: False # field_function configs num_layer: 8 hidden_dim: 512 leaky_relu_neg_slope: 0.2 # loss configs # predictor loss edited_attribute_weight: 1.0 attr_file: ./configs/attributes_5.json predictor_ckpt: ./download/pretrained_models/predictor_1024.pth.tar # arcface loss pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth arcface_weight: 10.0 arcface_loss_type: l1 # disciminator loss disc_weight: 1.0 discriminator_ckpt: ./download/pretrained_models/StyleGAN2_FFHQ1024_discriminator.pth # stylegan configs img_res: 1024 latent_dim: 512 n_mlp: 8 channel_multiplier: 2 generator_ckpt: ./download/pretrained_models/stylegan2_1024.pth latent_space: w ================================================ FILE: configs/train/field_128_bangs.yml ================================================ name: field_128_bangs use_tb_logger: true set_CUDA_VISIBLE_DEVICES: ~ gpu_ids: [3] attribute: Bangs model_type: FieldFunctionModel fix_layers: true replaced_layers: 8 # dataset configs batch_size: 32 num_workers: 8 input_latent_dir: ./download/train_data/128/Bangs editing_latent_code_path: ./download/editing_data/128/Bangs.npz.npy num_attr: 5 val_on_train_subset: true val_on_valset: true # training configs val_freq: 1 print_freq: 100 weight_decay: 0 manual_seed: 2021 num_epochs: 30 lr: !!float 1e-4 lr_decay: step gamma: 0.1 step: 100 # editing configs confidence_thresh: 0.8 max_cls_num: 5 max_trials_num: 100 print_every: False # field_function configs num_layer: 8 hidden_dim: 512 leaky_relu_neg_slope: 0.2 # loss configs # predictor loss edited_attribute_weight: 1.0 attr_file: ./configs/attributes_5.json predictor_ckpt: ./download/pretrained_models/predictor_128.pth.tar # arcface loss pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth arcface_weight: 5.0 arcface_loss_type: l1 # disciminator loss disc_weight: 1.0 discriminator_ckpt: ./download/pretrained_models/stylegan2_128.pt # stylegan configs img_res: 128 latent_dim: 512 n_mlp: 8 channel_multiplier: 1 generator_ckpt: ./download/pretrained_models/stylegan2_128.pt latent_space: w ================================================ FILE: configs/train/field_128_beard.yml ================================================ name: field_128_beard use_tb_logger: true set_CUDA_VISIBLE_DEVICES: ~ gpu_ids: [3] attribute: No_Beard model_type: FieldFunctionModel fix_layers: true replaced_layers: 8 # dataset configs batch_size: 32 num_workers: 8 input_latent_dir: ./download/train_data/128/No_Beard editing_latent_code_path: ./download/editing_data/128/No_Beard.npz.npy num_attr: 5 val_on_train_subset: true val_on_valset: true # training configs val_freq: 1 print_freq: 100 weight_decay: 0 manual_seed: 2021 num_epochs: 30 lr: !!float 1e-4 lr_decay: step gamma: 0.1 step: 100 # editing configs confidence_thresh: 0.8 max_cls_num: 5 max_trials_num: 100 print_every: False # field_function configs num_layer: 8 hidden_dim: 512 leaky_relu_neg_slope: 0.2 # loss configs # predictor loss edited_attribute_weight: 1.0 attr_file: ./configs/attributes_5.json predictor_ckpt: ./download/pretrained_models/predictor_128.pth.tar # arcface loss pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth arcface_weight: 5.0 arcface_loss_type: l1 # disciminator loss disc_weight: 1.0 discriminator_ckpt: ./download/pretrained_models/stylegan2_128.pt # stylegan configs img_res: 128 latent_dim: 512 n_mlp: 8 channel_multiplier: 1 generator_ckpt: ./download/pretrained_models/stylegan2_128.pt latent_space: w ================================================ FILE: configs/train/field_128_eyeglasses.yml ================================================ name: field_128_eyeglasses use_tb_logger: true set_CUDA_VISIBLE_DEVICES: ~ gpu_ids: [3] attribute: Eyeglasses model_type: FieldFunctionModel fix_layers: true replaced_layers: 8 # dataset configs batch_size: 32 num_workers: 8 input_latent_dir: ./download/train_data/128/Eyeglasses editing_latent_code_path: ./download/editing_data/128/Eyeglasses.npz.npy num_attr: 5 val_on_train_subset: true val_on_valset: true # training configs val_freq: 1 print_freq: 100 weight_decay: 0 manual_seed: 2021 num_epochs: 30 lr: !!float 1e-4 lr_decay: step gamma: 0.1 step: 100 # editing configs confidence_thresh: 0.8 max_cls_num: 5 max_trials_num: 100 print_every: False # field_function configs num_layer: 8 hidden_dim: 512 leaky_relu_neg_slope: 0.2 # loss configs # predictor loss edited_attribute_weight: 1.0 attr_file: ./configs/attributes_5.json predictor_ckpt: ./download/pretrained_models/predictor_128.pth.tar # arcface loss pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth arcface_weight: 5.0 arcface_loss_type: l1 # disciminator loss disc_weight: 1.0 discriminator_ckpt: ./download/pretrained_models/stylegan2_128.pt # stylegan configs img_res: 128 latent_dim: 512 n_mlp: 8 channel_multiplier: 1 generator_ckpt: ./download/pretrained_models/stylegan2_128.pt latent_space: w ================================================ FILE: configs/train/field_128_smiling.yml ================================================ name: field_128_smiling use_tb_logger: true set_CUDA_VISIBLE_DEVICES: ~ gpu_ids: [3] attribute: Smiling model_type: FieldFunctionModel fix_layers: true replaced_layers: 8 # dataset configs batch_size: 32 num_workers: 8 input_latent_dir: ./download/train_data/128/Smiling editing_latent_code_path: ./download/editing_data/128/Smiling.npz.npy num_attr: 5 val_on_train_subset: true val_on_valset: true # training configs val_freq: 1 print_freq: 100 weight_decay: 0 manual_seed: 2021 num_epochs: 30 lr: !!float 1e-4 lr_decay: step gamma: 0.1 step: 100 # editing configs confidence_thresh: 0.8 max_cls_num: 5 max_trials_num: 100 print_every: False # field_function configs num_layer: 8 hidden_dim: 512 leaky_relu_neg_slope: 0.2 # loss configs # predictor loss edited_attribute_weight: 1.0 attr_file: ./configs/attributes_5.json predictor_ckpt: ./download/pretrained_models/predictor_128.pth.tar # arcface loss pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth arcface_weight: 5.0 arcface_loss_type: l1 # disciminator loss disc_weight: 1.0 discriminator_ckpt: ./download/pretrained_models/stylegan2_128.pt # stylegan configs img_res: 128 latent_dim: 512 n_mlp: 8 channel_multiplier: 1 generator_ckpt: ./download/pretrained_models/stylegan2_128.pt ================================================ FILE: configs/train/field_128_young.yml ================================================ name: field_128_young use_tb_logger: true set_CUDA_VISIBLE_DEVICES: ~ gpu_ids: [3] attribute: Young model_type: FieldFunctionModel fix_layers: true replaced_layers: 8 # dataset configs batch_size: 32 num_workers: 8 input_latent_dir: ./download/train_data/128/Young editing_latent_code_path: ./download/editing_data/128/Young.npz.npy num_attr: 5 val_on_train_subset: true val_on_valset: true # training configs val_freq: 1 print_freq: 100 weight_decay: 0 manual_seed: 2021 num_epochs: 30 lr: !!float 1e-4 lr_decay: step gamma: 0.1 step: 100 # editing configs confidence_thresh: 0.5 max_cls_num: 5 max_trials_num: 100 print_every: False # field_function configs num_layer: 8 hidden_dim: 512 leaky_relu_neg_slope: 0.2 # loss configs # predictor loss edited_attribute_weight: 1.0 attr_file: ./configs/attributes_5.json predictor_ckpt: ./download/pretrained_models/predictor_128.pth.tar # arcface loss pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth arcface_weight: 5.0 arcface_loss_type: l1 # disciminator loss disc_weight: 1.0 discriminator_ckpt: ./download/pretrained_models/stylegan2_128.pt # stylegan configs img_res: 128 latent_dim: 512 n_mlp: 8 channel_multiplier: 1 generator_ckpt: ./download/pretrained_models/stylegan2_128.pt latent_space: w ================================================ FILE: data/__init__.py ================================================ ================================================ FILE: data/latent_code_dataset.py ================================================ """ Dataset for field function """ import os import os.path import random import numpy as np import torch import torch.utils.data as data class LatentCodeDataset(data.Dataset): def __init__(self, input_dir, subset_samples=None): assert os.path.exists(input_dir) self.latent_codes = np.load( os.path.join(input_dir, 'selected_latent_code.npy')).astype(float) self.labels = np.load( os.path.join(input_dir, 'selected_pred_class.npy')).astype(int) self.scores = np.load( os.path.join(input_dir, 'selected_pred_scores.npy')).astype(float) self.latent_codes = torch.FloatTensor(self.latent_codes) self.labels = torch.LongTensor(self.labels) self.scores = torch.FloatTensor(self.scores) # select a subset from train set if subset_samples is not None and len( self.latent_codes) > subset_samples: idx = list(range(len(self.latent_codes))) selected_idx = random.sample(idx, subset_samples) self.latent_codes = [self.latent_codes[i] for i in selected_idx] self.labels = [self.labels[i] for i in selected_idx] self.scores = [self.scores[i] for i in selected_idx] assert len(self.latent_codes) == len(self.labels) assert len(self.labels) == len(self.scores) def __getitem__(self, index): return (self.latent_codes[index], self.labels[index], self.scores[index]) def __len__(self): return len(self.latent_codes) ================================================ FILE: editing_quantitative.py ================================================ import argparse import logging import os import numpy as np from models import create_model from utils.logger import get_root_logger from utils.numerical_metrics import compute_num_metrics from utils.options import dict2str, dict_to_nonedict, parse from utils.util import make_exp_dirs def main(): # options parser = argparse.ArgumentParser() parser.add_argument('--opt', type=str, help='Path to option YAML file.') parser.add_argument( '--pretrained_path', type=str, help='Path to pretrained field model') args = parser.parse_args() opt = parse(args.opt, is_train=False) # mkdir and loggers make_exp_dirs(opt) # convert to NoneDict, which returns None for missing keys opt = dict_to_nonedict(opt) # load editing latent code editing_latent_codes = np.load(opt['editing_latent_code_path']) num_latent_codes = editing_latent_codes.shape[0] save_path = f'{opt["path"]["visualization"]}' os.makedirs(save_path) editing_logger = get_root_logger( logger_name='editing', log_level=logging.INFO, log_file=f'{save_path}/editing.log') editing_logger.info(dict2str(opt)) field_model = create_model(opt) field_model.load_network(args.pretrained_path) field_model.continuous_editing(editing_latent_codes, save_path, editing_logger) _, _ = compute_num_metrics(save_path, num_latent_codes, opt['pretrained_arcface'], opt['attr_file'], opt['predictor_ckpt'], opt['attr_dict'][opt['attribute']], editing_logger) if __name__ == '__main__': main() ================================================ FILE: editing_with_dialog.py ================================================ import argparse import json import logging import os.path import numpy as np import torch from models import create_model from utils.dialog_edit_utils import dialog_with_real_user from utils.inversion_utils import inversion from utils.logger import get_root_logger from utils.options import (dict2str, dict_to_nonedict, parse, parse_args_from_opt, parse_opt_wrt_resolution) from utils.util import make_exp_dirs def parse_args(): """Parses arguments.""" parser = argparse.ArgumentParser(description='') parser.add_argument( '--opt', default=None, type=str, help='Path to option YAML file.') return parser.parse_args() def main(): # ---------- Set up ----------- args = parse_args() opt = parse(args.opt, is_train=False) opt = parse_opt_wrt_resolution(opt) args = parse_args_from_opt(args, opt) make_exp_dirs(opt) # convert to NoneDict, which returns None for missing keys opt = dict_to_nonedict(opt) # set up logger save_log_path = f'{opt["path"]["log"]}' dialog_logger = get_root_logger( logger_name='dialog', log_level=logging.INFO, log_file=f'{save_log_path}/dialog.log') dialog_logger.info(dict2str(opt)) save_image_path = f'{opt["path"]["visualization"]}' os.makedirs(save_image_path) # ---------- Load files ----------- dialog_logger.info('loading template files') with open(opt['feedback_templates_file'], 'r') as f: args.feedback_templates = json.load(f) args.feedback_replacement = args.feedback_templates['replacement'] with open(opt['pool_file'], 'r') as f: pool = json.load(f) args.synonyms_dict = pool["synonyms"] # ---------- create model ---------- field_model = create_model(opt) # ---------- load latent code ---------- if opt['inversion']['is_real_image']: latent_code = inversion(opt, field_model) else: if opt['latent_code_path'] is None: latent_code = torch.randn(1, 512, device=torch.device('cuda')) with torch.no_grad(): latent_code = field_model.stylegan_gen.get_latent(latent_code) latent_code = latent_code.cpu().numpy() np.save(f'{opt["path"]["visualization"]}/latent_code.npz.npy', latent_code) else: i = opt['latent_code_index'] latent_code = np.load( opt['latent_code_path'], allow_pickle=True).item()[f"{str(i).zfill(7)}.png"] latent_code = torch.from_numpy(latent_code).to( torch.device('cuda')) with torch.no_grad(): latent_code = field_model.stylegan_gen.get_latent(latent_code) latent_code = latent_code.cpu().numpy() np.save(f'{opt["path"]["visualization"]}/latent_code.npz.npy', latent_code) # ---------- Perform dialog-based editing with user ----------- dialog_overall_log = dialog_with_real_user(field_model, latent_code, opt, args, dialog_logger) # ---------- Log the dialog history ----------- for (key, value) in dialog_overall_log.items(): dialog_logger.info(f'{key}: {value}') dialog_logger.info('successfully end.') if __name__ == '__main__': main() ================================================ FILE: editing_wo_dialog.py ================================================ import argparse import logging import os import numpy as np import torch from models import create_model from models.utils import save_image from utils.editing_utils import edit_target_attribute from utils.inversion_utils import inversion from utils.logger import get_root_logger from utils.options import (dict2str, dict_to_nonedict, parse, parse_opt_wrt_resolution) from utils.util import make_exp_dirs def parse_args(): """Parses arguments.""" parser = argparse.ArgumentParser(description='') parser.add_argument('--opt', type=str, help='Path to option YAML file.') parser.add_argument('--attr', type=str, help='Attribute to be edited.') parser.add_argument( '--target_val', type=int, help='Target Attribute Value.') return parser.parse_args() def main(): # ---------- Set up ----------- args = parse_args() opt = parse(args.opt, is_train=False) opt = parse_opt_wrt_resolution(opt) # args = parse_args_from_opt(args, opt) make_exp_dirs(opt) # convert to NoneDict, which returns None for missing keys opt = dict_to_nonedict(opt) # set up logger save_log_path = f'{opt["path"]["log"]}' editing_logger = get_root_logger( logger_name='editing', log_level=logging.INFO, log_file=f'{save_log_path}/editing.log') editing_logger.info(dict2str(opt)) save_image_path = f'{opt["path"]["visualization"]}' os.makedirs(save_image_path) # ---------- create model ---------- field_model = create_model(opt) # ---------- load latent code ---------- if opt['inversion']['is_real_image']: latent_code = inversion(opt, field_model) else: if opt['latent_code_path'] is None: latent_code = torch.randn(1, 512, device=torch.device('cuda')) with torch.no_grad(): latent_code = field_model.stylegan_gen.get_latent(latent_code) latent_code = latent_code.cpu().numpy() np.save(f'{opt["path"]["visualization"]}/latent_code.npz.npy', latent_code) else: i = opt['latent_code_index'] latent_code = np.load( opt['latent_code_path'], allow_pickle=True).item()[f"{str(i).zfill(7)}.png"] latent_code = torch.from_numpy(latent_code).to( torch.device('cuda')) with torch.no_grad(): latent_code = field_model.stylegan_gen.get_latent(latent_code) latent_code = latent_code.cpu().numpy() # ---------- synthesize images ---------- with torch.no_grad(): start_image, start_label, start_score = \ field_model.synthesize_and_predict(torch.from_numpy(latent_code).to(torch.device('cuda'))) # noqa save_image(start_image, f'{opt["path"]["visualization"]}/start_image.png') # initialize attribtue_dict attribute_dict = { "Bangs": start_label[0], "Eyeglasses": start_label[1], "No_Beard": start_label[2], "Smiling": start_label[3], "Young": start_label[4], } edit_label = {'attribute': args.attr, 'target_score': args.target_val} edited_latent_code = None print_intermediate_result = True round_idx = 0 attribute_dict, exception_mode, latent_code, edited_latent_code = edit_target_attribute( opt, attribute_dict, edit_label, round_idx, latent_code, edited_latent_code, field_model, editing_logger, print_intermediate_result) if exception_mode != 'normal': if exception_mode == 'already_at_target_class': editing_logger.info("This attribute is already at the degree that you want. Let's try a different attribute degree or another attribute.") elif exception_mode == 'max_edit_num_reached': editing_logger.info("Sorry, we are unable to edit this attribute. Perhaps we can try something else.") if __name__ == '__main__': main() ================================================ FILE: environment.yml ================================================ name: talk_edit channels: - pytorch - conda-forge - anaconda - defaults dependencies: - _libgcc_mutex=0.1=main - absl-py=0.11.0=pyhd3eb1b0_1 - aiohttp=3.7.3=py37h27cfd23_1 - async-timeout=3.0.1=py37h06a4308_0 - attrs=20.3.0=pyhd3eb1b0_0 - backcall=0.2.0=py_0 - blas=1.0=mkl - blinker=1.4=py37h06a4308_0 - blosc=1.21.0=h8c45485_0 - brotli=1.0.9=he6710b0_2 - brotlipy=0.7.0=py37h27cfd23_1003 - brunsli=0.1=h2531618_0 - bzip2=1.0.8=h7b6447c_0 - c-ares=1.17.1=h27cfd23_0 - ca-certificates=2021.7.5=h06a4308_1 - cachetools=4.2.1=pyhd3eb1b0_0 - certifi=2021.5.30=py37h06a4308_0 - cffi=1.14.4=py37h261ae71_0 - chardet=3.0.4=py37h06a4308_1003 - charls=2.1.0=he6710b0_2 - click=7.1.2=pyhd3eb1b0_0 - cloudpickle=1.6.0=py_0 - cryptography=2.9.2=py37h1ba5d50_0 - cudatoolkit=10.1.243=h6bb024c_0 - cycler=0.10.0=py_2 - cytoolz=0.11.0=py37h7b6447c_0 - dask-core=2021.3.0=pyhd3eb1b0_0 - decorator=4.4.2=pyhd3eb1b0_0 - freetype=2.10.4=h5ab3b9f_0 - giflib=5.1.4=h14c3975_1 - google-auth=1.24.0=pyhd3eb1b0_0 - google-auth-oauthlib=0.4.2=pyhd3eb1b0_2 - grpcio=1.31.0=py37hf8bcb03_0 - icu=67.1=he1b5a44_0 - idna=2.10=pyhd3eb1b0_0 - imagecodecs=2021.1.11=py37h581e88b_1 - imageio=2.9.0=py_0 - intel-openmp=2020.2=254 - ipython=7.18.1=py37h5ca1d4c_0 - ipython_genutils=0.2.0=py37_0 - jedi=0.18.0=py37h06a4308_1 - joblib=1.0.0=pyhd3eb1b0_0 - jpeg=9b=h024ee3a_2 - jxrlib=1.1=h7b6447c_2 - kiwisolver=1.3.1=py37hc928c03_0 - lcms2=2.11=h396b838_0 - ld_impl_linux-64=2.33.1=h53a641e_7 - lerc=2.2.1=h2531618_0 - libaec=1.0.4=he6710b0_1 - libdeflate=1.7=h27cfd23_5 - libedit=3.1.20191231=h14c3975_1 - libffi=3.3=he6710b0_2 - libgcc-ng=9.1.0=hdf63c60_0 - libgfortran-ng=7.3.0=hdf63c60_0 - libpng=1.6.37=hbc83047_0 - libprotobuf=3.13.0.1=h8b12597_0 - libstdcxx-ng=9.1.0=hdf63c60_0 - libtiff=4.1.0=h2733197_1 - libwebp=1.0.1=h8e7db2f_0 - libzopfli=1.0.3=he6710b0_0 - lz4-c=1.9.3=h2531618_0 - markdown=3.3.3=py37h06a4308_0 - matplotlib=3.2.2=1 - matplotlib-base=3.2.2=py37h1d35a4c_1 - mkl=2020.2=256 - mkl-service=2.3.0=py37he8ac12f_0 - mkl_fft=1.2.0=py37h23d657b_0 - mkl_random=1.1.1=py37h0573a6f_0 - multidict=4.7.6=py37h7b6447c_1 - ncurses=6.2=he6710b0_1 - networkx=2.5=py_0 - ninja=1.10.2=py37hff7bd54_0 - numpy=1.19.2=py37h54aff64_0 - numpy-base=1.19.2=py37hfa32c7d_0 - oauthlib=3.1.0=py_0 - olefile=0.46=py37_0 - openjpeg=2.3.0=h05c96fa_1 - openssl=1.1.1k=h27cfd23_0 - parso=0.8.0=py_0 - pexpect=4.8.0=py37_1 - pickleshare=0.7.5=py37_1001 - pillow=8.2.0=py37he98fc37_0 - pip=20.3.3=py37h06a4308_0 - prompt-toolkit=3.0.8=py_0 - protobuf=3.13.0.1=py37he6710b0_1 - ptyprocess=0.6.0=py37_0 - pyasn1=0.4.8=py_0 - pyasn1-modules=0.2.8=py_0 - pycparser=2.20=py_2 - pygments=2.7.1=py_0 - pyjwt=2.0.1=py37h06a4308_0 - pyopenssl=20.0.1=pyhd3eb1b0_1 - pyparsing=2.4.7=pyh9f0ad1d_0 - pysocks=1.7.1=py37_1 - python=3.7.9=h7579374_0 - python-dateutil=2.8.1=py_0 - python_abi=3.7=1_cp37m - pytorch=1.6.0=py3.7_cuda10.1.243_cudnn7.6.3_0 - pywavelets=1.1.1=py37h7b6447c_2 - pyyaml=5.4.1=py37h27cfd23_1 - readline=8.0=h7b6447c_0 - requests=2.25.1=pyhd3eb1b0_0 - requests-oauthlib=1.3.0=py_0 - rsa=4.7=pyhd3eb1b0_1 - scikit-image=0.17.2=py37hdf5156a_0 - scikit-learn=0.23.2=py37h0573a6f_0 - scipy=1.6.2=py37h91f5cce_0 - setuptools=52.0.0=py37h06a4308_0 - six=1.15.0=py37h06a4308_0 - snappy=1.1.8=he6710b0_0 - sqlite=3.33.0=h62c20be_0 - tensorboard=2.3.0=pyh4dce500_0 - tensorboard-plugin-wit=1.6.0=py_0 - tensorboardx=2.1=py_0 - threadpoolctl=2.1.0=pyh5ca1d4c_0 - tifffile=2021.3.5=pyhd3eb1b0_1 - tk=8.6.10=hbc83047_0 - toolz=0.11.1=pyhd3eb1b0_0 - torchvision=0.7.0=py37_cu101 - tornado=6.1=py37h4abf009_0 - tqdm=4.55.1=pyhd3eb1b0_0 - traitlets=5.0.5=py_0 - typing-extensions=3.7.4.3=hd3eb1b0_0 - typing_extensions=3.7.4.3=pyh06a4308_0 - urllib3=1.26.3=pyhd3eb1b0_0 - wcwidth=0.2.5=py_0 - werkzeug=1.0.1=pyhd3eb1b0_0 - wheel=0.36.2=pyhd3eb1b0_0 - xz=5.2.5=h7b6447c_0 - yaml=0.2.5=h7b6447c_0 - yarl=1.5.1=py37h7b6447c_0 - zfp=0.5.5=h2531618_4 - zipp=3.4.0=pyhd3eb1b0_0 - zlib=1.2.11=h7b6447c_3 - zstd=1.4.5=h9ceee32_0 - pip: - cmake==3.21.2 - dlib==19.22.1 - facenet-pytorch==2.5.2 - flake8==3.8.4 - future==0.18.2 - importlib-metadata==3.4.0 - isort==5.7.0 - lpips==0.1.4 - mccabe==0.6.1 - opencv-python==4.5.1.48 - pycodestyle==2.6.0 - pyflakes==2.2.0 - yapf==0.30.0 ================================================ FILE: language/accuracy.py ================================================ import torch def head_accuracy(output, target, unlabeled_value=999): """ Computes the precision@k for the specified values of k output: batch_size * num_cls (for a specific attribute) target: batch_size * 1 (for a specific attribute) return res: res = 100 * num_correct / batch_size, for a specific attribute for a batch """ with torch.no_grad(): batch_size = target.size(0) # _ = the largest score, pred = cls_idx with the largest score _, pred = output.topk(1, 1, True, True) pred = pred.reshape(-1) # acc = float(torch.sum(pred == target)) / float(batch_size) * 100 return_dict = {} if unlabeled_value is not None: correct_count = torch.sum( (target != unlabeled_value) * (pred == target)) labeled_count = torch.sum(target != unlabeled_value) if labeled_count: labeled_acc = float(correct_count) / float(labeled_count) * 100 else: labeled_acc = 0 return_dict['acc'] = labeled_acc return_dict['labeled_count'] = labeled_count else: return_dict['acc'] = acc # noqa return_dict['labeled_count'] = batch_size return return_dict ================================================ FILE: language/build_vocab.py ================================================ import argparse import json import os import sys sys.path.append('.') from language_utils import * # noqa """ Build vocabulary from all instantiated templates """ def parse_args(): """Parses arguments.""" parser = argparse.ArgumentParser(description='Build vocabulary') parser.add_argument( '--input_data_path', required=True, type=str, help='path to the input data file') parser.add_argument( '--output_dir', required=True, type=str, help='folder to save the output vocabulary file') return parser.parse_args() def main(): args = parse_args() # prepare output directory if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir, exist_ok=True) # load text data print("Loading text data from", args.input_data_path) with open(args.input_data_path, 'r') as f: input_data = json.load(f) # gather a list of text print("Building vocabulary from", len(input_data), "text data samples") text_list = [] for idx, data_sample in enumerate(input_data): if idx % 10000 == 0: print('loaded', idx, '/', len(input_data)) text = data_sample['text'] text_list.append(text) # build vocabulary text_token_to_idx = build_vocab(text_list=text_list) # noqa vocab = { 'text_token_to_idx': text_token_to_idx, } # save vocabulary print("Saving vocabulary file to", os.path.join(args.output_dir, 'vocab.json')) with open(os.path.join(args.output_dir, 'vocab.json'), 'w') as f: json.dump(vocab, f, indent=4) if __name__ == '__main__': main() ================================================ FILE: language/dataset.py ================================================ import os.path import numpy as np from torch.utils.data import Dataset class EncoderDataset(Dataset): def __init__(self, preprocessed_dir): # load text text_path = os.path.join(preprocessed_dir, 'text.npy') self.text = np.load(text_path) # load system_mode system_mode_path = os.path.join(preprocessed_dir, 'system_mode.npy') self.system_mode = np.load(system_mode_path) # load labels labels_path = os.path.join(preprocessed_dir, 'labels.npy') self.labels = np.load(labels_path) def __getitem__(self, index): # retrieve text text = self.text[index] # retrieve system_mode system_mode = self.system_mode[index] # retrieve labels labels = self.labels[index] return text, system_mode, labels def __len__(self): return len(self.text) def main(): """ Testing the Dataset""" encoderdataset = EncoderDataset( preprocessed_dir= # noqa '' # noqa ) print('len(encoderdataset):', len(encoderdataset)) print('encoderdataset[0]:', encoderdataset[0]) if __name__ == '__main__': main() ================================================ FILE: language/generate_feedback.py ================================================ import argparse import json import os.path import random import numpy as np from .language_utils import proper_capitalize def parse_args(): """Parses arguments.""" parser = argparse.ArgumentParser(description='') parser.add_argument( '--feedback_templates_file', default='./templates/feedback.json', type=str, help='directory to the request templates file') parser.add_argument( '--pool_file', default='./templates/pool.json', type=str, help='directory to the word pool file') parser.add_argument( '--num_feedback', default=100, type=int, help='number of feedback data to generate') parser.add_argument( '--output_file_dir', required=True, type=str, help='folder to save the output request file') parser.add_argument( '--output_file_name', required=True, type=str, help='name of the output request file') parser.add_argument( '--whether_enough_general_prob', default=0.2, type=float, help='probability of using general templates in whether_enough mode') return parser.parse_args() def main(): args = parse_args() if not os.path.isdir(args.output_file_dir): os.makedirs(args.output_file_dir, exist_ok=True) # load template files print('loading template files') with open(args.feedback_templates_file, 'r') as f: args.feedback_templates = json.load(f) args.feedback_replacement = args.feedback_templates['replacement'] with open(args.pool_file, 'r') as f: pool = json.load(f) args.synonyms_dict = pool["synonyms"] system_mode_list = ['whats_next', 'whether_enough', 'suggestion'] attribute_list = ['Bangs', "Eyeglasses", "No_Beard", "Smiling", "Young"] feedback_list = [] output_txt = [] # instantiate feedback for index in range(args.num_feedback): if index % 1000 == 0: print('generated', index, '/', args.num_feedback, 'feedback') # initialize feedback parameters attribute = None # randomly choose the feedback parameters system_mode = random.choice(system_mode_list) if system_mode == 'whether_enough' or system_mode == 'suggestion': attribute = random.choice(attribute_list) feedback = instantiate_feedback( args, system_mode=system_mode, attribute=attribute) feedback['index'] = index feedback_list.append(feedback) output_txt.append(feedback['text']) # save feedback dataset with open(os.path.join(args.output_file_dir, args.output_file_name), 'w') as f: json.dump(feedback_list, f, indent=4) np.savetxt( os.path.join(args.output_file_dir, "feedback.txt"), output_txt, fmt='%s', delimiter='\t') print('successfully saved.') def instantiate_feedback(args, system_mode=None, attribute=None, exception_mode='normal'): """ Given the feedback mode (i.e. system_mode) and the attribute (if any), return a feedback. """ if exception_mode != 'normal': candidate_templates = args.feedback_templates[exception_mode] template = random.choice(candidate_templates) attribute = attribute else: # ---------- STEP 1: 1st part of feedback: 'ok' template ---------- # instantiate the feedback prefix like "ok" ok_distribution_prob = random.uniform(0, 1) ok_template = '' if ok_distribution_prob < 0.7: ok_templates = args.feedback_templates['ok'] for idx, templates in enumerate(ok_templates): if 0.3 < ok_distribution_prob < 0.7 and (idx == 0 or idx == 1): continue ok_template += random.choice(templates) ok_template += ' ' ok_template = ok_template[0].capitalize() + ok_template[1:] # ---------- STEP 2: 2nd part of feedback: content template ---------- # feedback is trivial like "what's next?" if system_mode == 'whats_next': candidate_templates = args.feedback_templates['whats_next'] template = random.choice(candidate_templates) # feedback asks whether the editing extent is enough elif system_mode == 'whether_enough': whether_enough_general_prob = random.uniform(0, 1) if whether_enough_general_prob < args.whether_enough_general_prob \ or args.feedback_templates[ 'whether_enough'][attribute] == []: candidate_templates = args.feedback_templates[ 'whether_enough']['general'] else: candidate_templates = args.feedback_templates[ 'whether_enough'][attribute] template = random.choice(candidate_templates) # feedback provides suggestion on the next edit elif system_mode == 'suggestion': candidate_templates = args.feedback_templates['suggestion'] template = random.choice(candidate_templates) else: raise KeyError('System mode "%s" not recognized' % system_mode) # ---------- STEP 3: Postprocess the instantiated template sentence ---------- # noqa # replace the in the template with # proper attribute-specific words. # this is not applicable to 'whats_next' type of feedback if system_mode != 'whats_next': for word in args.feedback_replacement: new_word_dict = args.feedback_replacement[word] new_word = new_word_dict[attribute] template = template.replace(word, new_word) # to lower case template = template.lower() # randomly replace words with synonyms for word in args.synonyms_dict: replacing_word = random.choice(args.synonyms_dict[word]) template = template.replace(word, replacing_word) # capitalize template = proper_capitalize(template) if exception_mode != 'normal': # after given feedback of cannot_edit # encode user request by pretending that # the system_mode is 'whats_next' system_mode = 'whats_next' else: template = ok_template + template # ---------- STEP 4: Return the feedback and its annotations ---------- feedback = { "text": template, "system_mode": system_mode, "attribute": attribute } return feedback if __name__ == '__main__': main() ================================================ FILE: language/generate_training_request.py ================================================ import argparse import json import os.path import random import sys sys.path.append('.') from language_utils import proper_capitalize # noqa def parse_args(): """Parses arguments.""" parser = argparse.ArgumentParser(description='') parser.add_argument( '--num_request', default=100, type=int, help='number of request data to generate') # template files parser.add_argument( '--user_templates_file', type=str, default='./templates/user_fsm.json', help='directory to the request templates file') parser.add_argument( '--pool_file', type=str, default='./templates/pool.json', help='directory to the word pool file') parser.add_argument( '--metadata_file', type=str, default='./templates/metadata_fsm.json', help='directory to the metadata file') parser.add_argument( '--system_mode_file', type=str, default='./templates/system_mode.json', help='directory to the system_mode file') # output parser.add_argument( '--output_file_dir', required=True, type=str, help='folder to save the output request file') return parser.parse_args() def main(): args = parse_args() if not os.path.isdir(args.output_file_dir): os.makedirs(args.output_file_dir, exist_ok=False) # load template files print('loading template files') with open(args.user_templates_file, 'r') as f: args.user_templates = json.load(f) with open(args.pool_file, 'r') as f: pool = json.load(f) args.synonyms_dict = pool["synonyms"] args.postfix_list = pool["postfix"] with open(args.metadata_file, 'r') as f: args.metadata = json.load(f) with open(args.system_mode_file, 'r') as f: args.system_mode_dict = json.load(f) args.system_mode_list = [] for key, value in args.system_mode_dict.items(): args.system_mode_list.append(key) attribute_list = ['Bangs', "Eyeglasses", "No_Beard", "Smiling", "Young"] target_score_list = [0, 1, 2, 3, 4, 5] score_change_direction_list = ['positive', 'negative'] score_change_value_list = [1, 2, 3, 4, 5] request_list = [] # instantiate requests for index in range(args.num_request): if index % 1000 == 0: print('generated', index, '/', args.num_request, 'requests') # randomly choose the semantic editing parameters system_mode = random.choice(args.system_mode_list) user_mode_list = list(args.metadata[system_mode].keys()) user_mode = random.choice(user_mode_list) attribute = random.choice(attribute_list) score_change_value = random.choice(score_change_value_list) score_change_direction = random.choice(score_change_direction_list) target_score = random.choice(target_score_list) # instantiate a request according to the # chosen semantic editing parameters request = instantiate_training_request( args, attribute=attribute, user_mode=user_mode, score_change_direction=score_change_direction, score_change_value=score_change_value, target_score=target_score) request['system_mode'] = system_mode # assign each system_mode's user_mode for mode in args.system_mode_list: if system_mode == mode: request[mode] = request['user_mode'] else: request[mode] = None request['index'] = index request_list.append(request) # save request dataset if not os.path.isdir(args.output_file_dir): os.makedirs(args.output_file_dir, exist_ok=True) with open( os.path.join(args.output_file_dir, 'training_request.json'), 'w') as f: json.dump(request_list, f, indent=4) print('successfully saved.') def instantiate_training_request( args, attribute=None, user_mode=None, score_change_direction=None, score_change_value=None, target_score=None, ): """ Given semantic editing parameters, instantiate the request using the request templates. """ request_mode = None instantiated_sentence = '' user_sub_mode_list = user_mode.split('_') for user_sub_mode_idx, user_sub_mode in enumerate(user_sub_mode_list): sub_mode_template = '' if user_sub_mode != 'pureRequest': sub_mode_templates = args.user_templates[user_sub_mode] for templates in sub_mode_templates: sub_mode_template += random.choice(templates) else: request_mode = random.choice( ['target', 'change_definite', 'change_indefinite']) request_templates = args.user_templates['pureRequest'] attribute_templates = request_templates[attribute] # request is the score change direction and value if request_mode == 'change_definite': assert score_change_direction is not None assert score_change_value is not None target_score = None candidate_templates = attribute_templates['change'][ score_change_direction]['definite'][str( score_change_value)] # request is the score change direction without value elif request_mode == 'change_indefinite': assert score_change_direction is not None score_change_value = None target_score = None candidate_templates = attribute_templates['change'][ score_change_direction]['indefinite'] # request is the edit target elif request_mode == 'target': score_change_direction = None score_change_value = None assert target_score is not None candidate_templates = attribute_templates['target'][str( target_score)] else: raise KeyError('Request mode "%s" not recognized' % request_mode) # randomly choose one request template sub_mode_template = random.choice(candidate_templates) if user_sub_mode_idx >= 1: instantiated_sentence += ' ' instantiated_sentence += sub_mode_template if 'pureRequest' not in user_sub_mode_list: score_change_direction = None score_change_value = None target_score = None attribute = None # to lower case instantiated_sentence = instantiated_sentence.lower() # randomly replace words with synonyms for word in args.synonyms_dict: new_word = random.choice(args.synonyms_dict[word]) instantiated_sentence = instantiated_sentence.replace(word, new_word) # capitalize instantiated_sentence = proper_capitalize(instantiated_sentence) request = { "text": instantiated_sentence, "user_mode": user_mode, 'request_mode': request_mode, "attribute": attribute, "score_change_direction": score_change_direction, "score_change_value": score_change_value, "target_score": target_score, } return request if __name__ == '__main__': main() ================================================ FILE: language/language_utils.py ================================================ import numpy as np import torch # global variables PUNCTUATION_TO_KEEP = ['?', ';'] PUNCTUATION_TO_REMOVE = ['.', '!', ','] SPECIAL_TOKENS = { '': 0, '': 1, '': 2, '': 3, } def build_vocab(text_list, min_token_count=1, delimiter=' ', punct_to_keep=None, punct_to_remove=None, print_every=10000): """ Build token to index mapping from a list of text strings -- Input: a list of text string -- Output: a dict which is a mapping from token to index, """ token_to_count = {} # tokenize text and add tokens to token_to_count dict for text_idx, text in enumerate(text_list): if text_idx % print_every == 0: print('tokenized', text_idx, '/', len(text_list)) text_tokens = tokenize(text=text, delimiter=delimiter) for token in text_tokens: if token in token_to_count: token_to_count[token] += 1 else: token_to_count[token] = 1 token_to_idx = {} print('Mapping tokens to indices') # reserve indices for special tokens (must-have tokens) for token, idx in SPECIAL_TOKENS.items(): token_to_idx[token] = idx # assign indices to tokens for token, count in sorted(token_to_count.items()): if count >= min_token_count: token_to_idx[token] = len(token_to_idx) return token_to_idx def tokenize(text, delimiter=' ', add_start_token=False, add_end_token=False, punctuation_to_keep=PUNCTUATION_TO_KEEP, punctuation_to_remove=PUNCTUATION_TO_REMOVE): """ Tokenize a text string -- Input: a text string -- Output: a list of tokens, each token is still a string (usually an english word) """ # (1) Optionally keep or remove certain punctuation if punctuation_to_keep is not None: for punctuation in punctuation_to_keep: text = text.replace(punctuation, '%s%s' % (delimiter, punctuation)) if punctuation_to_remove is not None: for punctuation in punctuation_to_remove: text = text.replace(punctuation, '') # (2) Split the text string into a list of tokens text = text.lower() tokens = text.split(delimiter) # (3) Optionally add start and end tokens if add_start_token: tokens.insert(0, '') if add_end_token: tokens.append('') return tokens def encode(text_tokens, token_to_idx, allow_unk=False): text_encoded = [] for token in text_tokens: if token not in token_to_idx: if allow_unk: token = '' else: raise KeyError('Token "%s" not in vocab' % token) text_encoded.append(token_to_idx[token]) return text_encoded def decode(seq_idx, idx_to_token, delim=None, stop_at_end=True): tokens = [] for idx in seq_idx: tokens.append(idx_to_token[idx]) if stop_at_end and tokens[-1] == '': break if delim is None: return tokens else: return delim.join(tokens) def reverse_dict(input_dict): reversed_dict = {} for key in input_dict.keys(): val = input_dict[key] reversed_dict[val] = key return reversed_dict def to_long_tensor(dset): arr = np.asarray(dset, dtype=np.int64) tensor = torch.LongTensor(arr) return tensor def proper_capitalize(text): if len(text) > 0: text = text.lower() text = text[0].capitalize() + text[1:] for idx, char in enumerate(text): if char in ['.', '!', '?'] and (idx + 2) < len(text): text = text[:idx + 2] + text[idx + 2].capitalize() + text[idx + 3:] text = text.replace(' i ', ' I ') text = text.replace(',i ', ',I ') text = text.replace('.i ', '.I ') text = text.replace('!i ', '!I ') return text ================================================ FILE: language/lstm.py ================================================ """ LSTM Input: batch_size x max_text_length (tokenized questions) Output: batch_size x lstm_hidden_size (question embedding) Details: Tokenized text are first word-embedded (300-D), then passed to 2-layer LSTM, where each cell has is 1024-D. For each text, output the hidden state of the last non-null token. """ from __future__ import print_function import json import torch import torch.nn as nn from torch.autograd import Variable class Encoder(nn.Module): def __init__(self, token_to_idx, word_embedding_dim=300, text_embed_size=1024, metadata_file='./templates/metadata_fsm.json', linear_hidden_size=256, linear_dropout_rate=0): super(Encoder, self).__init__() # LSTM (shared) self.lstm = LSTM( token_to_idx=token_to_idx, word_embedding_dim=word_embedding_dim, lstm_hidden_size=text_embed_size) # classifiers (not shared) with open(metadata_file, 'r') as f: self.metadata = json.load(f) self.classifier_names = [] for idx, (key, val) in enumerate(self.metadata.items()): num_val = len(val.items()) classifier_name = key self.classifier_names.append(classifier_name) setattr( self, classifier_name, nn.Sequential( fc_block(text_embed_size, linear_hidden_size, linear_dropout_rate), nn.Linear(linear_hidden_size, num_val))) def forward(self, text): # LSTM (shared) # Input: batch_size x max_text_length # Output: batch_size x text_embed_size text_embedding = self.lstm(text) # classifiers (not shared) output = [] for classifier_name in self.classifier_names: classifier = getattr(self, classifier_name) output.append(classifier(text_embedding)) return output class LSTM(nn.Module): def __init__(self, token_to_idx, word_embedding_dim=300, lstm_hidden_size=1024, lstm_num_layers=2, lstm_dropout=0): super(LSTM, self).__init__() # token self.token_to_idx = token_to_idx self.NULL = token_to_idx[''] self.START = token_to_idx[''] self.END = token_to_idx[''] # word embedding self.word2vec = nn.Embedding( num_embeddings=len(token_to_idx), embedding_dim=word_embedding_dim) # LSTM self.rnn = nn.LSTM( input_size=word_embedding_dim, hidden_size=lstm_hidden_size, num_layers=lstm_num_layers, bias=True, batch_first=True, dropout=lstm_dropout, bidirectional=False) def forward(self, x): batch_size, max_text_length = x.size() # Find the last non-null element in each sequence, store in idx idx = torch.LongTensor(batch_size).fill_(max_text_length - 1) x_cpu = x.data.cpu() for text_idx in range(batch_size): for token_idx in range(max_text_length - 1): if (x_cpu[text_idx, token_idx] != self.NULL ) and x_cpu[text_idx, token_idx + 1] == self.NULL: # noqa idx[text_idx] = token_idx break idx = idx.type_as(x.data).long() idx = Variable(idx, requires_grad=False) # reduce memory access time self.rnn.flatten_parameters() # hs: all hidden states # [batch_size x max_text_length x hidden_size] # h_n: [2 x batch_size x hidden_size] # c_n: [2 x batch_size x hidden_size] hidden_states, (_, _) = self.rnn(self.word2vec(x)) idx = idx.view(batch_size, 1, 1).expand(batch_size, 1, hidden_states.size(2)) hidden_size = hidden_states.size(2) # only retrieve the hidden state of the last non-null element # [batch_size x 1 x hidden_size] hidden_state_at_last_token = hidden_states.gather(1, idx) # [batch_size x hidden_size] hidden_state_at_last_token = hidden_state_at_last_token.view( batch_size, hidden_size) return hidden_state_at_last_token class fc_block(nn.Module): def __init__(self, inplanes, planes, drop_rate=0.15): super(fc_block, self).__init__() self.fc = nn.Linear(inplanes, planes) self.bn = nn.BatchNorm1d(planes) if drop_rate > 0: self.dropout = nn.Dropout(drop_rate) self.relu = nn.ReLU(inplace=True) self.drop_rate = drop_rate def forward(self, x): x = self.fc(x) x = self.bn(x) if self.drop_rate > 0: x = self.dropout(x) x = self.relu(x) return x def main(): """ Test Code """ # ################### LSTM ######################### question_token_to_idx = { ".": 4, "missing": 34, "large": 28, "is": 26, "cubes": 19, "cylinder": 21, "what": 54, "": 1, "green": 24, "": 2, "object": 35, "things": 51, "": 3, "matte": 31, "rubber": 41, "tiny": 52, "yellow": 55, "red": 40, "visible": 53, "color": 17, "size": 44, "balls": 11, "the": 48, "any": 8, "blocks": 14, "ball": 10, "a": 6, "it": 27, "an": 7, "one": 38, "purple": 39, "how": 25, "thing": 50, "?": 5, "objects": 36, "blue": 15, "block": 13, "small": 45, "shiny": 43, "material": 30, "cylinders": 22, "": 0, "many": 29, "of": 37, "cube": 18, "metallic": 33, "gray": 23, "brown": 16, "spheres": 47, "there": 49, "sphere": 46, "shape": 42, "are": 9, "metal": 32, "cyan": 20, "big": 12 }, batch_size = 64 print('batch size:', batch_size) # questions=torch.ones(batch_size, 15, dtype=torch.long) questions = torch.randint(0, 10, (batch_size, 15), dtype=torch.long) print('intput size:', questions.size()) lstm = LSTM(token_to_idx=question_token_to_idx[0]) output = lstm(questions) print('output size:', output.size()) # ################### Language Encoder ######################### encoder = Encoder( token_to_idx=question_token_to_idx[0], metadata_file='./templates/metadata_fsm.json') output = encoder(questions) print('output length:', len(output)) for classifier in output: print('classifier.size():', classifier.size()) if __name__ == '__main__': main() ================================================ FILE: language/preprocess_request.py ================================================ import argparse import json import os import sys import numpy as np sys.path.append('.') from language_utils import * # noqa """ Preprocess the text """ def parse_args(): """Parses arguments.""" parser = argparse.ArgumentParser() parser.add_argument( '--input_vocab_path', required=True, type=str, help='path to the input vocabulary file') parser.add_argument( '--input_data_path', required=True, type=str, help='path to the input data file') parser.add_argument( '--metadata_file', type=str, default='./templates/metadata_fsm.json', help='directory to the metadata file') parser.add_argument( '--system_mode_file', type=str, default='./templates/system_mode.json', help='directory to the system_mode file') parser.add_argument( '--allow_unknown', default=0, type=int, help='whether allow unknown tokens (i.e. words)') parser.add_argument( '--expand_vocab', default=0, type=int, help='whether expand vocabularies') parser.add_argument( '--output_dir', required=True, type=str, help='folder to save the output vocabulary file') parser.add_argument( '--unlabeled_value', default=999, type=int, help='value to represent unlabeled value') return parser.parse_args() def main(): args = parse_args() if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir, exist_ok=False) # load vocabulary print("Loading vocab") with open(args.input_vocab_path, 'r') as f: vocab = json.load(f) text_token_to_idx = vocab['text_token_to_idx'] # load metadata file with open(args.metadata_file, 'r') as f: metadata = json.load(f) # load system_mode file with open(args.system_mode_file, 'r') as f: system_mode_file = json.load(f) # load input data with open(args.input_data_path, 'r') as f: input_data = json.load(f) # initialize lists to store encoded data text_encoded_list = [] system_mode_encoded_list = [] labels_encoded_list = [] print('Encoding') for idx, data_sample in enumerate(input_data): # encode text text = data_sample['text'] text_tokens = tokenize(text=text) # noqa text_encoded = encode( # noqa text_tokens=text_tokens, token_to_idx=text_token_to_idx, allow_unk=args.allow_unknown) text_encoded_list.append(text_encoded) # encode system_mode system_mode = data_sample['system_mode'] system_mode_encoded = system_mode_file[system_mode] system_mode_encoded_list.append(system_mode_encoded) # encode labels labels_encoded = [] for idx, (key, val) in enumerate(metadata.items()): label = data_sample[key] if label is None: # use args.unlabeled_value to represent missing labels label_encoded = args.unlabeled_value else: label_encoded = val[str(label)] labels_encoded.append(label_encoded) labels_encoded_list.append(labels_encoded) # Pad encoded text to equal length print('Padding tokens') text_encoded_padded_list = [] max_text_length = max(len(text) for text in text_encoded_list) for text_encoded in text_encoded_list: while len(text_encoded) < max_text_length: text_encoded.append(text_token_to_idx['']) text_encoded_padded_list.append(text_encoded) # save processed text np.save( os.path.join(args.output_dir, 'text.npy'), text_encoded_padded_list) np.savetxt( os.path.join(args.output_dir, 'text.txt'), text_encoded_padded_list, fmt='%.0f') # save processed system_mode np.save( os.path.join(args.output_dir, 'system_mode.npy'), system_mode_encoded_list) np.savetxt( os.path.join(args.output_dir, 'system_mode.txt'), system_mode_encoded_list, fmt='%.0f') # save processed labels np.save(os.path.join(args.output_dir, 'labels.npy'), labels_encoded_list) np.savetxt( os.path.join(args.output_dir, 'labels.txt'), labels_encoded_list, fmt='%.0f') if __name__ == '__main__': main() ================================================ FILE: language/run_encoder.py ================================================ import argparse import json import random import torch from .language_utils import * # noqa from .lstm import Encoder def parse_args(): """Parses arguments.""" parser = argparse.ArgumentParser() parser.add_argument( '--input_vocab_file', required=True, type=str, help='path to the input vocabulary file') parser.add_argument( '--allow_unknown', default=1, type=int, help='whether allow unknown tokens (i.e. words)') parser.add_argument( '--pretrained_checkpoint', default='', type=str, help='The pretrained network weights for testing') parser.add_argument( '--metadata_file', default='./templates/metadata_fsm.json', type=str, help='path to metadata file.') parser.add_argument( '--system_mode_file', default='./templates/system_mode.json', type=str, help='path to system_mode file.') parser.add_argument( '--device_name', default='gpu', type=str, ) parser.add_argument( '--verbose', default=0, type=int, ) # LSTM hyperparameter parser.add_argument('--word_embedding_dim', default=300, type=int) parser.add_argument('--text_embed_size', default=1024, type=int) parser.add_argument('--linear_hidden_size', default=256, type=int) parser.add_argument('--linear_dropout_rate', default=0, type=float) return parser.parse_args() def main(): args = parse_args() encode_request(args) def encode_request(args, system_mode=None, dialog_logger=None): # set up if args.device_name == 'cpu': args.device = torch.device('cpu') elif args.device_name == 'gpu': args.device = torch.device('cuda') if dialog_logger is None: output_function = print else: # output_function = dialog_logger.info def output_function(input): # suppress output when called by other scripts pass return compulsory_output_function = dialog_logger.info # ---------------- STEP 1: Input the Request ---------------- # choose system_mode with open(args.system_mode_file, 'r') as f: system_mode_dict = json.load(f) system_mode_list = [] for (mode, mode_idx) in system_mode_dict.items(): system_mode_list.append(mode) if __name__ == '__main__': assert system_mode is None system_mode = random.choice(system_mode_list) output_function(' PREDEFINED system_mode:', system_mode) else: assert system_mode is not None # input request if True: compulsory_output_function( 'Enter your request (Press enter when you finish):') input_text = input() else: input_text = 'make the bangs slightly longer.' compulsory_output_function('USER INPUT >>> ' + input_text) # ---------------- STEP 2: Preprocess Request ---------------- # output_function(" The system is trying to understand your request:") # output_function(" ########################################") # load vocabulary with open(args.input_vocab_file, 'r') as f: vocab = json.load(f) text_token_to_idx = vocab['text_token_to_idx'] text_tokens = tokenize(text=input_text) # noqa text_encoded = encode( # noqa text_tokens=text_tokens, token_to_idx=text_token_to_idx, allow_unk=args.allow_unknown) text_encoded = to_long_tensor([text_encoded]).to(args.device) # noqa # ---------------- STEP 3: Encode Request ---------------- # prepare encoder encoder = Encoder( token_to_idx=text_token_to_idx, word_embedding_dim=args.word_embedding_dim, text_embed_size=args.text_embed_size, metadata_file=args.metadata_file, linear_hidden_size=args.linear_hidden_size, linear_dropout_rate=args.linear_dropout_rate) encoder = encoder.to(args.device) checkpoint = torch.load(args.pretrained_checkpoint) encoder.load_state_dict(checkpoint['state_dict'], True) encoder.eval() # forward pass output = encoder(text_encoded) # ---------------- STEP 4: Process Encoder Output ---------------- output_labels = [] for head_idx in range(len(output)): _, pred = torch.max(output[head_idx], 1) head_label = pred.cpu().numpy()[0] output_labels.append(head_label) # load metadata file with open(args.metadata_file, 'r') as f: metadata = json.load(f) # find mapping from value to label reversed_metadata = {} for idx, (key, val) in enumerate(metadata.items()): reversed_val = reverse_dict(val) # noqa reversed_metadata[key] = reversed_val if args.verbose: output_function('reversed_metadata:', reversed_metadata) # convert predicted values to a dict of predicted labels output_semantic_labels = {} # from LSTM output valid_semantic_labels = {} # useful information among LSTM output for idx, (key, val) in enumerate(reversed_metadata.items()): output_semantic_labels[key] = val[output_labels[idx]] valid_semantic_labels[key] = None if args.verbose: output_function('output_semantic_labels:', output_semantic_labels) # extract predicted labels user_mode = output_semantic_labels[system_mode] valid_semantic_labels[system_mode] = user_mode request_mode = output_semantic_labels['request_mode'] attribute = output_semantic_labels['attribute'] score_change_direction = output_semantic_labels['score_change_direction'] if output_semantic_labels['score_change_value'] is None: score_change_value = None else: score_change_value = int(output_semantic_labels['score_change_value']) if output_semantic_labels['target_score'] is None: target_score = None else: target_score = int(output_semantic_labels['target_score']) # print to screen output_function(' ENCODED user_mode:' + ' ' + user_mode) valid_semantic_labels['user_mode'] = user_mode if 'pureRequest' in user_mode: output_function(' ENCODED request_mode: ' + ' ' + request_mode) valid_semantic_labels['request_mode'] = request_mode output_function(' ENCODED attribute:' + ' ' + attribute) valid_semantic_labels['attribute'] = attribute # only output_function labels valid for this request_mode if request_mode == 'change_definite': output_function(' ENCODED score_change_direction:' + ' ' + (score_change_direction)) valid_semantic_labels[ 'score_change_direction'] = score_change_direction output_function(' ENCODED score_change_value:' + ' ' + str(score_change_value)) valid_semantic_labels['score_change_value'] = score_change_value elif request_mode == 'change_indefinite': output_function(' ENCODED score_change_direction:' + ' ' + score_change_direction) valid_semantic_labels[ 'score_change_direction'] = score_change_direction elif request_mode == 'target': output_function(' ENCODED target_score:' + ' ' + str(target_score)) valid_semantic_labels['target_score'] = target_score valid_semantic_labels['text'] = input_text if args.verbose: output_function('valid_semantic_labels:' + ' ' + str(valid_semantic_labels)) # output_function(" ########################################") return valid_semantic_labels if __name__ == '__main__': main() ================================================ FILE: language/templates/attr_wise_caption_templates.json ================================================ { "Bangs": { "0": [ " has no bangs at all.", " has no bangs at all and forehead is visible.", " doesn't have any bangs.", " doesn't have any bangs and forehead is visible.", " entire forehead is visible.", " entire forehead is visible without any bangs.", " shows entire forehead without any bangs." ], "1": [ " has very short bangs which only covers a tiny portion of forehead." ], "2": [ " has short bangs that covers a small portion of forehead.", " has short bangs that only covers a small portion of forehead." ], "3": [ " has medium bangs that covers half of forehead.", " has bangs of medium length that covers half of forehead.", " has bangs of medium length that leaves half of forehead visible." ], "4": [ " has long bangs that almost covers all of forehead.", " has long bangs that almost covers This entire forehead." ], "5": [ " has extremely long bangs that almost covers all of forehead.", " has extremely long bangs that almost covers This entire forehead." ] }, "Eyeglasses": { "0": [ " is not wearing any eyeglasses.", "There is not any eyeglasses on face." ], "1": [ " is wearing rimless eyeglasses." ], "2": [ " is wearing eyeglasses with thin frame.", " is wearing thin frame eyeglasses." ], "3": [ " is wearing eyeglasses with thick frame.", " is wearing thick frame eyeglasses." ], "4": [ " is wearing sunglasses with thin frame.", " is wearing thin frame sunglasses." ], "5": [ " is wearing sunglasses with thick frame.", " is wearing thick frame sunglasses." ] }, "No_Beard": { "0": [ " doesn't have any beard.", " doesn't have any beard at all." ], "1": [ " face is covered with short pointed beard.", " face is covered with his stubble.", " face has a rough growth of stubble.", " has a rough growth of stubble.", "There should be stubble covering cheeks and chin." ], "2": [ " face is covered with short beard." ], "3": [ " face is covered with beard of medium length.", " has beard of medium length." ], "4": [ " has a big mustache on his face.", " has a bushy beard." ], "5": [ " has very long beard.", " has full beard.", " has very thick beard.", " has a very bushy beard." ] }, "Smiling": { "0": [ " looks serious with no smile in face." ], "1": [ " smiles with corners of the mouth turned up.", " smiles with corners of mouth turned up.", " turns up the corners of mouth." ], "2": [ "This corners of mouth curve up and we can see some teeth.", " smiles broadly and shows some teeth." ], "3": [ "The entire face of this is beamed with happiness.", " has a beaming face.", " is smiling with teeth visible.", " entire face is beamed with happiness." ], "4": [ " has a big smile.", " has a big smile on face.", " is smiling with mouth slightly open.", " is smiling with mouth slightly open and teeth visible." ], "5": [ "This in the image is laughing happily.", " has a deep rumbling laugh.", " has a very big smile.", " has a very big smile on face.", " is smiling with mouth wide open.", " is smiling with mouth wide open and teeth visible." ] }, "Young": { "0": [ "This is a young kid.", "This is a young child." ], "1": [ " is a teenager.", " looks very young." ], "2": [ " is a young adult.", " is in thirties." ], "3": [ " is in forties.", " is in middle age." ], "4": [ " is in sixties.", " is in fifties.", " looks like an elderly." ], "5": [ " is in eighties.", "This old is in eighties.", " is in seventies.", "This old is in seventies.", " looks very old." ] } } ================================================ FILE: language/templates/feedback.json ================================================ { "replacement": { "": { "Bangs": "bangs", "Eyeglasses": "glasses", "No_Beard": "beard", "Smiling": "smile", "Young": "age" }, "": { "Bangs": "them", "Eyeglasses": "them", "No_Beard": "it", "Smiling": "it", "Young": "it" }, "": { "Bangs": "are", "Eyeglasses": "are", "No_Beard": "is", "Smiling": "is", "Young": "is" }, "": { "Bangs": "length", "Eyeglasses": "style", "No_Beard": "shape", "Smiling": "degree", "Young": "level" } }, "suggestion": [ "Do you want to try manipulating the ?", "Do you want to try manipulating the instead?", "Do you want to try manipulating the as well?", "Do you want to try editing the ?", "Do you want to try editing the instead?", "Do you want to try editing the as well?", "What about the ? Do you want to play with ?", "Do you want to play with the ?", "What about the ? Do you want to edit ?", "Do you want to edit the ?", "What about the ? Do you want to manipulate ?", "Do you want to manipulate the ?" ], "whether_enough": { "general": [ "Is this enough?", "Is this good enough?", " the just right now?", " the what you want now?", " the of the person just right now?", " the of the person what you want now?", " the of proper degree now?", " the of the ok now?", " the of the okay now?" ], "Bangs": [ "Are the bangs in proper shape now?", "Is the length of the bangs ok now?" ], "Eyeglasses": [], "No_Beard": [], "Smiling": [], "Young": [ "Is the age of the person ok now?" ] }, "whats_next": [ "What's next?", "What else do you want to play with?", "What else do you want to manipulate?", "What else do you want to edit?", "What else do you want to change?", "What else do you want to try?" ], "ok": [ [ "Okay", "Ok", "Well", "Okie" ], [ " ", ", " ], [ "done.", "it's done.", "bingo.", "finished.", "that's it.", "this is it." ] ], "max_edit_num_reached": [ "It is infeasible to edit this attribute. Let's try another attribute.", "We cannot edit this attribute. Let's try something else.", "Oops, it is hard to edit this attribute. Let's try something else.", "Sorry, we are unable to edit this attribute. Perhaps we can try something else." ], "already_at_target_class": [ "This attribute is already at the degree that you want. Let's try a different attribute degree or another attribute." ] } ================================================ FILE: language/templates/gender.json ================================================ { "male": { "": [ "person", "guy", "gentleman" ], "": [ "he", "he", "this person", "this guy", "this gentleman", "this man" ], "": [ "his", "the" ], "": [ "him" ], "": [ "boy" ] }, "female": { "": [ "person", "lady", "female" ], "": [ "she", "she", "this lady", "this person", "this female", "this woman" ], "": [ "her", "the" ], "": [ "her" ], "": [ "girl" ] } } ================================================ FILE: language/templates/metadata_fsm.json ================================================ { "start": { "start_pureRequest": 0 }, "suggestion": { "yes": 0, "yes_pureRequest": 1, "no": 2, "no_pureRequest": 3, "no_end": 4 }, "whether_enough": { "yes": 0, "yes_pureRequest": 1, "yes_end": 2, "no": 3, "no_pureRequest": 4 }, "whats_next": { "pureRequest": 0, "end": 1 }, "attribute": { "Bangs": 0, "Eyeglasses": 1, "No_Beard": 2, "Smiling": 3, "Young": 4 }, "score_change_direction": { "negative": 0, "positive": 1 }, "score_change_value": { "1": 0, "2": 1, "3": 2, "4": 3, "5": 4 }, "target_score": { "0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5 }, "request_mode": { "change_definite": 0, "change_indefinite": 1, "target": 2, "end": 3 } } ================================================ FILE: language/templates/overall_caption_templates.json ================================================ { "attr_order_mapping": { "Bangs": { "0": [ "has", "sentence" ], "1": [ "has" ], "2": [ "has" ], "3": [ "has" ], "4": [ "has", "sentence" ] }, "No_Beard": { "0": [ "has", "sentence" ], "1": [ "has" ], "2": [ "has" ], "3": [ "has" ], "4": [ "has", "sentence" ] }, "Eyeglasses": { "0": [ "has", "sentence" ], "1": [ "has" ], "2": [ "has" ], "3": [ "has" ], "4": [ "has", "sentence" ] }, "Smiling": { "0": [ "has", "sentence" ], "1": [ "has" ], "2": [ "has" ], "3": [ "has" ], "4": [ "has", "sentence" ] }, "Young": { "0": [ "start" ], "1": [ "sentence" ], "2": [ "sentence" ], "3": [ "sentence" ], "4": [ "sentence" ] } }, "has": { "Bangs": { "0": [ "no bangs" ], "1": [ "very short bangs", "very short bangs which only covers a tiny portion of forehead" ], "2": [ "short bangs", "short bangs that covers a small portion of forehead", "short bangs that only covers a small portion of forehead" ], "3": [ "medium bangs", "medium bangs that covers half of forehead", "bangs of medium length that covers half of forehead", "bangs of medium length that leaves half of forehead visible" ], "4": [ "long bangs", "long bangs that almost covers all of forehead", "long bangs that almost covers This entire forehead" ], "5": [ "extremely long bangs", "extremely long bangs that almost covers all of forehead", "extremely long bangs that almost covers This entire forehead" ] }, "Eyeglasses": { "0": [ "no eyeglasses" ], "1": [ "rimless eyeglasses" ], "2": [ "eyeglasses with thin frame", "thin frame eyeglasses" ], "3": [ "eyeglasses with thick frame", "thick frame eyeglasses" ], "4": [ "sunglasses with thin frame", "thin frame sunglasses" ], "5": [ "sunglasses with thick frame", "thick frame sunglasses" ] }, "No_Beard": { "0": [ "no beard", "no beard at all" ], "1": [ "short pointed beard", "stubble", "a rough growth of stubble", "stubble covering cheeks and chin" ], "2": [ "short beard" ], "3": [ "beard of medium length" ], "4": [ "a big mustache on his face", "a bushy beard" ], "5": [ "very long beard", "full beard", "very thick beard", "a very bushy beard" ] }, "Smiling": { "0": [ "no smile" ], "1": [ "a very mild smile" ], "2": [ "a mild smile" ], "3": [ "a beaming face", "a smile with teeth visible", "a face that is beamed with happiness", "a smile" ], "4": [ "a big smile", "a big smile on face", "a big smile with mouth slightly open", "a big smile with mouth slightly open and teeth visible" ], "5": [ "a deep rumbling laugh", "a very big smile", "a very big smile on face", "a very big smile with mouth wide open", "a very big smile with mouth wide open and teeth visible" ] } }, "start": { "Young": { "0": [ "This young kid", "This young child", "This little " ], "1": [ "This teenager", "This young ", "This young " ], "2": [ "This young adult", "This in thirties" ], "3": [ "This in forties", "This in middle age", "This middle-aged " ], "4": [ "This in sixties", "This in fifties", "This elderly " ], "5": [ "This old ", "This in eighties", "This old in eighties", "This in seventies", "This old in seventies", "This very old " ] } }, "has_prefix": [ "This has ", " has " ] } ================================================ FILE: language/templates/pool.json ================================================ { "synonyms": { " can ": [ " can ", " could ", " should " ], "i'm": [ "i'm", "i am" ], "it's": [ "it's", "it is" ], "bangs": [ "bangs", "fringe" ], "slightly": [ "slightly", "a little bit", "a tiny little bit", "a little", "a bit", "only a little", "just a little bit" ], "somewhat": [ "somewhat", "relatively", "to some extent", "to some degree", "moderately", "partially", "sort of", "kind of", "considerably" ], "very": [ "very", "extremely" ], "entire": [ "entire", "whole", "full" ], "child": [ "child", "schoolchild" ], "teenager": [ "teenager", "teen" ], "beard": [ "beard", "mustache" ], "i think": [ "i think", "i think that", "i feel", "i feel that", "i kind of think", "i kind of think that", "i kind of feel", "i kind of feel that", "i guess", "i guess that" ], "i want": [ "i want", "i kind of want", "i would like" ], "let's try": [ "let's try", "how about trying", "what about trying" ], "but not too much": [ "but not too much", "just not too much", "just that not too much", "just don't go too much" ], "only": [ "only", "simply", "just" ], "eyeglasses": [ "eyeglasses", "glasses" ], "pokerface": [ "pokerface", "poker face" ], "what's": [ "what's", "what is" ], "how's": [ "how's", "how is" ], "do you want to": [ "do you want to", "would you like to", "perhaps you would like to", "perhaps you might want to", "maybe you would like to", "maybe you might want to" ], "want to": [ "want to", "would like to" ], "manipulate": [ "manipulate", "edit" ], "manipulating": [ "manipulating", "editing", "playing with" ] }, "prefix": [ "Actually,", "To be honest,", "Well,", "Well", "Emm", "Emmm", "Emmmm", "Emm,", "Emmm,", "Emmmm,", "Hi,", "Hello,", "Let me think about it.", "I'm not too sure but", "What about this?", "Can we try this?", "It looks okay now but", "It looks better now, but still,", "It looks nice, but still,", "Let me have a look. Well,", "Let me have a look. Well", "Let me have a look. Emm,", "Let me have a look. Emmm,", "Let me have a look. Emmmm,", "Let me have a look. Emm", "Let me have a look. Emmm", "Let me have a look. Emmmm", "Let me take a look. Well,", "Let me take a look. Well", "Let me take a look. Emm,", "Let me take a look. Emmm,", "Let me take a look. Emmmm,", "Let me take a look. Emm", "Let me take a look. Emmm", "Let me take a look. Emmmm" ], "postfix": [ "Thanks!", "Thank you!", "Is that possible?", "and emmm... well let's try this first.", "I guess it will probably get better this way.", "I'm not too sure, let's see how it goes first.", "It would be nicer in that way.", "It would be nicer in that way, I think.", "It would be nicer in that way, I guess.", "I think it would be nicer in that way.", "I guess it would be nicer in that way.", "It would be nicer this way.", "It would be nicer this way, I think.", "It would be nicer this way, I guess.", "I think it would be nicer this way.", "I guess it would be nicer this way.", "It might be nicer in that way.", "It might be nicer in that way, I think.", "It might be nicer in that way, I guess.", "I think it might be nicer in that way.", "I guess it might be nicer in that way.", "It might be nicer this way.", "It might be nicer this way, I think.", "It might be nicer this way, I guess.", "I think it might be nicer this way.", "I guess it might be nicer this way.", "It would look better in that way.", "It would look better in that way, I think.", "It would look better in that way, I guess.", "I think it would look better in that way.", "I guess it would look better in that way.", "It would look better this way.", "It would look better this way, I think.", "It would look better this way, I guess.", "I think it would look better this way.", "I guess it would look better this way.", "It might look better in that way.", "It might look better in that way, I think.", "It might look better in that way, I guess.", "I think it might look better in that way.", "I guess it might look better in that way.", "It might look better this way.", "It might look better this way, I think.", "It might look better this way, I guess.", "I think it might look better this way.", "I guess it might look better this way." ] } ================================================ FILE: language/templates/system_mode.json ================================================ { "start": 0, "suggestion": 1, "whether_enough": 2, "whats_next": 3 } ================================================ FILE: language/templates/user_fsm.json ================================================ { "start": [ [ "Hi.", "Hello." ] ], "pureRequest": { "Bangs": { "target": { "0": [ "No bangs.", "Remove all the bangs.", "Cut off all the bangs.", "I don't want the bangs at all.", "I don't want any bangs.", "I don't want any bangs visible.", "The bangs doesn't look good, let's remove it.", "The bangs covers the forehead, but I want the entire forehead visible." ], "1": [ "Add very short bangs.", "I want very short bangs.", "Add very short bangs that leaves most of the forehead uncovered.", "I want very short bangs that leaves most of the forehead uncovered." ], "2": [ "Add short bangs.", "Let's try short bangs.", "Add short bangs that covers only a small portion of the forehead.", "Let's try short bangs that covers only a small portion of the forehead." ], "3": [ "Add medium bangs.", "Add bangs of medium length.", "Let's try bangs of medium length.", "Let's try bangs that leaves half of the forehead visible." ], "4": [ "Add long bangs.", "Let's try long bangs.", "Add long bangs but don't cover the entire forehead.", "Let's try long bangs but don't cover the entire forehead." ], "5": [ "Add extremely long bangs.", "Let's try extremely long bangs.", "Add extremely long bangs that covers the entire forehead.", "Let's try extremely long bangs that covers the entire forehead.", "Indeed, the bangs can be much longer. Let's cover the eyebrows." ] }, "change": { "positive": { "definite": { "1": [ "The bangs can be slightly longer.", "Make the bangs slightly longer." ], "2": [ "The bangs can be somewhat longer, but not too much.", "Make the bangs somewhat longer, but not too much." ], "3": [ "Make the bangs longer, but not too much." ], "4": [ "The bangs can be longer.", "Make the bangs longer." ], "5": [ "The bangs can be much longer.", "Make the bangs much longer." ] }, "indefinite": [ "Longer bangs.", "Add bangs.", "The bangs can be longer.", "Let's add some bangs.", "Maybe the bangs can be longer.", "Let's try adding longer bangs.", "What about adding longer bangs?", "Emm, I think the bangs can be longer.", "Let's make the bangs longer.", "Hi, I want to see how my friend looks like with some bangs." ] }, "negative": { "definite": { "1": [ "The bangs can be slightly shorter.", "Make the bangs slightly shorter." ], "2": [ "The bangs can be somewhat shorter, but not too much.", "Make the bangs somewhat shorter, but not too much." ], "3": [ "The bangs can be shorter.", "Make the bangs shorter." ], "4": [ "The bangs can be much shorter.", "Make the bangs much shorter." ], "5": [ "Remove all the bangs.", "I don't want the bangs at all.", "I don't want any bangs at all." ] }, "indefinite": [ "Less bangs", "Remove bangs.", "Remove the bangs.", "Let's cut off the bangs.", "Let's cut the bangs short.", "Let's cut the bangs off.", "I don't like the bangs, let's remove it.", "I don't like the bangs, let's cut it off.", "The bangs is too long, let's remove it.", "The bangs is too long, let's cut it off." ] } } }, "Eyeglasses": { "target": { "0": [ "No eyeglass", "No eyeglasses please.", "No eyeglasses.", "Remove eyeglasses.", "Remove the eyeglasses.", "I don't want to see the eyeglasses.", "I think there shouldn't be any eyeglasses." ], "1": [ "The eyeglasses should be rimless.", "Let's try rimless eyeglasses." ], "2": [ "The eyeglasses should have thin frame.", "Let's try thin frame eyeglasses." ], "3": [ "The eyeglasses should have thick frame.", "Let's try thick frame eyeglasses." ], "4": [ "Let's try thin frame sunglasses.", "It should be sunglasses with thin frame." ], "5": [ "Let's try thick frame sunglasses.", "It should be sunglasses with thick frame." ] }, "change": { "positive": { "definite": { "1": [ "Make the eyeglasses slightly more obvious.", "The eyeglasses can be slightly more obvious." ], "2": [ "Make the eyeglasses somewhat more obvious.", "The eyeglasses can be somewhat more obvious." ], "3": [ "Make the eyeglasses more obvious.", "The eyeglasses can be more obvious." ], "4": [ "Let's try eyeglasses with thicker frame and darker color." ], "5": [ "Let's try thick frame sunglasses.", "It should be sunglasses with thick frame." ] }, "indefinite": [ "Add glasses", "Use eyeglasses", "Try eyeglasses.", "Add eyeglasses.", "Add eyeglasses to the face.", "Add eyeglasses please.", "Let's add eyeglasses.", "The eyeglasses can be more obvious.", "The eyeglasses are not obvious enough.", "I can't see the eyeglasses clearly, let's make them more obvious.", "The eyeglasses frame can be thicker.", "The glass color can be darker." ] }, "negative": { "definite": { "1": [ "Make the eyeglasses slightly less obvious.", "The eyeglasses can be slightly less obvious." ], "2": [ "Make the eyeglasses somewhat less obvious.", "The eyeglasses can be somewhat less obvious." ], "3": [ "Make the eyeglasses less obvious.", "The eyeglasses can be less obvious." ], "4": [ "The eyeglasses are too obvious, let's make it much less obvious.", "The eyeglasses are too obvious, let's try make it much less obvious." ], "5": [ "Remove eyeglasses.", "Remove the eyeglasses.", "I don't like the eyeglasses.", "I don't want to see the eyeglasses.", "There shouldn't be any eyeglasses." ] }, "indefinite": [ "Remove eyeglasses.", "No eyeglasses.", "The eyeglasses can be less obvious.", "The eyeglasses are too obvious.", "Let's make the eyeglasses more obvious.", "The eyeglasses frame can be thinner.", "The glass color can be lighter." ] } } }, "No_Beard": { "target": { "0": [ "Let's see what he looks like without his beard.", "Let's shave the beard off.", "No beard" ], "1": [ "His face should be covered with short pointed beard.", "His face should be covered with the stubble.", "His face has a rough growth of stubble.", "There should be stubble covering his cheeks and chin." ], "2": [ "His face should be covered with short beard.", "Let's add short beard to his face.", "Let's try short beard on his face." ], "3": [ "His face should be covered with beard of medium length.", "Let's add medium-length beard to his face.", "Let's try medium-length beard on his face." ], "4": [ "Let's try a big mustache on his face.", "He should have a bushy beard." ], "5": [ "Let's add very long beard.", "Let's add a full beard.", "He should have very thick beard.", "He should have a very bushy beard." ] }, "change": { "positive": { "definite": { "1": [ "The beard can be slightly longer.", "Make the beard slightly longer.", "Slightly add more beard." ], "2": [ "The beard can be somewhat longer, but not too much.", "Make the beard somewhat longer, but not too much." ], "3": [ "The beard can be longer.", "Make the beard longer." ], "4": [ "The beard can be much longer.", "Make the beard much longer." ], "5": [ "Let's add very long beard.", "Let's add a full beard.", "He should have very thick beard", "He has a very bushy beard." ] }, "indefinite": [ "Add beard.", "Add some beard.", "Longer beard.", "Let's add more beard.", "I want some more beard on the face." ] }, "negative": { "definite": { "1": [ "The beard can be slightly shorter.", "Make the beard slightly shorter.", "Slightly remove some beard." ], "2": [ "The beard can be somewhat shorter, but not too much.", "Make the beard somewhat shorter, but not too much." ], "3": [ "The beard can be shorter.", "Make the beard shorter." ], "4": [ "The beard can be much shorter.", "Make the beard much shorter." ], "5": [ "Let's see what he looks like without his beard.", "Let's shave the beard off." ] }, "indefinite": [ "Less beard.", "Remove beard.", "Remove the beard.", "The beard should be gone.", "Let's try to remove the beard.", "I don't like the beard.", "Let's try shorter beard." ] } } }, "Smiling": { "target": { "0": [ "I think the person shouldn't be smiling.", "I don't like the smile.", "I don't want the smile.", "No smile.", "Remove the smile." ], "1": [ "Turn up the corners of the mouth.", "The corners of the mouth should curve up." ], "2": [ "The corners of the mouth should curve up and show some teeth.", "Smile broadly and show some teeth." ], "3": [ "I want a beaming face.", "I want the face to be smiling with teeth visible.", "The entire face should be beamed with happiness." ], "4": [ "It can be a big smile.", "I want a big smile on the face.", "I want the face to be smiling with the mouth slightly open.", "I want the face to be smiling with the mouth slightly open. We should be able to see the teeth.", "I want the face to be smiling with the mouth slightly open so that we can see the teeth." ], "5": [ "I want a deep rumbling laugh.", "It can be laughing happily.", "It can be a very big smile.", "I want a very big smile on the face.", "I want the face to be smiling with the mouth wide open.", "I want the face to be smiling with the mouth wide open. We should be able to see the teeth." ] }, "change": { "positive": { "definite": { "1": [ "Smile slightly more.", "The smile can be slightly bigger.", "Make the smile slightly bigger.", "The person can look slightly happier.", "The person can smile slightly more happily." ], "2": [ "The smile can be somewhat bigger, but not too much.", "Make the smile somewhat bigger, but not too much.", "The person can look somewhat happier.", "The person can smile somewhat more happily." ], "3": [ "Smile more.", "The smile can be bigger.", "Make the smile bigger.", "The person can be happier.", "The person can smile more happily." ], "4": [ "The smile can be much bigger.", "Make the smile much bigger.", "The person can be a lot happier.", "The person can smile a lot more happily." ], "5": [ "I want a deep rumbling laugh.", "It can be laughing happily.", "It can be a very big smile.", "I want a very big smile on the face.", "I want the face to be smiling with the mouth wide open.", "I want the face to be smiling with the mouth wide open. We should be able to see the teeth.", "The person can smile very happily." ] }, "indefinite": [ "Look not so serious.", "Look less serious.", "Too serious, be happier.", "Add smile.", "Add some smiling please.", "The smile is not big enough.", "I want a bigger smile.", "I want the face to smile more.", "I want to change the pokerface face to a smiling face.", "The person can smile more happily.", "Can look happier." ] }, "negative": { "definite": { "1": [ "I want the smile to be slightly less obvious.", "The smile can be slightly less obvious.", "The person can smile slightly less happily." ], "2": [ "I want the smile to be less obvious.", "The smile can be less obvious.", "The person can smile somewhat less happily." ], "3": [ "I want the smile to be much less obvious.", "The smile can be much less obvious.", "The person can smile less happily." ], "4": [ "I want to make the smile almost vanish.", "The person can smile a lot less happily." ], "5": [ "I want the smile to vanish.", "I don't like the smile, let's remove it." ] }, "indefinite": [ "Not serious enough.", "More serious.", "No smiling.", "No smile.", "Remove smiling.", "Remove the smiling.", "Remove smile.", "Remove the smile.", "Smile less happily.", "Don't be so happy.", "The smile is too much.", "Can we have a gentler smile? This smile is too big.", "I want to change the smiling face to a pokerface." ] } } }, "Young": { "target": { "0": [ "Let's make the face a child one.", "Let's make the face very young." ], "1": [ "Let's make the face a teenager one.", "Let's make the face relatively young.", "The person should be in the twenties." ], "2": [ "Let's make the face a young one.", "It should be a young adult.", "The person should be in the thirties." ], "3": [ "Let's make the face a middle age one.", "The person should be in the forties." ], "4": [ "Let's make the face slightly older than middle age.", "Let's make the face the one of a senior.", "Let's make the face the one of an elderly.", "The person should be in the sixties.", "The person should be in the fifties." ], "5": [ "Let's make the face a very old one.", "The person should be in the seventies.", "The person should be in the eighties." ] }, "change": { "positive": { "definite": { "1": [ "The face can be slightly older.", "Make the face slightly older." ], "2": [ "Somewhat older", "The face can be somewhat older, just not too much.", "Make the face somewhat older, but not too much." ], "3": [ "Make the face older, but not too much.", "Make the face older, but not too much." ], "4": [ "The face can be older.", "Make the face older." ], "5": [ "The face can be much older.", "Make the face much older.", "Let's make the face a very old one." ] }, "indefinite": [ "Older.", "Make it older.", "The face can be older.", "This face is too young, let's make it older.", "Let's make the face older.", "What about making the face look older?" ] }, "negative": { "definite": { "1": [ "The face can be slightly younger.", "Make the face slightly younger." ], "2": [ "Somewhat younger.", "The face can be somewhat younger, but not too much.", "Make the face somewhat younger, but not too much." ], "3": [ "The face can be younger.", "Make the face younger.", "Younger face." ], "4": [ "Much younger.", "The face can be much younger.", "Make the face much younger." ], "5": [ "Let's make the face a child one." ] }, "indefinite": [ "Younger face.", "Younger.", "Look younger", "Make it younger.", "Be younger.", "Less old.", "The face can be younger.", "This face is too old, let's make it younger.", "Let's make the face younger.", "What about making it younger?", "Can you make the person look younger?" ] } } } }, "yes": [ [ "Yes", "Yep", "Yeep", "Yep sure", "Yes sure", "Sure", "Ok" ], [ "." ] ], "no": [ [ "No", "Nope" ], [ "." ] ], "end": [ [ "End.", "Nothing.", "Nothing else.", "Nothing else for now.", "It's all good now.", "I don't want any further edits.", "Actually it's all good now.", "No need for further edits.", "I don't need any further edits.", "That's all.", "This is it.", "That is it.", "That is all.", "No." ], [ " Thanks!", " Thank you!", " Thanks a lot!", "" ] ] } ================================================ FILE: language/templates/user_old_templates.json ================================================ { "start": [ [ "Hi.", "Hello." ], [ " " ] ], "requests": { "Bangs": { "target": { "0": [ "No bangs.", "Remove all the bangs.", "Cut off all the bangs.", "I don't want the bangs at all.", "I don't want any bangs.", "I don't want any bangs visible.", "The bangs doesn't look good, let's remove it.", "The bangs covers the forehead, but I want the entire forehead visible." ], "1": [ "Add very short bangs.", "I want very short bangs.", "Add very short bangs that leaves most of the forehead uncovered.", "I want very short bangs that leaves most of the forehead uncovered." ], "2": [ "Add short bangs.", "Let's try short bangs.", "Add short bangs that covers only a small portion of the forehead.", "Let's try short bangs that covers only a small portion of the forehead." ], "3": [ "Add medium bangs.", "Add bangs of medium length.", "Let's try bangs of medium length.", "Let's try bangs that leaves half of the forehead visible." ], "4": [ "Add long bangs.", "Let's try long bangs.", "Add long bangs but don't cover the entire forehead.", "Let's try long bangs but don't cover the entire forehead." ], "5": [ "Add extremely long bangs.", "Let's try extremely long bangs.", "Add extremely long bangs that covers the entire forehead.", "Let's try extremely long bangs that covers the entire forehead.", "Indeed, the bangs can be much longer. Let's cover the eyebrows." ] }, "change": { "positive": { "definite": { "1": [ "The bangs can be slightly longer.", "Make the bangs slightly longer." ], "2": [ "The bangs can be somewhat longer, but not too much.", "Make the bangs somewhat longer, but not too much." ], "3": [ "Make the bangs longer, but not too much." ], "4": [ "The bangs can be longer.", "Make the bangs longer." ], "5": [ "The bangs can be much longer.", "Make the bangs much longer." ] }, "indefinite": [ "The bangs can be longer.", "Let's add some bangs.", "Maybe the bangs can be longer.", "Let's try adding longer bangs.", "What about adding longer bangs?", "Emm, I think the bangs can be longer.", "Let's make the bangs longer.", "Hi, I want to see how my friend looks like with some bangs." ] }, "negative": { "definite": { "1": [ "The bangs can be slightly shorter.", "Make the bangs slightly shorter." ], "2": [ "The bangs can be somewhat shorter, but not too much.", "Make the bangs somewhat shorter, but not too much." ], "3": [ "The bangs can be shorter.", "Make the bangs shorter." ], "4": [ "The bangs can be much shorter.", "Make the bangs much shorter." ], "5": [ "Remove all the bangs.", "I don't want the bangs at all.", "I don't want any bangs at all." ] }, "indefinite": [ "Remove bangs.", "Remove the bangs.", "Let's cut off the bangs.", "Let's cut the bangs short.", "Let's cut the bangs off.", "I don't like the bangs, let's remove it.", "I don't like the bangs, let's cut it off.", "The bangs is too long, let's remove it.", "The bangs is too long, let's cut it off." ] } } }, "Eyeglasses": { "target": { "0": [ "No eyeglasses please.", "No eyeglasses.", "Remove eyeglasses.", "Remove the eyeglasses.", "I don't want to see the eyeglasses.", "I think there shouldn't be any eyeglasses." ], "1": [ "The eyeglasses should be rimless.", "Let's try rimless eyeglasses." ], "2": [ "The eyeglasses should have thin frame.", "Let's try thin frame eyeglasses." ], "3": [ "The eyeglasses should have thick frame.", "Let's try thick frame eyeglasses." ], "4": [ "Let's try thin frame sunglasses.", "It should be sunglasses with thin frame." ], "5": [ "Let's try thick frame sunglasses.", "It should be sunglasses with thick frame." ] }, "change": { "positive": { "definite": { "1": [ "Make the eyeglasses slightly more obvious.", "The eyeglasses can be slightly more obvious." ], "2": [ "Make the eyeglasses somewhat more obvious.", "The eyeglasses can be somewhat more obvious." ], "3": [ "Make the eyeglasses more obvious.", "The eyeglasses can be more obvious." ], "4": [ "Let's try eyeglasses with thicker frame and darker color." ], "5": [ "Let's try thick frame sunglasses.", "It should be sunglasses with thick frame." ] }, "indefinite": [ "Try eyeglasses.", "Add eyeglasses.", "Add eyeglasses to the face.", "Add eyeglasses please.", "Let's add eyeglasses.", "The eyeglasses can be more obvious.", "The eyeglasses are not obvious enough.", "I can't see the eyeglasses clearly, let's make them more obvious.", "The eyeglasses frame can be thicker.", "The glass color can be darker." ] }, "negative": { "definite": { "1": [ "Make the eyeglasses slightly less obvious.", "The eyeglasses can be slightly less obvious." ], "2": [ "Make the eyeglasses somewhat less obvious.", "The eyeglasses can be somewhat less obvious." ], "3": [ "Make the eyeglasses less obvious.", "The eyeglasses can be less obvious." ], "4": [ "The eyeglasses are too obvious, let's make it much less obvious.", "The eyeglasses are too obvious, let's try make it much less obvious." ], "5": [ "Remove eyeglasses.", "Remove the eyeglasses.", "I don't like the eyeglasses.", "I don't want to see the eyeglasses.", "There shouldn't be any eyeglasses." ] }, "indefinite": [ "The eyeglasses can be less obvious.", "The eyeglasses are too obvious.", "Let's make the eyeglasses more obvious.", "The eyeglasses frame can be thinner.", "The glass color can be lighter." ] } } }, "No_Beard": { "target": { "0": [ "Let's see what he looks like without his beard.", "Let's shave the beard off." ], "1": [ "His face should be covered with short pointed beard.", "His face should be covered with the stubble.", "His face has a rough growth of stubble.", "There should be stubble covering his cheeks and chin." ], "2": [ "His face should be covered with short beard.", "Let's add short beard to his face.", "Let's try short beard on his face." ], "3": [ "His face should be covered with beard of medium length.", "Let's add medium-length beard to his face.", "Let's try medium-length beard on his face." ], "4": [ "Let's try a big mustache on his face.", "He should have a bushy beard." ], "5": [ "Let's add very long beard.", "Let's add a full beard.", "He should have very thick beard.", "He should have a very bushy beard." ] }, "change": { "positive": { "definite": { "1": [ "The beard can be slightly longer.", "Make the beard slightly longer.", "Slightly add more beard." ], "2": [ "The beard can be somewhat longer, but not too much.", "Make the beard somewhat longer, but not too much." ], "3": [ "The beard can be longer.", "Make the beard longer." ], "4": [ "The beard can be much longer.", "Make the beard much longer." ], "5": [ "Let's add very long beard.", "Let's add a full beard.", "He should have very thick beard", "He has a very bushy beard." ] }, "indefinite": [ "Add beard.", "Add some beard.", "Longer beard.", "Let's add more beard.", "I want some more beard on the face." ] }, "negative": { "definite": { "1": [ "The beard can be slightly shorter.", "Make the beard slightly shorter.", "Slightly remove some beard." ], "2": [ "The beard can be somewhat shorter, but not too much.", "Make the beard somewhat shorter, but not too much." ], "3": [ "The beard can be shorter.", "Make the beard shorter." ], "4": [ "The beard can be much shorter.", "Make the beard much shorter." ], "5": [ "Let's see what he looks like without his beard.", "Let's shave the beard off." ] }, "indefinite": [ "Remove beard.", "Remove the beard.", "The beard should be gone.", "Let's try to remove the beard.", "I don't like the beard.", "Let's try shorter beard" ] } } }, "Smiling": { "target": { "0": [ "I think the person shouldn't be smiling.", "I don't like the smile.", "I don't want the smile" ], "1": [ "Turn up the corners of the mouth", "The corners of the mouth curve up." ], "2": [ "The corners of the mouth curve up and show some teeth.", "Smile broadly and show some teeth." ], "3": [ "I want a beaming face.", "I want the face to be smiling with teeth visible.", "The entire face should be beamed with happiness." ], "4": [ "It can be a big smile.", "I want a big smile on the face.", "I want the face to be smiling with the mouth slightly open.", "I want the face to be smiling with the mouth slightly open. We should be able to see the teeth.", "I want the face to be smiling with the mouth slightly open so that we can see the teeth." ], "5": [ "I want a deep rumbling laugh.", "It can be laughing happily.", "It can be a very big smile.", "I want a very big smile on the face.", "I want the face to be smiling with the mouth wide open.", "I want the face to be smiling with the mouth wide open. We should be able to see the teeth." ] }, "change": { "positive": { "definite": { "1": [ "Smile slightly more.", "The smile can be slightly bigger.", "Make the smile slightly bigger.", "The person can look slightly happier.", "The person can smile slightly more happily." ], "2": [ "The smile can be somewhat bigger, but not too much.", "Make the smile somewhat bigger, but not too much.", "The person can look somewhat happier.", "The person can smile somewhat more happily." ], "3": [ "Smile more.", "The smile can be bigger.", "Make the smile bigger.", "The person can be happier.", "The person can smile more happily." ], "4": [ "The smile can be much bigger.", "Make the smile much bigger.", "The person can be a lot happier.", "The person can smile a lot more happily." ], "5": [ "I want a deep rumbling laugh.", "It can be laughing happily.", "It can be a very big smile.", "I want a very big smile on the face.", "I want the face to be smiling with the mouth wide open.", "I want the face to be smiling with the mouth wide open. We should be able to see the teeth.", "The person can smile very happily." ] }, "indefinite": [ "Add some smiling please.", "The smile is not big enough.", "I want a bigger smile.", "I want the face to smile more.", "I want to change the pokerface face to a smiling face.", "The person can smile more happily.", "Can look happier." ] }, "negative": { "definite": { "1": [ "I want the smile to be slightly less obvious.", "The smile can be slightly less obvious.", "The person can smile slightly less happily." ], "2": [ "I want the smile to be less obvious.", "The smile can be less obvious.", "The person can smile somewhat less happily." ], "3": [ "I want the smile to be much less obvious.", "The smile can be much less obvious.", "The person can smile less happily." ], "4": [ "I want to make the smile almost vanish.", "The person can smile a lot less happily." ], "5": [ "I want the smile to vanish.", "I don't like the smile, let's remove it." ] }, "indefinite": [ "No smiling.", "No smile.", "Remove smiling.", "Remove the smiling.", "Remove smile.", "Remove the smile.", "Smile less happily.", "Don't be so happy.", "The smile is too much.", "Can we have a gentler smile? This smile is too big.", "I want to change the smiling face to a pokerface." ] } } }, "Young": { "target": { "0": [ "Let's make the face a child one.", "Let's make the face very young." ], "1": [ "Let's make the face a teenager one.", "Let's make the face relatively young.", "The person should be in the twenties." ], "2": [ "Let's make the face a young one.", "It should be a young adult.", "The person should be in the thirties." ], "3": [ "Let's make the face a middle age one.", "The person should be in the forties." ], "4": [ "Let's make the face slightly older than middle age.", "Let's make the face the one of a senior.", "Let's make the face the one of an elderly.", "The person should be in the sixties.", "The person should be in the fifties." ], "5": [ "Let's make the face a very old one.", "The person should be in the seventies.", "The person should be in the eighties." ] }, "change": { "positive": { "definite": { "1": [ "The face can be slightly older.", "Make the face slightly older." ], "2": [ "Somewhat older", "The face can be somewhat older, just not too much.", "Make the face somewhat older, but not too much." ], "3": [ "Make the face older, but not too much.", "Make the face older, but not too much." ], "4": [ "The face can be older.", "Make the face older." ], "5": [ "The face can be much older.", "Make the face much older.", "Let's make the face a very old one." ] }, "indefinite": [ "Older.", "Make it older.", "The face can be older.", "This face is too young, let's make it older.", "Let's make the face older.", "What about making the face look older?" ] }, "negative": { "definite": { "1": [ "The face can be slightly younger.", "Make the face slightly younger." ], "2": [ "Somewhat younger.", "The face can be somewhat younger, but not too much.", "Make the face somewhat younger, but not too much." ], "3": [ "The face can be younger.", "Make the face younger.", "Younger face." ], "4": [ "Much younger.", "The face can be much younger.", "Make the face much younger." ], "5": [ "Let's make the face a child one." ] }, "indefinite": [ "Younger face.", "Younger.", "Make it younger.", "Be younger.", "Less old.", "The face can be younger.", "This face is too old, let's make it younger.", "Let's make the face younger.", "What about making it younger?" ] } } } }, "yes_enough": [ [ "Emmm, yep", "Emmm, yes", "Emmm, yeep", "Yes", "Yep", "Yeep", "Yep sure" ], [ ", ", ". ", "! " ], [ "That's good enough now.", "That's nice.", "That's perfect.", "This is great." ], [ " " ] ], "no_enough": [ [ "Actually,", "To be honest,", "Well,", "Well", "Emm", "Emmm", "Emmmm", "Emm,", "Emmm,", "Emmmm,", "I'm not too sure but", "It looks okay now but", "It looks better now, but still,", "It looks nice, but still,", "Let me have a look. Well,", "Let me have a look. Well", "Let me have a look. Emm,", "Let me have a look. Emmm,", "Let me have a look. Emmmm,", "Let me have a look. Emm", "Let me have a look. Emmm", "Let me have a look. Emmmm", "Let me take a look. Well,", "Let me take a look. Well", "Let me take a look. Emm,", "Let me take a look. Emmm,", "Let me take a look. Emmmm,", "Let me take a look. Emm", "Let me take a look. Emmm", "Let me take a look. Emmmm" ], [ " " ] ], "yes_suggestion": [ [ "Emmm, yep", "Emmm, yes", "Emmm, yeep", "Yes", "Yep", "Yeep", "Yep sure", "Yes sure" ], [ ",", ".", "!" ], [ " " ] ], "no_suggestion": [ [ "Well,", "Well", "Emm,", "Emmm", "Emmmm", "Emm,", "Emmm,", "Emmmm,", "I'm not too sure so", "It looks okay now so", "It looks nice, so,", "Let me have a look. Well,", "Let me have a look. Well", "Let me have a look. Emm,", "Let me have a look. Emmm,", "Let me have a look. Emmmm,", "Let me have a look. Emm", "Let me have a look. Emmm", "Let me have a look. Emmmm", "Let me take a look. Well,", "Let me take a look. Well", "Let me take a look. Emm,", "Let me take a look. Emmm,", "Let me take a look. Emmmm,", "Let me take a look. Emm", "Let me take a look. Emmm", "Let me take a look. Emmmm" ], [ " " ], [ "Not really.", "Not really actually.", "No actually." ], [ " " ] ], "end": [ [ "Nothing else.", "Nothing else for now.", "It's all good now.", "I don't want any further edits.", "Actually it's all good now.", "No need for further edits.", "I don't need any further edits.", "That's all.", "This is it.", "That is it.", "That is all.", "No." ], [ " " ], [ "Thanks!", "Thank you!", "Thanks a lot!" ] ] } ================================================ FILE: language/templates/vocab.json ================================================ { "text_token_to_idx": { "": 0, "": 1, "": 2, "": 3, "?": 4, "a": 5, "able": 6, "about": 7, "actually": 8, "add": 9, "adding": 10, "adult": 11, "age": 12, "all": 13, "almost": 14, "an": 15, "and": 16, "any": 17, "are": 18, "at": 19, "bangs": 20, "be": 21, "beamed": 22, "beaming": 23, "beard": 24, "big": 25, "bigger": 26, "bit": 27, "broadly": 28, "bushy": 29, "but": 30, "can": 31, "can't": 32, "change": 33, "cheeks": 34, "child": 35, "chin": 36, "clearly": 37, "color": 38, "considerably": 39, "corners": 40, "could": 41, "cover": 42, "covered": 43, "covering": 44, "covers": 45, "curve": 46, "cut": 47, "darker": 48, "deep": 49, "degree": 50, "doesn't": 51, "don't": 52, "edits": 53, "eighties": 54, "elderly": 55, "else": 56, "emm": 57, "end": 58, "enough": 59, "entire": 60, "extent": 61, "extremely": 62, "eyebrows": 63, "eyeglass": 64, "eyeglasses": 65, "face": 66, "feel": 67, "fifties": 68, "for": 69, "forehead": 70, "forties": 71, "frame": 72, "friend": 73, "fringe": 74, "full": 75, "further": 76, "gentler": 77, "glass": 78, "glasses": 79, "go": 80, "gone": 81, "good": 82, "growth": 83, "guess": 84, "half": 85, "happier": 86, "happily": 87, "happiness": 88, "happy": 89, "has": 90, "have": 91, "he": 92, "hello": 93, "hi": 94, "his": 95, "how": 96, "i": 97, "in": 98, "indeed": 99, "is": 100, "it": 101, "it's": 102, "just": 103, "kind": 104, "laugh": 105, "laughing": 106, "leaves": 107, "length": 108, "less": 109, "let's": 110, "lighter": 111, "like": 112, "little": 113, "long": 114, "longer": 115, "look": 116, "looks": 117, "lot": 118, "make": 119, "making": 120, "maybe": 121, "medium": 122, "medium-length": 123, "middle": 124, "moderately": 125, "more": 126, "most": 127, "mouth": 128, "much": 129, "mustache": 130, "my": 131, "need": 132, "no": 133, "nope": 134, "not": 135, "nothing": 136, "now": 137, "obvious": 138, "of": 139, "off": 140, "ok": 141, "old": 142, "older": 143, "on": 144, "one": 145, "only": 146, "open": 147, "partially": 148, "person": 149, "please": 150, "pointed": 151, "poker": 152, "pokerface": 153, "portion": 154, "relatively": 155, "remove": 156, "rimless": 157, "rough": 158, "rumbling": 159, "schoolchild": 160, "see": 161, "senior": 162, "serious": 163, "seventies": 164, "shave": 165, "short": 166, "shorter": 167, "should": 168, "shouldn't": 169, "show": 170, "simply": 171, "sixties": 172, "slightly": 173, "small": 174, "smile": 175, "smiling": 176, "so": 177, "some": 178, "somewhat": 179, "sort": 180, "stubble": 181, "sunglasses": 182, "sure": 183, "teen": 184, "teenager": 185, "teeth": 186, "than": 187, "thank": 188, "thanks": 189, "that": 190, "that's": 191, "the": 192, "them": 193, "there": 194, "thick": 195, "thicker": 196, "thin": 197, "think": 198, "thinner": 199, "thirties": 200, "this": 201, "tiny": 202, "to": 203, "too": 204, "try": 205, "trying": 206, "turn": 207, "twenties": 208, "uncovered": 209, "up": 210, "use": 211, "vanish": 212, "very": 213, "visible": 214, "want": 215, "we": 216, "what": 217, "whole": 218, "wide": 219, "with": 220, "without": 221, "would": 222, "yeep": 223, "yep": 224, "yes": 225, "you": 226, "young": 227, "younger": 228 } } ================================================ FILE: language/train_encoder.py ================================================ import argparse import json import sys import time import torch import torch.nn as nn import torch.utils.data sys.path.append('.') from accuracy import head_accuracy # noqa from dataset import EncoderDataset # noqa from lstm import Encoder # noqa from utils import AverageMeter, dict2str, save_checkpoint # noqa from utils.setup_logger import setup_logger # noqa def parse_args(): """Parses arguments.""" parser = argparse.ArgumentParser(description='Train the language encoder') # mode parser.add_argument('--debug', type=int, default=0) # training parser.add_argument('--batch_size', type=int, default=2048) parser.add_argument('--val_batch', type=int, default=1024) # learning rate scheme parser.add_argument('--num_epochs', default=20, type=int) parser.add_argument('--lr', default=1e-3, type=float) parser.add_argument('--weight_decay', default=0, type=float) # LSTM hyperparameter parser.add_argument('--word_embedding_dim', default=300, type=int) parser.add_argument('--text_embed_size', default=1024, type=int) parser.add_argument('--linear_hidden_size', default=256, type=int) parser.add_argument('--linear_dropout_rate', default=0, type=float) # input directories parser.add_argument( '--vocab_file', required=True, type=str, help='path to vocab file.') parser.add_argument( '--metadata_file', default='./templates/metadata_fsm.json', type=str, help='path to metadata file.') parser.add_argument( '--train_set_dir', required=True, type=str, help='path to train data.') parser.add_argument( '--val_set_dir', required=True, type=str, help='path to val data.') # output directories parser.add_argument( '--work_dir', required=True, type=str, help='path to save checkpoint and log files.') # misc parser.add_argument( '--unlabeled_value', default=999, type=int, help='value to represent unlabeled value') parser.add_argument('--num_workers', default=8, type=int) return parser.parse_args() best_val_acc, best_epoch, current_iters = 0, 0, 0 def main(): """Main function.""" # ################### Set Up ####################### global args, best_val_acc, best_epoch args = parse_args() logger = setup_logger( args.work_dir, logger_name='train.txt', debug=args.debug) args.device = torch.device('cuda') logger.info('Saving arguments.') logger.info(dict2str(args.__dict__)) # ################### Metadata ####################### with open(args.metadata_file, 'r') as f: args.metadata = json.load(f) args.num_head = len(args.metadata.items()) logger.info(f'args.num_head: {args.num_head}, ') logger.info(f'args.metadata: {args.metadata}.') # ################### Language Encoder ####################### # load vocab file with open(args.vocab_file, 'r') as f: vocab = json.load(f) text_token_to_idx = vocab['text_token_to_idx'] encoder = Encoder( token_to_idx=text_token_to_idx, word_embedding_dim=args.word_embedding_dim, text_embed_size=args.text_embed_size, metadata_file=args.metadata_file, linear_hidden_size=args.linear_hidden_size, linear_dropout_rate=args.linear_dropout_rate) encoder = encoder.to(args.device) # ################### DataLoader ####################### logger.info('Preparing train_dataset') train_dataset = EncoderDataset(preprocessed_dir=args.train_set_dir) logger.info('Preparing train_loader') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=False, sampler=None) logger.info('Preparing val_dataset') val_dataset = EncoderDataset(preprocessed_dir=args.val_set_dir) logger.info('Preparing val_loader') val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.val_batch, shuffle=False, num_workers=args.num_workers, pin_memory=False) logger.info(f'Number of train text: {len(train_dataset)}, ' f'Number of val text: {len(val_dataset)}.') data_loader = { 'train': train_loader, 'val': val_loader, } # ################### Optimizer ####################### optimizer = torch.optim.Adam( encoder.parameters(), args.lr, weight_decay=args.weight_decay) # ################### Loss Function ####################### criterion = nn.CrossEntropyLoss( reduction='mean', ignore_index=args.unlabeled_value) # ################### Epochs ####################### for epoch in range(args.num_epochs): logger.info( '----------- Training: Epoch ' f'({epoch + 1} / {args.num_epochs}), LR: {args.lr:.4f}. ---------' ) train_per_head_acc_avg, train_overall_acc = train( args, 'train', encoder, data_loader['train'], criterion, optimizer, logger, ) logger.info( 'Train accuracy ' f'({epoch + 1} / {args.num_epochs}), ' f'{[str(round(i, 2))+"%" for i in train_per_head_acc_avg]}') val_per_head_acc_avg, val_overall_acc = train( args, 'val', encoder, data_loader['val'], criterion, optimizer, logger, ) logger.info('Validation accuracy ' f'({epoch + 1} / {args.num_epochs}), ' f'{[str(round(i, 2))+"%" for i in val_per_head_acc_avg]}') # whether this epoch has the highest val acc so far is_best = val_overall_acc > best_val_acc if is_best: best_epoch = epoch + 1 best_val_acc = val_overall_acc logger.info( f'Best Epoch: {best_epoch}, best acc: {best_val_acc: .4f}.') save_checkpoint( args, { 'epoch': epoch + 1, 'best_epoch_so_far': best_epoch, 'state_dict': encoder.state_dict(), 'best_val_acc': best_val_acc, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.work_dir) logger.info('successful') def train(args, phase, encoder, data_loader, criterion, optimizer, logger): if phase == 'train': encoder.train() else: encoder.eval() # record time batch_time = AverageMeter() data_time = AverageMeter() end = time.time() # record accuracy per_head_acc_list = [AverageMeter() for _ in range(args.num_head)] for batch_idx, batch_data in enumerate(data_loader): data_time.update(time.time() - end) text, system_mode, labels = batch_data text = text.to(args.device) system_mode = system_mode.to(args.device) labels = labels.to(args.device) if phase == 'train': output = encoder(text) else: with torch.no_grad(): output = encoder(text) loss_list = [] # Labels: loss and acc for head_idx, (key, val) in enumerate(args.metadata.items()): loss = criterion(output[head_idx], labels[:, head_idx]) loss_list.append(loss) acc_dict = head_accuracy( output=output[head_idx], target=labels[:, head_idx], unlabeled_value=args.unlabeled_value) acc = acc_dict['acc'] labeled_count = int(acc_dict['labeled_count']) if labeled_count > 0: per_head_acc_list[head_idx].update(acc, labeled_count) loss_avg = sum(loss_list) / len(loss_list) if phase == 'train': optimizer.zero_grad() loss_avg.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() logger.info( f'Batch: {batch_idx+1}, ' f'Data time: {data_time.avg:.3f}s, Batch time: {batch_time.avg:.3f}s, ' # noqa f'loss: {loss_avg:.4f}.') overall_acc = 0 per_head_acc_avg = [] for head_idx in range(args.num_head): per_head_acc_avg.append(per_head_acc_list[head_idx].avg) overall_acc += per_head_acc_list[head_idx].avg overall_acc = overall_acc / args.num_head return per_head_acc_avg, overall_acc if __name__ == '__main__': main() ================================================ FILE: language/utils/__init__.py ================================================ """Useful utils """ # progress bar import os import sys from .eval import * # noqa from .logger import * # noqa from .lr_schedule import * # noqa from .misc import * # noqa from .numerical import * # noqa from .visualize import * # noqa sys.path.append(os.path.join(os.path.dirname(__file__), "progress")) from progress.bar import Bar as Bar # noqa ================================================ FILE: language/utils/eval.py ================================================ from __future__ import absolute_import, print_function import torch __all__ = ['classification_accuracy', 'regression_accuracy'] def classification_accuracy(output, target, class_wise=False, num_cls=6, excluded_cls_idx=None): """ Computes the precision@k for the specified values of k output: batch_size * num_cls (for a specific attribute) target: batch_size * 1 (for a specific attribute) return res: res = 100 * num_correct / batch_size, for a specific attribute for a batch """ with torch.no_grad(): batch_size = target.size(0) # _ = the largest score, pred = cls_idx with the largest score _, pred = output.topk(1, 1, True, True) pred = pred.reshape(-1) acc = float(torch.sum(pred == target)) / float(batch_size) * 100 return_dict = {'acc': acc} if excluded_cls_idx is not None: correct_count = torch.sum( (pred == target) * (target != excluded_cls_idx)) labeled_count = torch.sum(target != excluded_cls_idx) if labeled_count: labeled_acc = float(correct_count) / float(labeled_count) * 100 else: labeled_acc = 0 return_dict['labeled_acc'] = labeled_acc return_dict['labeled_count'] = labeled_count else: return_dict['labeled_acc'] = acc return_dict['labeled_count'] = batch_size if class_wise: acc_class_wise = [] per_class_count = [] # actual number of classes <= num_cls=6 for i in range(num_cls): total_sample_cls_i = torch.sum(target == i) if total_sample_cls_i: correct_samples_cls_i = torch.sum( (pred == i) * (target == i)) acc_class_wise.append( float(correct_samples_cls_i) / float(total_sample_cls_i) * 100) else: acc_class_wise.append(0) per_class_count.append(total_sample_cls_i) return_dict['acc_class_wise'] = acc_class_wise return_dict['per_class_count'] = per_class_count return return_dict def regression_accuracy(output, target, margin=0.2, uni_neg=True, class_wise=False, num_cls=6, excluded_cls_idx=None, max_cls_value=5): """ Computes the regression accuracy if predicted score is less than one margin from the ground-truth score, we consider it as correct otherwise it is incorrect, the acc is the percentage of correct regression class_wise: if True, then report overall accuracy and class-wise accuracy else, then only report overall accuracy """ output = output.clone().reshape(-1) if uni_neg: output[(output <= 0 + margin) * (target == 0)] = 0 output[(output >= max_cls_value - margin) * (target == max_cls_value)] = max_cls_value distance = torch.absolute(target - output) distance = distance - margin predicted_class = torch.zeros_like(target) # if distance <= 0, assign ground truth class predicted_class[distance <= 0] = target[distance <= 0] # if distance > 0, assign an invalid value predicted_class[distance > 0] = -1 acc = float(torch.sum(predicted_class == target)) / float( target.size(0)) * 100 return_dict = {'acc': acc} if excluded_cls_idx is not None: correct_count = torch.sum( (predicted_class == target) * (target != excluded_cls_idx)) labeled_count = torch.sum(target != excluded_cls_idx) if labeled_count: labeled_acc = float(correct_count) / float(labeled_count) * 100 else: labeled_acc = 0 return_dict['labeled_acc'] = labeled_acc return_dict['labeled_count'] = labeled_count else: labeled_acc = acc return_dict['labeled_acc'] = acc return_dict['labeled_count'] = target.size(0) if class_wise: acc_class_wise = [] per_class_count = [] for i in range(num_cls): total_sample_cls_i = torch.sum(target == i) if total_sample_cls_i: correct_samples_cls_i = torch.sum( (predicted_class == i) * (target == i)) acc_class_wise.append( float(correct_samples_cls_i) / float(total_sample_cls_i) * 100) else: acc_class_wise.append(0) per_class_count.append(total_sample_cls_i) return_dict['acc_class_wise'] = acc_class_wise return_dict['per_class_count'] = per_class_count return return_dict def main(): l1 = [ 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 2, 2, 2, 1.7, 0, 3, 3, 2.79, 3.3, 0, 4, 2, 5, 3, 0, 6, 6, 4.78, 6, 0 ] l2 = [ 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5 ] output = torch.FloatTensor(l1) target = torch.LongTensor(l2) acc = regression_accuracy(output, target, margin=0.2) print('acc:', acc) print() acc, acc_class_wise_list, per_class_count = regression_accuracy( output, target, margin=0.2, class_wise=True) print('acc:', acc) print('acc_class_wise_list:', acc_class_wise_list) print('per_class_count: ', per_class_count) if __name__ == '__main__': main() ================================================ FILE: language/utils/logger.py ================================================ from __future__ import absolute_import import datetime import logging import time import matplotlib.pyplot as plt import matplotlib.ticker as plticker import numpy as np # from mmcv.runner import get_dist_info, master_only __all__ = [ 'Logger', 'LoggerMonitor', 'savefig', 'MessageLogger', 'init_tb_logger', 'get_root_logger', 'dict2str' ] def savefig(fname, dpi=None): dpi = 150 if dpi is None else dpi plt.savefig(fname, dpi=dpi) def plot_overlap(logger, names=None): names = logger.names if names is None else names numbers = logger.numbers for _, name in enumerate(names): x = np.arange(len(numbers[name])) plt.plot(x, np.asarray(numbers[name])) return [logger.title + '(' + name + ')' for name in names] class Logger(object): '''Save training process to log file with simple plot function.''' def __init__(self, fpath, title=None, resume=False): self.file = None self.resume = resume self.title = '' if title is None else title if fpath is not None: if resume: self.file = open(fpath, 'r') name = self.file.readline() self.names = name.rstrip().split('\t') self.numbers = {} for _, name in enumerate(self.names): self.numbers[name] = [] for numbers in self.file: numbers = numbers.rstrip().split('\t') for i in range(0, len(numbers)): self.numbers[self.names[i]].append(numbers[i]) self.file.close() self.file = open(fpath, 'a') else: self.file = open(fpath, 'w') def set_names(self, names): if self.resume: pass # initialize numbers as empty list self.numbers = {} self.names = names for _, name in enumerate(self.names): self.file.write(name) self.file.write('\t') self.numbers[name] = [] self.file.write('\n') self.file.flush() def append(self, numbers): assert len(self.names) == len(numbers), 'Numbers do not match names' for index, num in enumerate(numbers): if type(num) == int: self.file.write(str(num)) elif type(num) == float: self.file.write("{0:.6f}".format(num)) else: # str self.file.write(str(num)) self.file.write('\t') self.numbers[self.names[index]].append(num) self.file.write('\n') self.file.flush() def plot(self, out_file, names=None): names = self.names if names is None else names numbers = self.numbers fig, ax = plt.subplots(1, 1) for _, name in enumerate(names): x = np.arange(len(numbers[name])) ax.plot(x, numbers[name]) # whether add data labels to each point in the plot if False: for i in range(len(x)): y = numbers[name][i] # text = round(y, 2) # below 4 line are added by ziqi if type(y) == int or type(y) == float: text = round(y, 2) else: text = y ax.text(x[i], y, text) ax.legend([self.title + '(' + name + ')' for name in names]) loc = plticker.MultipleLocator( base=1.0 ) # this locator puts ticks at regular intervals # ziqi added ax.xaxis.set_major_locator(loc) ax.grid(True) plt.savefig(out_file) plt.close() def close(self): if self.file is not None: self.file.close() def get_numbers(self): stats = {} for name in self.names: stats[name] = self.numbers[name] return stats class LoggerMonitor(object): '''Load and visualize multiple logs.''' def __init__(self, paths): '''paths is a distionary with {name:filepath} pair''' self.loggers = [] for title, path in paths.items(): logger = Logger(path, title=title, resume=True) self.loggers.append(logger) def plot(self, names=None): plt.figure() plt.subplot(121) legend_text = [] for logger in self.loggers: legend_text += plot_overlap(logger, names) plt.legend( legend_text, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.grid(True) class MessageLogger(): """Message logger for printing. Args: opt (dict): Config. It contains the following keys: name (str): Exp name. logger (dict): Contains 'print_freq' (str) for logger interval. train (dict): Contains 'niter' (int) for total iters. use_tb_logger (bool): Use tensorboard logger. start_iter (int): Start iter. Default: 1. tb_logger (obj:`tb_logger`): Tensorboard logger. Default: None. """ def __init__(self, opt, start_iter=1, tb_logger=None): self.exp_name = opt['name'] self.interval = opt['logger']['print_freq'] self.start_iter = start_iter self.max_iters = opt['train']['niter'] self.use_tb_logger = opt['use_tb_logger'] self.tb_logger = tb_logger self.start_time = time.time() self.logger = get_root_logger() # @master_only def __call__(self, log_vars): """Format logging message. Args: log_vars (dict): It contains the following keys: epoch (int): Epoch number. iter (int): Current iter. lrs (list): List for learning rates. time (float): Iter time. data_time (float): Data time for each iter. """ # epoch, iter, learning rates epoch = log_vars.pop('epoch') current_iter = log_vars.pop('iter') lrs = log_vars.pop('lrs') message = (f'[{self.exp_name[:5]}..][epoch:{epoch:3d}, ' f'iter:{current_iter:8,d}, lr:(') for v in lrs: message += f'{v:.3e},' message += ')] ' # time and estimated time if 'time' in log_vars.keys(): iter_time = log_vars.pop('time') data_time = log_vars.pop('data_time') total_time = time.time() - self.start_time time_sec_avg = total_time / (current_iter - self.start_iter + 1) eta_sec = time_sec_avg * (self.max_iters - current_iter - 1) eta_str = str(datetime.timedelta(seconds=int(eta_sec))) message += f'[eta: {eta_str}, ' message += f'time: {iter_time:.3f}, data_time: {data_time:.3f}] ' # other items, especially losses for k, v in log_vars.items(): message += f'{k}: {v:.4e} ' # tensorboard logger if self.use_tb_logger and 'debug' not in self.exp_name: self.tb_logger.add_scalar(k, v, current_iter) self.logger.info(message) # @master_only def init_tb_logger(log_dir): from torch.utils.tensorboard import SummaryWriter tb_logger = SummaryWriter(log_dir=log_dir) return tb_logger def get_root_logger(logger_name='base', log_level=logging.INFO, log_file=None): """Get the root logger. The logger will be initialized if it has not been initialized. By default a StreamHandler will be added. If `log_file` is specified, a FileHandler will also be added. Args: logger_name (str): root logger name. Default: base. log_file (str | None): The log filename. If specified, a FileHandler will be added to the root logger. log_level (int): The root logger level. Note that only the process of rank 0 is affected, while other processes will set the level to "Error" and be silent most of the time. Returns: logging.Logger: The root logger. """ logger = logging.getLogger(logger_name) # if the logger has been initialized, just return it if logger.hasHandlers(): return logger format_str = '%(asctime)s.%(msecs)03d - %(levelname)s: %(message)s' logging.basicConfig(format=format_str, level=log_level) if log_file is not None: file_handler = logging.FileHandler(log_file, 'w') file_handler.setFormatter(logging.Formatter(format_str)) file_handler.setLevel(log_level) logger.addHandler(file_handler) return logger def dict2str(opt, indent_level=1): """dict to string for printing options. Args: opt (dict): Option dict. indent_level (int): Indent level. Default: 1. Return: (str): Option string for printing. """ msg = '' for k, v in opt.items(): if isinstance(v, dict): msg += ' ' * (indent_level * 2) + k + ':[\n' msg += dict2str(v, indent_level + 1) msg += ' ' * (indent_level * 2) + ']\n' else: msg += ' ' * (indent_level * 2) + k + ': ' + str(v) + '\n' return msg ================================================ FILE: language/utils/lr_schedule.py ================================================ import math __all__ = ['adjust_learning_rate'] def adjust_learning_rate(args, optimizer, epoch): lr = optimizer.param_groups[0]['lr'] """ Sets the learning rate to the initial LR decayed by 10 following schedule """ if args.lr_decay == 'step': lr = args.lr * (args.gamma**(epoch // args.step)) elif args.lr_decay == 'cos': lr = args.lr * (1 + math.cos(math.pi * epoch / args.epochs)) / 2 elif args.lr_decay == 'linear': lr = args.lr * (1 - epoch / args.epochs) elif args.lr_decay == 'linear2exp': if epoch < args.turning_point + 1: # learning rate decay as 95% # at the turning point (1 / 95% = 1.0526) lr = args.lr * (1 - epoch / int(args.turning_point * 1.0526)) else: lr *= args.gamma elif args.lr_decay == 'schedule': if epoch in args.schedule: lr *= args.gamma else: raise ValueError('Unknown lr mode {}'.format(args.lr_decay)) for param_group in optimizer.param_groups: param_group['lr'] = lr return lr ================================================ FILE: language/utils/misc.py ================================================ '''Some helper functions for PyTorch, including: - get_mean_and_std: calculate the mean and std value of dataset. - msr_init: net parameter initialization. - progress_bar: progress bar mimic xlua.progress. ''' import errno import os import torch import torch.nn as nn import torch.nn.init as init __all__ = [ 'get_mean_and_std', 'init_params', 'mkdir_p', 'save_checkpoint', 'AverageMeter' ] def get_mean_and_std(dataset): '''Compute the mean and std value of dataset.''' dataloader = trainloader = torch.utils.data.DataLoader( # noqa dataset, batch_size=1, shuffle=True, num_workers=2) mean = torch.zeros(3) std = torch.zeros(3) print('==> Computing mean and std..') for inputs, targets in dataloader: for i in range(3): mean[i] += inputs[:, i, :, :].mean() std[i] += inputs[:, i, :, :].std() mean.div_(len(dataset)) std.div_(len(dataset)) return mean, std def init_params(net): '''Init layer parameters.''' for m in net.modules(): if isinstance(m, nn.Conv2d): init.kaiming_normal(m.weight, mode='fan_out') if m.bias: init.constant(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): init.constant(m.weight, 1) init.constant(m.bias, 0) elif isinstance(m, nn.Linear): init.normal(m.weight, std=1e-3) if m.bias: init.constant(m.bias, 0) def mkdir_p(path): '''make dir if not exist''' try: os.makedirs(path) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def save_checkpoint(args, state, is_best, checkpoint='checkpoint', filename='checkpoint.pth.tar'): epoch = str(state['epoch']).zfill(2) save_every_epoch = True if not os.path.exists(os.path.join(args.work_dir, 'checkpoints')): os.makedirs(os.path.join(args.work_dir, 'checkpoints')) if save_every_epoch: filename = 'checkpoint_' + epoch + '.pth.tar' filepath = os.path.join(checkpoint, 'checkpoints', filename) torch.save(state, filepath) if is_best: filename = 'model_best.pth.tar' filepath = os.path.join(checkpoint, 'checkpoints', filename) torch.save(state, filepath) # shutil.copyfile(filepath, os.path.join(checkpoint, \ # 'model_best_'+epoch+'.pth.tar')) class AverageMeter(object): """ Computes and stores the average and current value Imported from https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262 """ def __init__(self): self.reset() def reset(self): self.val = 0 self.avg = 0 # running average = running sum / running count self.sum = 0 # running sum self.count = 0 # running count def update(self, val, n=1): # n = batch_size # val = batch accuracy for an attribute # self.val = val # sum = 100 * accumulative correct predictions for this attribute self.sum += val * n # count = total samples so far self.count += n # avg = 100 * avg accuracy for this attribute # for all the batches so far self.avg = self.sum / self.count ================================================ FILE: language/utils/numerical.py ================================================ import json import numpy as np __all__ = ['get_weight', 'transpose_and_format'] def get_weight(args): """ read the attribute class distribution file stats.txt and return the counts """ # read counts from stats file stats_f = open(args.stats_file, "r") # each list [] in the count_list is for one attribute # each value in [] is the number of training samples # for that attribute value count_list = [] for i in range(args.num_attr): count_list.append([]) for row_idx, row in enumerate(stats_f): # row 0 is attr names, row 1 is unlabeled statistics if row_idx == 0 or row_idx == 1: continue # [:-1] because the last value is the new line character row = row.split(' ')[:-1] for new_idx_in_row, attr_val in enumerate(row): # print('num_idx:', num_idx, 'num:', num) if new_idx_in_row == 0: continue new_idx = new_idx_in_row - 1 count_list[new_idx].append((int(attr_val))) # **0.5) # weight for gt_remapping case count_list = np.array(count_list) num_attr = count_list.shape[0] num_cls = count_list.shape[1] if args.gt_remapping: remap_count_list = np.zeros((num_attr, num_cls)) for attr_idx in range(num_attr): for cls_idx in range(num_cls): new_cls_idx = int(args.gt_remapping[attr_idx][cls_idx]) remap_count_list[attr_idx][new_cls_idx] += count_list[ attr_idx][cls_idx] count_list = remap_count_list # For each attribute, among classes, weight Inversion and Normalization value_weights = [] for attr_idx in range(num_attr): weight_l = np.zeros(num_cls) for cls_idx in range(num_cls): weight_l[cls_idx] = (1 / count_list[attr_idx][cls_idx] ) if count_list[attr_idx][cls_idx] else 0 # normalize weight_l so that their average value is 1 normalized_weight_l = np.zeros(num_cls) for cls_idx in range(num_cls): normalized_weight_l[cls_idx] = weight_l[cls_idx] / sum(weight_l) value_weights.append(normalized_weight_l) # Among attributes, weight Inversion and Normalization # count_sum_list = [] # for a_list in count_list: # count_sum_list.append(sum(a_list)) # count_sum = sum(count_sum_list) # attribute_weights = [] # for i in range(len(count_sum_list)): # attribute_weight = count_sum / count_sum_list[i] # attribute_weights.append(attribute_weight) # # normalize attribute_weights so that their average value is 1 # normalized_attribute_weights = [] # for i in range(len(attribute_weights)): # normalized_attribute_weights.append(attribute_weights[i] / # sum(attribute_weights) * # len(attribute_weights)) weights = {'value_weights': value_weights} return weights def transpose_and_format(args, input): """ input = [ [#, #, #, #, #, #], [#, #, #, #, #, #], [#, #, #, #, #, #] ] where outer loop is attribute inner loop is class labels new_f: attr_val Bangs Smiling Young 0 # # # 1 # # # 2 # # # 3 # # # 4 # # # 5 # # # """ with open(args.attr_file, 'r') as f: attr_f = json.load(f) attr_info = attr_f['attr_info'] attr_list = ['attr_val'] for key, val in attr_info.items(): attr_list.append(val["name"]) # new_f stores the output new_f = [] # first line is the header new_f.append(attr_list) for i in range(len(input[0])): row = [] row.append(i) for j in range(args.num_attr): row.append(round(input[j][i].item(), 2)) # row.append(round(input[j][i], 2)) new_f.append(row) return new_f ================================================ FILE: language/utils/progress/.gitignore ================================================ ================================================ FILE: language/utils/progress/LICENSE ================================================ # Copyright (c) 2012 Giorgos Verigakis # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ================================================ FILE: language/utils/progress/MANIFEST.in ================================================ include README.rst LICENSE ================================================ FILE: language/utils/progress/README.rst ================================================ Easy progress reporting for Python ================================== |pypi| |demo| .. |pypi| image:: https://img.shields.io/pypi/v/progress.svg .. |demo| image:: https://raw.github.com/verigak/progress/master/demo.gif :alt: Demo Bars ---- There are 7 progress bars to choose from: - ``Bar`` - ``ChargingBar`` - ``FillingSquaresBar`` - ``FillingCirclesBar`` - ``IncrementalBar`` - ``PixelBar`` - ``ShadyBar`` To use them, just call ``next`` to advance and ``finish`` to finish: .. code-block:: python from progress.bar import Bar bar = Bar('Processing', max=20) for i in range(20): # Do some work bar.next() bar.finish() The result will be a bar like the following: :: Processing |############# | 42/100 To simplify the common case where the work is done in an iterator, you can use the ``iter`` method: .. code-block:: python for i in Bar('Processing').iter(it): # Do some work Progress bars are very customizable, you can change their width, their fill character, their suffix and more: .. code-block:: python bar = Bar('Loading', fill='@', suffix='%(percent)d%%') This will produce a bar like the following: :: Loading |@@@@@@@@@@@@@ | 42% You can use a number of template arguments in ``message`` and ``suffix``: ========== ================================ Name Value ========== ================================ index current value max maximum value remaining max - index progress index / max percent progress * 100 avg simple moving average time per item (in seconds) elapsed elapsed time in seconds elapsed_td elapsed as a timedelta (useful for printing as a string) eta avg * remaining eta_td eta as a timedelta (useful for printing as a string) ========== ================================ Instead of passing all configuration options on instatiation, you can create your custom subclass: .. code-block:: python class FancyBar(Bar): message = 'Loading' fill = '*' suffix = '%(percent).1f%% - %(eta)ds' You can also override any of the arguments or create your own: .. code-block:: python class SlowBar(Bar): suffix = '%(remaining_hours)d hours remaining' @property def remaining_hours(self): return self.eta // 3600 Spinners ======== For actions with an unknown number of steps you can use a spinner: .. code-block:: python from progress.spinner import Spinner spinner = Spinner('Loading ') while state != 'FINISHED': # Do some work spinner.next() There are 5 predefined spinners: - ``Spinner`` - ``PieSpinner`` - ``MoonSpinner`` - ``LineSpinner`` - ``PixelSpinner`` Other ===== There are a number of other classes available too, please check the source or subclass one of them to create your own. License ======= progress is licensed under ISC ================================================ FILE: language/utils/progress/progress/__init__.py ================================================ # Copyright (c) 2012 Giorgos Verigakis # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. from __future__ import division from collections import deque from datetime import timedelta from math import ceil from sys import stderr from time import time __version__ = '1.3' class Infinite(object): file = stderr sma_window = 10 # Simple Moving Average window def __init__(self, *args, **kwargs): self.index = 0 self.start_ts = time() self.avg = 0 self._ts = self.start_ts self._xput = deque(maxlen=self.sma_window) for key, val in kwargs.items(): setattr(self, key, val) def __getitem__(self, key): if key.startswith('_'): return None return getattr(self, key, None) @property def elapsed(self): return int(time() - self.start_ts) @property def elapsed_td(self): return timedelta(seconds=self.elapsed) def update_avg(self, n, dt): if n > 0: self._xput.append(dt / n) self.avg = sum(self._xput) / len(self._xput) def update(self): pass def start(self): pass def finish(self): pass def next(self, n=1): now = time() dt = now - self._ts self.update_avg(n, dt) self._ts = now self.index = self.index + n self.update() def iter(self, it): try: for x in it: yield x self.next() finally: self.finish() class Progress(Infinite): def __init__(self, *args, **kwargs): super(Progress, self).__init__(*args, **kwargs) self.max = kwargs.get('max', 100) @property def eta(self): return int(ceil(self.avg * self.remaining)) @property def eta_td(self): return timedelta(seconds=self.eta) @property def percent(self): return self.progress * 100 @property def progress(self): return min(1, self.index / self.max) @property def remaining(self): return max(self.max - self.index, 0) def start(self): self.update() def goto(self, index): incr = index - self.index self.next(incr) def iter(self, it): try: self.max = len(it) except TypeError: pass try: for x in it: yield x self.next() finally: self.finish() ================================================ FILE: language/utils/progress/progress/bar.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) 2012 Giorgos Verigakis # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. from __future__ import unicode_literals from . import Progress from .helpers import WritelnMixin class Bar(WritelnMixin, Progress): width = 32 message = '' suffix = '%(index)d/%(max)d' bar_prefix = ' |' bar_suffix = '| ' empty_fill = ' ' fill = '#' hide_cursor = True def update(self): filled_length = int(self.width * self.progress) empty_length = self.width - filled_length message = self.message % self bar = self.fill * filled_length empty = self.empty_fill * empty_length suffix = self.suffix % self line = ''.join([message, self.bar_prefix, bar, empty, self.bar_suffix, suffix]) self.writeln(line) class ChargingBar(Bar): suffix = '%(percent)d%%' bar_prefix = ' ' bar_suffix = ' ' empty_fill = '∙' fill = '█' class FillingSquaresBar(ChargingBar): empty_fill = '▢' fill = '▣' class FillingCirclesBar(ChargingBar): empty_fill = '◯' fill = '◉' class IncrementalBar(Bar): phases = (' ', '▏', '▎', '▍', '▌', '▋', '▊', '▉', '█') def update(self): nphases = len(self.phases) filled_len = self.width * self.progress nfull = int(filled_len) # Number of full chars phase = int((filled_len - nfull) * nphases) # Phase of last char nempty = self.width - nfull # Number of empty chars message = self.message % self bar = self.phases[-1] * nfull current = self.phases[phase] if phase > 0 else '' empty = self.empty_fill * max(0, nempty - len(current)) suffix = self.suffix % self line = ''.join([message, self.bar_prefix, bar, current, empty, self.bar_suffix, suffix]) self.writeln(line) class PixelBar(IncrementalBar): phases = ('⡀', '⡄', '⡆', '⡇', '⣇', '⣧', '⣷', '⣿') class ShadyBar(IncrementalBar): phases = (' ', '░', '▒', '▓', '█') ================================================ FILE: language/utils/progress/progress/counter.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) 2012 Giorgos Verigakis # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. from __future__ import unicode_literals from . import Infinite, Progress from .helpers import WriteMixin class Counter(WriteMixin, Infinite): message = '' hide_cursor = True def update(self): self.write(str(self.index)) class Countdown(WriteMixin, Progress): hide_cursor = True def update(self): self.write(str(self.remaining)) class Stack(WriteMixin, Progress): phases = (' ', '▁', '▂', '▃', '▄', '▅', '▆', '▇', '█') hide_cursor = True def update(self): nphases = len(self.phases) i = min(nphases - 1, int(self.progress * nphases)) self.write(self.phases[i]) class Pie(Stack): phases = ('○', '◔', '◑', '◕', '●') ================================================ FILE: language/utils/progress/progress/helpers.py ================================================ # Copyright (c) 2012 Giorgos Verigakis # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. from __future__ import print_function HIDE_CURSOR = '\x1b[?25l' SHOW_CURSOR = '\x1b[?25h' class WriteMixin(object): hide_cursor = False def __init__(self, message=None, **kwargs): super(WriteMixin, self).__init__(**kwargs) self._width = 0 if message: self.message = message if self.file.isatty(): if self.hide_cursor: print(HIDE_CURSOR, end='', file=self.file) print(self.message, end='', file=self.file) self.file.flush() def write(self, s): if self.file.isatty(): b = '\b' * self._width c = s.ljust(self._width) print(b + c, end='', file=self.file) self._width = max(self._width, len(s)) self.file.flush() def finish(self): if self.file.isatty() and self.hide_cursor: print(SHOW_CURSOR, end='', file=self.file) class WritelnMixin(object): hide_cursor = False def __init__(self, message=None, **kwargs): super(WritelnMixin, self).__init__(**kwargs) if message: self.message = message if self.file.isatty() and self.hide_cursor: print(HIDE_CURSOR, end='', file=self.file) def clearln(self): if self.file.isatty(): print('\r\x1b[K', end='', file=self.file) def writeln(self, line): if self.file.isatty(): self.clearln() print(line, end='', file=self.file) self.file.flush() def finish(self): if self.file.isatty(): print(file=self.file) if self.hide_cursor: print(SHOW_CURSOR, end='', file=self.file) from signal import signal, SIGINT from sys import exit class SigIntMixin(object): """Registers a signal handler that calls finish on SIGINT""" def __init__(self, *args, **kwargs): super(SigIntMixin, self).__init__(*args, **kwargs) signal(SIGINT, self._sigint_handler) def _sigint_handler(self, signum, frame): self.finish() exit(0) ================================================ FILE: language/utils/progress/progress/spinner.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) 2012 Giorgos Verigakis # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. from __future__ import unicode_literals from . import Infinite from .helpers import WriteMixin class Spinner(WriteMixin, Infinite): message = '' phases = ('-', '\\', '|', '/') hide_cursor = True def update(self): i = self.index % len(self.phases) self.write(self.phases[i]) class PieSpinner(Spinner): phases = ['◷', '◶', '◵', '◴'] class MoonSpinner(Spinner): phases = ['◑', '◒', '◐', '◓'] class LineSpinner(Spinner): phases = ['⎺', '⎻', '⎼', '⎽', '⎼', '⎻'] class PixelSpinner(Spinner): phases = ['⣾','⣷', '⣯', '⣟', '⡿', '⢿', '⣻', '⣽'] ================================================ FILE: language/utils/progress/setup.py ================================================ #!/usr/bin/env python from setuptools import setup import progress setup( name='progress', version=progress.__version__, description='Easy to use progress bars', long_description=open('README.rst').read(), author='Giorgos Verigakis', author_email='verigak@gmail.com', url='http://github.com/verigak/progress/', license='ISC', packages=['progress'], classifiers=[ 'Environment :: Console', 'Intended Audience :: Developers', 'License :: OSI Approved :: ISC License (ISCL)', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', ] ) ================================================ FILE: language/utils/progress/test_progress.py ================================================ #!/usr/bin/env python from __future__ import print_function import random import time from progress.bar import (Bar, ChargingBar, FillingSquaresBar, FillingCirclesBar, IncrementalBar, PixelBar, ShadyBar) from progress.spinner import (Spinner, PieSpinner, MoonSpinner, LineSpinner, PixelSpinner) from progress.counter import Counter, Countdown, Stack, Pie def sleep(): t = 0.01 t += t * random.uniform(-0.1, 0.1) # Add some variance time.sleep(t) for bar_cls in (Bar, ChargingBar, FillingSquaresBar, FillingCirclesBar): suffix = '%(index)d/%(max)d [%(elapsed)d / %(eta)d / %(eta_td)s]' bar = bar_cls(bar_cls.__name__, suffix=suffix) for i in bar.iter(range(200)): sleep() for bar_cls in (IncrementalBar, PixelBar, ShadyBar): suffix = '%(percent)d%% [%(elapsed_td)s / %(eta)d / %(eta_td)s]' bar = bar_cls(bar_cls.__name__, suffix=suffix) for i in bar.iter(range(200)): sleep() for spin in (Spinner, PieSpinner, MoonSpinner, LineSpinner, PixelSpinner): for i in spin(spin.__name__ + ' ').iter(range(100)): sleep() print() for singleton in (Counter, Countdown, Stack, Pie): for i in singleton(singleton.__name__ + ' ').iter(range(100)): sleep() print() bar = IncrementalBar('Random', suffix='%(index)d') for i in range(100): bar.goto(random.randint(0, 100)) sleep() bar.finish() ================================================ FILE: language/utils/setup_logger.py ================================================ # python3.7 """Utility functions for logging.""" import logging import os import sys __all__ = ['setup_logger'] def setup_logger(work_dir=None, logfile_name='log.txt', logger_name='logger', debug=0): """Sets up logger from target work directory. The function will sets up a logger with `DEBUG` log level. Two handlers will be added to the logger automatically. One is the `sys.stdout` stream, with `INFO` log level, which will print improtant messages on the screen. The other is used to save all messages to file `$WORK_DIR/$LOGFILE_NAME`. Messages will be added time stamp and log level before logged. NOTE: If `work_dir` or `logfile_name` is empty, the file stream will be skipped. Args: work_dir: The work directory. All intermediate files will be saved here. (default: None) logfile_name: Name of the file to save log message. (default: `log.txt`) logger_name: Unique name for the logger. (default: `logger`) Returns: A `logging.Logger` object. Raises: SystemExit: If the work directory has already existed, of the logger with specified name `logger_name` has already existed. """ logger = logging.getLogger(logger_name) if logger.hasHandlers(): # Already existed raise SystemExit( f'Logger name `{logger_name}` has already been set up!\n' f'Please use another name, or otherwise the messages ' f'may be mixed between these two loggers.') logger.setLevel(logging.DEBUG) formatter = logging.Formatter("[%(asctime)s][%(levelname)s] %(message)s") # Print log message with `INFO` level or above onto the screen. sh = logging.StreamHandler(stream=sys.stdout) sh.setLevel(logging.INFO) sh.setFormatter(formatter) logger.addHandler(sh) if not work_dir or not logfile_name: return logger if os.path.exists(work_dir) and debug == 0: raise SystemExit(f'Work directory `{work_dir}` has already existed!\n' f'Please specify another one.') os.makedirs(work_dir, exist_ok=debug) # Save log message with all levels in log file. fh = logging.FileHandler(os.path.join(work_dir, logfile_name)) fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger.addHandler(fh) return logger ================================================ FILE: language/utils/visualize.py ================================================ import matplotlib.pyplot as plt import torch import torch.nn as nn import torchvision import torchvision.transforms as transforms import numpy as np from .misc import * __all__ = ['make_image', 'show_batch', 'show_mask', 'show_mask_single'] # functions to show an image def make_image(img, mean=(0,0,0), std=(1,1,1)): for i in range(0, 3): img[i] = img[i] * std[i] + mean[i] # unnormalize npimg = img.numpy() return np.transpose(npimg, (1, 2, 0)) def gauss(x,a,b,c): return torch.exp(-torch.pow(torch.add(x,-b),2).div(2*c*c)).mul(a) def colorize(x): ''' Converts a one-channel grayscale image to a color heatmap image ''' if x.dim() == 2: torch.unsqueeze(x, 0, out=x) if x.dim() == 3: cl = torch.zeros([3, x.size(1), x.size(2)]) cl[0] = gauss(x,.5,.6,.2) + gauss(x,1,.8,.3) cl[1] = gauss(x,1,.5,.3) cl[2] = gauss(x,1,.2,.3) cl[cl.gt(1)] = 1 elif x.dim() == 4: cl = torch.zeros([x.size(0), 3, x.size(2), x.size(3)]) cl[:,0,:,:] = gauss(x,.5,.6,.2) + gauss(x,1,.8,.3) cl[:,1,:,:] = gauss(x,1,.5,.3) cl[:,2,:,:] = gauss(x,1,.2,.3) return cl def show_batch(images, Mean=(2, 2, 2), Std=(0.5,0.5,0.5)): images = make_image(torchvision.utils.make_grid(images), Mean, Std) plt.imshow(images) plt.show() def show_mask_single(images, mask, Mean=(2, 2, 2), Std=(0.5,0.5,0.5)): im_size = images.size(2) # save for adding mask im_data = images.clone() for i in range(0, 3): im_data[:,i,:,:] = im_data[:,i,:,:] * Std[i] + Mean[i] # unnormalize images = make_image(torchvision.utils.make_grid(images), Mean, Std) plt.subplot(2, 1, 1) plt.imshow(images) plt.axis('off') # for b in range(mask.size(0)): # mask[b] = (mask[b] - mask[b].min())/(mask[b].max() - mask[b].min()) mask_size = mask.size(2) # print('Max %f Min %f' % (mask.max(), mask.min())) mask = (upsampling(mask, scale_factor=im_size/mask_size)) # mask = colorize(upsampling(mask, scale_factor=im_size/mask_size)) # for c in range(3): # mask[:,c,:,:] = (mask[:,c,:,:] - Mean[c])/Std[c] # print(mask.size()) mask = make_image(torchvision.utils.make_grid(0.3*im_data+0.7*mask.expand_as(im_data))) # mask = make_image(torchvision.utils.make_grid(0.3*im_data+0.7*mask), Mean, Std) plt.subplot(2, 1, 2) plt.imshow(mask) plt.axis('off') def show_mask(images, masklist, Mean=(2, 2, 2), Std=(0.5,0.5,0.5)): im_size = images.size(2) # save for adding mask im_data = images.clone() for i in range(0, 3): im_data[:,i,:,:] = im_data[:,i,:,:] * Std[i] + Mean[i] # unnormalize images = make_image(torchvision.utils.make_grid(images), Mean, Std) plt.subplot(1+len(masklist), 1, 1) plt.imshow(images) plt.axis('off') for i in range(len(masklist)): mask = masklist[i].data.cpu() # for b in range(mask.size(0)): # mask[b] = (mask[b] - mask[b].min())/(mask[b].max() - mask[b].min()) mask_size = mask.size(2) # print('Max %f Min %f' % (mask.max(), mask.min())) mask = (upsampling(mask, scale_factor=im_size/mask_size)) # mask = colorize(upsampling(mask, scale_factor=im_size/mask_size)) # for c in range(3): # mask[:,c,:,:] = (mask[:,c,:,:] - Mean[c])/Std[c] # print(mask.size()) mask = make_image(torchvision.utils.make_grid(0.3*im_data+0.7*mask.expand_as(im_data))) # mask = make_image(torchvision.utils.make_grid(0.3*im_data+0.7*mask), Mean, Std) plt.subplot(1+len(masklist), 1, i+2) plt.imshow(mask) plt.axis('off') # x = torch.zeros(1, 3, 3) # out = colorize(x) # out_im = make_image(out) # plt.imshow(out_im) # plt.show() ================================================ FILE: models/__init__.py ================================================ import glob import importlib import logging import os.path as osp # automatically scan and import model modules # scan all the files under the 'models' folder and collect files ending with # '_model.py' model_folder = osp.dirname(osp.abspath(__file__)) model_filenames = [ osp.splitext(osp.basename(v))[0] for v in glob.glob(f'{model_folder}/*_model.py') ] # import all the model modules _model_modules = [ importlib.import_module(f'models.{file_name}') for file_name in model_filenames ] def create_model(opt): """Create model. Args: opt (dict): Configuration. It constains: model_type (str): Model type. """ model_type = opt['model_type'] # dynamically instantiation for module in _model_modules: model_cls = getattr(module, model_type, None) if model_cls is not None: break if model_cls is None: raise ValueError(f'Model {model_type} is not found.') model = model_cls(opt) logger = logging.getLogger('base') logger.info(f'Model [{model.__class__.__name__}] is created.') return model ================================================ FILE: models/archs/__init__.py ================================================ ================================================ FILE: models/archs/attribute_predictor_arch.py ================================================ import json import torch.nn as nn import torch.utils.model_zoo as model_zoo __all__ = ['ResNet', 'resnet50'] model_urls = { 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', } def conv3x3(in_planes, out_planes, stride=1): """3x3 convolution with padding""" return nn.Conv2d( in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) def conv1x1(in_planes, out_planes, stride=1): """1x1 convolution""" return nn.Conv2d( in_planes, out_planes, kernel_size=1, stride=stride, bias=False) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None): super(BasicBlock, self).__init__() self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = nn.BatchNorm2d(planes) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = nn.BatchNorm2d(planes) self.downsample = downsample self.stride = stride def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None): super(Bottleneck, self).__init__() self.conv1 = conv1x1(inplanes, planes) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = conv3x3(planes, planes, stride) self.bn2 = nn.BatchNorm2d(planes) self.conv3 = conv1x1(planes, planes * self.expansion) self.bn3 = nn.BatchNorm2d(planes * self.expansion) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class fc_block(nn.Module): def __init__(self, inplanes, planes, drop_rate=0.15): super(fc_block, self).__init__() self.fc = nn.Linear(inplanes, planes) self.bn = nn.BatchNorm1d(planes) if drop_rate > 0: self.dropout = nn.Dropout(drop_rate) self.relu = nn.ReLU(inplace=True) self.drop_rate = drop_rate def forward(self, x): x = self.fc(x) x = self.bn(x) if self.drop_rate > 0: x = self.dropout(x) x = self.relu(x) return x class ResNet(nn.Module): def __init__(self, block, layers, attr_file, zero_init_residual=False, dropout_rate=0): super(ResNet, self).__init__() self.inplanes = 64 self.conv1 = nn.Conv2d( 3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.stem = fc_block(512 * block.expansion, 512, dropout_rate) # construct classifier heads according to the number of values of # each attribute self.attr_file = attr_file with open(self.attr_file, 'r') as f: attr_f = json.load(f) self.attr_info = attr_f['attr_info'] for idx, (key, val) in enumerate(self.attr_info.items()): num_val = int(len(val["value"])) setattr( self, 'classifier' + str(key).zfill(2) + val["name"], nn.Sequential( fc_block(512, 256, dropout_rate), nn.Linear(256, num_val))) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_( m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) # Zero-initialize the last BN in each residual branch, # so that the residual branch starts with zeros, and each residual # block behaves like an identity. # This improves the model by 0.2~0.3% according # to https://arxiv.org/abs/1706.02677 if zero_init_residual: for m in self.modules(): if isinstance(m, Bottleneck): nn.init.constant_(m.bn3.weight, 0) elif isinstance(m, BasicBlock): nn.init.constant_(m.bn2.weight, 0) def _make_layer(self, block, planes, blocks, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( conv1x1(self.inplanes, planes * block.expansion, stride), nn.BatchNorm2d(planes * block.expansion), ) layers = [] layers.append(block(self.inplanes, planes, stride, downsample)) self.inplanes = planes * block.expansion for _ in range(1, blocks): layers.append(block(self.inplanes, planes)) return nn.Sequential(*layers) def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.avgpool(x) x = x.view(x.size(0), -1) x = self.stem(x) y = [] for idx, (key, val) in enumerate(self.attr_info.items()): classifier = getattr( self, 'classifier' + str(key).zfill(2) + val["name"]) y.append(classifier(x)) return y def resnet50(pretrained=True, **kwargs): """Constructs a ResNet-50 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) if pretrained: init_pretrained_weights(model, model_urls['resnet50']) return model def init_pretrained_weights(model, model_url): """ Initialize model with pretrained weights. Layers that don't match with pretrained layers in name or size are kept unchanged. """ pretrain_dict = model_zoo.load_url(model_url) model_dict = model.state_dict() pretrain_dict = { k: v for k, v in pretrain_dict.items() if k in model_dict and model_dict[k].size() == v.size() } model_dict.update(pretrain_dict) model.load_state_dict(model_dict) print( "Initialized model with pretrained weights from {}".format(model_url)) ================================================ FILE: models/archs/field_function_arch.py ================================================ import torch import torch.nn as nn class FieldFunction(nn.Module): def __init__( self, num_layer=4, latent_dim=512, hidden_dim=512, leaky_relu_neg_slope=0.2, ): super(FieldFunction, self).__init__() layers = [] # first layer linear_layer = LinearLayer( in_dim=latent_dim, out_dim=hidden_dim, activation=True, negative_slope=leaky_relu_neg_slope) layers.append(linear_layer) # hidden layers for i in range(num_layer - 2): linear_layer = LinearLayer( in_dim=hidden_dim, out_dim=hidden_dim, activation=True, negative_slope=leaky_relu_neg_slope) layers.append(linear_layer) # final layers linear_layer = LinearLayer( in_dim=hidden_dim, out_dim=latent_dim, activation=False) layers.append(linear_layer) self.field = nn.Sequential(*layers) def forward(self, x): x = self.field(x) return x class LinearLayer(nn.Module): def __init__( self, in_dim=512, out_dim=512, activation=True, negative_slope=0.2, ): super(LinearLayer, self).__init__() self.Linear = nn.Linear( in_features=in_dim, out_features=out_dim, bias=True) self.activation = activation if activation: self.leaky_relu = nn.LeakyReLU( negative_slope=negative_slope, inplace=False) def forward(self, x): x = self.Linear(x) if self.activation: x = self.leaky_relu(x) return x class Normalization(nn.Module): def __init__(self, ): super(Normalization, self).__init__() self.mean = torch.tensor([0.485, 0.456, 0.406 ]).unsqueeze(-1).unsqueeze(-1).to('cuda') print(self.mean.shape) self.std = torch.tensor([0.229, 0.224, 0.225]).unsqueeze(-1).unsqueeze(-1).to('cuda') def forward(self, x): x = torch.sub(x, self.mean) x = torch.div(x, self.std) return x ================================================ FILE: models/archs/stylegan2/.gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ wandb/ *.lmdb/ *.pkl ================================================ FILE: models/archs/stylegan2/LICENSE ================================================ MIT License Copyright (c) 2019 Kim Seonghyeon Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: models/archs/stylegan2/LICENSE-FID ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: models/archs/stylegan2/LICENSE-LPIPS ================================================ Copyright (c) 2018, Richard Zhang, Phillip Isola, Alexei A. Efros, Eli Shechtman, Oliver Wang All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: models/archs/stylegan2/LICENSE-NVIDIA ================================================ Copyright (c) 2019, NVIDIA Corporation. All rights reserved. Nvidia Source Code License-NC ======================================================================= 1. Definitions "Licensor" means any person or entity that distributes its Work. "Software" means the original work of authorship made available under this License. "Work" means the Software and any additions to or derivative works of the Software that are made available under this License. "Nvidia Processors" means any central processing unit (CPU), graphics processing unit (GPU), field-programmable gate array (FPGA), application-specific integrated circuit (ASIC) or any combination thereof designed, made, sold, or provided by Nvidia or its affiliates. The terms "reproduce," "reproduction," "derivative works," and "distribution" have the meaning as provided under U.S. copyright law; provided, however, that for the purposes of this License, derivative works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work. Works, including the Software, are "made available" under this License by including in or with the Work either (a) a copyright notice referencing the applicability of this License to the Work, or (b) a copy of this License. 2. License Grants 2.1 Copyright Grant. Subject to the terms and conditions of this License, each Licensor grants to you a perpetual, worldwide, non-exclusive, royalty-free, copyright license to reproduce, prepare derivative works of, publicly display, publicly perform, sublicense and distribute its Work and any resulting derivative works in any form. 3. Limitations 3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so under this License, (b) you include a complete copy of this License with your distribution, and (c) you retain without modification any copyright, patent, trademark, or attribution notices that are present in the Work. 3.2 Derivative Works. You may specify that additional or different terms apply to the use, reproduction, and distribution of your derivative works of the Work ("Your Terms") only if (a) Your Terms provide that the use limitation in Section 3.3 applies to your derivative works, and (b) you identify the specific derivative works that are subject to Your Terms. Notwithstanding Your Terms, this License (including the redistribution requirements in Section 3.1) will continue to apply to the Work itself. 3.3 Use Limitation. The Work and any derivative works thereof only may be used or intended for use non-commercially. The Work or derivative works thereof may be used or intended for use by Nvidia or its affiliates commercially or non-commercially. As used herein, "non-commercially" means for research or evaluation purposes only. 3.4 Patent Claims. If you bring or threaten to bring a patent claim against any Licensor (including any claim, cross-claim or counterclaim in a lawsuit) to enforce any patents that you allege are infringed by any Work, then your rights under this License from such Licensor (including the grants in Sections 2.1 and 2.2) will terminate immediately. 3.5 Trademarks. This License does not grant any rights to use any Licensor's or its affiliates' names, logos, or trademarks, except as necessary to reproduce the notices described in this License. 3.6 Termination. If you violate any term of this License, then your rights under this License (including the grants in Sections 2.1 and 2.2) will terminate immediately. 4. Disclaimer of Warranty. THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE. 5. Limitation of Liability. EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. ======================================================================= ================================================ FILE: models/archs/stylegan2/__init__.py ================================================ ================================================ FILE: models/archs/stylegan2/apply_factor.py ================================================ import argparse import torch from torchvision import utils from model import Generator if __name__ == "__main__": torch.set_grad_enabled(False) parser = argparse.ArgumentParser(description="Apply closed form factorization") parser.add_argument( "-i", "--index", type=int, default=0, help="index of eigenvector" ) parser.add_argument( "-d", "--degree", type=float, default=5, help="scalar factors for moving latent vectors along eigenvector", ) parser.add_argument( "--channel_multiplier", type=int, default=2, help='channel multiplier factor. config-f = 2, else = 1', ) parser.add_argument("--ckpt", type=str, required=True, help="stylegan2 checkpoints") parser.add_argument( "--size", type=int, default=256, help="output image size of the generator" ) parser.add_argument( "-n", "--n_sample", type=int, default=7, help="number of samples created" ) parser.add_argument( "--truncation", type=float, default=0.7, help="truncation factor" ) parser.add_argument( "--device", type=str, default="cuda", help="device to run the model" ) parser.add_argument( "--out_prefix", type=str, default="factor", help="filename prefix to result samples", ) parser.add_argument( "factor", type=str, help="name of the closed form factorization result factor file", ) args = parser.parse_args() eigvec = torch.load(args.factor)["eigvec"].to(args.device) ckpt = torch.load(args.ckpt) g = Generator(args.size, 512, 8, channel_multiplier=args.channel_multiplier).to(args.device) g.load_state_dict(ckpt["g_ema"], strict=False) trunc = g.mean_latent(4096) latent = torch.randn(args.n_sample, 512, device=args.device) latent = g.get_latent(latent) direction = args.degree * eigvec[:, args.index].unsqueeze(0) img, _ = g( [latent], truncation=args.truncation, truncation_latent=trunc, input_is_latent=True, ) img1, _ = g( [latent + direction], truncation=args.truncation, truncation_latent=trunc, input_is_latent=True, ) img2, _ = g( [latent - direction], truncation=args.truncation, truncation_latent=trunc, input_is_latent=True, ) grid = utils.save_image( torch.cat([img1, img, img2], 0), f"{args.out_prefix}_index-{args.index}_degree-{args.degree}.png", normalize=True, range=(-1, 1), nrow=args.n_sample, ) ================================================ FILE: models/archs/stylegan2/calc_inception.py ================================================ import argparse import pickle import os import torch from torch import nn from torch.nn import functional as F from torch.utils.data import DataLoader from torchvision import transforms from torchvision.models import inception_v3, Inception3 import numpy as np from tqdm import tqdm from inception import InceptionV3 from dataset import MultiResolutionDataset class Inception3Feature(Inception3): def forward(self, x): if x.shape[2] != 299 or x.shape[3] != 299: x = F.interpolate(x, size=(299, 299), mode="bilinear", align_corners=True) x = self.Conv2d_1a_3x3(x) # 299 x 299 x 3 x = self.Conv2d_2a_3x3(x) # 149 x 149 x 32 x = self.Conv2d_2b_3x3(x) # 147 x 147 x 32 x = F.max_pool2d(x, kernel_size=3, stride=2) # 147 x 147 x 64 x = self.Conv2d_3b_1x1(x) # 73 x 73 x 64 x = self.Conv2d_4a_3x3(x) # 73 x 73 x 80 x = F.max_pool2d(x, kernel_size=3, stride=2) # 71 x 71 x 192 x = self.Mixed_5b(x) # 35 x 35 x 192 x = self.Mixed_5c(x) # 35 x 35 x 256 x = self.Mixed_5d(x) # 35 x 35 x 288 x = self.Mixed_6a(x) # 35 x 35 x 288 x = self.Mixed_6b(x) # 17 x 17 x 768 x = self.Mixed_6c(x) # 17 x 17 x 768 x = self.Mixed_6d(x) # 17 x 17 x 768 x = self.Mixed_6e(x) # 17 x 17 x 768 x = self.Mixed_7a(x) # 17 x 17 x 768 x = self.Mixed_7b(x) # 8 x 8 x 1280 x = self.Mixed_7c(x) # 8 x 8 x 2048 x = F.avg_pool2d(x, kernel_size=8) # 8 x 8 x 2048 return x.view(x.shape[0], x.shape[1]) # 1 x 1 x 2048 def load_patched_inception_v3(): # inception = inception_v3(pretrained=True) # inception_feat = Inception3Feature() # inception_feat.load_state_dict(inception.state_dict()) inception_feat = InceptionV3([3], normalize_input=False) return inception_feat @torch.no_grad() def extract_features(loader, inception, device): pbar = tqdm(loader) feature_list = [] for img in pbar: img = img.to(device) feature = inception(img)[0].view(img.shape[0], -1) feature_list.append(feature.to("cpu")) features = torch.cat(feature_list, 0) return features if __name__ == "__main__": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") parser = argparse.ArgumentParser( description="Calculate Inception v3 features for datasets" ) parser.add_argument( "--size", type=int, default=256, help="image sizes used for embedding calculation", ) parser.add_argument( "--batch", default=64, type=int, help="batch size for inception networks" ) parser.add_argument( "--n_sample", type=int, default=50000, help="number of samples used for embedding calculation", ) parser.add_argument( "--flip", action="store_true", help="apply random flipping to real images" ) parser.add_argument("path", metavar="PATH", help="path to datset lmdb file") args = parser.parse_args() inception = load_patched_inception_v3() inception = nn.DataParallel(inception).eval().to(device) transform = transforms.Compose( [ transforms.RandomHorizontalFlip(p=0.5 if args.flip else 0), transforms.ToTensor(), transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]), ] ) dset = MultiResolutionDataset(args.path, transform=transform, resolution=args.size) loader = DataLoader(dset, batch_size=args.batch, num_workers=4) features = extract_features(loader, inception, device).numpy() features = features[: args.n_sample] print(f"extracted {features.shape[0]} features") mean = np.mean(features, 0) cov = np.cov(features, rowvar=False) name = os.path.splitext(os.path.basename(args.path))[0] with open(f"inception_{name}.pkl", "wb") as f: pickle.dump({"mean": mean, "cov": cov, "size": args.size, "path": args.path}, f) ================================================ FILE: models/archs/stylegan2/checkpoint/.gitignore ================================================ *.pt ================================================ FILE: models/archs/stylegan2/convert_weight.py ================================================ import argparse import math import os import pickle import sys import numpy as np import torch from torchvision import utils from model import Discriminator, Generator def convert_modconv(vars, source_name, target_name, flip=False): weight = vars[source_name + "/weight"].value().eval() mod_weight = vars[source_name + "/mod_weight"].value().eval() mod_bias = vars[source_name + "/mod_bias"].value().eval() noise = vars[source_name + "/noise_strength"].value().eval() bias = vars[source_name + "/bias"].value().eval() dic = { "conv.weight": np.expand_dims(weight.transpose((3, 2, 0, 1)), 0), "conv.modulation.weight": mod_weight.transpose((1, 0)), "conv.modulation.bias": mod_bias + 1, "noise.weight": np.array([noise]), "activate.bias": bias, } dic_torch = {} for k, v in dic.items(): dic_torch[target_name + "." + k] = torch.from_numpy(v) if flip: dic_torch[target_name + ".conv.weight"] = torch.flip( dic_torch[target_name + ".conv.weight"], [3, 4]) return dic_torch def convert_conv(vars, source_name, target_name, bias=True, start=0): weight = vars[source_name + "/weight"].value().eval() dic = {"weight": weight.transpose((3, 2, 0, 1))} if bias: dic["bias"] = vars[source_name + "/bias"].value().eval() dic_torch = {} dic_torch[target_name + f".{start}.weight"] = torch.from_numpy( dic["weight"]) if bias: dic_torch[target_name + f".{start + 1}.bias"] = torch.from_numpy( dic["bias"]) return dic_torch def convert_torgb(vars, source_name, target_name): weight = vars[source_name + "/weight"].value().eval() mod_weight = vars[source_name + "/mod_weight"].value().eval() mod_bias = vars[source_name + "/mod_bias"].value().eval() bias = vars[source_name + "/bias"].value().eval() dic = { "conv.weight": np.expand_dims(weight.transpose((3, 2, 0, 1)), 0), "conv.modulation.weight": mod_weight.transpose((1, 0)), "conv.modulation.bias": mod_bias + 1, "bias": bias.reshape((1, 3, 1, 1)), } dic_torch = {} for k, v in dic.items(): dic_torch[target_name + "." + k] = torch.from_numpy(v) return dic_torch def convert_dense(vars, source_name, target_name): weight = vars[source_name + "/weight"].value().eval() bias = vars[source_name + "/bias"].value().eval() dic = {"weight": weight.transpose((1, 0)), "bias": bias} dic_torch = {} for k, v in dic.items(): dic_torch[target_name + "." + k] = torch.from_numpy(v) return dic_torch def update(state_dict, new): for k, v in new.items(): if k not in state_dict: raise KeyError(k + " is not found") if v.shape != state_dict[k].shape: raise ValueError( f"Shape mismatch: {v.shape} vs {state_dict[k].shape}") state_dict[k] = v def discriminator_fill_statedict(statedict, vars, size): log_size = int(math.log(size, 2)) update(statedict, convert_conv(vars, f"{size}x{size}/FromRGB", "convs.0")) conv_i = 1 for i in range(log_size - 2, 0, -1): reso = 4 * 2**i update( statedict, convert_conv(vars, f"{reso}x{reso}/Conv0", f"convs.{conv_i}.conv1"), ) update( statedict, convert_conv( vars, f"{reso}x{reso}/Conv1_down", f"convs.{conv_i}.conv2", start=1), ) update( statedict, convert_conv( vars, f"{reso}x{reso}/Skip", f"convs.{conv_i}.skip", start=1, bias=False), ) conv_i += 1 update(statedict, convert_conv(vars, f"4x4/Conv", "final_conv")) update(statedict, convert_dense(vars, f"4x4/Dense0", "final_linear.0")) update(statedict, convert_dense(vars, f"Output", "final_linear.1")) return statedict def fill_statedict(state_dict, vars, size, n_mlp): log_size = int(math.log(size, 2)) for i in range(n_mlp): update(state_dict, convert_dense(vars, f"G_mapping/Dense{i}", f"style.{i + 1}")) update( state_dict, { "input.input": torch.from_numpy( vars["G_synthesis/4x4/Const/const"].value().eval()) }, ) update(state_dict, convert_torgb(vars, "G_synthesis/4x4/ToRGB", "to_rgb1")) for i in range(log_size - 2): reso = 4 * 2**(i + 1) update( state_dict, convert_torgb(vars, f"G_synthesis/{reso}x{reso}/ToRGB", f"to_rgbs.{i}"), ) update(state_dict, convert_modconv(vars, "G_synthesis/4x4/Conv", "conv1")) conv_i = 0 for i in range(log_size - 2): reso = 4 * 2**(i + 1) update( state_dict, convert_modconv( vars, f"G_synthesis/{reso}x{reso}/Conv0_up", f"convs.{conv_i}", flip=True, ), ) update( state_dict, convert_modconv(vars, f"G_synthesis/{reso}x{reso}/Conv1", f"convs.{conv_i + 1}"), ) conv_i += 2 for i in range(0, (log_size - 2) * 2 + 1): update( state_dict, { f"noises.noise_{i}": torch.from_numpy(vars[f"G_synthesis/noise{i}"].value().eval()) }, ) return state_dict if __name__ == "__main__": device = "cuda" parser = argparse.ArgumentParser( description="Tensorflow to pytorch model checkpoint converter") parser.add_argument( "--repo", type=str, required=True, help="path to the offical StyleGAN2 repository with dnnlib/ folder", ) parser.add_argument( "--gen", action="store_true", help="convert the generator weights") parser.add_argument( "--disc", action="store_true", help="convert the discriminator weights") parser.add_argument( "--channel_multiplier", type=int, default=2, help="channel multiplier factor. config-f = 2, else = 1", ) parser.add_argument( "path", metavar="PATH", help="path to the tensorflow weights") args = parser.parse_args() sys.path.append(args.repo) import dnnlib from dnnlib import tflib tflib.init_tf() with open(args.path, "rb") as f: generator, discriminator, g_ema = pickle.load(f) size = g_ema.output_shape[2] print(size) raie NotImplementedError n_mlp = 0 mapping_layers_names = g_ema.__getstate__( )['components']['mapping'].list_layers() for layer in mapping_layers_names: if layer[0].startswith('Dense'): n_mlp += 1 g = Generator(size, 512, n_mlp, channel_multiplier=args.channel_multiplier) state_dict = g.state_dict() state_dict = fill_statedict(state_dict, g_ema.vars, size, n_mlp) g.load_state_dict(state_dict) latent_avg = torch.from_numpy(g_ema.vars["dlatent_avg"].value().eval()) ckpt = {"g_ema": state_dict, "latent_avg": latent_avg} if args.gen: g_train = Generator( size, 512, n_mlp, channel_multiplier=args.channel_multiplier) g_train_state = g_train.state_dict() g_train_state = fill_statedict(g_train_state, generator.vars, size) ckpt["g"] = g_train_state if args.disc: disc = Discriminator(size, channel_multiplier=args.channel_multiplier) d_state = disc.state_dict() d_state = discriminator_fill_statedict(d_state, discriminator.vars, size) ckpt["d"] = d_state name = os.path.splitext(os.path.basename(args.path))[0] torch.save(ckpt, name + ".pt") batch_size = {256: 16, 512: 9, 1024: 4} n_sample = batch_size.get(size, 25) g = g.to(device) z = np.random.RandomState(0).randn(n_sample, 512).astype("float32") with torch.no_grad(): img_pt, _ = g( [torch.from_numpy(z).to(device)], truncation=0.5, truncation_latent=latent_avg.to(device), randomize_noise=False, ) Gs_kwargs = dnnlib.EasyDict() Gs_kwargs.randomize_noise = False img_tf = g_ema.run(z, None, **Gs_kwargs) img_tf = torch.from_numpy(img_tf).to(device) img_diff = ((img_pt + 1) / 2).clamp(0.0, 1.0) - ( (img_tf.to(device) + 1) / 2).clamp(0.0, 1.0) img_concat = torch.cat((img_tf, img_pt, img_diff), dim=0) print(img_diff.abs().max()) utils.save_image( img_concat, name + ".png", nrow=n_sample, normalize=True, range=(-1, 1)) ================================================ FILE: models/archs/stylegan2/dataset.py ================================================ from io import BytesIO import lmdb from PIL import Image from torch.utils.data import Dataset class MultiResolutionDataset(Dataset): def __init__(self, path, transform, resolution=256): self.env = lmdb.open( path, max_readers=32, readonly=True, lock=False, readahead=False, meminit=False, ) if not self.env: raise IOError('Cannot open lmdb dataset', path) with self.env.begin(write=False) as txn: self.length = int(txn.get('length'.encode('utf-8')).decode('utf-8')) self.resolution = resolution self.transform = transform def __len__(self): return self.length def __getitem__(self, index): with self.env.begin(write=False) as txn: key = f'{self.resolution}-{str(index).zfill(5)}'.encode('utf-8') img_bytes = txn.get(key) buffer = BytesIO(img_bytes) img = Image.open(buffer) img = self.transform(img) return img ================================================ FILE: models/archs/stylegan2/distributed.py ================================================ import math import pickle import torch from torch import distributed as dist from torch.utils.data.sampler import Sampler def get_rank(): if not dist.is_available(): return 0 if not dist.is_initialized(): return 0 return dist.get_rank() def synchronize(): if not dist.is_available(): return if not dist.is_initialized(): return world_size = dist.get_world_size() if world_size == 1: return dist.barrier() def get_world_size(): if not dist.is_available(): return 1 if not dist.is_initialized(): return 1 return dist.get_world_size() def reduce_sum(tensor): if not dist.is_available(): return tensor if not dist.is_initialized(): return tensor tensor = tensor.clone() dist.all_reduce(tensor, op=dist.ReduceOp.SUM) return tensor def gather_grad(params): world_size = get_world_size() if world_size == 1: return for param in params: if param.grad is not None: dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM) param.grad.data.div_(world_size) def all_gather(data): world_size = get_world_size() if world_size == 1: return [data] buffer = pickle.dumps(data) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to('cuda') local_size = torch.IntTensor([tensor.numel()]).to('cuda') size_list = [torch.IntTensor([0]).to('cuda') for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) tensor_list = [] for _ in size_list: tensor_list.append(torch.ByteTensor(size=(max_size,)).to('cuda')) if local_size != max_size: padding = torch.ByteTensor(size=(max_size - local_size,)).to('cuda') tensor = torch.cat((tensor, padding), 0) dist.all_gather(tensor_list, tensor) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list def reduce_loss_dict(loss_dict): world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): keys = [] losses = [] for k in sorted(loss_dict.keys()): keys.append(k) losses.append(loss_dict[k]) losses = torch.stack(losses, 0) dist.reduce(losses, dst=0) if dist.get_rank() == 0: losses /= world_size reduced_losses = {k: v for k, v in zip(keys, losses)} return reduced_losses ================================================ FILE: models/archs/stylegan2/fid.py ================================================ import argparse import pickle import torch from torch import nn import numpy as np from scipy import linalg from tqdm import tqdm from model import Generator from calc_inception import load_patched_inception_v3 @torch.no_grad() def extract_feature_from_samples( generator, inception, truncation, truncation_latent, batch_size, n_sample, device ): n_batch = n_sample // batch_size resid = n_sample - (n_batch * batch_size) batch_sizes = [batch_size] * n_batch + [resid] features = [] for batch in tqdm(batch_sizes): latent = torch.randn(batch, 512, device=device) img, _ = g([latent], truncation=truncation, truncation_latent=truncation_latent) feat = inception(img)[0].view(img.shape[0], -1) features.append(feat.to("cpu")) features = torch.cat(features, 0) return features def calc_fid(sample_mean, sample_cov, real_mean, real_cov, eps=1e-6): cov_sqrt, _ = linalg.sqrtm(sample_cov @ real_cov, disp=False) if not np.isfinite(cov_sqrt).all(): print("product of cov matrices is singular") offset = np.eye(sample_cov.shape[0]) * eps cov_sqrt = linalg.sqrtm((sample_cov + offset) @ (real_cov + offset)) if np.iscomplexobj(cov_sqrt): if not np.allclose(np.diagonal(cov_sqrt).imag, 0, atol=1e-3): m = np.max(np.abs(cov_sqrt.imag)) raise ValueError(f"Imaginary component {m}") cov_sqrt = cov_sqrt.real mean_diff = sample_mean - real_mean mean_norm = mean_diff @ mean_diff trace = np.trace(sample_cov) + np.trace(real_cov) - 2 * np.trace(cov_sqrt) fid = mean_norm + trace return fid if __name__ == "__main__": device = "cuda" parser = argparse.ArgumentParser(description="Calculate FID scores") parser.add_argument("--truncation", type=float, default=1, help="truncation factor") parser.add_argument( "--truncation_mean", type=int, default=4096, help="number of samples to calculate mean for truncation", ) parser.add_argument( "--batch", type=int, default=64, help="batch size for the generator" ) parser.add_argument( "--n_sample", type=int, default=50000, help="number of the samples for calculating FID", ) parser.add_argument( "--size", type=int, default=256, help="image sizes for generator" ) parser.add_argument( "--inception", type=str, default=None, required=True, help="path to precomputed inception embedding", ) parser.add_argument( "ckpt", metavar="CHECKPOINT", help="path to generator checkpoint" ) args = parser.parse_args() ckpt = torch.load(args.ckpt) g = Generator(args.size, 512, 8).to(device) g.load_state_dict(ckpt["g_ema"]) g = nn.DataParallel(g) g.eval() if args.truncation < 1: with torch.no_grad(): mean_latent = g.mean_latent(args.truncation_mean) else: mean_latent = None inception = nn.DataParallel(load_patched_inception_v3()).to(device) inception.eval() features = extract_feature_from_samples( g, inception, args.truncation, mean_latent, args.batch, args.n_sample, device ).numpy() print(f"extracted {features.shape[0]} features") sample_mean = np.mean(features, 0) sample_cov = np.cov(features, rowvar=False) with open(args.inception, "rb") as f: embeds = pickle.load(f) real_mean = embeds["mean"] real_cov = embeds["cov"] fid = calc_fid(sample_mean, sample_cov, real_mean, real_cov) print("fid:", fid) ================================================ FILE: models/archs/stylegan2/generate.py ================================================ import argparse import os import sys import numpy as np import torch from torchvision import utils from tqdm import tqdm sys.path.append('..') from stylegan2_pytorch.model import Generator def generate(args, g_ema, device, mean_latent): if not os.path.exists(args.synthetic_image_dir): os.makedirs(args.synthetic_image_dir) latent_code = {} w_space_code = {} with torch.no_grad(): g_ema.eval() for i in tqdm(range(args.pics)): sample_z = torch.randn(args.sample, args.latent, device=device) sample, w_space = g_ema([sample_z], truncation=args.truncation, truncation_latent=mean_latent, return_latents=True, randomize_noise=False) utils.save_image( sample, os.path.join(args.synthetic_image_dir, f"{str(i).zfill(7)}.png"), nrow=1, normalize=True, range=(-1, 1), ) latent_code[f"{str(i).zfill(7)}.png"] = sample_z.cpu().numpy() w_space_code[f"{str(i).zfill(7)}.png"] = w_space.cpu().numpy() # save latent code np.save(f'{args.synthetic_image_dir}/latent_code.npz', latent_code) np.save(f'{args.synthetic_image_dir}/w_space_code.npz', w_space_code) if __name__ == "__main__": device = "cuda" parser = argparse.ArgumentParser( description="Generate samples from the generator") parser.add_argument( "--size", type=int, default=1024, help="output image size of the generator") parser.add_argument( "--sample", type=int, default=1, help="number of samples to be generated for each image", ) parser.add_argument( "--pics", type=int, default=20, help="number of images to be generated") parser.add_argument( "--truncation", type=float, default=1, help="truncation ratio") parser.add_argument( "--truncation_mean", type=int, default=4096, help="number of vectors to calculate mean for the truncation", ) parser.add_argument( "--ckpt", type=str, default="stylegan2-ffhq-config-f.pt", help="path to the model checkpoint", ) parser.add_argument( "--channel_multiplier", type=int, default=2, help="channel multiplier of the generator. config-f = 2, else = 1", ) parser.add_argument( "--synthetic_image_dir", default='', help="channel multiplier of the generator. config-f = 2, else = 1", ) args = parser.parse_args() args.latent = 512 args.n_mlp = 8 g_ema = Generator( args.size, args.latent, args.n_mlp, channel_multiplier=args.channel_multiplier).to(device) checkpoint = torch.load(args.ckpt) g_ema.load_state_dict(checkpoint["g_ema"]) if args.truncation < 1: with torch.no_grad(): mean_latent = g_ema.mean_latent(args.truncation_mean) else: mean_latent = None generate(args, g_ema, device, mean_latent) ================================================ FILE: models/archs/stylegan2/inception.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from torchvision import models try: from torchvision.models.utils import load_state_dict_from_url except ImportError: from torch.utils.model_zoo import load_url as load_state_dict_from_url # Inception weights ported to Pytorch from # http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth' class InceptionV3(nn.Module): """Pretrained InceptionV3 network returning feature maps""" # Index of default block of inception to return, # corresponds to output of final average pooling DEFAULT_BLOCK_INDEX = 3 # Maps feature dimensionality to their output blocks indices BLOCK_INDEX_BY_DIM = { 64: 0, # First max pooling features 192: 1, # Second max pooling featurs 768: 2, # Pre-aux classifier features 2048: 3 # Final average pooling features } def __init__(self, output_blocks=[DEFAULT_BLOCK_INDEX], resize_input=True, normalize_input=True, requires_grad=False, use_fid_inception=True): """Build pretrained InceptionV3 Parameters ---------- output_blocks : list of int Indices of blocks to return features of. Possible values are: - 0: corresponds to output of first max pooling - 1: corresponds to output of second max pooling - 2: corresponds to output which is fed to aux classifier - 3: corresponds to output of final average pooling resize_input : bool If true, bilinearly resizes input to width and height 299 before feeding input to model. As the network without fully connected layers is fully convolutional, it should be able to handle inputs of arbitrary size, so resizing might not be strictly needed normalize_input : bool If true, scales the input from range (0, 1) to the range the pretrained Inception network expects, namely (-1, 1) requires_grad : bool If true, parameters of the model require gradients. Possibly useful for finetuning the network use_fid_inception : bool If true, uses the pretrained Inception model used in Tensorflow's FID implementation. If false, uses the pretrained Inception model available in torchvision. The FID Inception model has different weights and a slightly different structure from torchvision's Inception model. If you want to compute FID scores, you are strongly advised to set this parameter to true to get comparable results. """ super(InceptionV3, self).__init__() self.resize_input = resize_input self.normalize_input = normalize_input self.output_blocks = sorted(output_blocks) self.last_needed_block = max(output_blocks) assert self.last_needed_block <= 3, \ 'Last possible output block index is 3' self.blocks = nn.ModuleList() if use_fid_inception: inception = fid_inception_v3() else: inception = models.inception_v3(pretrained=True) # Block 0: input to maxpool1 block0 = [ inception.Conv2d_1a_3x3, inception.Conv2d_2a_3x3, inception.Conv2d_2b_3x3, nn.MaxPool2d(kernel_size=3, stride=2) ] self.blocks.append(nn.Sequential(*block0)) # Block 1: maxpool1 to maxpool2 if self.last_needed_block >= 1: block1 = [ inception.Conv2d_3b_1x1, inception.Conv2d_4a_3x3, nn.MaxPool2d(kernel_size=3, stride=2) ] self.blocks.append(nn.Sequential(*block1)) # Block 2: maxpool2 to aux classifier if self.last_needed_block >= 2: block2 = [ inception.Mixed_5b, inception.Mixed_5c, inception.Mixed_5d, inception.Mixed_6a, inception.Mixed_6b, inception.Mixed_6c, inception.Mixed_6d, inception.Mixed_6e, ] self.blocks.append(nn.Sequential(*block2)) # Block 3: aux classifier to final avgpool if self.last_needed_block >= 3: block3 = [ inception.Mixed_7a, inception.Mixed_7b, inception.Mixed_7c, nn.AdaptiveAvgPool2d(output_size=(1, 1)) ] self.blocks.append(nn.Sequential(*block3)) for param in self.parameters(): param.requires_grad = requires_grad def forward(self, inp): """Get Inception feature maps Parameters ---------- inp : torch.autograd.Variable Input tensor of shape Bx3xHxW. Values are expected to be in range (0, 1) Returns ------- List of torch.autograd.Variable, corresponding to the selected output block, sorted ascending by index """ outp = [] x = inp if self.resize_input: x = F.interpolate(x, size=(299, 299), mode='bilinear', align_corners=False) if self.normalize_input: x = 2 * x - 1 # Scale from range (0, 1) to range (-1, 1) for idx, block in enumerate(self.blocks): x = block(x) if idx in self.output_blocks: outp.append(x) if idx == self.last_needed_block: break return outp def fid_inception_v3(): """Build pretrained Inception model for FID computation The Inception model for FID computation uses a different set of weights and has a slightly different structure than torchvision's Inception. This method first constructs torchvision's Inception and then patches the necessary parts that are different in the FID Inception model. """ inception = models.inception_v3(num_classes=1008, aux_logits=False, pretrained=False) inception.Mixed_5b = FIDInceptionA(192, pool_features=32) inception.Mixed_5c = FIDInceptionA(256, pool_features=64) inception.Mixed_5d = FIDInceptionA(288, pool_features=64) inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128) inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160) inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160) inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192) inception.Mixed_7b = FIDInceptionE_1(1280) inception.Mixed_7c = FIDInceptionE_2(2048) state_dict = load_state_dict_from_url(FID_WEIGHTS_URL, progress=True) inception.load_state_dict(state_dict) return inception class FIDInceptionA(models.inception.InceptionA): """InceptionA block patched for FID computation""" def __init__(self, in_channels, pool_features): super(FIDInceptionA, self).__init__(in_channels, pool_features) def forward(self, x): branch1x1 = self.branch1x1(x) branch5x5 = self.branch5x5_1(x) branch5x5 = self.branch5x5_2(branch5x5) branch3x3dbl = self.branch3x3dbl_1(x) branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl) # Patch: Tensorflow's average pool does not use the padded zero's in # its average calculation branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1, count_include_pad=False) branch_pool = self.branch_pool(branch_pool) outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool] return torch.cat(outputs, 1) class FIDInceptionC(models.inception.InceptionC): """InceptionC block patched for FID computation""" def __init__(self, in_channels, channels_7x7): super(FIDInceptionC, self).__init__(in_channels, channels_7x7) def forward(self, x): branch1x1 = self.branch1x1(x) branch7x7 = self.branch7x7_1(x) branch7x7 = self.branch7x7_2(branch7x7) branch7x7 = self.branch7x7_3(branch7x7) branch7x7dbl = self.branch7x7dbl_1(x) branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl) branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl) branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl) branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl) # Patch: Tensorflow's average pool does not use the padded zero's in # its average calculation branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1, count_include_pad=False) branch_pool = self.branch_pool(branch_pool) outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool] return torch.cat(outputs, 1) class FIDInceptionE_1(models.inception.InceptionE): """First InceptionE block patched for FID computation""" def __init__(self, in_channels): super(FIDInceptionE_1, self).__init__(in_channels) def forward(self, x): branch1x1 = self.branch1x1(x) branch3x3 = self.branch3x3_1(x) branch3x3 = [ self.branch3x3_2a(branch3x3), self.branch3x3_2b(branch3x3), ] branch3x3 = torch.cat(branch3x3, 1) branch3x3dbl = self.branch3x3dbl_1(x) branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) branch3x3dbl = [ self.branch3x3dbl_3a(branch3x3dbl), self.branch3x3dbl_3b(branch3x3dbl), ] branch3x3dbl = torch.cat(branch3x3dbl, 1) # Patch: Tensorflow's average pool does not use the padded zero's in # its average calculation branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1, count_include_pad=False) branch_pool = self.branch_pool(branch_pool) outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool] return torch.cat(outputs, 1) class FIDInceptionE_2(models.inception.InceptionE): """Second InceptionE block patched for FID computation""" def __init__(self, in_channels): super(FIDInceptionE_2, self).__init__(in_channels) def forward(self, x): branch1x1 = self.branch1x1(x) branch3x3 = self.branch3x3_1(x) branch3x3 = [ self.branch3x3_2a(branch3x3), self.branch3x3_2b(branch3x3), ] branch3x3 = torch.cat(branch3x3, 1) branch3x3dbl = self.branch3x3dbl_1(x) branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) branch3x3dbl = [ self.branch3x3dbl_3a(branch3x3dbl), self.branch3x3dbl_3b(branch3x3dbl), ] branch3x3dbl = torch.cat(branch3x3dbl, 1) # Patch: The FID Inception model uses max pooling instead of average # pooling. This is likely an error in this specific Inception # implementation, as other Inception models use average pooling here # (which matches the description in the paper). branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1) branch_pool = self.branch_pool(branch_pool) outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool] return torch.cat(outputs, 1) ================================================ FILE: models/archs/stylegan2/inversion.py ================================================ import argparse import math import os import numpy as np import torch from PIL import Image from torch import optim from torch.nn import functional as F from torchvision import transforms from tqdm import tqdm import lpips from model import Generator def noise_regularize(noises): loss = 0 for noise in noises: size = noise.shape[2] while True: loss = ( loss + (noise * torch.roll(noise, shifts=1, dims=3)).mean().pow(2) + (noise * torch.roll(noise, shifts=1, dims=2)).mean().pow(2)) if size <= 8: break noise = noise.reshape([-1, 1, size // 2, 2, size // 2, 2]) noise = noise.mean([3, 5]) size //= 2 return loss def noise_normalize_(noises): for noise in noises: mean = noise.mean() std = noise.std() noise.data.add_(-mean).div_(std) def get_lr(t, initial_lr, rampdown=0.25, rampup=0.05): lr_ramp = min(1, (1 - t) / rampdown) lr_ramp = 0.5 - 0.5 * math.cos(lr_ramp * math.pi) lr_ramp = lr_ramp * min(1, t / rampup) return initial_lr * lr_ramp def latent_noise(latent, strength): noise = torch.randn_like(latent) * strength return latent + noise def make_image(tensor): return (tensor.detach().clamp_(min=-1, max=1).add(1).div_(2).mul(255).type( torch.uint8).permute(0, 2, 3, 1).to("cpu").numpy()) if __name__ == "__main__": device = "cuda" parser = argparse.ArgumentParser( description="Image projector to the generator latent spaces") parser.add_argument( "--ckpt", type=str, required=True, help="path to the model checkpoint") parser.add_argument( "--size", type=int, default=256, help="output image sizes of the generator") parser.add_argument( "--lr_rampup", type=float, default=0.05, help="duration of the learning rate warmup", ) parser.add_argument( "--lr_rampdown", type=float, default=0.25, help="duration of the learning rate decay", ) parser.add_argument("--lr", type=float, default=0.1, help="learning rate") parser.add_argument( "--noise", type=float, default=0.05, help="strength of the noise level") parser.add_argument( "--noise_ramp", type=float, default=0.75, help="duration of the noise level decay", ) parser.add_argument( "--step", type=int, default=1000, help="optimize iterations") parser.add_argument( "--noise_regularize", type=float, default=1e5, help="weight of the noise regularization", ) parser.add_argument("--randomise_noise", type=int, default=1) parser.add_argument( "--img_mse_weight", type=float, default=0, help="weight of the mse loss") parser.add_argument( "files", metavar="FILES", nargs="+", help="path to image files to be projected") parser.add_argument("--output_dir", type=str, required=True) parser.add_argument( "--w_plus", action="store_true", help="allow to use distinct latent codes to each layers", ) parser.add_argument( "--postfix", default='', type=str, help='postfix for filenames') parser.add_argument( "--latent_type", required=True, type=str, help='z or w, not case sensitive') parser.add_argument( "--w_path", default='', type=str, help='path to w latent code') parser.add_argument('--w_mse_weight', default=0, type=float) parser.add_argument('--w_loss_type', default='mse', type=str) args = parser.parse_args() # latent space type args.latent_type = args.latent_type.lower() if args.latent_type == 'z': args.input_is_latent = False elif args.latent_type == 'w': args.input_is_latent = True else: assert False, "Unrecognized args.latent_type" n_mean_latent = 10000 resize = min(args.size, 256) transform = transforms.Compose([ transforms.Resize(resize), transforms.CenterCrop(resize), transforms.ToTensor(), transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]), ]) imgs = [] for imgfile in args.files: img = transform(Image.open(imgfile).convert("RGB")) imgs.append(img) imgs = torch.stack(imgs, 0).to(device) if args.w_mse_weight: assert args.latent_type == 'z' w_latent_code = np.load(args.w_path) w_latent_code = torch.tensor(w_latent_code).to(device) # g_ema = Generator(args.size, 512, 8) # ziqi modified g_ema = Generator(args.size, 512, 8, 1) g_ema.load_state_dict(torch.load(args.ckpt)["g_ema"], strict=False) g_ema.eval() g_ema = g_ema.to(device) with torch.no_grad(): noise_sample = torch.randn(n_mean_latent, 512, device=device) latent_out = g_ema.style(noise_sample) latent_mean = latent_out.mean(0) latent_std = ((latent_out - latent_mean).pow(2).sum() / n_mean_latent)**0.5 percept = lpips.PerceptualLoss( model="net-lin", net="vgg", use_gpu=device.startswith("cuda")) if args.latent_type == 'w': latent_in = latent_mean.detach().clone().unsqueeze(0).repeat( imgs.shape[0], 1) elif args.latent_type == 'z': latent_in = noise_sample.mean(0).detach().clone().unsqueeze(0).repeat( imgs.shape[0], 1) if args.w_plus: latent_in = latent_in.unsqueeze(1).repeat(1, g_ema.n_latent, 1) latent_in.requires_grad = True if args.randomise_noise: print('Noise term will be optimized together.') noises_single = g_ema.make_noise() noises = [] for noise in noises_single: noises.append(noise.repeat(imgs.shape[0], 1, 1, 1).normal_()) for noise in noises: noise.requires_grad = True optimizer = optim.Adam( [latent_in] + noises + [g_ema.parameters()], lr=args.lr) else: optim_params = [] for v in g_ema.parameters(): if v.requires_grad: optim_params.append(v) optimizer = optim.Adam([{ 'params': [latent_in] }, { 'params': optim_params, 'lr': 1e-4 }], lr=args.lr) pbar = tqdm(range(args.step)) latent_path = [] for i in pbar: t = i / args.step lr = get_lr(t, args.lr) optimizer.param_groups[0]["lr"] = lr noise_strength = latent_std * args.noise * max( 0, 1 - t / args.noise_ramp)**2 if args.latent_type == 'z': latent_w = g_ema.style(latent_in) latent_n = latent_noise(latent_w, noise_strength.item()) else: latent_n = latent_noise(latent_in, noise_strength.item()) if args.randomise_noise: img_gen, _ = g_ema([latent_n], input_is_latent=True, noise=noises) else: img_gen, _ = g_ema([latent_n], input_is_latent=True, randomize_noise=False) batch, channel, height, width = img_gen.shape if height > 256: factor = height // 256 img_gen = img_gen.reshape(batch, channel, height // factor, factor, width // factor, factor) img_gen = img_gen.mean([3, 5]) p_loss = percept(img_gen, imgs).sum() mse_loss = F.mse_loss(img_gen, imgs) if args.randomise_noise: n_loss = noise_regularize(noises) else: n_loss = 0 loss = p_loss + args.noise_regularize * n_loss + args.img_mse_weight * mse_loss if args.w_mse_weight > 0: # this loss is only applicable to z space assert args.latent_type == 'z' if args.w_loss_type == 'mse': w_mse_loss = F.mse_loss(latent_w, w_latent_code) elif args.w_loss_type == 'l1': w_mse_loss = F.l1_loss(latent_w, w_latent_code) loss += args.w_mse_weight * w_mse_loss else: w_mse_loss = 0 optimizer.zero_grad() loss.backward() optimizer.step() if args.randomise_noise: noise_normalize_(noises) if (i + 1) % 100 == 0: latent_path.append(latent_in.detach().clone()) pbar.set_description(( f"total: {loss:.4f}; perceptual: {p_loss:.4f}; noise regularize: {n_loss:.4f};" f" mse: {mse_loss:.4f}; w_mse_loss: {w_mse_loss:.4f}; lr: {lr:.4f}" )) if args.randomise_noise: img_gen, _ = g_ema([latent_path[-1]], input_is_latent=args.input_is_latent, noise=noises) else: img_gen, _ = g_ema([latent_path[-1]], input_is_latent=args.input_is_latent, randomize_noise=False) filename = os.path.splitext(os.path.basename(args.files[0]))[0] + ".pt" img_ar = make_image(img_gen) result_file = {} for i, input_name in enumerate(args.files): result_file[input_name] = {"img": img_gen[i], "latent": latent_in[i]} if args.randomise_noise: noise_single = [] for noise in noises: noise_single.append(noise[i:i + 1]) result_file[input_name]["noise"] = noise_single img_name = os.path.splitext( os.path.basename(input_name) )[0] + '_' + args.postfix + '-' + args.latent_type + "-project.png" pil_img = Image.fromarray(img_ar[i]) # save image if not os.path.isdir(os.path.join(args.output_dir, 'recovered_image')): os.makedirs( os.path.join(args.output_dir, 'recovered_image'), exist_ok=False) pil_img.save( os.path.join(args.output_dir, 'recovered_image', img_name)) latent_code = latent_in[i].cpu() latent_code = latent_code.detach().numpy() latent_code = np.expand_dims(latent_code, axis=0) print('latent_code:', len(latent_code), len(latent_code[0])) # save latent code if not os.path.isdir(os.path.join(args.output_dir, 'latent_codes')): os.makedirs( os.path.join(args.output_dir, 'latent_codes'), exist_ok=False) np.save( f'{args.output_dir}/latent_codes/{img_name}_{args.latent_type}.npz.npy', latent_code) if not os.path.isdir(os.path.join(args.output_dir, 'checkpoint')): os.makedirs( os.path.join(args.output_dir, 'checkpoint'), exist_ok=False) torch.save( { "g_ema": g_ema.state_dict(), }, f"{os.path.join(args.output_dir, 'checkpoint')}/{img_name}_{args.latent_type}.pt", ) # save info if not os.path.isdir(os.path.join(args.output_dir, 'pt')): os.makedirs(os.path.join(args.output_dir, 'pt'), exist_ok=False) torch.save( result_file, os.path.join( args.output_dir, os.path.join(args.output_dir, 'pt', filename + '_' + args.latent_type))) ================================================ FILE: models/archs/stylegan2/lpips/__init__.py ================================================ from __future__ import absolute_import, division, print_function import numpy as np import torch from models.archs.stylegan2.lpips import dist_model from skimage.measure import compare_ssim class PerceptualLoss(torch.nn.Module): def __init__( self, model='net-lin', net='alex', colorspace='rgb', spatial=False, use_gpu=True, gpu_ids=[ 0 ]): # VGG using our perceptually-learned weights (LPIPS metric) # def __init__(self, model='net', net='vgg', use_gpu=True): # "default" way of using VGG as a perceptual loss super(PerceptualLoss, self).__init__() self.use_gpu = use_gpu self.spatial = spatial self.gpu_ids = gpu_ids self.model = dist_model.DistModel() self.model.initialize( model=model, net=net, use_gpu=use_gpu, colorspace=colorspace, spatial=self.spatial, gpu_ids=gpu_ids) def forward(self, pred, target, normalize=False): """ Pred and target are Variables. If normalize is True, assumes the images are between [0,1] and then scales them between [-1,+1] If normalize is False, assumes the images are already between [-1,+1] Inputs pred and target are Nx3xHxW Output pytorch Variable N long """ if normalize: target = 2 * target - 1 pred = 2 * pred - 1 return self.model.forward(target, pred) def normalize_tensor(in_feat, eps=1e-10): norm_factor = torch.sqrt(torch.sum(in_feat**2, dim=1, keepdim=True)) return in_feat / (norm_factor + eps) def l2(p0, p1, range=255.): return .5 * np.mean((p0 / range - p1 / range)**2) def psnr(p0, p1, peak=255.): return 10 * np.log10(peak**2 / np.mean((1. * p0 - 1. * p1)**2)) def dssim(p0, p1, range=255.): return (1 - compare_ssim(p0, p1, data_range=range, multichannel=True)) / 2. def rgb2lab(in_img, mean_cent=False): from skimage import color img_lab = color.rgb2lab(in_img) if (mean_cent): img_lab[:, :, 0] = img_lab[:, :, 0] - 50 return img_lab def tensor2np(tensor_obj): # change dimension of a tensor object into a numpy array return tensor_obj[0].cpu().float().numpy().transpose((1, 2, 0)) def np2tensor(np_obj): # change dimenion of np array into tensor array return torch.Tensor(np_obj[:, :, :, np.newaxis].transpose((3, 2, 0, 1))) def tensor2tensorlab(image_tensor, to_norm=True, mc_only=False): # image tensor to lab tensor from skimage import color img = tensor2im(image_tensor) img_lab = color.rgb2lab(img) if (mc_only): img_lab[:, :, 0] = img_lab[:, :, 0] - 50 if (to_norm and not mc_only): img_lab[:, :, 0] = img_lab[:, :, 0] - 50 img_lab = img_lab / 100. return np2tensor(img_lab) def tensorlab2tensor(lab_tensor, return_inbnd=False): import warnings from skimage import color warnings.filterwarnings("ignore") lab = tensor2np(lab_tensor) * 100. lab[:, :, 0] = lab[:, :, 0] + 50 rgb_back = 255. * np.clip(color.lab2rgb(lab.astype('float')), 0, 1) if (return_inbnd): # convert back to lab, see if we match lab_back = color.rgb2lab(rgb_back.astype('uint8')) mask = 1. * np.isclose(lab_back, lab, atol=2.) mask = np2tensor(np.prod(mask, axis=2)[:, :, np.newaxis]) return (im2tensor(rgb_back), mask) else: return im2tensor(rgb_back) def rgb2lab(input): from skimage import color return color.rgb2lab(input / 255.) def tensor2im(image_tensor, imtype=np.uint8, cent=1., factor=255. / 2.): image_numpy = image_tensor[0].cpu().float().numpy() image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + cent) * factor return image_numpy.astype(imtype) def im2tensor(image, imtype=np.uint8, cent=1., factor=255. / 2.): return torch.Tensor((image / factor - cent)[:, :, :, np.newaxis].transpose( (3, 2, 0, 1))) def tensor2vec(vector_tensor): return vector_tensor.data.cpu().numpy()[:, :, 0, 0] def voc_ap(rec, prec, use_07_metric=False): """ ap = voc_ap(rec, prec, [use_07_metric]) Compute VOC AP given precision and recall. If use_07_metric is true, uses the VOC 07 11 point method (default:False). """ if use_07_metric: # 11 point metric ap = 0. for t in np.arange(0., 1.1, 0.1): if np.sum(rec >= t) == 0: p = 0 else: p = np.max(prec[rec >= t]) ap = ap + p / 11. else: # correct AP calculation # first append sentinel values at the end mrec = np.concatenate(([0.], rec, [1.])) mpre = np.concatenate(([0.], prec, [0.])) # compute the precision envelope for i in range(mpre.size - 1, 0, -1): mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) # to calculate area under PR curve, look for points # where X axis (recall) changes value i = np.where(mrec[1:] != mrec[:-1])[0] # and sum (\Delta recall) * prec ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) return ap def tensor2im(image_tensor, imtype=np.uint8, cent=1., factor=255. / 2.): # def tensor2im(image_tensor, imtype=np.uint8, cent=1., factor=1.): image_numpy = image_tensor[0].cpu().float().numpy() image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + cent) * factor return image_numpy.astype(imtype) def im2tensor(image, imtype=np.uint8, cent=1., factor=255. / 2.): # def im2tensor(image, imtype=np.uint8, cent=1., factor=1.): return torch.Tensor((image / factor - cent)[:, :, :, np.newaxis].transpose( (3, 2, 0, 1))) ================================================ FILE: models/archs/stylegan2/lpips/base_model.py ================================================ import os import numpy as np import torch class BaseModel(): def __init__(self): pass def name(self): return 'BaseModel' def initialize(self, use_gpu=True, gpu_ids=[0]): self.use_gpu = use_gpu self.gpu_ids = gpu_ids def forward(self): pass def get_image_paths(self): pass def optimize_parameters(self): pass def get_current_visuals(self): return self.input def get_current_errors(self): return {} def save(self, label): pass # helper saving function that can be used by subclasses def save_network(self, network, path, network_label, epoch_label): save_filename = '%s_net_%s.pth' % (epoch_label, network_label) save_path = os.path.join(path, save_filename) torch.save(network.state_dict(), save_path) # helper loading function that can be used by subclasses def load_network(self, network, network_label, epoch_label): save_filename = '%s_net_%s.pth' % (epoch_label, network_label) save_path = os.path.join(self.save_dir, save_filename) print('Loading network from %s' % save_path) network.load_state_dict(torch.load(save_path)) def update_learning_rate(): pass def get_image_paths(self): return self.image_paths def save_done(self, flag=False): np.save(os.path.join(self.save_dir, 'done_flag'), flag) np.savetxt( os.path.join(self.save_dir, 'done_flag'), [ flag, ], fmt='%i') ================================================ FILE: models/archs/stylegan2/lpips/dist_model.py ================================================ from __future__ import absolute_import import os from collections import OrderedDict import models.archs.stylegan2.lpips as util import numpy as np import torch from scipy.ndimage import zoom from torch.autograd import Variable from tqdm import tqdm from . import networks_basic as networks from .base_model import BaseModel class DistModel(BaseModel): def name(self): return self.model_name def initialize(self, model='net-lin', net='alex', colorspace='Lab', pnet_rand=False, pnet_tune=False, model_path=None, use_gpu=True, printNet=False, spatial=False, is_train=False, lr=.0001, beta1=0.5, version='0.1', gpu_ids=[0]): ''' INPUTS model - ['net-lin'] for linearly calibrated network ['net'] for off-the-shelf network ['L2'] for L2 distance in Lab colorspace ['SSIM'] for ssim in RGB colorspace net - ['squeeze','alex','vgg'] model_path - if None, will look in weights/[NET_NAME].pth colorspace - ['Lab','RGB'] colorspace to use for L2 and SSIM use_gpu - bool - whether or not to use a GPU printNet - bool - whether or not to print network architecture out spatial - bool - whether to output an array containing varying distances across spatial dimensions spatial_shape - if given, output spatial shape. if None then spatial shape is determined automatically via spatial_factor (see below). spatial_factor - if given, specifies upsampling factor relative to the largest spatial extent of a convolutional layer. if None then resized to size of input images. spatial_order - spline order of filter for upsampling in spatial mode, by default 1 (bilinear). is_train - bool - [True] for training mode lr - float - initial learning rate beta1 - float - initial momentum term for adam version - 0.1 for latest, 0.0 was original (with a bug) gpu_ids - int array - [0] by default, gpus to use ''' BaseModel.initialize(self, use_gpu=use_gpu, gpu_ids=gpu_ids) self.model = model self.net = net self.is_train = is_train self.spatial = spatial self.gpu_ids = gpu_ids self.model_name = '%s [%s]' % (model, net) if (self.model == 'net-lin'): # pretrained net + linear layer self.net = networks.PNetLin( pnet_rand=pnet_rand, pnet_tune=pnet_tune, pnet_type=net, use_dropout=True, spatial=spatial, version=version, lpips=True) kw = {} if not use_gpu: kw['map_location'] = 'cpu' if (model_path is None): import inspect model_path = os.path.abspath( os.path.join( inspect.getfile(self.initialize), '..', 'weights/v%s/%s.pth' % (version, net))) if (not is_train): print('Loading model from: %s' % model_path) self.net.load_state_dict( torch.load(model_path, **kw), strict=False) elif (self.model == 'net'): # pretrained network self.net = networks.PNetLin( pnet_rand=pnet_rand, pnet_type=net, lpips=False) elif (self.model in ['L2', 'l2']): self.net = networks.L2( use_gpu=use_gpu, colorspace=colorspace ) # not really a network, only for testing self.model_name = 'L2' elif (self.model in ['DSSIM', 'dssim', 'SSIM', 'ssim']): self.net = networks.DSSIM(use_gpu=use_gpu, colorspace=colorspace) self.model_name = 'SSIM' else: raise ValueError("Model [%s] not recognized." % self.model) self.parameters = list(self.net.parameters()) if self.is_train: # training mode # extra network on top to go from distances (d0,d1) => predicted human judgment (h*) self.rankLoss = networks.BCERankingLoss() self.parameters += list(self.rankLoss.net.parameters()) self.lr = lr self.old_lr = lr self.optimizer_net = torch.optim.Adam( self.parameters, lr=lr, betas=(beta1, 0.999)) else: # test mode self.net.eval() if (use_gpu): self.net.to(gpu_ids[0]) self.net = torch.nn.DataParallel(self.net, device_ids=gpu_ids) if (self.is_train): self.rankLoss = self.rankLoss.to( device=gpu_ids[0]) # just put this on GPU0 if (printNet): print('---------- Networks initialized -------------') networks.print_network(self.net) print('-----------------------------------------------') def forward(self, in0, in1, retPerLayer=False): ''' Function computes the distance between image patches in0 and in1 INPUTS in0, in1 - torch.Tensor object of shape Nx3xXxY - image patch scaled to [-1,1] OUTPUT computed distances between in0 and in1 ''' return self.net.forward(in0, in1, retPerLayer=retPerLayer) # ***** TRAINING FUNCTIONS ***** def optimize_parameters(self): self.forward_train() self.optimizer_net.zero_grad() self.backward_train() self.optimizer_net.step() self.clamp_weights() def clamp_weights(self): for module in self.net.modules(): if (hasattr(module, 'weight') and module.kernel_size == (1, 1)): module.weight.data = torch.clamp(module.weight.data, min=0) def set_input(self, data): self.input_ref = data['ref'] self.input_p0 = data['p0'] self.input_p1 = data['p1'] self.input_judge = data['judge'] if (self.use_gpu): self.input_ref = self.input_ref.to(device=self.gpu_ids[0]) self.input_p0 = self.input_p0.to(device=self.gpu_ids[0]) self.input_p1 = self.input_p1.to(device=self.gpu_ids[0]) self.input_judge = self.input_judge.to(device=self.gpu_ids[0]) self.var_ref = Variable(self.input_ref, requires_grad=True) self.var_p0 = Variable(self.input_p0, requires_grad=True) self.var_p1 = Variable(self.input_p1, requires_grad=True) def forward_train(self): # run forward pass # print(self.net.module.scaling_layer.shift) # print(torch.norm(self.net.module.net.slice1[0].weight).item(), torch.norm(self.net.module.lin0.model[1].weight).item()) self.d0 = self.forward(self.var_ref, self.var_p0) self.d1 = self.forward(self.var_ref, self.var_p1) self.acc_r = self.compute_accuracy(self.d0, self.d1, self.input_judge) self.var_judge = Variable(1. * self.input_judge).view(self.d0.size()) self.loss_total = self.rankLoss.forward(self.d0, self.d1, self.var_judge * 2. - 1.) return self.loss_total def backward_train(self): torch.mean(self.loss_total).backward() def compute_accuracy(self, d0, d1, judge): ''' d0, d1 are Variables, judge is a Tensor ''' d1_lt_d0 = (d1 < d0).cpu().data.numpy().flatten() judge_per = judge.cpu().numpy().flatten() return d1_lt_d0 * judge_per + (1 - d1_lt_d0) * (1 - judge_per) def get_current_errors(self): retDict = OrderedDict([('loss_total', self.loss_total.data.cpu().numpy()), ('acc_r', self.acc_r)]) for key in retDict.keys(): retDict[key] = np.mean(retDict[key]) return retDict def get_current_visuals(self): zoom_factor = 256 / self.var_ref.data.size()[2] ref_img = util.tensor2im(self.var_ref.data) p0_img = util.tensor2im(self.var_p0.data) p1_img = util.tensor2im(self.var_p1.data) ref_img_vis = zoom(ref_img, [zoom_factor, zoom_factor, 1], order=0) p0_img_vis = zoom(p0_img, [zoom_factor, zoom_factor, 1], order=0) p1_img_vis = zoom(p1_img, [zoom_factor, zoom_factor, 1], order=0) return OrderedDict([('ref', ref_img_vis), ('p0', p0_img_vis), ('p1', p1_img_vis)]) def save(self, path, label): if (self.use_gpu): self.save_network(self.net.module, path, '', label) else: self.save_network(self.net, path, '', label) self.save_network(self.rankLoss.net, path, 'rank', label) def update_learning_rate(self, nepoch_decay): lrd = self.lr / nepoch_decay lr = self.old_lr - lrd for param_group in self.optimizer_net.param_groups: param_group['lr'] = lr print('update lr [%s] decay: %f -> %f' % (type, self.old_lr, lr)) self.old_lr = lr def score_2afc_dataset(data_loader, func, name=''): ''' Function computes Two Alternative Forced Choice (2AFC) score using distance function 'func' in dataset 'data_loader' INPUTS data_loader - CustomDatasetDataLoader object - contains a TwoAFCDataset inside func - callable distance function - calling d=func(in0,in1) should take 2 pytorch tensors with shape Nx3xXxY, and return numpy array of length N OUTPUTS [0] - 2AFC score in [0,1], fraction of time func agrees with human evaluators [1] - dictionary with following elements d0s,d1s - N arrays containing distances between reference patch to perturbed patches gts - N array in [0,1], preferred patch selected by human evaluators (closer to "0" for left patch p0, "1" for right patch p1, "0.6" means 60pct people preferred right patch, 40pct preferred left) scores - N array in [0,1], corresponding to what percentage function agreed with humans CONSTS N - number of test triplets in data_loader ''' d0s = [] d1s = [] gts = [] for data in tqdm(data_loader.load_data(), desc=name): d0s += func(data['ref'], data['p0']).data.cpu().numpy().flatten().tolist() d1s += func(data['ref'], data['p1']).data.cpu().numpy().flatten().tolist() gts += data['judge'].cpu().numpy().flatten().tolist() d0s = np.array(d0s) d1s = np.array(d1s) gts = np.array(gts) scores = (d0s < d1s) * (1. - gts) + (d1s < d0s) * gts + (d1s == d0s) * .5 return (np.mean(scores), dict(d0s=d0s, d1s=d1s, gts=gts, scores=scores)) def score_jnd_dataset(data_loader, func, name=''): ''' Function computes JND score using distance function 'func' in dataset 'data_loader' INPUTS data_loader - CustomDatasetDataLoader object - contains a JNDDataset inside func - callable distance function - calling d=func(in0,in1) should take 2 pytorch tensors with shape Nx3xXxY, and return pytorch array of length N OUTPUTS [0] - JND score in [0,1], mAP score (area under precision-recall curve) [1] - dictionary with following elements ds - N array containing distances between two patches shown to human evaluator sames - N array containing fraction of people who thought the two patches were identical CONSTS N - number of test triplets in data_loader ''' ds = [] gts = [] for data in tqdm(data_loader.load_data(), desc=name): ds += func(data['p0'], data['p1']).data.cpu().numpy().tolist() gts += data['same'].cpu().numpy().flatten().tolist() sames = np.array(gts) ds = np.array(ds) sorted_inds = np.argsort(ds) ds_sorted = ds[sorted_inds] sames_sorted = sames[sorted_inds] TPs = np.cumsum(sames_sorted) FPs = np.cumsum(1 - sames_sorted) FNs = np.sum(sames_sorted) - TPs precs = TPs / (TPs + FPs) recs = TPs / (TPs + FNs) score = util.voc_ap(recs, precs) return (score, dict(ds=ds, sames=sames)) ================================================ FILE: models/archs/stylegan2/lpips/networks_basic.py ================================================ from __future__ import absolute_import import models.archs.stylegan2.lpips as util import torch import torch.nn as nn from torch.autograd import Variable from . import pretrained_networks as pn def spatial_average(in_tens, keepdim=True): return in_tens.mean([2, 3], keepdim=keepdim) def upsample(in_tens, out_H=64): # assumes scale factor is same for H and W in_H = in_tens.shape[2] scale_factor = 1. * out_H / in_H return nn.Upsample( scale_factor=scale_factor, mode='bilinear', align_corners=False)( in_tens) # Learned perceptual metric class PNetLin(nn.Module): def __init__(self, pnet_type='vgg', pnet_rand=False, pnet_tune=False, use_dropout=True, spatial=False, version='0.1', lpips=True): super(PNetLin, self).__init__() self.pnet_type = pnet_type self.pnet_tune = pnet_tune self.pnet_rand = pnet_rand self.spatial = spatial self.lpips = lpips self.version = version self.scaling_layer = ScalingLayer() if (self.pnet_type in ['vgg', 'vgg16']): net_type = pn.vgg16 self.chns = [64, 128, 256, 512, 512] elif (self.pnet_type == 'alex'): net_type = pn.alexnet self.chns = [64, 192, 384, 256, 256] elif (self.pnet_type == 'squeeze'): net_type = pn.squeezenet self.chns = [64, 128, 256, 384, 384, 512, 512] self.L = len(self.chns) self.net = net_type( pretrained=not self.pnet_rand, requires_grad=self.pnet_tune) if (lpips): self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout) self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout) self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout) self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout) self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout) self.lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4] if (self.pnet_type == 'squeeze'): # 7 layers for squeezenet self.lin5 = NetLinLayer(self.chns[5], use_dropout=use_dropout) self.lin6 = NetLinLayer(self.chns[6], use_dropout=use_dropout) self.lins += [self.lin5, self.lin6] def forward(self, in0, in1, retPerLayer=False): # v0.0 - original release had a bug, where input was not scaled in0_input, in1_input = ( self.scaling_layer(in0), self.scaling_layer(in1)) if self.version == '0.1' else (in0, in1) outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input) feats0, feats1, diffs = {}, {}, {} for kk in range(self.L): feats0[kk], feats1[kk] = util.normalize_tensor( outs0[kk]), util.normalize_tensor(outs1[kk]) diffs[kk] = (feats0[kk] - feats1[kk])**2 if (self.lpips): if (self.spatial): res = [ upsample( self.lins[kk].model(diffs[kk]), out_H=in0.shape[2]) for kk in range(self.L) ] else: res = [ spatial_average( self.lins[kk].model(diffs[kk]), keepdim=True) for kk in range(self.L) ] else: if (self.spatial): res = [ upsample( diffs[kk].sum(dim=1, keepdim=True), out_H=in0.shape[2]) for kk in range(self.L) ] else: res = [ spatial_average( diffs[kk].sum(dim=1, keepdim=True), keepdim=True) for kk in range(self.L) ] val = res[0] for l in range(1, self.L): val += res[l] if (retPerLayer): return (val, res) else: return val class ScalingLayer(nn.Module): def __init__(self): super(ScalingLayer, self).__init__() self.register_buffer( 'shift', torch.Tensor([-.030, -.088, -.188])[None, :, None, None]) self.register_buffer( 'scale', torch.Tensor([.458, .448, .450])[None, :, None, None]) def forward(self, inp): return (inp - self.shift) / self.scale class NetLinLayer(nn.Module): ''' A single linear layer which does a 1x1 conv ''' def __init__(self, chn_in, chn_out=1, use_dropout=False): super(NetLinLayer, self).__init__() layers = [ nn.Dropout(), ] if (use_dropout) else [] layers += [ nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False), ] self.model = nn.Sequential(*layers) class Dist2LogitLayer(nn.Module): ''' takes 2 distances, puts through fc layers, spits out value between [0,1] (if use_sigmoid is True) ''' def __init__(self, chn_mid=32, use_sigmoid=True): super(Dist2LogitLayer, self).__init__() layers = [ nn.Conv2d(5, chn_mid, 1, stride=1, padding=0, bias=True), ] layers += [ nn.LeakyReLU(0.2, True), ] layers += [ nn.Conv2d(chn_mid, chn_mid, 1, stride=1, padding=0, bias=True), ] layers += [ nn.LeakyReLU(0.2, True), ] layers += [ nn.Conv2d(chn_mid, 1, 1, stride=1, padding=0, bias=True), ] if (use_sigmoid): layers += [ nn.Sigmoid(), ] self.model = nn.Sequential(*layers) def forward(self, d0, d1, eps=0.1): return self.model.forward( torch.cat((d0, d1, d0 - d1, d0 / (d1 + eps), d1 / (d0 + eps)), dim=1)) class BCERankingLoss(nn.Module): def __init__(self, chn_mid=32): super(BCERankingLoss, self).__init__() self.net = Dist2LogitLayer(chn_mid=chn_mid) # self.parameters = list(self.net.parameters()) self.loss = torch.nn.BCELoss() def forward(self, d0, d1, judge): per = (judge + 1.) / 2. self.logit = self.net.forward(d0, d1) return self.loss(self.logit, per) # L2, DSSIM metrics class FakeNet(nn.Module): def __init__(self, use_gpu=True, colorspace='Lab'): super(FakeNet, self).__init__() self.use_gpu = use_gpu self.colorspace = colorspace class L2(FakeNet): def forward(self, in0, in1, retPerLayer=None): assert (in0.size()[0] == 1) # currently only supports batchSize 1 if (self.colorspace == 'RGB'): (N, C, X, Y) = in0.size() value = torch.mean( torch.mean( torch.mean((in0 - in1)**2, dim=1).view(N, 1, X, Y), dim=2).view(N, 1, 1, Y), dim=3).view(N) return value elif (self.colorspace == 'Lab'): value = util.l2( util.tensor2np(util.tensor2tensorlab(in0.data, to_norm=False)), util.tensor2np(util.tensor2tensorlab(in1.data, to_norm=False)), range=100.).astype('float') ret_var = Variable(torch.Tensor((value, ))) if (self.use_gpu): ret_var = ret_var.cuda() return ret_var class DSSIM(FakeNet): def forward(self, in0, in1, retPerLayer=None): assert (in0.size()[0] == 1) # currently only supports batchSize 1 if (self.colorspace == 'RGB'): value = util.dssim( 1. * util.tensor2im(in0.data), 1. * util.tensor2im(in1.data), range=255.).astype('float') elif (self.colorspace == 'Lab'): value = util.dssim( util.tensor2np(util.tensor2tensorlab(in0.data, to_norm=False)), util.tensor2np(util.tensor2tensorlab(in1.data, to_norm=False)), range=100.).astype('float') ret_var = Variable(torch.Tensor((value, ))) if (self.use_gpu): ret_var = ret_var.cuda() return ret_var def print_network(net): num_params = 0 for param in net.parameters(): num_params += param.numel() print('Network', net) print('Total number of parameters: %d' % num_params) ================================================ FILE: models/archs/stylegan2/lpips/pretrained_networks.py ================================================ from collections import namedtuple import torch from torchvision import models as tv class squeezenet(torch.nn.Module): def __init__(self, requires_grad=False, pretrained=True): super(squeezenet, self).__init__() pretrained_features = tv.squeezenet1_1(pretrained=pretrained).features self.slice1 = torch.nn.Sequential() self.slice2 = torch.nn.Sequential() self.slice3 = torch.nn.Sequential() self.slice4 = torch.nn.Sequential() self.slice5 = torch.nn.Sequential() self.slice6 = torch.nn.Sequential() self.slice7 = torch.nn.Sequential() self.N_slices = 7 for x in range(2): self.slice1.add_module(str(x), pretrained_features[x]) for x in range(2, 5): self.slice2.add_module(str(x), pretrained_features[x]) for x in range(5, 8): self.slice3.add_module(str(x), pretrained_features[x]) for x in range(8, 10): self.slice4.add_module(str(x), pretrained_features[x]) for x in range(10, 11): self.slice5.add_module(str(x), pretrained_features[x]) for x in range(11, 12): self.slice6.add_module(str(x), pretrained_features[x]) for x in range(12, 13): self.slice7.add_module(str(x), pretrained_features[x]) if not requires_grad: for param in self.parameters(): param.requires_grad = False def forward(self, X): h = self.slice1(X) h_relu1 = h h = self.slice2(h) h_relu2 = h h = self.slice3(h) h_relu3 = h h = self.slice4(h) h_relu4 = h h = self.slice5(h) h_relu5 = h h = self.slice6(h) h_relu6 = h h = self.slice7(h) h_relu7 = h vgg_outputs = namedtuple( "SqueezeOutputs", ['relu1', 'relu2', 'relu3', 'relu4', 'relu5', 'relu6', 'relu7']) out = vgg_outputs(h_relu1, h_relu2, h_relu3, h_relu4, h_relu5, h_relu6, h_relu7) return out class alexnet(torch.nn.Module): def __init__(self, requires_grad=False, pretrained=True): super(alexnet, self).__init__() alexnet_pretrained_features = tv.alexnet( pretrained=pretrained).features self.slice1 = torch.nn.Sequential() self.slice2 = torch.nn.Sequential() self.slice3 = torch.nn.Sequential() self.slice4 = torch.nn.Sequential() self.slice5 = torch.nn.Sequential() self.N_slices = 5 for x in range(2): self.slice1.add_module(str(x), alexnet_pretrained_features[x]) for x in range(2, 5): self.slice2.add_module(str(x), alexnet_pretrained_features[x]) for x in range(5, 8): self.slice3.add_module(str(x), alexnet_pretrained_features[x]) for x in range(8, 10): self.slice4.add_module(str(x), alexnet_pretrained_features[x]) for x in range(10, 12): self.slice5.add_module(str(x), alexnet_pretrained_features[x]) if not requires_grad: for param in self.parameters(): param.requires_grad = False def forward(self, X): h = self.slice1(X) h_relu1 = h h = self.slice2(h) h_relu2 = h h = self.slice3(h) h_relu3 = h h = self.slice4(h) h_relu4 = h h = self.slice5(h) h_relu5 = h alexnet_outputs = namedtuple( "AlexnetOutputs", ['relu1', 'relu2', 'relu3', 'relu4', 'relu5']) out = alexnet_outputs(h_relu1, h_relu2, h_relu3, h_relu4, h_relu5) return out class vgg16(torch.nn.Module): def __init__(self, requires_grad=False, pretrained=True): super(vgg16, self).__init__() vgg_pretrained_features = tv.vgg16(pretrained=pretrained).features self.slice1 = torch.nn.Sequential() self.slice2 = torch.nn.Sequential() self.slice3 = torch.nn.Sequential() self.slice4 = torch.nn.Sequential() self.slice5 = torch.nn.Sequential() self.N_slices = 5 for x in range(4): self.slice1.add_module(str(x), vgg_pretrained_features[x]) for x in range(4, 9): self.slice2.add_module(str(x), vgg_pretrained_features[x]) for x in range(9, 16): self.slice3.add_module(str(x), vgg_pretrained_features[x]) for x in range(16, 23): self.slice4.add_module(str(x), vgg_pretrained_features[x]) for x in range(23, 30): self.slice5.add_module(str(x), vgg_pretrained_features[x]) if not requires_grad: for param in self.parameters(): param.requires_grad = False def forward(self, X): h = self.slice1(X) h_relu1_2 = h h = self.slice2(h) h_relu2_2 = h h = self.slice3(h) h_relu3_3 = h h = self.slice4(h) h_relu4_3 = h h = self.slice5(h) h_relu5_3 = h vgg_outputs = namedtuple( "VggOutputs", ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3', 'relu5_3']) out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3) return out class resnet(torch.nn.Module): def __init__(self, requires_grad=False, pretrained=True, num=18): super(resnet, self).__init__() if (num == 18): self.net = tv.resnet18(pretrained=pretrained) elif (num == 34): self.net = tv.resnet34(pretrained=pretrained) elif (num == 50): self.net = tv.resnet50(pretrained=pretrained) elif (num == 101): self.net = tv.resnet101(pretrained=pretrained) elif (num == 152): self.net = tv.resnet152(pretrained=pretrained) self.N_slices = 5 self.conv1 = self.net.conv1 self.bn1 = self.net.bn1 self.relu = self.net.relu self.maxpool = self.net.maxpool self.layer1 = self.net.layer1 self.layer2 = self.net.layer2 self.layer3 = self.net.layer3 self.layer4 = self.net.layer4 def forward(self, X): h = self.conv1(X) h = self.bn1(h) h = self.relu(h) h_relu1 = h h = self.maxpool(h) h = self.layer1(h) h_conv2 = h h = self.layer2(h) h_conv3 = h h = self.layer3(h) h_conv4 = h h = self.layer4(h) h_conv5 = h outputs = namedtuple("Outputs", ['relu1', 'conv2', 'conv3', 'conv4', 'conv5']) out = outputs(h_relu1, h_conv2, h_conv3, h_conv4, h_conv5) return out ================================================ FILE: models/archs/stylegan2/model.py ================================================ import functools import math import operator import random import sys import torch from models.archs.stylegan2.op import (FusedLeakyReLU, fused_leaky_relu, upfirdn2d) from torch import nn from torch.autograd import Function from torch.nn import functional as F class PixelNorm(nn.Module): def __init__(self): super().__init__() def forward(self, input): return input * torch.rsqrt( torch.mean(input**2, dim=1, keepdim=True) + 1e-8) def make_kernel(k): k = torch.tensor(k, dtype=torch.float32) if k.ndim == 1: k = k[None, :] * k[:, None] k /= k.sum() return k class Upsample(nn.Module): def __init__(self, kernel, factor=2): super().__init__() self.factor = factor kernel = make_kernel(kernel) * (factor**2) self.register_buffer("kernel", kernel) p = kernel.shape[0] - factor pad0 = (p + 1) // 2 + factor - 1 pad1 = p // 2 self.pad = (pad0, pad1) def forward(self, input): out = upfirdn2d( input, self.kernel, up=self.factor, down=1, pad=self.pad) return out class Downsample(nn.Module): def __init__(self, kernel, factor=2): super().__init__() self.factor = factor kernel = make_kernel(kernel) self.register_buffer("kernel", kernel) p = kernel.shape[0] - factor pad0 = (p + 1) // 2 pad1 = p // 2 self.pad = (pad0, pad1) def forward(self, input): out = upfirdn2d( input, self.kernel, up=1, down=self.factor, pad=self.pad) return out class Blur(nn.Module): def __init__(self, kernel, pad, upsample_factor=1): super().__init__() kernel = make_kernel(kernel) if upsample_factor > 1: kernel = kernel * (upsample_factor**2) self.register_buffer("kernel", kernel) self.pad = pad def forward(self, input): out = upfirdn2d(input, self.kernel, pad=self.pad) return out class EqualConv2d(nn.Module): def __init__(self, in_channel, out_channel, kernel_size, stride=1, padding=0, bias=True): super().__init__() self.weight = nn.Parameter( torch.randn(out_channel, in_channel, kernel_size, kernel_size)) self.scale = 1 / math.sqrt(in_channel * kernel_size**2) self.stride = stride self.padding = padding if bias: self.bias = nn.Parameter(torch.zeros(out_channel)) else: self.bias = None def forward(self, input): out = F.conv2d( input, self.weight * self.scale, bias=self.bias, stride=self.stride, padding=self.padding, ) return out def __repr__(self): return ( f"{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]}," f" {self.weight.shape[2]}, stride={self.stride}, padding={self.padding})" ) class EqualLinear(nn.Module): def __init__(self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1, activation=None): super().__init__() self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul)) if bias: self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init)) else: self.bias = None self.activation = activation self.scale = (1 / math.sqrt(in_dim)) * lr_mul self.lr_mul = lr_mul def forward(self, input): if self.activation: out = F.linear(input, self.weight * self.scale) out = fused_leaky_relu(out, self.bias * self.lr_mul) else: out = F.linear( input, self.weight * self.scale, bias=self.bias * self.lr_mul) return out def __repr__(self): return ( f"{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]})" ) class ModulatedConv2d(nn.Module): def __init__( self, in_channel, out_channel, kernel_size, style_dim, demodulate=True, upsample=False, downsample=False, blur_kernel=[1, 3, 3, 1], ): super().__init__() self.eps = 1e-8 self.kernel_size = kernel_size self.in_channel = in_channel self.out_channel = out_channel self.upsample = upsample self.downsample = downsample if upsample: factor = 2 p = (len(blur_kernel) - factor) - (kernel_size - 1) pad0 = (p + 1) // 2 + factor - 1 pad1 = p // 2 + 1 self.blur = Blur( blur_kernel, pad=(pad0, pad1), upsample_factor=factor) if downsample: factor = 2 p = (len(blur_kernel) - factor) + (kernel_size - 1) pad0 = (p + 1) // 2 pad1 = p // 2 self.blur = Blur(blur_kernel, pad=(pad0, pad1)) fan_in = in_channel * kernel_size**2 self.scale = 1 / math.sqrt(fan_in) self.padding = kernel_size // 2 self.weight = nn.Parameter( torch.randn(1, out_channel, in_channel, kernel_size, kernel_size)) self.modulation = EqualLinear(style_dim, in_channel, bias_init=1) self.demodulate = demodulate def __repr__(self): return ( f"{self.__class__.__name__}({self.in_channel}, {self.out_channel}, {self.kernel_size}, " f"upsample={self.upsample}, downsample={self.downsample})") def forward(self, input, style): batch, in_channel, height, width = input.shape style = self.modulation(style).view(batch, 1, in_channel, 1, 1) weight = self.scale * self.weight * style if self.demodulate: demod = torch.rsqrt(weight.pow(2).sum([2, 3, 4]) + 1e-8) weight = weight * demod.view(batch, self.out_channel, 1, 1, 1) weight = weight.view(batch * self.out_channel, in_channel, self.kernel_size, self.kernel_size) if self.upsample: input = input.view(1, batch * in_channel, height, width) weight = weight.view(batch, self.out_channel, in_channel, self.kernel_size, self.kernel_size) weight = weight.transpose(1, 2).reshape(batch * in_channel, self.out_channel, self.kernel_size, self.kernel_size) out = F.conv_transpose2d( input, weight, padding=0, stride=2, groups=batch) _, _, height, width = out.shape out = out.view(batch, self.out_channel, height, width) out = self.blur(out) elif self.downsample: input = self.blur(input) _, _, height, width = input.shape input = input.view(1, batch * in_channel, height, width) out = F.conv2d(input, weight, padding=0, stride=2, groups=batch) _, _, height, width = out.shape out = out.view(batch, self.out_channel, height, width) else: input = input.view(1, batch * in_channel, height, width) out = F.conv2d(input, weight, padding=self.padding, groups=batch) _, _, height, width = out.shape out = out.view(batch, self.out_channel, height, width) return out class NoiseInjection(nn.Module): def __init__(self): super().__init__() self.weight = nn.Parameter(torch.zeros(1)) def forward(self, image, noise=None): if noise is None: batch, _, height, width = image.shape noise = image.new_empty(batch, 1, height, width).normal_() return image + self.weight * noise class ConstantInput(nn.Module): def __init__(self, channel, size=4): super().__init__() self.input = nn.Parameter(torch.randn(1, channel, size, size)) def forward(self, input): batch = input.shape[0] out = self.input.repeat(batch, 1, 1, 1) return out class StyledConv(nn.Module): def __init__( self, in_channel, out_channel, kernel_size, style_dim, upsample=False, blur_kernel=[1, 3, 3, 1], demodulate=True, ): super().__init__() self.conv = ModulatedConv2d( in_channel, out_channel, kernel_size, style_dim, upsample=upsample, blur_kernel=blur_kernel, demodulate=demodulate, ) self.noise = NoiseInjection() # self.bias = nn.Parameter(torch.zeros(1, out_channel, 1, 1)) # self.activate = ScaledLeakyReLU(0.2) self.activate = FusedLeakyReLU(out_channel) def forward(self, input, style, noise=None): out = self.conv(input, style) out = self.noise(out, noise=noise) # out = out + self.bias out = self.activate(out) return out class ToRGB(nn.Module): def __init__(self, in_channel, style_dim, upsample=True, blur_kernel=[1, 3, 3, 1]): super().__init__() if upsample: self.upsample = Upsample(blur_kernel) self.conv = ModulatedConv2d( in_channel, 3, 1, style_dim, demodulate=False) self.bias = nn.Parameter(torch.zeros(1, 3, 1, 1)) def forward(self, input, style, skip=None): out = self.conv(input, style) out = out + self.bias if skip is not None: skip = self.upsample(skip) out = out + skip return out class Generator(nn.Module): def __init__( self, size, style_dim, n_mlp, channel_multiplier=2, blur_kernel=[1, 3, 3, 1], lr_mlp=0.01, ): super().__init__() self.size = size self.style_dim = style_dim layers = [PixelNorm()] for i in range(n_mlp): layers.append( EqualLinear( style_dim, style_dim, lr_mul=lr_mlp, activation="fused_lrelu")) # self.style = nn.Sequential(*layers) self.style = nn.ModuleList(layers) self.channels = { 4: 512, 8: 512, 16: 512, 32: 512, 64: 256 * channel_multiplier, 128: 128 * channel_multiplier, 256: 64 * channel_multiplier, 512: 32 * channel_multiplier, 1024: 16 * channel_multiplier, } self.input = ConstantInput(self.channels[4]) self.conv1 = StyledConv( self.channels[4], self.channels[4], 3, style_dim, blur_kernel=blur_kernel) self.to_rgb1 = ToRGB(self.channels[4], style_dim, upsample=False) self.log_size = int(math.log(size, 2)) self.num_layers = (self.log_size - 2) * 2 + 1 self.convs = nn.ModuleList() self.upsamples = nn.ModuleList() self.to_rgbs = nn.ModuleList() self.noises = nn.Module() in_channel = self.channels[4] for layer_idx in range(self.num_layers): res = (layer_idx + 5) // 2 shape = [1, 1, 2**res, 2**res] self.noises.register_buffer(f"noise_{layer_idx}", torch.randn(*shape)) for i in range(3, self.log_size + 1): out_channel = self.channels[2**i] self.convs.append( StyledConv( in_channel, out_channel, 3, style_dim, upsample=True, blur_kernel=blur_kernel, )) self.convs.append( StyledConv( out_channel, out_channel, 3, style_dim, blur_kernel=blur_kernel)) self.to_rgbs.append(ToRGB(out_channel, style_dim)) in_channel = out_channel self.n_latent = self.log_size * 2 - 2 def make_noise(self): device = self.input.input.device noises = [torch.randn(1, 1, 2**2, 2**2, device=device)] for i in range(3, self.log_size + 1): for _ in range(2): noises.append(torch.randn(1, 1, 2**i, 2**i, device=device)) return noises def mean_latent(self, n_latent): latent_in = torch.randn( n_latent, self.style_dim, device=self.input.input.device) latent = self.style_forward(latent_in).mean(0, keepdim=True) return latent def get_latent(self, input): out = input for i, layer in enumerate(self.style): out = layer(out) return out def style_forward(self, input, skip_norm=False): out = input for i, layer in enumerate(self.style): if i == 0 and skip_norm: continue out = layer(out) return out def forward( self, styles, return_latents=False, inject_index=None, truncation=1, truncation_latent=None, input_is_latent=False, noise=None, randomize_noise=True, ): if not input_is_latent: styles = [self.style_forward(s) for s in styles] if noise is None: if randomize_noise: noise = [None] * self.num_layers else: noise = [ getattr(self.noises, f"noise_{i}") for i in range(self.num_layers) ] if truncation < 1: style_t = [] for style in styles: style_t.append(truncation_latent + truncation * (style - truncation_latent)) styles = style_t if len(styles) < 2: inject_index = self.n_latent if styles[0].ndim < 3: latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1) else: latent = styles[0] else: if inject_index is None: inject_index = random.randint(1, self.n_latent - 1) latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1) latent2 = styles[1].unsqueeze(1).repeat( 1, self.n_latent - inject_index, 1) latent = torch.cat([latent, latent2], 1) out = self.input(latent) out = self.conv1(out, latent[:, 0], noise=noise[0]) skip = self.to_rgb1(out, latent[:, 1]) i = 1 for conv1, conv2, noise1, noise2, to_rgb in zip( self.convs[::2], self.convs[1::2], noise[1::2], noise[2::2], self.to_rgbs): out = conv1(out, latent[:, i], noise=noise1) out = conv2(out, latent[:, i + 1], noise=noise2) skip = to_rgb(out, latent[:, i + 2], skip) i += 2 image = skip if return_latents: return image, latent else: return image, None class ConvLayer(nn.Sequential): def __init__( self, in_channel, out_channel, kernel_size, downsample=False, blur_kernel=[1, 3, 3, 1], bias=True, activate=True, ): layers = [] if downsample: factor = 2 p = (len(blur_kernel) - factor) + (kernel_size - 1) pad0 = (p + 1) // 2 pad1 = p // 2 layers.append(Blur(blur_kernel, pad=(pad0, pad1))) stride = 2 self.padding = 0 else: stride = 1 self.padding = kernel_size // 2 layers.append( EqualConv2d( in_channel, out_channel, kernel_size, padding=self.padding, stride=stride, bias=bias and not activate, )) if activate: layers.append(FusedLeakyReLU(out_channel, bias=bias)) super().__init__(*layers) class ResBlock(nn.Module): def __init__(self, in_channel, out_channel, blur_kernel=[1, 3, 3, 1]): super().__init__() self.conv1 = ConvLayer(in_channel, in_channel, 3) self.conv2 = ConvLayer(in_channel, out_channel, 3, downsample=True) self.skip = ConvLayer( in_channel, out_channel, 1, downsample=True, activate=False, bias=False) def forward(self, input): out = self.conv1(input) out = self.conv2(out) skip = self.skip(input) out = (out + skip) / math.sqrt(2) return out class Discriminator(nn.Module): def __init__(self, size, channel_multiplier=2, blur_kernel=[1, 3, 3, 1]): super().__init__() channels = { 4: 512, 8: 512, 16: 512, 32: 512, 64: 256 * channel_multiplier, 128: 128 * channel_multiplier, 256: 64 * channel_multiplier, 512: 32 * channel_multiplier, 1024: 16 * channel_multiplier, } convs = [ConvLayer(3, channels[size], 1)] log_size = int(math.log(size, 2)) in_channel = channels[size] for i in range(log_size, 2, -1): out_channel = channels[2**(i - 1)] convs.append(ResBlock(in_channel, out_channel, blur_kernel)) in_channel = out_channel self.convs = nn.Sequential(*convs) self.stddev_group = 4 self.stddev_feat = 1 self.final_conv = ConvLayer(in_channel + 1, channels[4], 3) self.final_linear = nn.Sequential( EqualLinear( channels[4] * 4 * 4, channels[4], activation="fused_lrelu"), EqualLinear(channels[4], 1), ) def forward(self, input): out = self.convs(input) batch, channel, height, width = out.shape group = min(batch, self.stddev_group) stddev = out.view(group, -1, self.stddev_feat, channel // self.stddev_feat, height, width) stddev = torch.sqrt(stddev.var(0, unbiased=False) + 1e-8) stddev = stddev.mean([2, 3, 4], keepdims=True).squeeze(2) stddev = stddev.repeat(group, 1, height, width) out = torch.cat([out, stddev], 1) out = self.final_conv(out) out = out.view(batch, -1) out = self.final_linear(out) return out ================================================ FILE: models/archs/stylegan2/non_leaking.py ================================================ import math import torch from torch.nn import functional as F from distributed import reduce_sum from op import upfirdn2d class AdaptiveAugment: def __init__(self, ada_aug_target, ada_aug_len, update_every, device): self.ada_aug_target = ada_aug_target self.ada_aug_len = ada_aug_len self.update_every = update_every self.ada_aug_buf = torch.tensor([0.0, 0.0], device=device) self.r_t_stat = 0 self.ada_aug_p = 0 @torch.no_grad() def tune(self, real_pred): ada_aug_data = torch.tensor( (torch.sign(real_pred).sum().item(), real_pred.shape[0]), device=real_pred.device, ) self.ada_aug_buf += reduce_sum(ada_aug_data) if self.ada_aug_buf[1] > self.update_every - 1: pred_signs, n_pred = self.ada_aug_buf.tolist() self.r_t_stat = pred_signs / n_pred if self.r_t_stat > self.ada_aug_target: sign = 1 else: sign = -1 self.ada_aug_p += sign * n_pred / self.ada_aug_len self.ada_aug_p = min(1, max(0, self.ada_aug_p)) self.ada_aug_buf.mul_(0) return self.ada_aug_p SYM6 = ( 0.015404109327027373, 0.0034907120842174702, -0.11799011114819057, -0.048311742585633, 0.4910559419267466, 0.787641141030194, 0.3379294217276218, -0.07263752278646252, -0.021060292512300564, 0.04472490177066578, 0.0017677118642428036, -0.007800708325034148, ) def translate_mat(t_x, t_y): batch = t_x.shape[0] mat = torch.eye(3).unsqueeze(0).repeat(batch, 1, 1) translate = torch.stack((t_x, t_y), 1) mat[:, :2, 2] = translate return mat def rotate_mat(theta): batch = theta.shape[0] mat = torch.eye(3).unsqueeze(0).repeat(batch, 1, 1) sin_t = torch.sin(theta) cos_t = torch.cos(theta) rot = torch.stack((cos_t, -sin_t, sin_t, cos_t), 1).view(batch, 2, 2) mat[:, :2, :2] = rot return mat def scale_mat(s_x, s_y): batch = s_x.shape[0] mat = torch.eye(3).unsqueeze(0).repeat(batch, 1, 1) mat[:, 0, 0] = s_x mat[:, 1, 1] = s_y return mat def translate3d_mat(t_x, t_y, t_z): batch = t_x.shape[0] mat = torch.eye(4).unsqueeze(0).repeat(batch, 1, 1) translate = torch.stack((t_x, t_y, t_z), 1) mat[:, :3, 3] = translate return mat def rotate3d_mat(axis, theta): batch = theta.shape[0] u_x, u_y, u_z = axis eye = torch.eye(3).unsqueeze(0) cross = torch.tensor([(0, -u_z, u_y), (u_z, 0, -u_x), (-u_y, u_x, 0)]).unsqueeze(0) outer = torch.tensor(axis) outer = (outer.unsqueeze(1) * outer).unsqueeze(0) sin_t = torch.sin(theta).view(-1, 1, 1) cos_t = torch.cos(theta).view(-1, 1, 1) rot = cos_t * eye + sin_t * cross + (1 - cos_t) * outer eye_4 = torch.eye(4).unsqueeze(0).repeat(batch, 1, 1) eye_4[:, :3, :3] = rot return eye_4 def scale3d_mat(s_x, s_y, s_z): batch = s_x.shape[0] mat = torch.eye(4).unsqueeze(0).repeat(batch, 1, 1) mat[:, 0, 0] = s_x mat[:, 1, 1] = s_y mat[:, 2, 2] = s_z return mat def luma_flip_mat(axis, i): batch = i.shape[0] eye = torch.eye(4).unsqueeze(0).repeat(batch, 1, 1) axis = torch.tensor(axis + (0,)) flip = 2 * torch.ger(axis, axis) * i.view(-1, 1, 1) return eye - flip def saturation_mat(axis, i): batch = i.shape[0] eye = torch.eye(4).unsqueeze(0).repeat(batch, 1, 1) axis = torch.tensor(axis + (0,)) axis = torch.ger(axis, axis) saturate = axis + (eye - axis) * i.view(-1, 1, 1) return saturate def lognormal_sample(size, mean=0, std=1): return torch.empty(size).log_normal_(mean=mean, std=std) def category_sample(size, categories): category = torch.tensor(categories) sample = torch.randint(high=len(categories), size=(size,)) return category[sample] def uniform_sample(size, low, high): return torch.empty(size).uniform_(low, high) def normal_sample(size, mean=0, std=1): return torch.empty(size).normal_(mean, std) def bernoulli_sample(size, p): return torch.empty(size).bernoulli_(p) def random_mat_apply(p, transform, prev, eye): size = transform.shape[0] select = bernoulli_sample(size, p).view(size, 1, 1) select_transform = select * transform + (1 - select) * eye return select_transform @ prev def sample_affine(p, size, height, width): G = torch.eye(3).unsqueeze(0).repeat(size, 1, 1) eye = G # flip param = category_sample(size, (0, 1)) Gc = scale_mat(1 - 2.0 * param, torch.ones(size)) G = random_mat_apply(p, Gc, G, eye) # print('flip', G, scale_mat(1 - 2.0 * param, torch.ones(size)), sep='\n') # 90 rotate param = category_sample(size, (0, 3)) Gc = rotate_mat(-math.pi / 2 * param) G = random_mat_apply(p, Gc, G, eye) # print('90 rotate', G, rotate_mat(-math.pi / 2 * param), sep='\n') # integer translate param = uniform_sample(size, -0.125, 0.125) param_height = torch.round(param * height) / height param_width = torch.round(param * width) / width Gc = translate_mat(param_width, param_height) G = random_mat_apply(p, Gc, G, eye) # print('integer translate', G, translate_mat(param_width, param_height), sep='\n') # isotropic scale param = lognormal_sample(size, std=0.2 * math.log(2)) Gc = scale_mat(param, param) G = random_mat_apply(p, Gc, G, eye) # print('isotropic scale', G, scale_mat(param, param), sep='\n') p_rot = 1 - math.sqrt(1 - p) # pre-rotate param = uniform_sample(size, -math.pi, math.pi) Gc = rotate_mat(-param) G = random_mat_apply(p_rot, Gc, G, eye) # print('pre-rotate', G, rotate_mat(-param), sep='\n') # anisotropic scale param = lognormal_sample(size, std=0.2 * math.log(2)) Gc = scale_mat(param, 1 / param) G = random_mat_apply(p, Gc, G, eye) # print('anisotropic scale', G, scale_mat(param, 1 / param), sep='\n') # post-rotate param = uniform_sample(size, -math.pi, math.pi) Gc = rotate_mat(-param) G = random_mat_apply(p_rot, Gc, G, eye) # print('post-rotate', G, rotate_mat(-param), sep='\n') # fractional translate param = normal_sample(size, std=0.125) Gc = translate_mat(param, param) G = random_mat_apply(p, Gc, G, eye) # print('fractional translate', G, translate_mat(param, param), sep='\n') return G def sample_color(p, size): C = torch.eye(4).unsqueeze(0).repeat(size, 1, 1) eye = C axis_val = 1 / math.sqrt(3) axis = (axis_val, axis_val, axis_val) # brightness param = normal_sample(size, std=0.2) Cc = translate3d_mat(param, param, param) C = random_mat_apply(p, Cc, C, eye) # contrast param = lognormal_sample(size, std=0.5 * math.log(2)) Cc = scale3d_mat(param, param, param) C = random_mat_apply(p, Cc, C, eye) # luma flip param = category_sample(size, (0, 1)) Cc = luma_flip_mat(axis, param) C = random_mat_apply(p, Cc, C, eye) # hue rotation param = uniform_sample(size, -math.pi, math.pi) Cc = rotate3d_mat(axis, param) C = random_mat_apply(p, Cc, C, eye) # saturation param = lognormal_sample(size, std=1 * math.log(2)) Cc = saturation_mat(axis, param) C = random_mat_apply(p, Cc, C, eye) return C def make_grid(shape, x0, x1, y0, y1, device): n, c, h, w = shape grid = torch.empty(n, h, w, 3, device=device) grid[:, :, :, 0] = torch.linspace(x0, x1, w, device=device) grid[:, :, :, 1] = torch.linspace(y0, y1, h, device=device).unsqueeze(-1) grid[:, :, :, 2] = 1 return grid def affine_grid(grid, mat): n, h, w, _ = grid.shape return (grid.view(n, h * w, 3) @ mat.transpose(1, 2)).view(n, h, w, 2) def get_padding(G, height, width): extreme = ( G[:, :2, :] @ torch.tensor([(-1.0, -1, 1), (-1, 1, 1), (1, -1, 1), (1, 1, 1)]).t() ) size = torch.tensor((width, height)) pad_low = ( ((extreme.min(-1).values + 1) * size) .clamp(max=0) .abs() .ceil() .max(0) .values.to(torch.int64) .tolist() ) pad_high = ( (extreme.max(-1).values * size - size) .clamp(min=0) .ceil() .max(0) .values.to(torch.int64) .tolist() ) return pad_low[0], pad_high[0], pad_low[1], pad_high[1] def try_sample_affine_and_pad(img, p, pad_k, G=None): batch, _, height, width = img.shape G_try = G while True: if G is None: G_try = sample_affine(p, batch, height, width) pad_x1, pad_x2, pad_y1, pad_y2 = get_padding( torch.inverse(G_try), height, width ) try: img_pad = F.pad( img, (pad_x1 + pad_k, pad_x2 + pad_k, pad_y1 + pad_k, pad_y2 + pad_k), mode="reflect", ) except RuntimeError: continue break return img_pad, G_try, (pad_x1, pad_x2, pad_y1, pad_y2) def random_apply_affine(img, p, G=None, antialiasing_kernel=SYM6): kernel = antialiasing_kernel len_k = len(kernel) pad_k = (len_k + 1) // 2 kernel = torch.as_tensor(kernel) kernel = torch.ger(kernel, kernel).to(img) kernel_flip = torch.flip(kernel, (0, 1)) img_pad, G, (pad_x1, pad_x2, pad_y1, pad_y2) = try_sample_affine_and_pad( img, p, pad_k, G ) p_ux1 = pad_x1 p_ux2 = pad_x2 + 1 p_uy1 = pad_y1 p_uy2 = pad_y2 + 1 w_p = img_pad.shape[3] - len_k + 1 h_p = img_pad.shape[2] - len_k + 1 h_o = img.shape[2] w_o = img.shape[3] img_2x = upfirdn2d(img_pad, kernel_flip, up=2) grid = make_grid( img_2x.shape, -2 * p_ux1 / w_o - 1, 2 * (w_p - p_ux1) / w_o - 1, -2 * p_uy1 / h_o - 1, 2 * (h_p - p_uy1) / h_o - 1, device=img_2x.device, ).to(img_2x) grid = affine_grid(grid, torch.inverse(G)[:, :2, :].to(img_2x)) grid = grid * torch.tensor( [w_o / w_p, h_o / h_p], device=grid.device ) + torch.tensor( [(w_o + 2 * p_ux1) / w_p - 1, (h_o + 2 * p_uy1) / h_p - 1], device=grid.device ) img_affine = F.grid_sample( img_2x, grid, mode="bilinear", align_corners=False, padding_mode="zeros" ) img_down = upfirdn2d(img_affine, kernel, down=2) end_y = -pad_y2 - 1 if end_y == 0: end_y = img_down.shape[2] end_x = -pad_x2 - 1 if end_x == 0: end_x = img_down.shape[3] img = img_down[:, :, pad_y1:end_y, pad_x1:end_x] return img, G def apply_color(img, mat): batch = img.shape[0] img = img.permute(0, 2, 3, 1) mat_mul = mat[:, :3, :3].transpose(1, 2).view(batch, 1, 3, 3) mat_add = mat[:, :3, 3].view(batch, 1, 1, 3) img = img @ mat_mul + mat_add img = img.permute(0, 3, 1, 2) return img def random_apply_color(img, p, C=None): if C is None: C = sample_color(p, img.shape[0]) img = apply_color(img, C.to(img)) return img, C def augment(img, p, transform_matrix=(None, None)): img, G = random_apply_affine(img, p, transform_matrix[0]) img, C = random_apply_color(img, p, transform_matrix[1]) return img, (G, C) ================================================ FILE: models/archs/stylegan2/op/__init__.py ================================================ from .fused_act import FusedLeakyReLU, fused_leaky_relu from .upfirdn2d import upfirdn2d ================================================ FILE: models/archs/stylegan2/op/fused_act.py ================================================ import os import torch from torch import nn from torch.nn import functional as F from torch.autograd import Function from torch.utils.cpp_extension import load module_path = os.path.dirname(__file__) fused = load( "fused", sources=[ os.path.join(module_path, "fused_bias_act.cpp"), os.path.join(module_path, "fused_bias_act_kernel.cu"), ], ) class FusedLeakyReLUFunctionBackward(Function): @staticmethod def forward(ctx, grad_output, out, bias, negative_slope, scale): ctx.save_for_backward(out) ctx.negative_slope = negative_slope ctx.scale = scale empty = grad_output.new_empty(0) grad_input = fused.fused_bias_act( grad_output, empty, out, 3, 1, negative_slope, scale ) dim = [0] if grad_input.ndim > 2: dim += list(range(2, grad_input.ndim)) if bias: grad_bias = grad_input.sum(dim).detach() else: grad_bias = empty return grad_input, grad_bias @staticmethod def backward(ctx, gradgrad_input, gradgrad_bias): out, = ctx.saved_tensors gradgrad_out = fused.fused_bias_act( gradgrad_input, gradgrad_bias, out, 3, 1, ctx.negative_slope, ctx.scale ) return gradgrad_out, None, None, None, None class FusedLeakyReLUFunction(Function): @staticmethod def forward(ctx, input, bias, negative_slope, scale): empty = input.new_empty(0) ctx.bias = bias is not None if bias is None: bias = empty out = fused.fused_bias_act(input, bias, empty, 3, 0, negative_slope, scale) ctx.save_for_backward(out) ctx.negative_slope = negative_slope ctx.scale = scale return out @staticmethod def backward(ctx, grad_output): out, = ctx.saved_tensors grad_input, grad_bias = FusedLeakyReLUFunctionBackward.apply( grad_output, out, ctx.bias, ctx.negative_slope, ctx.scale ) if not ctx.bias: grad_bias = None return grad_input, grad_bias, None, None class FusedLeakyReLU(nn.Module): def __init__(self, channel, bias=True, negative_slope=0.2, scale=2 ** 0.5): super().__init__() if bias: self.bias = nn.Parameter(torch.zeros(channel)) else: self.bias = None self.negative_slope = negative_slope self.scale = scale def forward(self, input): return fused_leaky_relu(input, self.bias, self.negative_slope, self.scale) def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2 ** 0.5): if input.device.type == "cpu": if bias is not None: rest_dim = [1] * (input.ndim - bias.ndim - 1) return ( F.leaky_relu( input + bias.view(1, bias.shape[0], *rest_dim), negative_slope=0.2 ) * scale ) else: return F.leaky_relu(input, negative_slope=0.2) * scale else: return FusedLeakyReLUFunction.apply(input, bias, negative_slope, scale) ================================================ FILE: models/archs/stylegan2/op/fused_bias_act.cpp ================================================ #include torch::Tensor fused_bias_act_op(const torch::Tensor& input, const torch::Tensor& bias, const torch::Tensor& refer, int act, int grad, float alpha, float scale); #define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) torch::Tensor fused_bias_act(const torch::Tensor& input, const torch::Tensor& bias, const torch::Tensor& refer, int act, int grad, float alpha, float scale) { CHECK_CUDA(input); CHECK_CUDA(bias); return fused_bias_act_op(input, bias, refer, act, grad, alpha, scale); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("fused_bias_act", &fused_bias_act, "fused bias act (CUDA)"); } ================================================ FILE: models/archs/stylegan2/op/fused_bias_act_kernel.cu ================================================ // Copyright (c) 2019, NVIDIA Corporation. All rights reserved. // // This work is made available under the Nvidia Source Code License-NC. // To view a copy of this license, visit // https://nvlabs.github.io/stylegan2/license.html #include #include #include #include #include #include #include template static __global__ void fused_bias_act_kernel(scalar_t* out, const scalar_t* p_x, const scalar_t* p_b, const scalar_t* p_ref, int act, int grad, scalar_t alpha, scalar_t scale, int loop_x, int size_x, int step_b, int size_b, int use_bias, int use_ref) { int xi = blockIdx.x * loop_x * blockDim.x + threadIdx.x; scalar_t zero = 0.0; for (int loop_idx = 0; loop_idx < loop_x && xi < size_x; loop_idx++, xi += blockDim.x) { scalar_t x = p_x[xi]; if (use_bias) { x += p_b[(xi / step_b) % size_b]; } scalar_t ref = use_ref ? p_ref[xi] : zero; scalar_t y; switch (act * 10 + grad) { default: case 10: y = x; break; case 11: y = x; break; case 12: y = 0.0; break; case 30: y = (x > 0.0) ? x : x * alpha; break; case 31: y = (ref > 0.0) ? x : x * alpha; break; case 32: y = 0.0; break; } out[xi] = y * scale; } } torch::Tensor fused_bias_act_op(const torch::Tensor& input, const torch::Tensor& bias, const torch::Tensor& refer, int act, int grad, float alpha, float scale) { int curDevice = -1; cudaGetDevice(&curDevice); cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice); auto x = input.contiguous(); auto b = bias.contiguous(); auto ref = refer.contiguous(); int use_bias = b.numel() ? 1 : 0; int use_ref = ref.numel() ? 1 : 0; int size_x = x.numel(); int size_b = b.numel(); int step_b = 1; for (int i = 1 + 1; i < x.dim(); i++) { step_b *= x.size(i); } int loop_x = 4; int block_size = 4 * 32; int grid_size = (size_x - 1) / (loop_x * block_size) + 1; auto y = torch::empty_like(x); AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "fused_bias_act_kernel", [&] { fused_bias_act_kernel<<>>( y.data_ptr(), x.data_ptr(), b.data_ptr(), ref.data_ptr(), act, grad, alpha, scale, loop_x, size_x, step_b, size_b, use_bias, use_ref ); }); return y; } ================================================ FILE: models/archs/stylegan2/op/upfirdn2d.cpp ================================================ #include torch::Tensor upfirdn2d_op(const torch::Tensor& input, const torch::Tensor& kernel, int up_x, int up_y, int down_x, int down_y, int pad_x0, int pad_x1, int pad_y0, int pad_y1); #define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) torch::Tensor upfirdn2d(const torch::Tensor& input, const torch::Tensor& kernel, int up_x, int up_y, int down_x, int down_y, int pad_x0, int pad_x1, int pad_y0, int pad_y1) { CHECK_CUDA(input); CHECK_CUDA(kernel); return upfirdn2d_op(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("upfirdn2d", &upfirdn2d, "upfirdn2d (CUDA)"); } ================================================ FILE: models/archs/stylegan2/op/upfirdn2d.py ================================================ import os import torch from torch.nn import functional as F from torch.autograd import Function from torch.utils.cpp_extension import load module_path = os.path.dirname(__file__) upfirdn2d_op = load( "upfirdn2d", sources=[ os.path.join(module_path, "upfirdn2d.cpp"), os.path.join(module_path, "upfirdn2d_kernel.cu"), ], ) class UpFirDn2dBackward(Function): @staticmethod def forward( ctx, grad_output, kernel, grad_kernel, up, down, pad, g_pad, in_size, out_size ): up_x, up_y = up down_x, down_y = down g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1 = g_pad grad_output = grad_output.reshape(-1, out_size[0], out_size[1], 1) grad_input = upfirdn2d_op.upfirdn2d( grad_output, grad_kernel, down_x, down_y, up_x, up_y, g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1, ) grad_input = grad_input.view(in_size[0], in_size[1], in_size[2], in_size[3]) ctx.save_for_backward(kernel) pad_x0, pad_x1, pad_y0, pad_y1 = pad ctx.up_x = up_x ctx.up_y = up_y ctx.down_x = down_x ctx.down_y = down_y ctx.pad_x0 = pad_x0 ctx.pad_x1 = pad_x1 ctx.pad_y0 = pad_y0 ctx.pad_y1 = pad_y1 ctx.in_size = in_size ctx.out_size = out_size return grad_input @staticmethod def backward(ctx, gradgrad_input): kernel, = ctx.saved_tensors gradgrad_input = gradgrad_input.reshape(-1, ctx.in_size[2], ctx.in_size[3], 1) gradgrad_out = upfirdn2d_op.upfirdn2d( gradgrad_input, kernel, ctx.up_x, ctx.up_y, ctx.down_x, ctx.down_y, ctx.pad_x0, ctx.pad_x1, ctx.pad_y0, ctx.pad_y1, ) # gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.out_size[0], ctx.out_size[1], ctx.in_size[3]) gradgrad_out = gradgrad_out.view( ctx.in_size[0], ctx.in_size[1], ctx.out_size[0], ctx.out_size[1] ) return gradgrad_out, None, None, None, None, None, None, None, None class UpFirDn2d(Function): @staticmethod def forward(ctx, input, kernel, up, down, pad): up_x, up_y = up down_x, down_y = down pad_x0, pad_x1, pad_y0, pad_y1 = pad kernel_h, kernel_w = kernel.shape batch, channel, in_h, in_w = input.shape ctx.in_size = input.shape input = input.reshape(-1, in_h, in_w, 1) ctx.save_for_backward(kernel, torch.flip(kernel, [0, 1])) out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1 out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1 ctx.out_size = (out_h, out_w) ctx.up = (up_x, up_y) ctx.down = (down_x, down_y) ctx.pad = (pad_x0, pad_x1, pad_y0, pad_y1) g_pad_x0 = kernel_w - pad_x0 - 1 g_pad_y0 = kernel_h - pad_y0 - 1 g_pad_x1 = in_w * up_x - out_w * down_x + pad_x0 - up_x + 1 g_pad_y1 = in_h * up_y - out_h * down_y + pad_y0 - up_y + 1 ctx.g_pad = (g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1) out = upfirdn2d_op.upfirdn2d( input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1 ) # out = out.view(major, out_h, out_w, minor) out = out.view(-1, channel, out_h, out_w) return out @staticmethod def backward(ctx, grad_output): kernel, grad_kernel = ctx.saved_tensors grad_input = UpFirDn2dBackward.apply( grad_output, kernel, grad_kernel, ctx.up, ctx.down, ctx.pad, ctx.g_pad, ctx.in_size, ctx.out_size, ) return grad_input, None, None, None, None def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)): if input.device.type == "cpu": out = upfirdn2d_native( input, kernel, up, up, down, down, pad[0], pad[1], pad[0], pad[1] ) else: out = UpFirDn2d.apply( input, kernel, (up, up), (down, down), (pad[0], pad[1], pad[0], pad[1]) ) return out def upfirdn2d_native( input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1 ): _, channel, in_h, in_w = input.shape input = input.reshape(-1, in_h, in_w, 1) _, in_h, in_w, minor = input.shape kernel_h, kernel_w = kernel.shape out = input.view(-1, in_h, 1, in_w, 1, minor) out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1]) out = out.view(-1, in_h * up_y, in_w * up_x, minor) out = F.pad( out, [0, 0, max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)] ) out = out[ :, max(-pad_y0, 0) : out.shape[1] - max(-pad_y1, 0), max(-pad_x0, 0) : out.shape[2] - max(-pad_x1, 0), :, ] out = out.permute(0, 3, 1, 2) out = out.reshape( [-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1] ) w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w) out = F.conv2d(out, w) out = out.reshape( -1, minor, in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1, in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1, ) out = out.permute(0, 2, 3, 1) out = out[:, ::down_y, ::down_x, :] out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1 out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1 return out.view(-1, channel, out_h, out_w) ================================================ FILE: models/archs/stylegan2/op/upfirdn2d_kernel.cu ================================================ // Copyright (c) 2019, NVIDIA Corporation. All rights reserved. // // This work is made available under the Nvidia Source Code License-NC. // To view a copy of this license, visit // https://nvlabs.github.io/stylegan2/license.html #include #include #include #include #include #include #include static __host__ __device__ __forceinline__ int floor_div(int a, int b) { int c = a / b; if (c * b > a) { c--; } return c; } struct UpFirDn2DKernelParams { int up_x; int up_y; int down_x; int down_y; int pad_x0; int pad_x1; int pad_y0; int pad_y1; int major_dim; int in_h; int in_w; int minor_dim; int kernel_h; int kernel_w; int out_h; int out_w; int loop_major; int loop_x; }; template __global__ void upfirdn2d_kernel_large(scalar_t *out, const scalar_t *input, const scalar_t *kernel, const UpFirDn2DKernelParams p) { int minor_idx = blockIdx.x * blockDim.x + threadIdx.x; int out_y = minor_idx / p.minor_dim; minor_idx -= out_y * p.minor_dim; int out_x_base = blockIdx.y * p.loop_x * blockDim.y + threadIdx.y; int major_idx_base = blockIdx.z * p.loop_major; if (out_x_base >= p.out_w || out_y >= p.out_h || major_idx_base >= p.major_dim) { return; } int mid_y = out_y * p.down_y + p.up_y - 1 - p.pad_y0; int in_y = min(max(floor_div(mid_y, p.up_y), 0), p.in_h); int h = min(max(floor_div(mid_y + p.kernel_h, p.up_y), 0), p.in_h) - in_y; int kernel_y = mid_y + p.kernel_h - (in_y + 1) * p.up_y; for (int loop_major = 0, major_idx = major_idx_base; loop_major < p.loop_major && major_idx < p.major_dim; loop_major++, major_idx++) { for (int loop_x = 0, out_x = out_x_base; loop_x < p.loop_x && out_x < p.out_w; loop_x++, out_x += blockDim.y) { int mid_x = out_x * p.down_x + p.up_x - 1 - p.pad_x0; int in_x = min(max(floor_div(mid_x, p.up_x), 0), p.in_w); int w = min(max(floor_div(mid_x + p.kernel_w, p.up_x), 0), p.in_w) - in_x; int kernel_x = mid_x + p.kernel_w - (in_x + 1) * p.up_x; const scalar_t *x_p = &input[((major_idx * p.in_h + in_y) * p.in_w + in_x) * p.minor_dim + minor_idx]; const scalar_t *k_p = &kernel[kernel_y * p.kernel_w + kernel_x]; int x_px = p.minor_dim; int k_px = -p.up_x; int x_py = p.in_w * p.minor_dim; int k_py = -p.up_y * p.kernel_w; scalar_t v = 0.0f; for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { v += static_cast(*x_p) * static_cast(*k_p); x_p += x_px; k_p += k_px; } x_p += x_py - w * x_px; k_p += k_py - w * k_px; } out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim + minor_idx] = v; } } } template __global__ void upfirdn2d_kernel(scalar_t *out, const scalar_t *input, const scalar_t *kernel, const UpFirDn2DKernelParams p) { const int tile_in_h = ((tile_out_h - 1) * down_y + kernel_h - 1) / up_y + 1; const int tile_in_w = ((tile_out_w - 1) * down_x + kernel_w - 1) / up_x + 1; __shared__ volatile float sk[kernel_h][kernel_w]; __shared__ volatile float sx[tile_in_h][tile_in_w]; int minor_idx = blockIdx.x; int tile_out_y = minor_idx / p.minor_dim; minor_idx -= tile_out_y * p.minor_dim; tile_out_y *= tile_out_h; int tile_out_x_base = blockIdx.y * p.loop_x * tile_out_w; int major_idx_base = blockIdx.z * p.loop_major; if (tile_out_x_base >= p.out_w | tile_out_y >= p.out_h | major_idx_base >= p.major_dim) { return; } for (int tap_idx = threadIdx.x; tap_idx < kernel_h * kernel_w; tap_idx += blockDim.x) { int ky = tap_idx / kernel_w; int kx = tap_idx - ky * kernel_w; scalar_t v = 0.0; if (kx < p.kernel_w & ky < p.kernel_h) { v = kernel[(p.kernel_h - 1 - ky) * p.kernel_w + (p.kernel_w - 1 - kx)]; } sk[ky][kx] = v; } for (int loop_major = 0, major_idx = major_idx_base; loop_major < p.loop_major & major_idx < p.major_dim; loop_major++, major_idx++) { for (int loop_x = 0, tile_out_x = tile_out_x_base; loop_x < p.loop_x & tile_out_x < p.out_w; loop_x++, tile_out_x += tile_out_w) { int tile_mid_x = tile_out_x * down_x + up_x - 1 - p.pad_x0; int tile_mid_y = tile_out_y * down_y + up_y - 1 - p.pad_y0; int tile_in_x = floor_div(tile_mid_x, up_x); int tile_in_y = floor_div(tile_mid_y, up_y); __syncthreads(); for (int in_idx = threadIdx.x; in_idx < tile_in_h * tile_in_w; in_idx += blockDim.x) { int rel_in_y = in_idx / tile_in_w; int rel_in_x = in_idx - rel_in_y * tile_in_w; int in_x = rel_in_x + tile_in_x; int in_y = rel_in_y + tile_in_y; scalar_t v = 0.0; if (in_x >= 0 & in_y >= 0 & in_x < p.in_w & in_y < p.in_h) { v = input[((major_idx * p.in_h + in_y) * p.in_w + in_x) * p.minor_dim + minor_idx]; } sx[rel_in_y][rel_in_x] = v; } __syncthreads(); for (int out_idx = threadIdx.x; out_idx < tile_out_h * tile_out_w; out_idx += blockDim.x) { int rel_out_y = out_idx / tile_out_w; int rel_out_x = out_idx - rel_out_y * tile_out_w; int out_x = rel_out_x + tile_out_x; int out_y = rel_out_y + tile_out_y; int mid_x = tile_mid_x + rel_out_x * down_x; int mid_y = tile_mid_y + rel_out_y * down_y; int in_x = floor_div(mid_x, up_x); int in_y = floor_div(mid_y, up_y); int rel_in_x = in_x - tile_in_x; int rel_in_y = in_y - tile_in_y; int kernel_x = (in_x + 1) * up_x - mid_x - 1; int kernel_y = (in_y + 1) * up_y - mid_y - 1; scalar_t v = 0.0; #pragma unroll for (int y = 0; y < kernel_h / up_y; y++) #pragma unroll for (int x = 0; x < kernel_w / up_x; x++) v += sx[rel_in_y + y][rel_in_x + x] * sk[kernel_y + y * up_y][kernel_x + x * up_x]; if (out_x < p.out_w & out_y < p.out_h) { out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim + minor_idx] = v; } } } } } torch::Tensor upfirdn2d_op(const torch::Tensor &input, const torch::Tensor &kernel, int up_x, int up_y, int down_x, int down_y, int pad_x0, int pad_x1, int pad_y0, int pad_y1) { int curDevice = -1; cudaGetDevice(&curDevice); cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice); UpFirDn2DKernelParams p; auto x = input.contiguous(); auto k = kernel.contiguous(); p.major_dim = x.size(0); p.in_h = x.size(1); p.in_w = x.size(2); p.minor_dim = x.size(3); p.kernel_h = k.size(0); p.kernel_w = k.size(1); p.up_x = up_x; p.up_y = up_y; p.down_x = down_x; p.down_y = down_y; p.pad_x0 = pad_x0; p.pad_x1 = pad_x1; p.pad_y0 = pad_y0; p.pad_y1 = pad_y1; p.out_h = (p.in_h * p.up_y + p.pad_y0 + p.pad_y1 - p.kernel_h + p.down_y) / p.down_y; p.out_w = (p.in_w * p.up_x + p.pad_x0 + p.pad_x1 - p.kernel_w + p.down_x) / p.down_x; auto out = at::empty({p.major_dim, p.out_h, p.out_w, p.minor_dim}, x.options()); int mode = -1; int tile_out_h = -1; int tile_out_w = -1; if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 && p.kernel_h <= 4 && p.kernel_w <= 4) { mode = 1; tile_out_h = 16; tile_out_w = 64; } if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 && p.kernel_h <= 3 && p.kernel_w <= 3) { mode = 2; tile_out_h = 16; tile_out_w = 64; } if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 && p.kernel_h <= 4 && p.kernel_w <= 4) { mode = 3; tile_out_h = 16; tile_out_w = 64; } if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 && p.kernel_h <= 2 && p.kernel_w <= 2) { mode = 4; tile_out_h = 16; tile_out_w = 64; } if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 && p.kernel_h <= 4 && p.kernel_w <= 4) { mode = 5; tile_out_h = 8; tile_out_w = 32; } if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 && p.kernel_h <= 2 && p.kernel_w <= 2) { mode = 6; tile_out_h = 8; tile_out_w = 32; } dim3 block_size; dim3 grid_size; if (tile_out_h > 0 && tile_out_w > 0) { p.loop_major = (p.major_dim - 1) / 16384 + 1; p.loop_x = 1; block_size = dim3(32 * 8, 1, 1); grid_size = dim3(((p.out_h - 1) / tile_out_h + 1) * p.minor_dim, (p.out_w - 1) / (p.loop_x * tile_out_w) + 1, (p.major_dim - 1) / p.loop_major + 1); } else { p.loop_major = (p.major_dim - 1) / 16384 + 1; p.loop_x = 4; block_size = dim3(4, 32, 1); grid_size = dim3((p.out_h * p.minor_dim - 1) / block_size.x + 1, (p.out_w - 1) / (p.loop_x * block_size.y) + 1, (p.major_dim - 1) / p.loop_major + 1); } AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] { switch (mode) { case 1: upfirdn2d_kernel <<>>(out.data_ptr(), x.data_ptr(), k.data_ptr(), p); break; case 2: upfirdn2d_kernel <<>>(out.data_ptr(), x.data_ptr(), k.data_ptr(), p); break; case 3: upfirdn2d_kernel <<>>(out.data_ptr(), x.data_ptr(), k.data_ptr(), p); break; case 4: upfirdn2d_kernel <<>>(out.data_ptr(), x.data_ptr(), k.data_ptr(), p); break; case 5: upfirdn2d_kernel <<>>(out.data_ptr(), x.data_ptr(), k.data_ptr(), p); break; case 6: upfirdn2d_kernel <<>>(out.data_ptr(), x.data_ptr(), k.data_ptr(), p); break; default: upfirdn2d_kernel_large<<>>( out.data_ptr(), x.data_ptr(), k.data_ptr(), p); } }); return out; } ================================================ FILE: models/archs/stylegan2/ppl.py ================================================ import argparse import torch from torch.nn import functional as F import numpy as np from tqdm import tqdm import lpips from model import Generator def normalize(x): return x / torch.sqrt(x.pow(2).sum(-1, keepdim=True)) def slerp(a, b, t): a = normalize(a) b = normalize(b) d = (a * b).sum(-1, keepdim=True) p = t * torch.acos(d) c = normalize(b - d * a) d = a * torch.cos(p) + c * torch.sin(p) return normalize(d) def lerp(a, b, t): return a + (b - a) * t if __name__ == "__main__": device = "cuda" parser = argparse.ArgumentParser(description="Perceptual Path Length calculator") parser.add_argument( "--space", choices=["z", "w"], help="space that PPL calculated with" ) parser.add_argument( "--batch", type=int, default=64, help="batch size for the models" ) parser.add_argument( "--n_sample", type=int, default=5000, help="number of the samples for calculating PPL", ) parser.add_argument( "--size", type=int, default=256, help="output image sizes of the generator" ) parser.add_argument( "--eps", type=float, default=1e-4, help="epsilon for numerical stability" ) parser.add_argument( "--crop", action="store_true", help="apply center crop to the images" ) parser.add_argument( "--sampling", default="end", choices=["end", "full"], help="set endpoint sampling method", ) parser.add_argument( "ckpt", metavar="CHECKPOINT", help="path to the model checkpoints" ) args = parser.parse_args() latent_dim = 512 ckpt = torch.load(args.ckpt) g = Generator(args.size, latent_dim, 8).to(device) g.load_state_dict(ckpt["g_ema"]) g.eval() percept = lpips.PerceptualLoss( model="net-lin", net="vgg", use_gpu=device.startswith("cuda") ) distances = [] n_batch = args.n_sample // args.batch resid = args.n_sample - (n_batch * args.batch) batch_sizes = [args.batch] * n_batch + [resid] with torch.no_grad(): for batch in tqdm(batch_sizes): noise = g.make_noise() inputs = torch.randn([batch * 2, latent_dim], device=device) if args.sampling == "full": lerp_t = torch.rand(batch, device=device) else: lerp_t = torch.zeros(batch, device=device) if args.space == "w": latent = g.get_latent(inputs) latent_t0, latent_t1 = latent[::2], latent[1::2] latent_e0 = lerp(latent_t0, latent_t1, lerp_t[:, None]) latent_e1 = lerp(latent_t0, latent_t1, lerp_t[:, None] + args.eps) latent_e = torch.stack([latent_e0, latent_e1], 1).view(*latent.shape) image, _ = g([latent_e], input_is_latent=True, noise=noise) if args.crop: c = image.shape[2] // 8 image = image[:, :, c * 3 : c * 7, c * 2 : c * 6] factor = image.shape[2] // 256 if factor > 1: image = F.interpolate( image, size=(256, 256), mode="bilinear", align_corners=False ) dist = percept(image[::2], image[1::2]).view(image.shape[0] // 2) / ( args.eps ** 2 ) distances.append(dist.to("cpu").numpy()) distances = np.concatenate(distances, 0) lo = np.percentile(distances, 1, interpolation="lower") hi = np.percentile(distances, 99, interpolation="higher") filtered_dist = np.extract( np.logical_and(lo <= distances, distances <= hi), distances ) print("ppl:", filtered_dist.mean()) ================================================ FILE: models/archs/stylegan2/sample/.gitignore ================================================ *.png ================================================ FILE: models/archs/stylegan2/train.py ================================================ import argparse import math import random import os import numpy as np import torch from torch import nn, autograd, optim from torch.nn import functional as F from torch.utils import data import torch.distributed as dist from torchvision import transforms, utils from tqdm import tqdm try: import wandb except ImportError: wandb = None from model import Generator, Discriminator from dataset import MultiResolutionDataset from distributed import ( get_rank, synchronize, reduce_loss_dict, reduce_sum, get_world_size, ) from non_leaking import augment, AdaptiveAugment def data_sampler(dataset, shuffle, distributed): if distributed: return data.distributed.DistributedSampler(dataset, shuffle=shuffle) if shuffle: return data.RandomSampler(dataset) else: return data.SequentialSampler(dataset) def requires_grad(model, flag=True): for p in model.parameters(): p.requires_grad = flag def accumulate(model1, model2, decay=0.999): par1 = dict(model1.named_parameters()) par2 = dict(model2.named_parameters()) for k in par1.keys(): par1[k].data.mul_(decay).add_(par2[k].data, alpha=1 - decay) def sample_data(loader): while True: for batch in loader: yield batch def d_logistic_loss(real_pred, fake_pred): real_loss = F.softplus(-real_pred) fake_loss = F.softplus(fake_pred) return real_loss.mean() + fake_loss.mean() def d_r1_loss(real_pred, real_img): grad_real, = autograd.grad( outputs=real_pred.sum(), inputs=real_img, create_graph=True ) grad_penalty = grad_real.pow(2).reshape(grad_real.shape[0], -1).sum(1).mean() return grad_penalty def g_nonsaturating_loss(fake_pred): loss = F.softplus(-fake_pred).mean() return loss def g_path_regularize(fake_img, latents, mean_path_length, decay=0.01): noise = torch.randn_like(fake_img) / math.sqrt( fake_img.shape[2] * fake_img.shape[3] ) grad, = autograd.grad( outputs=(fake_img * noise).sum(), inputs=latents, create_graph=True ) path_lengths = torch.sqrt(grad.pow(2).sum(2).mean(1)) path_mean = mean_path_length + decay * (path_lengths.mean() - mean_path_length) path_penalty = (path_lengths - path_mean).pow(2).mean() return path_penalty, path_mean.detach(), path_lengths def make_noise(batch, latent_dim, n_noise, device): if n_noise == 1: return torch.randn(batch, latent_dim, device=device) noises = torch.randn(n_noise, batch, latent_dim, device=device).unbind(0) return noises def mixing_noise(batch, latent_dim, prob, device): if prob > 0 and random.random() < prob: return make_noise(batch, latent_dim, 2, device) else: return [make_noise(batch, latent_dim, 1, device)] def set_grad_none(model, targets): for n, p in model.named_parameters(): if n in targets: p.grad = None def train(args, loader, generator, discriminator, g_optim, d_optim, g_ema, device): loader = sample_data(loader) pbar = range(args.iter) if get_rank() == 0: pbar = tqdm(pbar, initial=args.start_iter, dynamic_ncols=True, smoothing=0.01) mean_path_length = 0 d_loss_val = 0 r1_loss = torch.tensor(0.0, device=device) g_loss_val = 0 path_loss = torch.tensor(0.0, device=device) path_lengths = torch.tensor(0.0, device=device) mean_path_length_avg = 0 loss_dict = {} if args.distributed: g_module = generator.module d_module = discriminator.module else: g_module = generator d_module = discriminator accum = 0.5 ** (32 / (10 * 1000)) ada_aug_p = args.augment_p if args.augment_p > 0 else 0.0 r_t_stat = 0 if args.augment and args.augment_p == 0: ada_augment = AdaptiveAugment(args.ada_target, args.ada_length, 256, device) sample_z = torch.randn(args.n_sample, args.latent, device=device) for idx in pbar: i = idx + args.start_iter if i > args.iter: print("Done!") break real_img = next(loader) real_img = real_img.to(device) requires_grad(generator, False) requires_grad(discriminator, True) noise = mixing_noise(args.batch, args.latent, args.mixing, device) fake_img, _ = generator(noise) if args.augment: real_img_aug, _ = augment(real_img, ada_aug_p) fake_img, _ = augment(fake_img, ada_aug_p) else: real_img_aug = real_img fake_pred = discriminator(fake_img) real_pred = discriminator(real_img_aug) d_loss = d_logistic_loss(real_pred, fake_pred) loss_dict["d"] = d_loss loss_dict["real_score"] = real_pred.mean() loss_dict["fake_score"] = fake_pred.mean() discriminator.zero_grad() d_loss.backward() d_optim.step() if args.augment and args.augment_p == 0: ada_aug_p = ada_augment.tune(real_pred) r_t_stat = ada_augment.r_t_stat d_regularize = i % args.d_reg_every == 0 if d_regularize: real_img.requires_grad = True real_pred = discriminator(real_img) r1_loss = d_r1_loss(real_pred, real_img) discriminator.zero_grad() (args.r1 / 2 * r1_loss * args.d_reg_every + 0 * real_pred[0]).backward() d_optim.step() loss_dict["r1"] = r1_loss requires_grad(generator, True) requires_grad(discriminator, False) noise = mixing_noise(args.batch, args.latent, args.mixing, device) fake_img, _ = generator(noise) if args.augment: fake_img, _ = augment(fake_img, ada_aug_p) fake_pred = discriminator(fake_img) g_loss = g_nonsaturating_loss(fake_pred) loss_dict["g"] = g_loss generator.zero_grad() g_loss.backward() g_optim.step() g_regularize = i % args.g_reg_every == 0 if g_regularize: path_batch_size = max(1, args.batch // args.path_batch_shrink) noise = mixing_noise(path_batch_size, args.latent, args.mixing, device) fake_img, latents = generator(noise, return_latents=True) path_loss, mean_path_length, path_lengths = g_path_regularize( fake_img, latents, mean_path_length ) generator.zero_grad() weighted_path_loss = args.path_regularize * args.g_reg_every * path_loss if args.path_batch_shrink: weighted_path_loss += 0 * fake_img[0, 0, 0, 0] weighted_path_loss.backward() g_optim.step() mean_path_length_avg = ( reduce_sum(mean_path_length).item() / get_world_size() ) loss_dict["path"] = path_loss loss_dict["path_length"] = path_lengths.mean() accumulate(g_ema, g_module, accum) loss_reduced = reduce_loss_dict(loss_dict) d_loss_val = loss_reduced["d"].mean().item() g_loss_val = loss_reduced["g"].mean().item() r1_val = loss_reduced["r1"].mean().item() path_loss_val = loss_reduced["path"].mean().item() real_score_val = loss_reduced["real_score"].mean().item() fake_score_val = loss_reduced["fake_score"].mean().item() path_length_val = loss_reduced["path_length"].mean().item() if get_rank() == 0: pbar.set_description( ( f"d: {d_loss_val:.4f}; g: {g_loss_val:.4f}; r1: {r1_val:.4f}; " f"path: {path_loss_val:.4f}; mean path: {mean_path_length_avg:.4f}; " f"augment: {ada_aug_p:.4f}" ) ) if wandb and args.wandb: wandb.log( { "Generator": g_loss_val, "Discriminator": d_loss_val, "Augment": ada_aug_p, "Rt": r_t_stat, "R1": r1_val, "Path Length Regularization": path_loss_val, "Mean Path Length": mean_path_length, "Real Score": real_score_val, "Fake Score": fake_score_val, "Path Length": path_length_val, } ) if i % 100 == 0: with torch.no_grad(): g_ema.eval() sample, _ = g_ema([sample_z]) utils.save_image( sample, f"sample/{str(i).zfill(6)}.png", nrow=int(args.n_sample ** 0.5), normalize=True, range=(-1, 1), ) if i % 10000 == 0: torch.save( { "g": g_module.state_dict(), "d": d_module.state_dict(), "g_ema": g_ema.state_dict(), "g_optim": g_optim.state_dict(), "d_optim": d_optim.state_dict(), "args": args, "ada_aug_p": ada_aug_p, }, f"checkpoint/{str(i).zfill(6)}.pt", ) if __name__ == "__main__": device = "cuda" parser = argparse.ArgumentParser(description="StyleGAN2 trainer") parser.add_argument("path", type=str, help="path to the lmdb dataset") parser.add_argument( "--iter", type=int, default=800000, help="total training iterations" ) parser.add_argument( "--batch", type=int, default=16, help="batch sizes for each gpus" ) parser.add_argument( "--n_sample", type=int, default=64, help="number of the samples generated during training", ) parser.add_argument( "--size", type=int, default=256, help="image sizes for the model" ) parser.add_argument( "--r1", type=float, default=10, help="weight of the r1 regularization" ) parser.add_argument( "--path_regularize", type=float, default=2, help="weight of the path length regularization", ) parser.add_argument( "--path_batch_shrink", type=int, default=2, help="batch size reducing factor for the path length regularization (reduce memory consumption)", ) parser.add_argument( "--d_reg_every", type=int, default=16, help="interval of the applying r1 regularization", ) parser.add_argument( "--g_reg_every", type=int, default=4, help="interval of the applying path length regularization", ) parser.add_argument( "--mixing", type=float, default=0.9, help="probability of latent code mixing" ) parser.add_argument( "--ckpt", type=str, default=None, help="path to the checkpoints to resume training", ) parser.add_argument("--lr", type=float, default=0.002, help="learning rate") parser.add_argument( "--channel_multiplier", type=int, default=2, help="channel multiplier factor for the model. config-f = 2, else = 1", ) parser.add_argument( "--wandb", action="store_true", help="use weights and biases logging" ) parser.add_argument( "--local_rank", type=int, default=0, help="local rank for distributed training" ) parser.add_argument( "--augment", action="store_true", help="apply non leaking augmentation" ) parser.add_argument( "--augment_p", type=float, default=0, help="probability of applying augmentation. 0 = use adaptive augmentation", ) parser.add_argument( "--ada_target", type=float, default=0.6, help="target augmentation probability for adaptive augmentation", ) parser.add_argument( "--ada_length", type=int, default=500 * 1000, help="target duraing to reach augmentation probability for adaptive augmentation", ) parser.add_argument( "--ada_every", type=int, default=256, help="probability update interval of the adaptive augmentation", ) args = parser.parse_args() n_gpu = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = n_gpu > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() args.latent = 512 args.n_mlp = 8 args.start_iter = 0 generator = Generator( args.size, args.latent, args.n_mlp, channel_multiplier=args.channel_multiplier ).to(device) discriminator = Discriminator( args.size, channel_multiplier=args.channel_multiplier ).to(device) g_ema = Generator( args.size, args.latent, args.n_mlp, channel_multiplier=args.channel_multiplier ).to(device) g_ema.eval() accumulate(g_ema, generator, 0) g_reg_ratio = args.g_reg_every / (args.g_reg_every + 1) d_reg_ratio = args.d_reg_every / (args.d_reg_every + 1) g_optim = optim.Adam( generator.parameters(), lr=args.lr * g_reg_ratio, betas=(0 ** g_reg_ratio, 0.99 ** g_reg_ratio), ) d_optim = optim.Adam( discriminator.parameters(), lr=args.lr * d_reg_ratio, betas=(0 ** d_reg_ratio, 0.99 ** d_reg_ratio), ) if args.ckpt is not None: print("load model:", args.ckpt) ckpt = torch.load(args.ckpt, map_location=lambda storage, loc: storage) try: ckpt_name = os.path.basename(args.ckpt) args.start_iter = int(os.path.splitext(ckpt_name)[0]) except ValueError: pass generator.load_state_dict(ckpt["g"]) discriminator.load_state_dict(ckpt["d"]) g_ema.load_state_dict(ckpt["g_ema"]) g_optim.load_state_dict(ckpt["g_optim"]) d_optim.load_state_dict(ckpt["d_optim"]) if args.distributed: generator = nn.parallel.DistributedDataParallel( generator, device_ids=[args.local_rank], output_device=args.local_rank, broadcast_buffers=False, ) discriminator = nn.parallel.DistributedDataParallel( discriminator, device_ids=[args.local_rank], output_device=args.local_rank, broadcast_buffers=False, ) transform = transforms.Compose( [ transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True), ] ) dataset = MultiResolutionDataset(args.path, transform, args.size) loader = data.DataLoader( dataset, batch_size=args.batch, sampler=data_sampler(dataset, shuffle=True, distributed=args.distributed), drop_last=True, ) if get_rank() == 0 and wandb is not None and args.wandb: wandb.init(project="stylegan 2") train(args, loader, generator, discriminator, g_optim, d_optim, g_ema, device) ================================================ FILE: models/base_model.py ================================================ import logging import math from collections import OrderedDict import cv2 import matplotlib.image as mpimg import matplotlib.pyplot as plt import torch import torch.nn as nn from models.archs.attribute_predictor_arch import resnet50 from models.archs.field_function_arch import FieldFunction from models.archs.stylegan2.model import Generator from models.losses.arcface_loss import ArcFaceLoss from models.losses.discriminator_loss import DiscriminatorLoss from models.utils import (postprocess, predictor_to_label, save_image, transform_image) logger = logging.getLogger('base') class BaseModel(): """Base model. """ def __init__(self, opt): self.opt = opt self.device = torch.device('cuda') self.is_train = opt['is_train'] self.target_attr_idx = opt['attr_dict'][opt['attribute']] # define stylegan generator self.stylegan_gen = Generator( size=opt['img_res'], style_dim=opt['latent_dim'], n_mlp=opt['n_mlp'], channel_multiplier=opt['channel_multiplier']).to(self.device) self.truncation = 1.0 self.truncation_latent = None self.randomize_noise = False if opt['latent_space'] == 'z': self.input_is_latent = False self.latent_code_is_w_space = False else: self.input_is_latent = True self.latent_code_is_w_space = True self.transform_z_to_w = opt.get('transform_z_to_w', True) if opt['img_res'] == 128: self.w_space_channel_num = 12 logger.info( f'Loading stylegan model from: {opt["generator_ckpt"]}') checkpoint = torch.load(opt['generator_ckpt']) self.stylegan_gen.load_state_dict(checkpoint["g_ema"], strict=True) self.img_resize = False elif opt['img_res'] == 1024: self.w_space_channel_num = 18 logger.info( f'Loading stylegan model from: {opt["generator_ckpt"]}') checkpoint = torch.load(opt['generator_ckpt']) self.stylegan_gen.load_state_dict(checkpoint, strict=True) self.img_resize = True # define attribute predictor self.predictor = resnet50(attr_file=opt['attr_file']) self.predictor = self.predictor.to(self.device) logger.info(f'Loading model from: {opt["predictor_ckpt"]}') checkpoint = torch.load(opt['predictor_ckpt']) self.predictor.load_state_dict(checkpoint['state_dict'], strict=True) self.predictor.eval() # define field function self.field_function = FieldFunction( num_layer=opt['num_layer'], latent_dim=512, hidden_dim=opt['hidden_dim'], leaky_relu_neg_slope=opt['leaky_relu_neg_slope']) self.field_function = self.field_function.to(self.device) self.fix_layers = False if self.is_train: self.init_training_settings() self.log_dict = OrderedDict() def init_training_settings(self): # set up optimizers self.optimizer = torch.optim.Adam( self.field_function.parameters(), self.opt['lr'], weight_decay=self.opt['weight_decay']) # define loss functions # predictor loss self.criterion_predictor = nn.CrossEntropyLoss(reduction='mean') # arcface loss if self.opt['arcface_weight'] > 0: self.criterion_arcface = ArcFaceLoss( self.opt['pretrained_arcface'], self.opt['arcface_loss_type']) else: self.criterion_arcface = None # discriminator loss if self.opt['arcface_weight'] > 0: self.criterion_disc = DiscriminatorLoss( self.opt['discriminator_ckpt'], self.opt['img_res']) else: self.criterion_disc = None def feed_data(self, data): self.original_latent_code = data[0].to(self.device) self.original_label = data[1].to(self.device) self.gt_label = self.original_label.clone() self.gt_label[:, self.target_attr_idx] = \ self.gt_label[:, self.target_attr_idx] + 1 def optimize_parameters(self): self.field_function.train() if self.latent_code_is_w_space and self.transform_z_to_w: # translate original z space latent code to w space with torch.no_grad(): original_latent_code = self.stylegan_gen.get_latent( self.original_latent_code) else: original_latent_code = self.original_latent_code # modify latent code via field function edited_dict = self.modify_latent_code(original_latent_code) edited_image = self.synthesize_image(edited_dict['edited_latent_code']) predictor_output = self.predictor( transform_image(edited_image, self.img_resize)) # compute loss function loss_total = 0 assert self.opt['num_attr'] == len(predictor_output) loss_list = [] # iterate over each attribute for attr_idx in range(self.opt['num_attr']): loss_attr = self.criterion_predictor(predictor_output[attr_idx], self.gt_label[:, attr_idx]) if attr_idx == self.target_attr_idx: loss_attr = loss_attr * self.opt['edited_attribute_weight'] loss_list.append(loss_attr) predictor_loss = sum(loss_list) / len(loss_list) self.log_dict['predictor_loss'] = predictor_loss loss_total += predictor_loss if self.criterion_arcface is not None: original_image = self.synthesize_image(original_latent_code) arcface_loss = self.criterion_arcface(original_image, edited_image, self.img_resize) loss_total += self.opt['arcface_weight'] * arcface_loss self.log_dict['arcface_loss'] = arcface_loss if self.opt['disc_weight'] > 0: disc_loss = self.criterion_disc(edited_image) loss_total += disc_loss * self.opt['disc_weight'] self.log_dict['disc_loss'] = disc_loss self.optimizer.zero_grad() loss_total.backward() self.optimizer.step() self.log_dict['loss_total'] = loss_total def get_current_log(self): return self.log_dict def update_learning_rate(self, epoch): """Update learning rate. Args: current_iter (int): Current iteration. warmup_iter (int): Warmup iter numbers. -1 for no warmup. Default: -1. """ lr = self.optimizer.param_groups[0]['lr'] if self.opt['lr_decay'] == 'step': lr = self.opt['lr'] * ( self.opt['gamma']**(epoch // self.opt['step'])) elif self.opt['lr_decay'] == 'cos': lr = self.opt['lr'] * ( 1 + math.cos(math.pi * epoch / self.opt['num_epochs'])) / 2 elif self.opt['lr_decay'] == 'linear': lr = self.opt['lr'] * (1 - epoch / self.opt['num_epochs']) elif self.opt['lr_decay'] == 'linear2exp': if epoch < self.opt['turning_point'] + 1: # learning rate decay as 95% # at the turning point (1 / 95% = 1.0526) lr = self.opt['lr'] * ( 1 - epoch / int(self.opt['turning_point'] * 1.0526)) else: lr *= self.opt['gamma'] elif self.opt['lr_decay'] == 'schedule': if epoch in self.opt['schedule']: lr *= self.opt['gamma'] else: raise ValueError('Unknown lr mode {}'.format(self.opt['lr_decay'])) # set learning rate for param_group in self.optimizer.param_groups: param_group['lr'] = lr return lr def save_network(self, net, save_path): """Save networks. Args: net (nn.Module): Network to be saved. net_label (str): Network label. current_iter (int): Current iter number. """ state_dict = net.state_dict() torch.save(state_dict, save_path) def load_network(self, pretrained_field): checkpoint = torch.load(pretrained_field) self.field_function.load_state_dict(checkpoint, strict=True) self.field_function.eval() def synthesize_image(self, sample_latent_code): synthesized_img, _ = self.stylegan_gen( [sample_latent_code], truncation=self.truncation, input_is_latent=self.input_is_latent, truncation_latent=self.truncation_latent, randomize_noise=self.randomize_noise) return synthesized_img def synthesize_and_predict(self, sample_latent_code): synthesized_img = self.synthesize_image(sample_latent_code) current_predictor_output = self.predictor( transform_image(synthesized_img, self.img_resize)) predicted_label, predicted_score = predictor_to_label( current_predictor_output) return synthesized_img, predicted_label, predicted_score def inference(self, batch_idx, epoch, save_dir): self.field_function.eval() assert self.original_latent_code.size()[0] == 1 if self.latent_code_is_w_space and self.transform_z_to_w: # translate original z space latent code to w space with torch.no_grad(): original_latent_code = self.stylegan_gen.get_latent( self.original_latent_code) else: original_latent_code = self.original_latent_code with torch.no_grad(): original_image = self.synthesize_image(original_latent_code) original_image = postprocess(original_image.cpu().detach().numpy()) # field function mapping with torch.no_grad(): return_dict = self.modify_latent_code(original_latent_code) with torch.no_grad(): edited_image, edited_label, _ = self.synthesize_and_predict( return_dict['edited_latent_code']) edited_image = postprocess(edited_image.cpu().detach().numpy()) concat_images = cv2.hconcat([original_image[0], edited_image[0]]) save_image( concat_images, f'{save_dir}/{batch_idx:03d}_epoch_{epoch:03d}_{self.opt["exp_name"]}_original_{self.original_label[0][self.target_attr_idx]}_edited_{edited_label[self.target_attr_idx]}.png', # noqa need_post_process=False) self.field_function.train() def continuous_editing(self, latent_codes, save_dir, editing_logger): total_num = latent_codes.shape[0] for sample_id in range(total_num): sample_latent_code = torch.from_numpy( latent_codes[sample_id:sample_id + 1]).to( torch.device('cuda')) if self.latent_code_is_w_space and self.transform_z_to_w: # translate original z space latent code to w space with torch.no_grad(): sample_latent_code = self.stylegan_gen.get_latent( sample_latent_code) # synthesize with torch.no_grad(): original_image, start_label, start_score = \ self.synthesize_and_predict(sample_latent_code) target_attr_label = int(start_label[self.target_attr_idx]) target_score = start_score[self.target_attr_idx] save_name = f'{sample_id:03d}_num_edits_0_class_{target_attr_label}.png' # noqa save_image(original_image, f'{save_dir}/{save_name}') editing_logger.info(f'{save_name}: {start_label}, {start_score}') # skip images with low confidence if target_score < self.opt['confidence_thresh']: editing_logger.info( f'Sample {sample_id:03d} is not confident, skip.') continue # skip images that are already the max_cls_num if target_attr_label == self.opt['max_cls_num']: editing_logger.info( f'Sample {sample_id:03d} is already the max_cls_num, skip.' ) continue num_trials = 0 num_edits = 0 current_stage_scores_list = [] current_stage_labels_list = [] current_stage_images_list = [] current_stage_target_scores_list = [] previous_target_attr_label = target_attr_label if self.fix_layers: edited_latent_code = sample_latent_code.unsqueeze(1).repeat( 1, self.w_space_channel_num, 1) while target_attr_label < self.opt['max_cls_num']: num_trials += 1 with torch.no_grad(): # modify sampled latent code if self.fix_layers: # for fix layers, the input to the field_function is w # space, but the input to the stylegan is w plus space edited_dict = self.modify_latent_code( sample_latent_code, edited_latent_code) sample_latent_code = sample_latent_code + edited_dict[ 'field'] edited_latent_code = edited_dict['edited_latent_code'] else: # for other modes, the input to the field function and # stylegan are same (both w space or z space) edited_dict = self.modify_latent_code( sample_latent_code) sample_latent_code = edited_dict['edited_latent_code'] edited_image, edited_label, edited_score = \ self.synthesize_and_predict(edited_dict['edited_latent_code']) # noqa target_attr_label = edited_label[self.target_attr_idx] target_attr_score = edited_score[self.target_attr_idx] if target_attr_label != previous_target_attr_label: num_edits += 1 if num_edits > 0: if target_attr_label == previous_target_attr_label: current_stage_images_list.append(edited_image) current_stage_labels_list.append(edited_label) current_stage_scores_list.append(edited_score) current_stage_target_scores_list.append( target_attr_score) else: if num_edits > 1: # save images for previous stage max_value = max(current_stage_target_scores_list) max_index = current_stage_target_scores_list.index( max_value) saved_image = current_stage_images_list[max_index] saved_label = current_stage_labels_list[max_index] saved_score = current_stage_scores_list[max_index] save_name = f'{sample_id:03d}_num_edits_{num_edits-1}_class_{previous_target_attr_label}.png' # noqa save_image(saved_image, f'{save_dir}/{save_name}') editing_logger.info( f'{save_name}: {saved_label}, {saved_score}') current_stage_images_list = [] current_stage_labels_list = [] current_stage_scores_list = [] current_stage_target_scores_list = [] num_trials = 0 current_stage_images_list.append(edited_image) current_stage_labels_list.append(edited_label) current_stage_scores_list.append(edited_score) current_stage_target_scores_list.append( target_attr_score) previous_target_attr_label = target_attr_label if self.opt['print_every']: save_name = f'{sample_id:03d}_num_edits_{num_edits}_num_trials_{num_trials}_class_{target_attr_label}.png' # noqa\ saved_image(edited_image, f'{save_dir}/{save_name}') editing_logger.info( f'{save_name}: {edited_label}, {edited_score}') if num_trials > self.opt['max_trials_num']: editing_logger.info('Maximum edits num reached.') break if num_edits > 0: # save images for previous stage max_value = max(current_stage_target_scores_list) max_index = current_stage_target_scores_list.index(max_value) saved_image = current_stage_images_list[max_index] saved_label = current_stage_labels_list[max_index] saved_score = current_stage_scores_list[max_index] save_name = f'{sample_id:03d}_num_edits_{num_edits}_class_{previous_target_attr_label}.png' # noqa save_image(saved_image, f'{save_dir}/{save_name}') editing_logger.info( f'{save_name}: {saved_label}, {saved_score}') editing_logger.info(f'{sample_id:03d}: Finish editing.') def continuous_editing_with_target(self, latent_codes, target_cls, save_dir, editing_logger, edited_latent_code, prefix, print_intermediate_result=False, display_img=False): total_num = latent_codes.shape[0] for sample_id in range(total_num): sample_latent_code = torch.from_numpy( latent_codes[sample_id:sample_id + 1]).to( torch.device('cuda')) start_latent_codes = sample_latent_code start_edited_latent_code = edited_latent_code exception_mode = 'normal' # synthesize if edited_latent_code is None: if self.latent_code_is_w_space and self.transform_z_to_w: # translate original z space latent code to w space with torch.no_grad(): sample_latent_code = self.stylegan_gen.get_latent( sample_latent_code) with torch.no_grad(): original_image, start_label, start_score = \ self.synthesize_and_predict(sample_latent_code) else: with torch.no_grad(): original_image, start_label, start_score = \ self.synthesize_and_predict(edited_latent_code) target_attr_label = int(start_label[self.target_attr_idx]) target_score = start_score[self.target_attr_idx] # save_name = f'{prefix}_{sample_id:03d}_num_edits_0_class_{target_attr_label}_attr_idx_{self.target_attr_idx}.png' # noqa ### save_image(original_image, f'{save_dir}/{save_name}') # editing_logger.info(f'{save_name}: {start_label}, {start_score}') # skip images with low confidence if target_score < self.opt['confidence_thresh']: if editing_logger: editing_logger.info( f'Sample {sample_id:03d} is not confident, skip.') continue # skip images that are already the target class num if target_attr_label == target_cls: if editing_logger: editing_logger.info( f'Sample {sample_id:03d} is already at the target class, skip.' ) # return the exactly the input image and input latent codes saved_label = start_label saved_latent_code = start_latent_codes saved_editing_latent_code = start_edited_latent_code saved_score = start_score # save_name = f'{prefix}_{sample_id:03d}_num_edits_1_class_{target_attr_label}_attr_idx_{self.target_attr_idx}.png' # noqa ### save_image(original_image, f'{save_dir}/{save_name}') # editing_logger.info( # f'{save_name}: {saved_label}, {saved_score}') exception_mode = 'already_at_target_class' continue elif target_attr_label < target_cls: direction = 'positive' alpha = 1 elif target_attr_label > target_cls: direction = 'negative' alpha = -1 num_trials = 0 num_edits = 0 current_stage_scores_list = [] current_stage_labels_list = [] current_stage_images_list = [] current_stage_target_scores_list = [] current_stage_latent_code_list = [] current_stage_editing_latent_code_list = [] previous_target_attr_label = target_attr_label if self.fix_layers: if edited_latent_code is None: edited_latent_code = sample_latent_code.unsqueeze( 1).repeat(1, self.w_space_channel_num, 1) while ((direction == 'positive') and (target_attr_label <= target_cls) and (target_attr_label < self.opt['max_cls_num'])) or ( (direction == 'negative') and (target_attr_label >= target_cls) and (target_attr_label > self.opt['min_cls_num'])): num_trials += 1 with torch.no_grad(): # modify sampled latent code if self.fix_layers: # for fix layers, the input to the field_function is w # space, but the input to the stylegan is w plus space edited_dict = self.modify_latent_code_bidirection( sample_latent_code, edited_latent_code, alpha) sample_latent_code = sample_latent_code + alpha * edited_dict[ 'field'] edited_latent_code = edited_dict['edited_latent_code'] else: # for other modes, the input to the field function and # stylegan are same (both w space or z space) edited_dict = self.modify_latent_code_bidirection( latent_code_w=sample_latent_code, alpha=1) sample_latent_code = edited_dict['edited_latent_code'] edited_image, edited_label, edited_score = \ self.synthesize_and_predict(edited_dict['edited_latent_code']) # noqa target_attr_label = edited_label[self.target_attr_idx] target_attr_score = edited_score[self.target_attr_idx] if ((direction == 'positive') and (target_attr_label > target_cls)) or ( (direction == 'negative') and (target_attr_label < target_cls)): if num_edits == 0: saved_label = edited_label saved_latent_code = sample_latent_code saved_editing_latent_code = edited_latent_code save_name = f'{prefix}_{sample_id:03d}_num_edits_{num_edits+1}_class_{target_attr_label}_attr_idx_{self.target_attr_idx}.png' # noqa saved_image = edited_image saved_score = edited_score save_image(saved_image, f'{save_dir}/{save_name}') if display_img: plt.figure() plt.imshow(mpimg.imread(f'{save_dir}/{save_name}')) plt.axis('off') plt.show() if editing_logger: editing_logger.info( f'{save_name}: {saved_label}, {saved_score}') break if target_attr_label != previous_target_attr_label: num_edits += 1 if num_edits > 0: if target_attr_label == previous_target_attr_label: current_stage_images_list.append(edited_image) current_stage_labels_list.append(edited_label) current_stage_scores_list.append(edited_score) current_stage_target_scores_list.append( target_attr_score) current_stage_latent_code_list.append( sample_latent_code) current_stage_editing_latent_code_list.append( edited_latent_code) else: if num_edits > 1: # save images for previous stage max_value = max(current_stage_target_scores_list) max_index = current_stage_target_scores_list.index( max_value) saved_image = current_stage_images_list[max_index] saved_label = current_stage_labels_list[max_index] saved_score = current_stage_scores_list[max_index] saved_latent_code = current_stage_latent_code_list[ max_index] saved_editing_latent_code = current_stage_editing_latent_code_list[ max_index] save_name = f'{prefix}_{sample_id:03d}_num_edits_{num_edits-1}_class_{previous_target_attr_label}_attr_idx_{self.target_attr_idx}.png' # noqa if print_intermediate_result: save_image(saved_image, f'{save_dir}/{save_name}') if editing_logger: editing_logger.info( f'{save_name}: {saved_label}, {saved_score}' ) current_stage_images_list = [] current_stage_labels_list = [] current_stage_scores_list = [] current_stage_target_scores_list = [] current_stage_latent_code_list = [] current_stage_editing_latent_code_list = [] num_trials = 0 current_stage_images_list.append(edited_image) current_stage_labels_list.append(edited_label) current_stage_scores_list.append(edited_score) current_stage_target_scores_list.append( target_attr_score) current_stage_latent_code_list.append( sample_latent_code) current_stage_editing_latent_code_list.append( edited_latent_code) previous_target_attr_label = target_attr_label if num_trials > self.opt['max_trials_num']: if num_edits == 0: saved_label = start_label saved_latent_code = start_latent_codes saved_editing_latent_code = start_edited_latent_code saved_score = start_score # save_name = f'{prefix}_{sample_id:03d}_num_edits_1_class_{target_attr_label}_attr_idx_{self.target_attr_idx}.png' # noqa ### save_image(original_image, f'{save_dir}/{save_name}') # if editing_logger: # editing_logger.info( # f'{save_name}: {saved_label}, {saved_score}') exception_mode = 'max_edit_num_reached' break if num_edits > 0: # save images for previous stage max_value = max(current_stage_target_scores_list) max_index = current_stage_target_scores_list.index(max_value) saved_image = current_stage_images_list[max_index] saved_label = current_stage_labels_list[max_index] saved_score = current_stage_scores_list[max_index] saved_latent_code = current_stage_latent_code_list[max_index] saved_editing_latent_code = current_stage_editing_latent_code_list[ max_index] save_name = f'{prefix}_{sample_id:03d}_num_edits_{num_edits}_class_{previous_target_attr_label}_attr_idx_{self.target_attr_idx}.png' # noqa save_image(saved_image, f'{save_dir}/{save_name}') if display_img: plt.figure() plt.imshow(mpimg.imread(f'{save_dir}/{save_name}')) plt.axis('off') plt.show() if editing_logger: editing_logger.info( f'{save_name}: {saved_label}, {saved_score}') return saved_latent_code, saved_editing_latent_code, saved_label, exception_mode ================================================ FILE: models/field_function_model.py ================================================ import logging import torch from models.base_model import BaseModel logger = logging.getLogger('base') class FieldFunctionModel(BaseModel): def __init__(self, opt): super(FieldFunctionModel, self).__init__(opt) self.replaced_layers = opt['replaced_layers'] self.fix_layers = True def modify_latent_code(self, latent_code_w, latent_code_w_plus=None): assert self.input_is_latent return_dict = {} # field function mapping field = self.field_function(latent_code_w) with torch.no_grad(): offset_w = self.stylegan_gen.style_forward( torch.zeros_like(field), skip_norm=True) delta_w = self.stylegan_gen.style_forward( field, skip_norm=True) - offset_w if latent_code_w_plus is None: edited_latent_code = latent_code_w.unsqueeze(1).repeat( 1, self.w_space_channel_num, 1) else: edited_latent_code = latent_code_w_plus.clone() return_dict['field'] = delta_w for layer_idx in range(self.replaced_layers): edited_latent_code[:, layer_idx, :] += delta_w return_dict['edited_latent_code'] = edited_latent_code return return_dict def modify_latent_code_bidirection(self, latent_code_w, latent_code_w_plus=None, alpha=1): assert self.input_is_latent return_dict = {} # field function mapping field = self.field_function(latent_code_w) with torch.no_grad(): offset_w = self.stylegan_gen.style_forward( torch.zeros_like(field), skip_norm=True) delta_w = self.stylegan_gen.style_forward( field, skip_norm=True) - offset_w if latent_code_w_plus is None: edited_latent_code = latent_code_w.unsqueeze(1).repeat( 1, self.w_space_channel_num, 1) else: edited_latent_code = latent_code_w_plus.clone() return_dict['field'] = delta_w for layer_idx in range(self.replaced_layers): edited_latent_code[:, layer_idx, :] += alpha * delta_w return_dict['edited_latent_code'] = edited_latent_code return return_dict ================================================ FILE: models/losses/__init__.py ================================================ ================================================ FILE: models/losses/arcface_loss.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F def conv3x3(in_planes, out_planes, stride=1): """3x3 convolution with padding""" return nn.Conv2d( in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None): super(BasicBlock, self).__init__() self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = nn.BatchNorm2d(planes) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = nn.BatchNorm2d(planes) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class IRBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None, use_se=True): super(IRBlock, self).__init__() self.bn0 = nn.BatchNorm2d(inplanes) self.conv1 = conv3x3(inplanes, inplanes) self.bn1 = nn.BatchNorm2d(inplanes) self.prelu = nn.PReLU() self.conv2 = conv3x3(inplanes, planes, stride) self.bn2 = nn.BatchNorm2d(planes) self.downsample = downsample self.stride = stride self.use_se = use_se if self.use_se: self.se = SEBlock(planes) def forward(self, x): residual = x out = self.bn0(x) out = self.conv1(out) out = self.bn1(out) out = self.prelu(out) out = self.conv2(out) out = self.bn2(out) if self.use_se: out = self.se(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.prelu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None): super(Bottleneck, self).__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d( planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.conv3 = nn.Conv2d( planes, planes * self.expansion, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(planes * self.expansion) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class SEBlock(nn.Module): def __init__(self, channel, reduction=16): super(SEBlock, self).__init__() self.avg_pool = nn.AdaptiveAvgPool2d(1) self.fc = nn.Sequential( nn.Linear(channel, channel // reduction), nn.PReLU(), nn.Linear(channel // reduction, channel), nn.Sigmoid()) def forward(self, x): b, c, _, _ = x.size() y = self.avg_pool(x).view(b, c) y = self.fc(y).view(b, c, 1, 1) return x * y class ResNetFace(nn.Module): def __init__(self, block, layers, use_se=True): self.inplanes = 64 self.use_se = use_se super(ResNetFace, self).__init__() self.conv1 = nn.Conv2d(1, 64, kernel_size=3, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(64) self.prelu = nn.PReLU() self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) self.bn4 = nn.BatchNorm2d(512) self.dropout = nn.Dropout() self.fc5 = nn.Linear(512 * 8 * 8, 512) self.bn5 = nn.BatchNorm1d(512) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.xavier_normal_(m.weight) elif isinstance(m, nn.BatchNorm2d) or isinstance( m, nn.BatchNorm1d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): nn.init.xavier_normal_(m.weight) nn.init.constant_(m.bias, 0) def _make_layer(self, block, planes, blocks, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2d( self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes * block.expansion), ) layers = [] layers.append( block( self.inplanes, planes, stride, downsample, use_se=self.use_se)) self.inplanes = planes for i in range(1, blocks): layers.append(block(self.inplanes, planes, use_se=self.use_se)) return nn.Sequential(*layers) def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = self.prelu(x) x = self.maxpool(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.bn4(x) x = self.dropout(x) x = x.view(x.size(0), -1) x = self.fc5(x) x = self.bn5(x) return x def resnet_face18(use_se=True, **kwargs): model = ResNetFace(IRBlock, [2, 2, 2, 2], use_se=use_se, **kwargs) return model class ArcFaceLoss(nn.Module): def __init__(self, pretrained_model, loss_type, use_se=False): super(ArcFaceLoss, self).__init__() self.model = resnet_face18(use_se=use_se) self.model = nn.DataParallel(self.model) self.model.load_state_dict(torch.load(pretrained_model), strict=True) self.model.to(torch.device('cuda')) self.model.eval() self.loss_type = loss_type if self.loss_type == 'l1': self.loss_func = nn.L1Loss(reduction='mean') elif self.loss_type == 'l2': self.loss_func = nn.MSELoss(reduction='mean') elif self.loss_type == 'cosine': self.loss_func = nn.CosineEmbeddingLoss(reduction='mean') else: raise NotImplementedError def forward(self, original_imgs, edited_imgs, resize=False): # the image range should be [-1, 1], and convert image to grayscale if resize: # need to resize image to [128, 128] original_features = self.model( F.interpolate(original_imgs, (128, 128), mode='area').mean(dim=1, keepdim=True)) edited_features = self.model( F.interpolate(edited_imgs, (128, 128), mode='area').mean(dim=1, keepdim=True)) else: # the image range should be [-1, 1], and convert image to grayscale original_features = self.model( original_imgs.mean(dim=1, keepdim=True)) edited_features = self.model(edited_imgs.mean(dim=1, keepdim=True)) if self.loss_type == 'l1' or self.loss_type == 'l2': loss = self.loss_func(original_features, edited_features) elif self.loss_type == 'cosine': target = torch.ones(original_features.size(0)).to( torch.device('cuda')) loss = self.loss_func(original_features, edited_features, target) else: raise NotImplementedError return loss ================================================ FILE: models/losses/discriminator_loss.py ================================================ import torch import torch.nn as nn from models.archs.stylegan2.model import Discriminator from torch.nn import functional as F class DiscriminatorLoss(nn.Module): def __init__(self, pretrained_model, img_res): super(DiscriminatorLoss, self).__init__() if img_res == 128: self.discriminator = Discriminator( size=img_res, channel_multiplier=1) self.discriminator.load_state_dict( torch.load(pretrained_model)['d'], strict=True) elif img_res == 1024: self.discriminator = Discriminator( size=img_res, channel_multiplier=2) self.discriminator.load_state_dict( torch.load(pretrained_model), strict=True) self.discriminator.to(torch.device('cuda')) self.discriminator.eval() def forward(self, generated_images): generated_pred = self.discriminator(generated_images) loss = F.softplus(-generated_pred).mean() return loss ================================================ FILE: models/utils.py ================================================ import random import cv2 import numpy as np import torch import torch.nn.functional as F def postprocess(images, channel_order='BGR', min_val=-1.0, max_val=1.0): """Postprocesses the output images if needed. This function assumes the input numpy array is with shape [batch_size, channel, height, width]. Here, `channel = 3` for color image and `channel = 1` for grayscale image. The return images are with shape [batch_size, height, width, channel]. NOTE: The channel order of output image will always be `RGB`. Args: images: The raw output from the generator. Returns: The postprocessed images with dtype `numpy.uint8` with range [0, 255]. Raises: ValueError: If the input `images` are not with type `numpy.ndarray` or not with shape [batch_size, channel, height, width]. """ if not isinstance(images, np.ndarray): raise ValueError('Images should be with type `numpy.ndarray`!') images_shape = images.shape if len(images_shape) != 4 or images_shape[1] not in [1, 3]: raise ValueError(f'Input should be with shape [batch_size, channel, ' f'height, width], where channel equals to 1 or 3. ' f'But {images_shape} is received!') images = (images - min_val) * 255 / (max_val - min_val) images = np.clip(images + 0.5, 0, 255).astype(np.uint8) images = images.transpose(0, 2, 3, 1) if channel_order == 'BGR': images = images[:, :, :, ::-1] return images def transform_image(image, resize=False): # transform image range to [0, 1] image = (image + 1) * 255 / 2 # TODO: int()? quantization? image = torch.clamp(image + 0.5, 0, 255) image = image / 255. if resize: image = F.interpolate(image, (128, 128), mode='area') # normalize image to imagenet range img_mean = torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(torch.device('cuda')) img_std = torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(torch.device('cuda')) image = (image - img_mean) / img_std return image def set_random_seed(seed): """Set random seeds.""" random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) def output_to_label(output): """ INPUT - output: [num_attr, batch_size, num_classes] OUTPUT - scores: [num_attr, batch_size, num_classes] (softmaxed) - label: [num_attr, batch_size] """ scores = [] labels = [] for attr_idx in range(len(output)): _, label = torch.max(input=output[attr_idx], dim=1) label = label.cpu().numpy()[0] labels.append(label) score_per_attr = output[attr_idx].cpu().numpy()[0] # softmax score_per_attr = (np.exp(score_per_attr) / np.sum(np.exp(score_per_attr)))[label] scores.append(score_per_attr) scores = torch.FloatTensor(scores) labels = torch.LongTensor(labels) return labels, scores def predictor_to_label(predictor_output): scores = [] labels = [] for attr_idx in range(len(predictor_output)): _, label = torch.max(input=predictor_output[attr_idx], dim=1) label = label.cpu().numpy()[0] labels.append(label) score_per_attr = predictor_output[attr_idx].cpu().numpy()[0] # softmax score_per_attr = (np.exp(score_per_attr) / np.sum(np.exp(score_per_attr)))[label] scores.append(score_per_attr) return labels, scores def save_image(img, save_path, need_post_process=True): if need_post_process: cv2.imwrite(save_path, postprocess(img.cpu().detach().numpy())[0]) else: cv2.imwrite(save_path, img) ================================================ FILE: quantitative_results.py ================================================ import argparse import glob import logging import cv2 import numpy as np import torch import torch.nn as nn import torchvision.transforms as transforms from facenet_pytorch import InceptionResnetV1 from PIL import Image from models.archs.attribute_predictor_arch import resnet50 from models.utils import output_to_label from utils.logger import get_root_logger from utils.options import dict2str attr_predictor_eval_ckpt = './download/pretrained_models/eval_predictor.pth.tar' def parse_args(): """Parses arguments.""" parser = argparse.ArgumentParser( description='Continuous image editing via field function') # inference parser.add_argument( '--attribute', type=str, required=True, help='[Bangs, Eyeglasses, No_Beard, Smiling, Young]') # input and output directories parser.add_argument( '--work_dir', required=True, type=str, metavar='PATH', help='path to save checkpoint and log files.') parser.add_argument( '--image_dir', required=True, type=str, metavar='PATH', help='path to save checkpoint and log files.') parser.add_argument('--image_num', type=int, required=True) parser.add_argument('--debug', default=0, type=int) return parser.parse_args() def get_edited_images_list(img_dir, img_idx): return_img_list = [] img_path_list = glob.glob(f'{img_dir}/{img_idx:03d}_*.png') start_img_path = glob.glob(f'{img_dir}/{img_idx:03d}_num_edits_0_*.png') assert len(start_img_path) == 1 return_img_list.append(start_img_path[0]) num_edits = len(img_path_list) - 1 if num_edits > 0: for edit_idx in range(1, num_edits + 1): img_path_edit_list = glob.glob( f'{img_dir}/{img_idx:03d}_num_edits_{edit_idx}_*.png') assert len(img_path_edit_list) == 1 return_img_list.append(img_path_edit_list[0]) return return_img_list def load_face_image(img_path): image = cv2.imread(img_path) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image = image.transpose((2, 0, 1)) image = image[np.newaxis, :, :, :] image = image.astype(np.float32, copy=False) image -= 127.5 image /= 128.0 image = torch.from_numpy(image).to(torch.device('cuda')) return image def load_image_predictor(img_path, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ])): image = Image.open(img_path).convert('RGB') image = transform(image) image = image.to(torch.device('cuda')).unsqueeze(0) return image def predictor_score(predictor_output, gt_label, target_attr_idx, criterion_predictor): num_attr = len(predictor_output) loss_avg = 0 count = 0 for attr_idx in range(num_attr): if attr_idx == target_attr_idx: continue loss_attr = criterion_predictor( predictor_output[attr_idx], gt_label[attr_idx].unsqueeze(0).to(torch.device('cuda'))) loss_avg += loss_attr count += 1 loss_avg = loss_avg / count return loss_avg def compute_num_metrics(image_dir, image_num, target_attr_idx, logger): # use different face model and predictor model from training phase # define face recognition model resnet = InceptionResnetV1(pretrained='vggface2').eval().to( torch.device('cuda')) # define attribute predictor model predictor = resnet50(attr_file='./configs/attributes_5.json', ) predictor = predictor.to(torch.device('cuda')) checkpoint = torch.load(attr_predictor_eval_ckpt) predictor.load_state_dict(checkpoint['state_dict'], strict=True) predictor.eval() criterion_predictor = nn.CrossEntropyLoss(reduction='mean') face_distance_dataset = 0 predictor_score_dataset = 0 count = 0 for img_idx in range(image_num): edit_image_list = get_edited_images_list(image_dir, img_idx) num_edits = len(edit_image_list) - 1 face_distance_img = 0 predictor_score_img = 0 if num_edits > 0: # face recognition feature source_img = load_face_image(edit_image_list[0]) source_img_feat = resnet(source_img) # attribute label for predictor source_img_predictor = load_image_predictor(edit_image_list[0]) with torch.no_grad(): source_predictor_output = predictor(source_img_predictor) source_label, score = output_to_label(source_predictor_output) for edit_idx in range(1, num_edits + 1): edited_img = load_face_image(edit_image_list[edit_idx]) edited_img_feat = resnet(edited_img) temp_face_dist = torch.norm(source_img_feat - edited_img_feat).item() face_distance_img += temp_face_dist # attribute predictor score edited_img_predictor = load_image_predictor( edit_image_list[edit_idx]) with torch.no_grad(): edited_predictor_output = predictor(edited_img_predictor) temp_predictor_score_img = predictor_score( edited_predictor_output, source_label, target_attr_idx, criterion_predictor) predictor_score_img += temp_predictor_score_img face_distance_img = face_distance_img / num_edits face_distance_dataset += face_distance_img predictor_score_img = predictor_score_img / num_edits predictor_score_dataset += predictor_score_img count += 1 logger.info( f'{img_idx:03d}: Identity Preservation: {face_distance_img: .4f}, Attribute Preservation: {predictor_score_img: .4f}.' ) else: logger.info(f'{img_idx:03d}: no available edits.') face_distance_dataset = face_distance_dataset / count predictor_score_dataset = predictor_score_dataset / count logger.info( f'Avg: {face_distance_dataset: .4f}, {predictor_score_dataset: .4f}.') return face_distance_dataset, predictor_score_dataset def main(): """Main function.""" args = parse_args() args.attr_dict = { 'Bangs': 0, 'Eyeglasses': 1, 'No_Beard': 2, 'Smiling': 3, 'Young': 4 } logger = get_root_logger( logger_name='base', log_level=logging.INFO, log_file=f'{args.work_dir}/quantitative_results.txt') logger.info(dict2str(args.__dict__)) compute_num_metrics(args.image_dir, args.image_num, args.attr_dict[args.attribute], logger) if __name__ == '__main__': main() ================================================ FILE: train.py ================================================ import argparse import logging import os import os.path as osp import random import time import numpy as np import torch from data.latent_code_dataset import LatentCodeDataset from models import create_model from utils.logger import MessageLogger, get_root_logger, init_tb_logger from utils.numerical_metrics import compute_num_metrics from utils.options import dict2str, dict_to_nonedict, parse from utils.util import make_exp_dirs def main(): # options parser = argparse.ArgumentParser() parser.add_argument('--opt', type=str, help='Path to option YAML file.') args = parser.parse_args() opt = parse(args.opt, is_train=True) # mkdir and loggers make_exp_dirs(opt) log_file = osp.join(opt['path']['log'], f"train_{opt['name']}.log") logger = get_root_logger( logger_name='base', log_level=logging.INFO, log_file=log_file) logger.info(dict2str(opt)) # initialize tensorboard logger tb_logger = None if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger = init_tb_logger(log_dir='./tb_logger/' + opt['name']) # convert to NoneDict, which returns None for missing keys opt = dict_to_nonedict(opt) # random seed seed = opt['manual_seed'] if seed is None: seed = random.randint(1, 10000) logger.info(f'Random seed: {seed}') # set up data loader logger.info(f'Loading data from{opt["input_latent_dir"]}.') train_latent_dataset = LatentCodeDataset( input_dir=opt['dataset']['train_latent_dir']) train_latent_loader = torch.utils.data.DataLoader( dataset=train_latent_dataset, batch_size=opt['batch_size'], shuffle=True, num_workers=opt['num_workers'], drop_last=True) logger.info(f'Number of train set: {len(train_latent_dataset)}.') opt['max_iters'] = opt['num_epochs'] * len( train_latent_dataset) // opt['batch_size'] if opt['val_on_train_subset']: train_subset_latent_dataset = LatentCodeDataset( input_dir=opt['dataset']['train_subset_latent_dir']) train_subset_latent_loader = torch.utils.data.DataLoader( dataset=train_subset_latent_dataset, batch_size=1, shuffle=False, num_workers=opt['num_workers']) logger.info( f'Number of train subset: {len(train_subset_latent_dataset)}.') if opt['val_on_valset']: val_latent_dataset = LatentCodeDataset( input_dir=opt['dataset']['val_latent_dir']) val_latent_loader = torch.utils.data.DataLoader( dataset=val_latent_dataset, batch_size=1, shuffle=False, num_workers=opt['num_workers']) logger.info(f'Number of val set: {len(val_latent_dataset)}.') # load editing latent code editing_latent_codes = np.load(opt['editing_latent_code_path']) num_latent_codes = editing_latent_codes.shape[0] current_iter = 0 best_metric = 10000 best_epoch = None best_arcface = None best_predictor = None field_model = create_model(opt) data_time, iter_time = 0, 0 current_iter = 0 # create message logger (formatted outputs) msg_logger = MessageLogger(opt, current_iter, tb_logger) for epoch in range(opt['num_epochs']): lr = field_model.update_learning_rate(epoch) for _, batch_data in enumerate(train_latent_loader): data_time = time.time() - data_time current_iter += 1 field_model.feed_data(batch_data) field_model.optimize_parameters() iter_time = time.time() - iter_time if current_iter % opt['print_freq'] == 0: log_vars = {'epoch': epoch, 'iter': current_iter} log_vars.update({'lrs': [lr]}) log_vars.update({'time': iter_time, 'data_time': data_time}) log_vars.update(field_model.get_current_log()) msg_logger(log_vars) data_time = time.time() iter_time = time.time() if epoch % opt['val_freq'] == 0: if opt['val_on_valset']: save_dir = f'{opt["path"]["visualization"]}/valset/epoch_{epoch:03d}' # noqa os.makedirs(save_dir, exist_ok=opt['debug']) for batch_idx, batch_data in enumerate(val_latent_loader): field_model.feed_data(batch_data) field_model.inference(batch_idx, epoch, save_dir) if opt['val_on_train_subset']: save_dir = f'{opt["path"]["visualization"]}/trainsubset/epoch_{epoch:03d}' # noqa os.makedirs(save_dir, exist_ok=opt['debug']) for batch_idx, batch_data in enumerate( train_subset_latent_loader): field_model.feed_data(batch_data) field_model.inference(batch_idx, epoch, save_dir) save_path = f'{opt["path"]["visualization"]}/continuous_editing/epoch_{epoch:03d}' # noqa os.makedirs(save_path, exist_ok=opt['debug']) editing_logger = get_root_logger( logger_name=f'editing_{epoch:03d}', log_level=logging.INFO, log_file=f'{save_path}/editing.log') field_model.continuous_editing(editing_latent_codes, save_path, editing_logger) arcface_sim, predictor_score = compute_num_metrics( save_path, num_latent_codes, opt['pretrained_arcface'], opt['attr_file'], opt['predictor_ckpt'], opt['attr_dict'][opt['attribute']], editing_logger) logger.info(f'Epoch: {epoch}, ' f'ArcFace: {arcface_sim: .4f}, ' f'Predictor: {predictor_score: .4f}.') metrics = 1 - arcface_sim + predictor_score if metrics < best_metric: best_epoch = epoch best_metric = metrics best_arcface = arcface_sim best_predictor = predictor_score logger.info(f'Best epoch: {best_epoch}, ' f'ArcFace: {best_arcface: .4f}, ' f'Predictor: {best_predictor: .4f}.') # save model field_model.save_network( field_model.field_function, f'{opt["path"]["models"]}/ckpt_epoch{epoch}.pth') if __name__ == '__main__': main() ================================================ FILE: utils/__init__.py ================================================ ================================================ FILE: utils/crop_img.py ================================================ """ brief: face alignment with FFHQ method (https://github.com/NVlabs/ffhq-dataset) author: lzhbrian (https://lzhbrian.me) date: 2020.1.5 note: code is heavily borrowed from https://github.com/NVlabs/ffhq-dataset http://dlib.net/face_landmark_detection.py.html requirements: apt install cmake conda install Pillow numpy scipy pip install dlib # download face landmark model from: # http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2 """ import cv2 import dlib import numpy as np import PIL import PIL.Image import scipy import scipy.ndimage from facenet_pytorch import MTCNN from PIL import Image def crop_img(img_size, input_img_path, cropped_output_path, device='cuda'): if img_size == 128: return crop_img_128(input_img_path, cropped_output_path, device) elif img_size == 1024: return crop_img_1024(input_img_path, cropped_output_path) else: raise NotImplementedError def crop_img_128(input_img_path, cropped_output_path, device='cuda'): mtcnn = MTCNN(select_largest=True, device=device) img = Image.open(input_img_path).convert('RGB') img = np.uint8(img) bboxes, _ = mtcnn.detect(img) w0, h0, w1, h1 = bboxes[0] hc, wc = (h0 + h1) / 2, (w0 + w1) / 2 crop = int(((h1 - h0) + (w1 - w0)) / 2 / 2 * 1.1) h0 = int(hc - crop + crop + crop * 0.15) w0 = int(wc - crop + crop) x0, y0, w, h = w0 - crop, h0 - crop, crop * 2, crop * 2 im = cv2.imread(input_img_path) im_pad = cv2.copyMakeBorder( im, h, h, w, w, cv2.BORDER_REPLICATE) # allow cropping outside by replicating borders im_crop = im_pad[y0 + h:y0 + h * 2, x0 + w:x0 + w * 2] im_crop = cv2.resize(im_crop, (128, 128), interpolation=cv2.INTER_AREA) cv2.imwrite(cropped_output_path, im_crop) return True # download model from: http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2 # noqa predictor = dlib.shape_predictor( './download/pretrained_models/shape_predictor_68_face_landmarks.dat' # noqa ) def get_landmark(filepath): """get landmark with dlib :return: np.array shape=(68, 2) """ detector = dlib.get_frontal_face_detector() img = dlib.load_rgb_image(filepath) dets = detector(img, 1) if len(dets) < 1: return False, None # print("Number of faces detected: {}".format(len(dets))) for k, d in enumerate(dets): # print("Detection {}: Left: {} Top: {} Right: {} Bottom: {}".format( # k, d.left(), d.top(), d.right(), d.bottom())) # Get the landmarks/parts for the face in box d. shape = predictor(img, d) # print("Part 0: {}, Part 1: {} ...".format( # shape.part(0), shape.part(1))) t = list(shape.parts()) a = [] for tt in t: a.append([tt.x, tt.y]) lm = np.array(a) # lm is a shape=(68,2) np.array return True, lm def crop_img_1024(input_img_path, cropped_output_path): """ :param filepath: str :return: PIL Image """ success, lm = get_landmark(input_img_path) if success is False: return False lm_eye_left = lm[36:42] # left-clockwise lm_eye_right = lm[42:48] # left-clockwise lm_mouth_outer = lm[48:60] # left-clockwise # Calculate auxiliary vectors. eye_left = np.mean(lm_eye_left, axis=0) eye_right = np.mean(lm_eye_right, axis=0) eye_avg = (eye_left + eye_right) * 0.5 eye_to_eye = eye_right - eye_left mouth_left = lm_mouth_outer[0] mouth_right = lm_mouth_outer[6] mouth_avg = (mouth_left + mouth_right) * 0.5 eye_to_mouth = mouth_avg - eye_avg # Choose oriented crop rectangle. x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1] x /= np.hypot(*x) x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8) y = np.flipud(x) * [-1, 1] c = eye_avg + eye_to_mouth * 0.1 quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y]) qsize = np.hypot(*x) * 2 # read image img = PIL.Image.open(input_img_path) output_size = 1024 transform_size = 4096 enable_padding = True # Shrink. shrink = int(np.floor(qsize / output_size * 0.5)) if shrink > 1: rsize = (int(np.rint(float(img.size[0]) / shrink)), int(np.rint(float(img.size[1]) / shrink))) img = img.resize(rsize, PIL.Image.ANTIALIAS) quad /= shrink qsize /= shrink # Crop. border = max(int(np.rint(qsize * 0.1)), 3) crop = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))), int(np.ceil(max(quad[:, 1])))) crop = (max(crop[0] - border, 0), max(crop[1] - border, 0), min(crop[2] + border, img.size[0]), min(crop[3] + border, img.size[1])) if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]: img = img.crop(crop) quad -= crop[0:2] # Pad. pad = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))), int(np.ceil(max(quad[:, 1])))) pad = (max(-pad[0] + border, 0), max(-pad[1] + border, 0), max(pad[2] - img.size[0] + border, 0), max(pad[3] - img.size[1] + border, 0)) if enable_padding and max(pad) > border - 4: pad = np.maximum(pad, int(np.rint(qsize * 0.3))) img = np.pad( np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect') h, w, _ = img.shape y, x, _ = np.ogrid[:h, :w, :1] mask = np.maximum( 1.0 - np.minimum(np.float32(x) / pad[0], np.float32(w - 1 - x) / pad[2]), 1.0 - np.minimum(np.float32(y) / pad[1], np.float32(h - 1 - y) / pad[3])) blur = qsize * 0.02 img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0) img += (np.median(img, axis=(0, 1)) - img) * np.clip(mask, 0.0, 1.0) img = PIL.Image.fromarray( np.uint8(np.clip(np.rint(img), 0, 255)), 'RGB') quad += pad[:2] # Transform. img = img.transform((transform_size, transform_size), PIL.Image.QUAD, (quad + 0.5).flatten(), PIL.Image.BILINEAR) if output_size < transform_size: img = img.resize((output_size, output_size), PIL.Image.ANTIALIAS) img.save(cropped_output_path) return True ================================================ FILE: utils/dialog_edit_utils.py ================================================ import random import matplotlib.image as mpimg import matplotlib.pyplot as plt import torch from language.generate_feedback import instantiate_feedback from language.run_encoder import encode_request from models.utils import save_image from utils.editing_utils import edit_target_attribute def dialog_with_real_user(field_model, latent_code, opt, args, dialog_logger, display_img=False): # initialize dialog recorder state_log = ['start'] edit_log = [] system_log = [{"text": None, "system_mode": 'start', "attribute": None}] user_log = [] not_used_attribute = [ 'Bangs', "Eyeglasses", "No_Beard", "Smiling", "Young" ] text_log = [] text_image_log = [] # initialize first round's variables round_idx = 0 edited_latent_code = None with torch.no_grad(): start_image, start_label, start_score = \ field_model.synthesize_and_predict(torch.from_numpy(latent_code).to(torch.device('cuda'))) # noqa save_image(start_image, f'{opt["path"]["visualization"]}/start_image.png') if display_img: plt.figure() plt.imshow( mpimg.imread(f'{opt["path"]["visualization"]}/start_image.png')) plt.axis('off') plt.show() # initialize attribtue_dict attribute_dict = { "Bangs": start_label[0], "Eyeglasses": start_label[1], "No_Beard": start_label[2], "Smiling": start_label[3], "Young": start_label[4], } dialog_logger.info('START IMAGE >>> ' + str(attribute_dict)) while True: dialog_logger.info('\n---------------------------------------- Edit ' + str(round_idx) + '----------------------------------------\n') # -------------------- TAKE USER INPUT -------------------- # understand user input user_labels = encode_request( args, system_mode=system_log[-1]['system_mode'], dialog_logger=dialog_logger) text_image_log.append('USER: ' + user_labels['text']) # update not_used_attribute if user_labels['attribute'] in not_used_attribute: not_used_attribute.remove(user_labels['attribute']) # #################### DECIDE STATE #################### state = decide_next_state( state=state_log[-1], system_mode=system_log[-1]['system_mode'], user_mode=user_labels['user_mode']) if state == 'end': user_log.append(user_labels) state_log.append(state) text_log.append('USER: ' + user_labels['text']) break # #################### DECIDE EDIT #################### edit_labels = decide_next_edit( edit_log=edit_log, system_labels=system_log[-1], user_labels=user_labels, state=state, attribute_dict=attribute_dict, dialog_logger=dialog_logger) text_image_log.append(edit_labels) attribute_dict, exception_mode, latent_code, edited_latent_code = edit_target_attribute( # noqa opt, attribute_dict, edit_labels, round_idx, latent_code, edited_latent_code, field_model, display_img=display_img) if state == 'no_edit': dialog_logger.info('NO EDIT >>> ' + str(attribute_dict)) else: dialog_logger.info('UPDATED IMAGE >>> ' + str(attribute_dict)) text_image_log.append(attribute_dict.copy()) # #################### DECIDE SYSTEM #################### # decide system feedback hard labels temp_system_labels = decide_next_feedback( system_labels=system_log[-1], user_labels=user_labels, state=state, edit_labels=edit_labels, not_used_attribute=not_used_attribute, round_idx=round_idx, exception_mode=exception_mode) # instantiate feedback system_labels = instantiate_feedback( args, system_mode=temp_system_labels['system_mode'], attribute=temp_system_labels['attribute'], exception_mode=exception_mode) dialog_logger.info('SYSTEM FEEDBACK >>> ' + system_labels['text']) # update not_used_attribute if system_labels['attribute'] in not_used_attribute: not_used_attribute.remove(system_labels['attribute']) # -------------------- UPDATE LOG -------------------- state_log.append(state) edit_log.append(edit_labels) system_log.append(system_labels) user_log.append(user_labels) text_log.append('USER: ' + user_labels['text']) text_log.append('SYSTEM: ' + system_labels['text']) text_log.append('') text_image_log.append('SYSTEM: ' + system_labels['text']) text_image_log.append('') round_idx += 1 dialog_overall_log = { 'state_log': state_log, 'edit_log': edit_log, 'system_log': system_log, 'user_log': user_log, 'text_log': text_log, 'text_image_log': text_image_log } dialog_logger.info('Dialog successfully ended.') return dialog_overall_log def decide_next_state(state, system_mode, user_mode): """ Input: state, system, user Output: next state """ if state == 'start': assert system_mode == 'start' assert user_mode == 'start_pureRequest' next_state = 'edit' elif state == 'edit': if system_mode == 'suggestion': if user_mode == 'yes': next_state = 'edit' elif user_mode == 'yes_pureRequest': next_state = 'edit' elif user_mode == 'no_pureRequest': next_state = 'edit' elif user_mode == 'no': next_state = 'no_edit' elif user_mode == 'no_end': next_state = 'end' else: raise ValueError("invalid user_mode") elif system_mode == 'whether_enough': if user_mode == 'yes': next_state = 'no_edit' elif user_mode == 'yes_pureRequest': next_state = 'edit' elif user_mode == 'yes_end': next_state = 'end' elif user_mode == 'no': next_state = 'edit' elif user_mode == 'no_pureRequest': next_state = 'edit' else: raise ValueError("invalid user_mode") elif system_mode == 'whats_next': if user_mode == 'pureRequest': next_state = 'edit' elif user_mode == 'end': next_state = 'end' else: raise ValueError("invalid system_mode") elif state == 'no_edit': if system_mode == 'suggestion': if user_mode == 'yes': next_state = 'edit' elif user_mode == 'yes_pureRequest': next_state = 'edit' elif user_mode == 'no_pureRequest': next_state = 'edit' elif user_mode == 'no': next_state = 'no_edit' elif user_mode == 'no_end': next_state = 'end' else: raise ValueError("invalid user_mode") elif system_mode == 'whether_enough': raise ValueError("invalid system_mode") elif system_mode == 'whats_next': if user_mode == 'pureRequest': next_state = 'edit' elif user_mode == 'end': next_state = 'end' else: raise ValueError("invalid system_mode") elif state == 'end': raise ValueError("invalid state") else: raise ValueError("invalid state") return next_state def decide_next_edit(edit_log, system_labels, user_labels, state, attribute_dict, dialog_logger): """ Input: previous edit, system, user, resulting state, attribute labels Output: current edit """ attribute = None score_change_direction = None score_change_value = None target_score = None if len(edit_log) > 0: edit_labels = edit_log[-1] # ---------- decide edit_labels ---------- if len(edit_log) == 0: # now is the first round, so edit according to user request assert 'pureRequest' in user_labels['user_mode'] assert state == 'edit' attribute = user_labels['attribute'] score_change_direction = user_labels['score_change_direction'] score_change_value = user_labels['score_change_value'] target_score = user_labels['target_score'] if user_labels['request_mode'] == 'change_indefinite': assert score_change_value is None score_change_value = 1 elif 'pureRequest' in user_labels['user_mode']: # edit according to user request assert state == 'edit' attribute = user_labels['attribute'] score_change_direction = user_labels['score_change_direction'] score_change_value = user_labels['score_change_value'] target_score = user_labels['target_score'] if user_labels['request_mode'] == 'change_indefinite': assert score_change_value is None score_change_value = 1 elif system_labels['system_mode'] == 'whether_enough' and user_labels[ 'user_mode'] == 'no': # continue the previous edit assert state == 'edit' attribute = edit_labels['attribute'] score_change_direction = edit_labels['score_change_direction'] score_change_value = 1 target_score = None elif system_labels['system_mode'] == 'suggestion' and user_labels[ 'user_mode'] == 'yes': # play with the suggested attribute, random direction # (small degree --> positive direction) assert state == 'edit' attribute = system_labels['attribute'] if attribute_dict[attribute] <= 2: score_change_direction = 'positive' else: score_change_direction = 'negative' score_change_value = 1 target_score = None else: # no edit assert (state == 'no_edit' or state == 'end') attribute = None score_change_direction = None score_change_value = None target_score = None # --- The code below is moderation mechanism for language encoder --- if system_labels['system_mode'] == 'suggestion' and user_labels[ 'user_mode'] == 'yes': attribute = system_labels['attribute'] # ---------- Fill in all the values in edit_labels ---------- if attribute is None: assert score_change_direction is None assert score_change_value is None assert target_score is None elif target_score is not None: assert score_change_direction is None assert score_change_value is None if target_score > attribute_dict[attribute]: score_change_direction = 'positive' elif target_score < attribute_dict[attribute]: score_change_direction = 'negative' else: pass score_change_value = abs(target_score - attribute_dict[attribute]) elif score_change_direction is not None: assert score_change_value is not None if score_change_direction == 'positive': target_score = attribute_dict[attribute] + score_change_value elif score_change_direction == 'negative': target_score = attribute_dict[attribute] - score_change_value else: raise ValueError('invalid direction') # boundary value checking if target_score > 5: target_score = 5 score_change_value = abs(target_score - attribute_dict[attribute]) elif target_score < 0: target_score = 0 score_change_value = abs(target_score - attribute_dict[attribute]) next_edit_labels = { 'attribute': attribute, 'score_change_direction': score_change_direction, "score_change_value": score_change_value, 'target_score': target_score } return next_edit_labels def decide_next_feedback(system_labels, user_labels, state, edit_labels, not_used_attribute, round_idx, exception_mode): """ Input: system, user, state, edit + others Output: system """ assert (state == 'edit' or state == 'no_edit') while True: if exception_mode != 'normal': system_mode = 'whats_next' feedback_attribute = None break system_mode = None feedback_attribute = None if system_labels['system_mode'] == 'suggestion' and user_labels[ 'user_mode'] == 'yes': assert state == 'edit' system_mode = 'whether_enough' feedback_attribute = system_labels['attribute'] break # ---------- whether_enough ---------- # first round has higher chance for whether_enough whether_enough_random_num = random.uniform(0, 1) if round_idx == 0: whether_enough_prob = 0.8 if whether_enough_random_num < whether_enough_prob: system_mode = 'whether_enough' if state == 'no_edit': continue else: feedback_attribute = user_labels['attribute'] assert feedback_attribute is not None # ---------- whats_next ---------- # higher chance at earlier rounds if system_mode is None: whats_next_random_num = random.uniform(0, 1) whats_next_prob_list = [0.5, 0.4, 0.3, 0.3] if round_idx <= 3: whats_next_prob = whats_next_prob_list[round_idx] else: whats_next_prob = 0.2 if whats_next_random_num < whats_next_prob: system_mode = 'whats_next' feedback_attribute = None # ---------- suggestion ---------- # if a lot of attribute has been edited, don't be suggestion if system_mode is None: suggestion_random_num = random.uniform(0, 1) suggestion_prob = len(not_used_attribute) * 0.2 if suggestion_random_num < suggestion_prob: system_mode = 'suggestion' if len(not_used_attribute) > 0: feedback_attribute = random.choice(not_used_attribute) else: system_mode = None # ---------- whether_enough ---------- # if not chosen to be 'whats_next' or 'suggestion', # then use 'whether_enough' if system_mode is None: system_mode = 'whether_enough' if state == 'no_edit': continue else: feedback_attribute = edit_labels['attribute'] assert feedback_attribute is not None # if state is no_edit, system_mode cannot be whether_enough if not (state == 'no_edit' and system_mode == 'whether_enough'): break next_system_labels = { 'exception_mode': exception_mode, 'system_mode': system_mode, 'attribute': feedback_attribute } return next_system_labels ================================================ FILE: utils/editing_utils.py ================================================ def edit_target_attribute(opt, attribute_dict, edit_labels, round_idx, latent_code, edited_latent_code, field_model, editing_logger=None, print_intermediate_result=False, display_img=False): """ Input: current attribute labels, how to edit Output: updated attribute labels """ edit_attr_name = edit_labels['attribute'] if edit_attr_name is None: # dialog_logger.info('No edit in the current round') exception_mode = 'normal' return attribute_dict, exception_mode, latent_code, edited_latent_code # define network field_model.target_attr_idx = int(opt['attr_to_idx'][edit_attr_name]) field_model.load_network(opt['pretrained_field'][edit_attr_name]) latent_code, edited_latent_code, saved_label, exception_mode = \ field_model.continuous_editing_with_target( latent_codes=latent_code, target_cls=edit_labels['target_score'], save_dir=opt['path']['visualization'], editing_logger=editing_logger, edited_latent_code=edited_latent_code, prefix=f'edit_order_{str(round_idx)}', print_intermediate_result=print_intermediate_result, display_img=display_img) latent_code = latent_code.cpu().numpy() # update attribtue_dict for idx, (attr, old_label) in enumerate(list(attribute_dict.items())): new_label = int(saved_label[idx]) if field_model.target_attr_idx != idx and new_label != old_label: pass attribute_dict[attr] = new_label return attribute_dict, exception_mode, latent_code, edited_latent_code ================================================ FILE: utils/inversion_utils.py ================================================ import math import models.archs.stylegan2.lpips as lpips import numpy as np import torch from PIL import Image from torch import optim from torch.nn import functional as F from torchvision import transforms from tqdm import tqdm from utils.crop_img import crop_img def noise_regularize(noises): loss = 0 for noise in noises: size = noise.shape[2] while True: loss = ( loss + (noise * torch.roll(noise, shifts=1, dims=3)).mean().pow(2) + (noise * torch.roll(noise, shifts=1, dims=2)).mean().pow(2)) if size <= 8: break noise = noise.reshape([-1, 1, size // 2, 2, size // 2, 2]) noise = noise.mean([3, 5]) size //= 2 return loss def noise_normalize_(noises): for noise in noises: mean = noise.mean() std = noise.std() noise.data.add_(-mean).div_(std) def get_lr(t, initial_lr, rampdown=0.25, rampup=0.05): lr_ramp = min(1, (1 - t) / rampdown) lr_ramp = 0.5 - 0.5 * math.cos(lr_ramp * math.pi) lr_ramp = lr_ramp * min(1, t / rampup) return initial_lr * lr_ramp def latent_noise(latent, strength): noise = torch.randn_like(latent) * strength return latent + noise def make_image(tensor): return (tensor.detach().clamp_(min=-1, max=1).add(1).div_(2).mul(255).type( torch.uint8).permute(0, 2, 3, 1).to("cpu").numpy()) def inversion(opt, field_model): inv_opt = opt['inversion'] device = inv_opt['device'] img_size = opt['img_res'] # inversion transform = transforms.Compose([ transforms.Resize(img_size), transforms.CenterCrop(img_size), transforms.ToTensor(), transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]), ]) if inv_opt['crop_img']: cropped_output_path = f'{opt["path"]["visualization"]}/cropped.png' crop_img(img_size, inv_opt['img_path'], cropped_output_path, device) img = transform(Image.open(cropped_output_path).convert("RGB")) else: img = transform(Image.open(inv_opt['img_path']).convert("RGB")) img = img.unsqueeze(0).to(torch.device('cuda')) batch, channel, height, width = img.shape if height > 256: factor = height // 256 img = img.reshape(batch, channel, height // factor, factor, width // factor, factor) img = img.mean([3, 5]) n_mean_latent = 10000 with torch.no_grad(): noise_sample = torch.randn(n_mean_latent, 512, device=device) latent_out = field_model.stylegan_gen.style_forward(noise_sample) latent_mean = latent_out.mean(0) latent_std = ((latent_out - latent_mean).pow(2).sum() / n_mean_latent)**0.5 percept = lpips.PerceptualLoss( model="net-lin", net="vgg", use_gpu=device.startswith("cuda")) latent_in = latent_mean.detach().clone().unsqueeze(0).repeat( img.shape[0], 1) latent_in.requires_grad = True optim_params = [] for v in field_model.stylegan_gen.parameters(): if v.requires_grad: optim_params.append(v) optimizer = optim.Adam([{ 'params': [latent_in] }, { 'params': optim_params, 'lr': inv_opt['lr_gen'] }], lr=inv_opt['lr']) pbar = tqdm(range(inv_opt['step'])) latent_path = [] for i in pbar: t = i / inv_opt['step'] lr = get_lr(t, inv_opt['lr']) optimizer.param_groups[0]["lr"] = lr noise_strength = latent_std * inv_opt['noise'] * max( 0, 1 - t / inv_opt['noise_ramp'])**2 latent_n = latent_noise(latent_in, noise_strength.item()) img_gen, _ = field_model.stylegan_gen([latent_n], input_is_latent=True, randomize_noise=False) batch, channel, height, width = img_gen.shape if height > 256: factor = height // 256 img_gen = img_gen.reshape(batch, channel, height // factor, factor, width // factor, factor) img_gen = img_gen.mean([3, 5]) p_loss = percept(img_gen, img).sum() mse_loss = F.mse_loss(img_gen, img) loss = p_loss + inv_opt['img_mse_weight'] * mse_loss optimizer.zero_grad() loss.backward() optimizer.step() if (i + 1) % 100 == 0: latent_path.append(latent_in.detach().clone()) pbar.set_description((f"total: {loss:.4f}; perceptual: {p_loss:.4f};" f" mse: {mse_loss:.4f}; lr: {lr:.4f}")) latent_code = latent_in[0].cpu().detach().numpy() latent_code = np.expand_dims(latent_code, axis=0) return latent_code ================================================ FILE: utils/logger.py ================================================ import datetime import logging import time class MessageLogger(): """Message logger for printing. Args: opt (dict): Config. It contains the following keys: name (str): Exp name. logger (dict): Contains 'print_freq' (str) for logger interval. train (dict): Contains 'niter' (int) for total iters. use_tb_logger (bool): Use tensorboard logger. start_iter (int): Start iter. Default: 1. tb_logger (obj:`tb_logger`): Tensorboard logger. Default: None. """ def __init__(self, opt, start_iter=1, tb_logger=None): self.exp_name = opt['name'] self.interval = opt['print_freq'] self.start_iter = start_iter self.max_iters = opt['max_iters'] self.use_tb_logger = opt['use_tb_logger'] self.tb_logger = tb_logger self.start_time = time.time() self.logger = get_root_logger() def __call__(self, log_vars): """Format logging message. Args: log_vars (dict): It contains the following keys: epoch (int): Epoch number. iter (int): Current iter. lrs (list): List for learning rates. time (float): Iter time. data_time (float): Data time for each iter. """ # epoch, iter, learning rates epoch = log_vars.pop('epoch') current_iter = log_vars.pop('iter') lrs = log_vars.pop('lrs') message = (f'[{self.exp_name[:5]}..][epoch:{epoch:3d}, ' f'iter:{current_iter:8,d}, lr:(') for v in lrs: message += f'{v:.3e},' message += ')] ' # time and estimated time if 'time' in log_vars.keys(): iter_time = log_vars.pop('time') data_time = log_vars.pop('data_time') total_time = time.time() - self.start_time time_sec_avg = total_time / (current_iter - self.start_iter + 1) eta_sec = time_sec_avg * (self.max_iters - current_iter - 1) eta_str = str(datetime.timedelta(seconds=int(eta_sec))) message += f'[eta: {eta_str}, ' message += f'time: {iter_time:.3f}, data_time: {data_time:.3f}] ' # other items, especially losses for k, v in log_vars.items(): message += f'{k}: {v:.4e} ' # tensorboard logger if self.use_tb_logger and 'debug' not in self.exp_name: self.tb_logger.add_scalar(k, v, current_iter) self.logger.info(message) def init_tb_logger(log_dir): from torch.utils.tensorboard import SummaryWriter tb_logger = SummaryWriter(log_dir=log_dir) return tb_logger def get_root_logger(logger_name='base', log_level=logging.INFO, log_file=None): """Get the root logger. The logger will be initialized if it has not been initialized. By default a StreamHandler will be added. If `log_file` is specified, a FileHandler will also be added. Args: logger_name (str): root logger name. Default: base. log_file (str | None): The log filename. If specified, a FileHandler will be added to the root logger. log_level (int): The root logger level. Note that only the process of rank 0 is affected, while other processes will set the level to "Error" and be silent most of the time. Returns: logging.Logger: The root logger. """ logger = logging.getLogger(logger_name) # if the logger has been initialized, just return it if logger.hasHandlers(): return logger format_str = '%(asctime)s.%(msecs)03d - %(levelname)s: %(message)s' logging.basicConfig(format=format_str, level=log_level) if log_file is not None: file_handler = logging.FileHandler(log_file, 'w') file_handler.setFormatter(logging.Formatter(format_str)) file_handler.setLevel(log_level) logger.addHandler(file_handler) return logger ================================================ FILE: utils/numerical_metrics.py ================================================ import argparse import glob import cv2 import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torchvision.transforms as transforms from models.archs.attribute_predictor_arch import resnet50 from models.losses.arcface_loss import resnet_face18 from models.utils import output_to_label from PIL import Image def parse_args(): """Parses arguments.""" parser = argparse.ArgumentParser( description='Continuous image editing via field function') # inference parser.add_argument( '--attribute', type=str, required=True, help='[Bangs, Eyeglasses, No_Beard, Smiling, Young]') parser.add_argument('--confidence_thresh', type=float, default=0.8) # input and output directories parser.add_argument( '--work_dir', required=True, type=str, metavar='PATH', help='path to save checkpoint and log files.') parser.add_argument( '--image_dir', required=True, type=str, metavar='PATH', help='path to save checkpoint and log files.') parser.add_argument('--image_num', type=int, required=True) parser.add_argument('--debug', default=0, type=int) # predictor args parser.add_argument( '--attr_file', required=True, type=str, help='directory to attribute metadata') parser.add_argument( '--predictor_ckpt', required=True, type=str, help='The pretrained network weights for testing') parser.add_argument('--num_attr', type=int, default=5) # arcface loss args parser.add_argument( '--pretrained_arcface', default= # noqa '../share_work_dirs/pretrained_arcface/arcface_resnet18_110.pth', type=str) return parser.parse_args() def get_edited_images_list(img_dir, img_idx): return_img_list = [] img_path_list = glob.glob(f'{img_dir}/{img_idx:03d}_*.png') start_img_path = glob.glob(f'{img_dir}/{img_idx:03d}_num_edits_0_*.png') assert len(start_img_path) == 1 return_img_list.append(start_img_path[0]) num_edits = len(img_path_list) - 1 if num_edits > 0: for edit_idx in range(1, num_edits + 1): img_path_edit_list = glob.glob( f'{img_dir}/{img_idx:03d}_num_edits_{edit_idx}_*.png') assert len(img_path_edit_list) == 1 return_img_list.append(img_path_edit_list[0]) return return_img_list def load_image_predictor(img_path, transform=transforms.Compose([transforms.ToTensor() ])): image = Image.open(img_path).convert('RGB') image = transform(image) image = image.to(torch.device('cuda')).unsqueeze(0) if image.size()[-1] > 128: image = F.interpolate(image, (128, 128), mode='area') img_mean = torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(torch.device('cuda')) img_std = torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(torch.device('cuda')) image = (image - img_mean) / img_std return image def load_image_arcface(img_path): image = cv2.imread(img_path, 0) if image is None: return None image = image[:, :, np.newaxis] image = image.transpose((2, 0, 1)) image = image[:, np.newaxis, :, :] image = image.astype(np.float32, copy=False) image -= 127.5 image /= 127.5 image = torch.from_numpy(image).to(torch.device('cuda')) if image.size()[-1] > 128: image = F.interpolate(image, (128, 128), mode='area') return image def cosin_metric(x1, x2): return np.dot(x1, x2) / (np.linalg.norm(x1) * np.linalg.norm(x2)) def predictor_score(predictor_output, gt_label, target_attr_idx, criterion_predictor): num_attr = len(predictor_output) loss_avg = 0 count = 0 for attr_idx in range(num_attr): if attr_idx == target_attr_idx: continue loss_attr = criterion_predictor( predictor_output[attr_idx], gt_label[attr_idx].unsqueeze(0).to(torch.device('cuda'))) loss_avg += loss_attr count += 1 loss_avg = loss_avg / count return loss_avg def compute_num_metrics(image_dir, image_num, pretrained_arcface, attr_file, pretrained_predictor, target_attr_idx, logger): # define arcface model arcface_model = resnet_face18(use_se=False) arcface_model = nn.DataParallel(arcface_model) arcface_model.load_state_dict(torch.load(pretrained_arcface), strict=True) arcface_model.to(torch.device('cuda')) arcface_model.eval() # define predictor model predictor = predictor = resnet50(attr_file=attr_file) predictor = predictor.to(torch.device('cuda')) checkpoint = torch.load(pretrained_predictor) predictor.load_state_dict(checkpoint['state_dict'], strict=True) predictor.eval() criterion_predictor = nn.CrossEntropyLoss(reduction='mean') arcface_sim_dataset = 0 predictor_score_dataset = 0 count = 0 for img_idx in range(image_num): edit_image_list = get_edited_images_list(image_dir, img_idx) num_edits = len(edit_image_list) - 1 arcface_sim_img = 0 predictor_score_img = 0 if num_edits > 0: # read image for arcface source_img_arcface = load_image_arcface(edit_image_list[0]) with torch.no_grad(): source_feature = arcface_model( source_img_arcface).cpu().numpy() # read image for predictor source_img_predictor = load_image_predictor(edit_image_list[0]) with torch.no_grad(): source_predictor_output = predictor(source_img_predictor) source_label, score = output_to_label(source_predictor_output) for edit_idx in range(1, num_edits + 1): # arcface cosine similarity edited_img_arcface = load_image_arcface( edit_image_list[edit_idx]) with torch.no_grad(): edited_feature = arcface_model( edited_img_arcface).cpu().numpy() temp_arcface_sim = cosin_metric(source_feature, edited_feature.transpose( 1, 0))[0][0] arcface_sim_img += temp_arcface_sim # predictor score edited_img_predictor = load_image_predictor( edit_image_list[edit_idx]) with torch.no_grad(): edited_predictor_output = predictor(edited_img_predictor) temp_predictor_score_img = predictor_score( edited_predictor_output, source_label, target_attr_idx, criterion_predictor) predictor_score_img += temp_predictor_score_img arcface_sim_img = arcface_sim_img / num_edits predictor_score_img = predictor_score_img / num_edits arcface_sim_dataset += arcface_sim_img predictor_score_dataset += predictor_score_img count += 1 logger.info( f'{img_idx:03d}: Arcface: {arcface_sim_img: .4f}, Predictor: {predictor_score_img: .4f}.' # noqa ) else: logger.info(f'{img_idx:03d}: no available edits.') arcface_sim_dataset = arcface_sim_dataset / count predictor_score_dataset = predictor_score_dataset / count logger.info( f'Avg: {arcface_sim_dataset: .4f}, {predictor_score_dataset: .4f}.') return arcface_sim_dataset, predictor_score_dataset ================================================ FILE: utils/options.py ================================================ import os import os.path as osp from collections import OrderedDict import yaml def ordered_yaml(): """Support OrderedDict for yaml. Returns: yaml Loader and Dumper. """ try: from yaml import CDumper as Dumper from yaml import CLoader as Loader except ImportError: from yaml import Dumper, Loader _mapping_tag = yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG def dict_representer(dumper, data): return dumper.represent_dict(data.items()) def dict_constructor(loader, node): return OrderedDict(loader.construct_pairs(node)) Dumper.add_representer(OrderedDict, dict_representer) Loader.add_constructor(_mapping_tag, dict_constructor) return Loader, Dumper def parse(opt_path, is_train=True): """Parse option file. Args: opt_path (str): Option file path. is_train (str): Indicate whether in training or not. Default: True. Returns: (dict): Options. """ with open(opt_path, mode='r') as f: Loader, _ = ordered_yaml() opt = yaml.load(f, Loader=Loader) gpu_list = ','.join(str(x) for x in opt['gpu_ids']) if opt.get('set_CUDA_VISIBLE_DEVICES', None): os.environ['CUDA_VISIBLE_DEVICES'] = gpu_list # print('export CUDA_VISIBLE_DEVICES=' + gpu_list, flush=True) else: pass # print('gpu_list: ', gpu_list, flush=True) opt['is_train'] = is_train # datasets if opt['is_train']: input_latent_dir = opt['input_latent_dir'] opt['dataset'] = {} opt['dataset']['train_latent_dir'] = f'{input_latent_dir}/train' if opt['val_on_train_subset']: opt['dataset'][ 'train_subset_latent_dir'] = f'{input_latent_dir}/train_subset' if opt['val_on_valset']: opt['dataset']['val_latent_dir'] = f'{input_latent_dir}/val' # paths opt['path'] = {} opt['path']['root'] = osp.abspath( osp.join(__file__, osp.pardir, osp.pardir)) if is_train: experiments_root = osp.join(opt['path']['root'], 'experiments', opt['name']) opt['path']['experiments_root'] = experiments_root opt['path']['models'] = osp.join(experiments_root, 'models') opt['path']['log'] = experiments_root opt['path']['visualization'] = osp.join(experiments_root, 'visualization') # change some options for debug mode if 'debug' in opt['name']: opt['debug'] = True opt['val_freq'] = 1 opt['print_freq'] = 1 opt['save_checkpoint_freq'] = 1 opt['dataset'][ 'train_latent_dir'] = f'{input_latent_dir}/train_subset' if opt['val_on_train_subset']: opt['dataset'][ 'train_subset_latent_dir'] = f'{input_latent_dir}/train_subset' # noqa if opt['val_on_valset']: opt['dataset'][ 'val_latent_dir'] = f'{input_latent_dir}/train_subset' else: # test results_root = osp.join(opt['path']['root'], 'results', opt['name']) opt['path']['results_root'] = results_root opt['path']['log'] = results_root opt['path']['visualization'] = osp.join(results_root, 'visualization') # some basics for editing task opt['attr_list'] = ['Bangs', 'Eyeglasses', 'No_Beard', 'Smiling', 'Young'] opt['attr_dict'] = { 'Bangs': 0, 'Eyeglasses': 1, 'No_Beard': 2, 'Smiling': 3, 'Young': 4 } if 'has_dialog' in opt.keys(): opt['path']['dialog'] = osp.join(results_root, 'dialog') return opt def dict2str(opt, indent_level=1): """dict to string for printing options. Args: opt (dict): Option dict. indent_level (int): Indent level. Default: 1. Return: (str): Option string for printing. """ msg = '' for k, v in opt.items(): if isinstance(v, dict): msg += ' ' * (indent_level * 2) + k + ':[\n' msg += dict2str(v, indent_level + 1) msg += ' ' * (indent_level * 2) + ']\n' else: msg += ' ' * (indent_level * 2) + k + ': ' + str(v) + '\n' return msg class NoneDict(dict): """None dict. It will return none if key is not in the dict.""" def __missing__(self, key): return None def dict_to_nonedict(opt): """Convert to NoneDict, which returns None for missing keys. Args: opt (dict): Option dict. Returns: (dict): NoneDict for options. """ if isinstance(opt, dict): new_opt = dict() for key, sub_opt in opt.items(): new_opt[key] = dict_to_nonedict(sub_opt) return NoneDict(**new_opt) elif isinstance(opt, list): return [dict_to_nonedict(sub_opt) for sub_opt in opt] else: return opt def parse_args_from_opt(args, opt): ''' Given the opt, parse it to args, since previous code for dialog and language uses args to pass arguments among different scripts ''' for (key, value) in list(opt.items()): setattr(args, key, value) for (key, value) in list(opt['language_encoder'].items()): setattr(args, key, value) args.pretrained_checkpoint = opt['pretrained_language_encoder'] return args def parse_opt_wrt_resolution(opt): if opt['img_res'] == 1024: opt['channel_multiplier'] = opt['channel_multiplier_1024'] opt['pretrained_field'] = opt['pretrained_field_1024'] opt['predictor_ckpt'] = opt['predictor_ckpt_1024'] opt['generator_ckpt'] = opt['generator_ckpt_1024'] opt['replaced_layers'] = opt['replaced_layers_1024'] elif opt['img_res'] == 128: opt['channel_multiplier'] = opt['channel_multiplier_128'] opt['pretrained_field'] = opt['pretrained_field_128'] opt['predictor_ckpt'] = opt['predictor_ckpt_128'] opt['generator_ckpt'] = opt['generator_ckpt_128'] opt['replaced_layers'] = opt['replaced_layers_128'] return opt ================================================ FILE: utils/util.py ================================================ import logging import os import random import sys import time from shutil import get_terminal_size import numpy as np import torch logger = logging.getLogger('base') def make_exp_dirs(opt): """Make dirs for experiments.""" path_opt = opt['path'].copy() if opt['is_train']: overwrite = True if 'debug' in opt['name'] else False os.makedirs(path_opt.pop('experiments_root'), exist_ok=overwrite) os.makedirs(path_opt.pop('models'), exist_ok=overwrite) else: os.makedirs(path_opt.pop('results_root')) def set_random_seed(seed): """Set random seeds.""" random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) class ProgressBar(object): """A progress bar which can print the progress. Modified from: https://github.com/hellock/cvbase/blob/master/cvbase/progress.py """ def __init__(self, task_num=0, bar_width=50, start=True): self.task_num = task_num max_bar_width = self._get_max_bar_width() self.bar_width = ( bar_width if bar_width <= max_bar_width else max_bar_width) self.completed = 0 if start: self.start() def _get_max_bar_width(self): terminal_width, _ = get_terminal_size() max_bar_width = min(int(terminal_width * 0.6), terminal_width - 50) if max_bar_width < 10: print(f'terminal width is too small ({terminal_width}), ' 'please consider widen the terminal for better ' 'progressbar visualization') max_bar_width = 10 return max_bar_width def start(self): if self.task_num > 0: sys.stdout.write(f"[{' ' * self.bar_width}] 0/{self.task_num}, " f'elapsed: 0s, ETA:\nStart...\n') else: sys.stdout.write('completed: 0, elapsed: 0s') sys.stdout.flush() self.start_time = time.time() def update(self, msg='In progress...'): self.completed += 1 elapsed = time.time() - self.start_time fps = self.completed / elapsed if self.task_num > 0: percentage = self.completed / float(self.task_num) eta = int(elapsed * (1 - percentage) / percentage + 0.5) mark_width = int(self.bar_width * percentage) bar_chars = '>' * mark_width + '-' * (self.bar_width - mark_width) sys.stdout.write('\033[2F') # cursor up 2 lines sys.stdout.write( '\033[J' ) # clean the output (remove extra chars since last display) sys.stdout.write( f'[{bar_chars}] {self.completed}/{self.task_num}, ' f'{fps:.1f} task/s, elapsed: {int(elapsed + 0.5)}s, ' f'ETA: {eta:5}s\n{msg}\n') else: sys.stdout.write( f'completed: {self.completed}, elapsed: {int(elapsed + 0.5)}s, ' f'{fps:.1f} tasks/s') sys.stdout.flush()