Repository: yumingj/Talk-to-Edit
Branch: main
Commit: 72c45e109006
Files: 120
Total size: 516.1 KB
Directory structure:
gitextract_av2x8cy4/
├── .gitignore
├── README.md
├── configs/
│ ├── attributes_5.json
│ ├── editing/
│ │ ├── editing_with_dialog.yml
│ │ └── editing_wo_dialog.yml
│ └── train/
│ ├── field_1024_bangs.yml
│ ├── field_1024_beard.yml
│ ├── field_1024_eyeglasses.yml
│ ├── field_1024_smiling.yml
│ ├── field_1024_young.yml
│ ├── field_128_bangs.yml
│ ├── field_128_beard.yml
│ ├── field_128_eyeglasses.yml
│ ├── field_128_smiling.yml
│ └── field_128_young.yml
├── data/
│ ├── __init__.py
│ └── latent_code_dataset.py
├── editing_quantitative.py
├── editing_with_dialog.py
├── editing_wo_dialog.py
├── environment.yml
├── language/
│ ├── accuracy.py
│ ├── build_vocab.py
│ ├── dataset.py
│ ├── generate_feedback.py
│ ├── generate_training_request.py
│ ├── language_utils.py
│ ├── lstm.py
│ ├── preprocess_request.py
│ ├── run_encoder.py
│ ├── templates/
│ │ ├── attr_wise_caption_templates.json
│ │ ├── feedback.json
│ │ ├── gender.json
│ │ ├── metadata_fsm.json
│ │ ├── overall_caption_templates.json
│ │ ├── pool.json
│ │ ├── system_mode.json
│ │ ├── user_fsm.json
│ │ ├── user_old_templates.json
│ │ └── vocab.json
│ ├── train_encoder.py
│ └── utils/
│ ├── __init__.py
│ ├── eval.py
│ ├── logger.py
│ ├── lr_schedule.py
│ ├── misc.py
│ ├── numerical.py
│ ├── progress/
│ │ ├── .gitignore
│ │ ├── LICENSE
│ │ ├── MANIFEST.in
│ │ ├── README.rst
│ │ ├── progress/
│ │ │ ├── __init__.py
│ │ │ ├── bar.py
│ │ │ ├── counter.py
│ │ │ ├── helpers.py
│ │ │ └── spinner.py
│ │ ├── setup.py
│ │ └── test_progress.py
│ ├── setup_logger.py
│ └── visualize.py
├── models/
│ ├── __init__.py
│ ├── archs/
│ │ ├── __init__.py
│ │ ├── attribute_predictor_arch.py
│ │ ├── field_function_arch.py
│ │ └── stylegan2/
│ │ ├── .gitignore
│ │ ├── LICENSE
│ │ ├── LICENSE-FID
│ │ ├── LICENSE-LPIPS
│ │ ├── LICENSE-NVIDIA
│ │ ├── __init__.py
│ │ ├── apply_factor.py
│ │ ├── calc_inception.py
│ │ ├── checkpoint/
│ │ │ └── .gitignore
│ │ ├── convert_weight.py
│ │ ├── dataset.py
│ │ ├── distributed.py
│ │ ├── fid.py
│ │ ├── generate.py
│ │ ├── inception.py
│ │ ├── inversion.py
│ │ ├── lpips/
│ │ │ ├── __init__.py
│ │ │ ├── base_model.py
│ │ │ ├── dist_model.py
│ │ │ ├── networks_basic.py
│ │ │ ├── pretrained_networks.py
│ │ │ └── weights/
│ │ │ ├── v0.0/
│ │ │ │ ├── alex.pth
│ │ │ │ ├── squeeze.pth
│ │ │ │ └── vgg.pth
│ │ │ └── v0.1/
│ │ │ ├── alex.pth
│ │ │ ├── squeeze.pth
│ │ │ └── vgg.pth
│ │ ├── model.py
│ │ ├── non_leaking.py
│ │ ├── op/
│ │ │ ├── __init__.py
│ │ │ ├── fused_act.py
│ │ │ ├── fused_bias_act.cpp
│ │ │ ├── fused_bias_act_kernel.cu
│ │ │ ├── upfirdn2d.cpp
│ │ │ ├── upfirdn2d.py
│ │ │ └── upfirdn2d_kernel.cu
│ │ ├── ppl.py
│ │ ├── sample/
│ │ │ └── .gitignore
│ │ └── train.py
│ ├── base_model.py
│ ├── field_function_model.py
│ ├── losses/
│ │ ├── __init__.py
│ │ ├── arcface_loss.py
│ │ └── discriminator_loss.py
│ └── utils.py
├── quantitative_results.py
├── train.py
└── utils/
├── __init__.py
├── crop_img.py
├── dialog_edit_utils.py
├── editing_utils.py
├── inversion_utils.py
├── logger.py
├── numerical_metrics.py
├── options.py
└── util.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
experiments/
results/
tb_logger/
*.pyc
.vscode/
download
download/*
*.sh
================================================
FILE: README.md
================================================
# Talk-to-Edit (ICCV2021)


This repository contains the implementation of the following paper:
> **Talk-to-Edit: Fine-Grained Facial Editing via Dialog**
> Yuming Jiang∗, Ziqi Huang∗, Xingang Pan, Chen Change Loy, Ziwei Liu
> IEEE International Conference on Computer Vision (**ICCV**), 2021
[[Paper](https://arxiv.org/abs/2109.04425)]
[[Project Page](https://www.mmlab-ntu.com/project/talkedit/)]
[[CelebA-Dialog Dataset](https://github.com/ziqihuangg/CelebA-Dialog)]
[[Poster](https://drive.google.com/file/d/1KaojezBNqDrkwcT0yOkvAgqW1grwUDed/view?usp=sharing)]
[[Video](https://www.youtube.com/watch?v=ZKMkQhkMXPI)]
You can try our colab demo here. Enjoy!
1. Editing with dialog:
1. Editing without dialog:
## Overview

## Dependencies and Installation
1. Clone Repo
```bash
git clone git@github.com:yumingj/Talk-to-Edit.git
```
1. Create Conda Environment and Install Dependencies
```bash
conda env create -f environment.yml
conda activate talk_edit
```
- Python >= 3.7
- PyTorch >= 1.6
- CUDA 10.1
- GCC 5.4.0
## Get Started
## Editing
We provide scripts for editing using our pretrained models.
1. First, download the pretrained models from this [link](https://drive.google.com/drive/folders/1W9dvjz8bUolEIG524o8ZvM62uEWKJ5do?usp=sharing) and put them under `./download/pretrained_models` as follows:
```
./download/pretrained_models
├── 1024_field
│ ├── Bangs.pth
│ ├── Eyeglasses.pth
│ ├── No_Beard.pth
│ ├── Smiling.pth
│ └── Young.pth
├── 128_field
│ ├── Bangs.pth
│ ├── Eyeglasses.pth
│ ├── No_Beard.pth
│ ├── Smiling.pth
│ └── Young.pth
├── arcface_resnet18_110.pth
├── language_encoder.pth.tar
├── predictor_1024.pth.tar
├── predictor_128.pth.tar
├── stylegan2_1024.pth
├── stylegan2_128.pt
├── StyleGAN2_FFHQ1024_discriminator.pth
└── eval_predictor.pth.tar
```
1. You can try pure image editing without dialog instructions:
```bash
python editing_wo_dialog.py \
--opt ./configs/editing/editing_wo_dialog.yml \
--attr 'Bangs' \
--target_val 5
```
The editing results will be saved in `./results`.
You can change `attr` to one of the following attributes: `Bangs`, `Eyeglasses`, `Beard`, `Smiling`, and `Young(i.e. Age)`. And the `target_val` can be `[0, 1, 2, 3, 4, 5]`.
1. You can also try dialog-based editing, where you talk to the system through the command prompt:
```bash
python editing_with_dialog.py --opt ./configs/editing/editing_with_dialog.yml
```
The editing results will be saved in `./results`.
**How to talk to the system:**
* Our system is able to edit five facial attributes: `Bangs`, `Eyeglasses`, `Beard`, `Smiling`, and `Young(i.e. Age)`.
* When prompted with `"Enter your request (Press enter when you finish):"`, you can enter an editing request about one of the five attributes. For example, you can say `"Make the bangs longer."`
* To respond to the system's feedback, just talk as if you were talking to a real person. For example, if the system asks `"Is the length of the bangs just right?"` after one round of editing, You can say things like `"Yes."` / `"No."` / `"Yes, and I also want her to smile more happily."`.
* To end the conversation, just tell the system things like `"That's all"` / `"Nothing else, thank you."`
1. By default, the above editing would be performed on the teaser image. You may change the image to be edited in two ways: 1) change `line 11: latent_code_index` to other values ranging from `0` to `99`; 2) set `line 10: latent_code_path` to `~`, so that an image would be randomly generated.
1. If you want to try editing on real images, you may download the real images from this [link](https://drive.google.com/drive/folders/1BunrwvlwCBZJnb9QqeUp_uIXMxeXXJrY?usp=sharing) and put them under `./download/real_images`. You could also provide other real images at your choice. You need to change `line 12: img_path` in `editing_with_dialog.yml` or `editing_wo_dialog.yml` according to the path to the real image and set `line 11: is_real_image` as `True`.
1. You can switch the default image size to `128 x 128` by setting `line 3: img_res` to `128` in config files.
## Train the Semantic Field
1. To train the Semantic Field, a number of sampled latent codes should be prepared and then we use the attribute predictor to predict the facial attributes for their corresponding images. The attribute predictor is trained using fine-grained annotations in [CelebA-Dialog](https://github.com/ziqihuangg/CelebA-Dialog) dataset. Here, we provide the latent codes we used. You can download the train data from this [link](https://drive.google.com/drive/folders/1CYBpLIwts3ZVFiFAPb4TTnqYH3NBR63p?usp=sharing) and put them under `./download/train_data` as follows:
```
./download/train_data
├── 1024
│ ├── Bangs
│ ├── Eyeglasses
│ ├── No_Beard
│ ├── Smiling
│ └── Young
└── 128
├── Bangs
├── Eyeglasses
├── No_Beard
├── Smiling
└── Young
```
1. We will also use some editing latent codes to monitor the training phase. You can download the editing latent code from this [link](https://drive.google.com/drive/folders/1G-0srCePEXcPq9HY38Il_4FTVHX_rOa-?usp=sharing) and put them under `./download/editing_data` as follows:
```
./download/editing_data
├── 1024
│ ├── Bangs.npz.npy
│ ├── Eyeglasses.npz.npy
│ ├── No_Beard.npz.npy
│ ├── Smiling.npz.npy
│ └── Young.npz.npy
└── 128
├── Bangs.npz.npy
├── Eyeglasses.npz.npy
├── No_Beard.npz.npy
├── Smiling.npz.npy
└── Young.npz.npy
```
1. All logging files in the training process, *e.g.*, log message, checkpoints, and snapshots, will be saved to `./experiments` and `./tb_logger` directory.
1. There are 10 configuration files under `./configs/train`, named in the format of `field__`.
Choose the corresponding configuration file for the attribute and resolution you want.
1. For example, to train the semantic field which edits the attribute `Bangs` in `128x128` image resolution, simply run:
```bash
python train.py --opt ./configs/train/field_128_Bangs.yml
```
## Quantitative Results
We provide codes for quantitative results shown in Table 1. Here we use `Bangs` in `128x128` resolution as an example.
1. Use the trained semantic field to edit images.
```bash
python editing_quantitative.py \
--opt ./configs/train/field_128_bangs.yml \
--pretrained_path ./download/pretrained_models/128_field/Bangs.pth
```
2. Evaluate the edited images using quantitative metircs. Change `image_num` for different attribute accordingly: `Bangs: 148`, `Eyeglasses: 82`, `Beard: 129`, `Smiling: 140`, `Young: 61`.
```bash
python quantitative_results.py \
--attribute Bangs \
--work_dir ./results/field_128_bangs \
--image_dir ./results/field_128_bangs/visualization \
--image_num 148
```
## Qualitative Results

## CelebA-Dialog Dataset

Our [**CelebA-Dialog Dataset**](https://github.com/ziqihuangg/CelebA-Dialog) is available for [Download](https://drive.google.com/drive/folders/18nejI_hrwNzWyoF6SW8bL27EYnM4STAs?usp=sharing).
**CelebA-Dialog** is a large-scale visual-language face dataset with the following features:
- Facial images are annotated with rich **fine-grained labels**, which classify one attribute into multiple degrees according to its semantic meaning.
- Accompanied with each image, there are **captions** describing the attributes and a **user request** sample.

The dataset can be employed as the training and test sets for the following computer vision tasks: fine-grained facial attribute recognition, fine-grained facial manipulation, text-based facial generation and manipulation, face image captioning, and broader natural language based facial recognition and manipulation tasks.
## Citation
If you find our repo useful for your research, please consider citing our paper:
```bibtex
@inproceedings{jiang2021talk,
title={Talk-to-Edit: Fine-Grained Facial Editing via Dialog},
author={Jiang, Yuming and Huang, Ziqi and Pan, Xingang and Loy, Chen Change and Liu, Ziwei},
booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
pages={13799--13808},
year={2021}
}
@article{jiang2023talk,
title={Talk-to-edit: Fine-grained 2d and 3d facial editing via dialog},
author={Jiang, Yuming and Huang, Ziqi and Wu, Tianxing and Pan, Xingang and Loy, Chen Change and Liu, Ziwei},
journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
year={2023},
publisher={IEEE}
}
```
## Contact
If you have any question, please feel free to contact us via `yuming002@ntu.edu.sg` or `hu0007qi@ntu.edu.sg`.
## Acknowledgement
The codebase is maintained by [Yuming Jiang](https://yumingj.github.io/) and [Ziqi Huang](https://ziqihuangg.github.io/).
Part of the code is borrowed from [stylegan2-pytorch](https://github.com/rosinality/stylegan2-pytorch), [IEP](https://github.com/facebookresearch/clevr-iep) and [face-attribute-prediction](https://github.com/d-li14/face-attribute-prediction).
================================================
FILE: configs/attributes_5.json
================================================
{
"attr_info":{
"6": {
"name": "Bangs",
"value":[0, 1, 2, 3, 4, 5],
"idx_scale": 1,
"idx_bias": 0
},
"16": {
"name": "Eyeglasses",
"value":[0, 1, 2, 3, 4, 5],
"idx_scale": 1,
"idx_bias": 0
},
"25": {
"name": "No_Beard",
"value":[0, 1, 2, 3, 4, 5],
"idx_scale": -1,
"idx_bias": 5
},
"32": {
"name": "Smiling",
"value":[0, 1, 2, 3, 4, 5],
"idx_scale": 1,
"idx_bias": 0
},
"40": {
"name": "Young",
"value":[0, 1, 2, 3, 4, 5],
"idx_scale": -1,
"idx_bias": 5
}
},
"newIdx_to_attrIdx":{
"0": "6",
"1": "16",
"2": "25",
"3": "32",
"4": "40"
},
"newIdx_to_attrName":{
"0": "Bangs",
"1": "Eyeglasses",
"2": "No_Beard",
"3": "Smiling",
"4": "Young"
},
"attrName_to_newIdx":{
"Bangs": "0",
"Eyeglasses": "1",
"No_Beard": "2",
"Smiling": "3",
"Young": "4"
},
"attrIdx_to_newIdx":{
"6": 0,
"16": 1,
"25": 2,
"32": 3,
"40": 4
}
}
================================================
FILE: configs/editing/editing_with_dialog.yml
================================================
name: dialog_editing
img_res: 1024 # 128
# latent code
latent_code_path: ./download/editing_data/teaser_latent_code.npz.npy
latent_code_index: 38
# inversion
inversion:
is_real_image: False # False
img_path: ./download/real_images/annehathaway.png
crop_img: True
device: cuda
img_mse_weight: 1.0
step: 600
noise: 0.05
noise_ramp: 0.75
lr: 0.1
lr_gen: !!float 1e-4
use_tb_logger: true
set_CUDA_VISIBLE_DEVICES: ~
gpu_ids: [3]
attribute: Eyeglasses
model_type: FieldFunctionModel
fix_layers: true
replaced_layers_128: 8
replaced_layers_1024: 10
manual_seed: 2021
# editing configs
confidence_thresh: 0
max_cls_num: 5
min_cls_num: 0
max_trials_num: 100
print_every: False
transform_z_to_w: False
# field_function configs
num_layer: 8
hidden_dim: 512
leaky_relu_neg_slope: 0.2
# predictor
attr_file: ./configs/attributes_5.json
baseline: classification
use_sigmoid: True
gt_remapping_file: ~
predictor_ckpt_128: ./download/pretrained_models/predictor_128.pth.tar
predictor_ckpt_1024: ./download/pretrained_models/predictor_1024.pth.tar
# stylegan configs
latent_dim: 512
n_mlp: 8
channel_multiplier_128: 1
channel_multiplier_1024: 2
generator_ckpt_128: ./download/pretrained_models/stylegan2_128.pt
generator_ckpt_1024: ./download/pretrained_models/stylegan2_1024.pth
latent_space: w
# ---------- Dialog Editing -----------
has_dialog: True
device_name: gpu
# pretrained field
pretrained_field_128:
Bangs: ./download/pretrained_models/128_field/Bangs.pth
Eyeglasses: ./download/pretrained_models/128_field/Eyeglasses.pth
No_Beard: ./download/pretrained_models/128_field/No_Beard.pth
Smiling: ./download/pretrained_models/128_field/Smiling.pth
Young: ./download/pretrained_models/128_field/Young.pth
pretrained_field_1024:
Bangs: ./download/pretrained_models/1024_field/Bangs.pth
Eyeglasses: ./download/pretrained_models/1024_field/Eyeglasses.pth
No_Beard: ./download/pretrained_models/1024_field/No_Beard.pth
Smiling: ./download/pretrained_models/1024_field/Smiling.pth
Young: ./download/pretrained_models/1024_field/Young.pth
attr_to_idx:
Bangs: 0
Eyeglasses: 1
No_Beard: 2
Smiling: 3
Young: 4
# language template files set up
feedback_templates_file: ./language/templates/feedback.json
metadata_file: ./language/templates/metadata_fsm.json
pool_file: ./language/templates/pool.json
system_mode_file: ./language/templates/system_mode.json
input_vocab_file: ./language/templates/vocab.json
# dialog setting
postfix_prob: 0.3
whether_enough_general_prob: 0.2
allow_unknown: 1
verbose: 0
# pretrained language encoder
pretrained_language_encoder: ./download/pretrained_models/language_encoder.pth.tar
language_encoder:
word_embedding_dim: 300
text_embed_size: 1024
linear_hidden_size: 256
linear_dropout_rate: 0
================================================
FILE: configs/editing/editing_wo_dialog.yml
================================================
name: editing_wo_dialog
img_res: 1024 # 128
# latent code
latent_code_path: ./download/editing_data/teaser_latent_code.npz.npy
latent_code_index: 38
# inversion
inversion:
is_real_image: False # False
img_path: ./download/real_images/annehathaway.png
crop_img: True
device: cuda
img_mse_weight: 1.0
step: 600
noise: 0.05
noise_ramp: 0.75
lr: 0.1
lr_gen: !!float 1e-4
use_tb_logger: true
set_CUDA_VISIBLE_DEVICES: ~
gpu_ids: [3]
attribute: Eyeglasses
model_type: FieldFunctionModel
fix_layers: true
replaced_layers_128: 8
replaced_layers_1024: 10
manual_seed: 2021
# editing configs
confidence_thresh: 0
max_cls_num: 5
min_cls_num: 0
max_trials_num: 100
print_every: False
transform_z_to_w: False
# field_function configs
num_layer: 8
hidden_dim: 512
leaky_relu_neg_slope: 0.2
# predictor
attr_file: ./configs/attributes_5.json
baseline: classification
use_sigmoid: True
gt_remapping_file: ~
predictor_ckpt_128: ./download/pretrained_models/predictor_128.pth.tar
predictor_ckpt_1024: ./download/pretrained_models/predictor_1024.pth.tar
# stylegan configs
latent_dim: 512
n_mlp: 8
channel_multiplier_128: 1
channel_multiplier_1024: 2
generator_ckpt_128: ./download/pretrained_models/stylegan2_128.pt
generator_ckpt_1024: ./download/pretrained_models/stylegan2_1024.pth
latent_space: w
# ---------- Dialog Editing -----------
has_dialog: False
device_name: gpu
# pretrained field
pretrained_field_128:
Bangs: ./download/pretrained_models/128_field/Bangs.pth
Eyeglasses: ./download/pretrained_models/128_field/Eyeglasses.pth
No_Beard: ./download/pretrained_models/128_field/No_Beard.pth
Smiling: ./download/pretrained_models/128_field/Smiling.pth
Young: ./download/pretrained_models/128_field/Young.pth
pretrained_field_1024:
Bangs: ./download/pretrained_models/1024_field/Bangs.pth
Eyeglasses: ./download/pretrained_models/1024_field/Eyeglasses.pth
No_Beard: ./download/pretrained_models/1024_field/No_Beard.pth
Smiling: ./download/pretrained_models/1024_field/Smiling.pth
Young: ./download/pretrained_models/1024_field/Young.pth
attr_to_idx:
Bangs: 0
Eyeglasses: 1
No_Beard: 2
Smiling: 3
Young: 4
================================================
FILE: configs/train/field_1024_bangs.yml
================================================
name: field_1024_bangs
use_tb_logger: true
set_CUDA_VISIBLE_DEVICES: ~
gpu_ids: [3]
attribute: Bangs
model_type: FieldFunctionModel
fix_layers: true
replaced_layers: 10
# dataset configs
batch_size: 8
num_workers: 8
input_latent_dir: ./download/train_data/1024/Bangs
editing_latent_code_path: ./download/editing_data/1024/Bangs.npz.npy
num_attr: 5
val_on_train_subset: true
val_on_valset: true
# training configs
val_freq: 1
print_freq: 100
weight_decay: 0
manual_seed: 2021
num_epochs: 500
lr: !!float 1e-4
lr_decay: step
gamma: 0.1
step: 100
# editing configs
confidence_thresh: 0.8
max_cls_num: 5
max_trials_num: 100
print_every: False
# field_function configs
num_layer: 8
hidden_dim: 512
leaky_relu_neg_slope: 0.2
# loss configs
# predictor loss
edited_attribute_weight: 1.0
attr_file: ./configs/attributes_5.json
predictor_ckpt: ./download/pretrained_models/predictor_1024.pth.tar
# arcface loss
pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth
arcface_weight: 5.0
arcface_loss_type: l1
# disciminator loss
disc_weight: 1.0
discriminator_ckpt: ./download/pretrained_models/StyleGAN2_FFHQ1024_discriminator.pth
# stylegan configs
img_res: 1024
latent_dim: 512
n_mlp: 8
channel_multiplier: 2
generator_ckpt: ./download/pretrained_models/stylegan2_1024.pth
latent_space: w
================================================
FILE: configs/train/field_1024_beard.yml
================================================
name: field_1024_beard
use_tb_logger: true
set_CUDA_VISIBLE_DEVICES: ~
gpu_ids: [3]
attribute: No_Beard
model_type: FieldFunctionModel
fix_layers: true
replaced_layers: 10
# dataset configs
batch_size: 8
num_workers: 8
input_latent_dir: ./download/train_data/1024/No_Beard
editing_latent_code_path: ./download/editing_data/1024/No_Beard.npz.npy
num_attr: 5
val_on_train_subset: true
val_on_valset: true
# training configs
val_freq: 1
print_freq: 100
weight_decay: 0
manual_seed: 2021
num_epochs: 30
lr: !!float 1e-4
lr_decay: step
gamma: 0.1
step: 100
# editing configs
confidence_thresh: 0.8
max_cls_num: 5
max_trials_num: 100
print_every: False
# field_function configs
num_layer: 8
hidden_dim: 512
leaky_relu_neg_slope: 0.2
# loss configs
# predictor loss
edited_attribute_weight: 1.0
attr_file: ./configs/attributes_5.json
predictor_ckpt: ./download/pretrained_models/predictor_1024.pth.tar
# arcface loss
pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth
arcface_weight: 10.0
arcface_loss_type: l1
# disciminator loss
disc_weight: 1.0
discriminator_ckpt: ./download/pretrained_models/StyleGAN2_FFHQ1024_discriminator.pth
# stylegan configs
img_res: 1024
latent_dim: 512
n_mlp: 8
channel_multiplier: 2
generator_ckpt: ./download/pretrained_models/stylegan2_1024.pth
latent_space: w
================================================
FILE: configs/train/field_1024_eyeglasses.yml
================================================
name: field_1024_eyeglasses
use_tb_logger: true
set_CUDA_VISIBLE_DEVICES: ~
gpu_ids: [3]
attribute: Eyeglasses
model_type: FieldFunctionModel
fix_layers: true
replaced_layers: 10
# dataset configs
batch_size: 8
num_workers: 8
input_latent_dir: ./download/train_data/1024/Eyeglasses
editing_latent_code_path: ./download/editing_data/1024/Eyeglasses.npz.npy
num_attr: 5
val_on_train_subset: true
val_on_valset: true
# training configs
val_freq: 1
print_freq: 100
weight_decay: 0
manual_seed: 2021
num_epochs: 30
lr: !!float 1e-4
lr_decay: step
gamma: 0.1
step: 100
# editing configs
confidence_thresh: 0.8
max_cls_num: 5
max_trials_num: 100
print_every: False
# field_function configs
num_layer: 8
hidden_dim: 512
leaky_relu_neg_slope: 0.2
# loss configs
# predictor loss
edited_attribute_weight: 1.0
attr_file: ./configs/attributes_5.json
predictor_ckpt: ./download/pretrained_models/predictor_1024.pth.tar
# arcface loss
pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth
arcface_weight: 10.0
arcface_loss_type: l1
# disciminator loss
disc_weight: 1.0
discriminator_ckpt: ./download/pretrained_models/StyleGAN2_FFHQ1024_discriminator.pth
# stylegan configs
img_res: 1024
latent_dim: 512
n_mlp: 8
channel_multiplier: 2
generator_ckpt: ./download/pretrained_models/stylegan2_1024.pth
latent_space: w
================================================
FILE: configs/train/field_1024_smiling.yml
================================================
name: field_1024_smiling
use_tb_logger: true
set_CUDA_VISIBLE_DEVICES: ~
gpu_ids: [3]
attribute: Smiling
model_type: FieldFunctionModel
fix_layers: true
replaced_layers: 10
# dataset configs
batch_size: 8
num_workers: 8
input_latent_dir: ./download/train_data/1024/Smiling
editing_latent_code_path: ./download/editing_data/1024/Smiling.npz.npy
num_attr: 5
val_on_train_subset: true
val_on_valset: true
# training configs
val_freq: 1
print_freq: 100
weight_decay: 0
manual_seed: 2021
num_epochs: 30
lr: !!float 1e-4
lr_decay: step
gamma: 0.1
step: 100
# editing configs
confidence_thresh: 0.8
max_cls_num: 5
max_trials_num: 100
print_every: False
# field_function configs
num_layer: 8
hidden_dim: 512
leaky_relu_neg_slope: 0.2
# loss configs
# predictor loss
edited_attribute_weight: 1.0
attr_file: ./configs/attributes_5.json
predictor_ckpt: ./download/pretrained_models/predictor_1024.pth.tar
# arcface loss
pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth
arcface_weight: 5.0
arcface_loss_type: l1
# disciminator loss
disc_weight: 1.0
discriminator_ckpt: ./download/pretrained_models/StyleGAN2_FFHQ1024_discriminator.pth
# stylegan configs
img_res: 1024
latent_dim: 512
n_mlp: 8
channel_multiplier: 2
generator_ckpt: ./download/pretrained_models/stylegan2_1024.pth
latent_space: w
================================================
FILE: configs/train/field_1024_young.yml
================================================
name: field_1024_young
use_tb_logger: true
set_CUDA_VISIBLE_DEVICES: ~
gpu_ids: [3]
attribute: Young
model_type: FieldFunctionModel
fix_layers: true
replaced_layers: 10
# dataset configs
batch_size: 8
num_workers: 8
input_latent_dir: ./download/train_data/1024/Young
editing_latent_code_path: ./download/editing_data/1024/Young.npz.npy
num_attr: 5
val_on_train_subset: true
val_on_valset: true
# training configs
val_freq: 1
print_freq: 100
weight_decay: 0
manual_seed: 2021
num_epochs: 30
lr: !!float 1e-4
lr_decay: step
gamma: 0.1
step: 100
# editing configs
confidence_thresh: 0.8
max_cls_num: 5
max_trials_num: 100
print_every: False
# field_function configs
num_layer: 8
hidden_dim: 512
leaky_relu_neg_slope: 0.2
# loss configs
# predictor loss
edited_attribute_weight: 1.0
attr_file: ./configs/attributes_5.json
predictor_ckpt: ./download/pretrained_models/predictor_1024.pth.tar
# arcface loss
pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth
arcface_weight: 10.0
arcface_loss_type: l1
# disciminator loss
disc_weight: 1.0
discriminator_ckpt: ./download/pretrained_models/StyleGAN2_FFHQ1024_discriminator.pth
# stylegan configs
img_res: 1024
latent_dim: 512
n_mlp: 8
channel_multiplier: 2
generator_ckpt: ./download/pretrained_models/stylegan2_1024.pth
latent_space: w
================================================
FILE: configs/train/field_128_bangs.yml
================================================
name: field_128_bangs
use_tb_logger: true
set_CUDA_VISIBLE_DEVICES: ~
gpu_ids: [3]
attribute: Bangs
model_type: FieldFunctionModel
fix_layers: true
replaced_layers: 8
# dataset configs
batch_size: 32
num_workers: 8
input_latent_dir: ./download/train_data/128/Bangs
editing_latent_code_path: ./download/editing_data/128/Bangs.npz.npy
num_attr: 5
val_on_train_subset: true
val_on_valset: true
# training configs
val_freq: 1
print_freq: 100
weight_decay: 0
manual_seed: 2021
num_epochs: 30
lr: !!float 1e-4
lr_decay: step
gamma: 0.1
step: 100
# editing configs
confidence_thresh: 0.8
max_cls_num: 5
max_trials_num: 100
print_every: False
# field_function configs
num_layer: 8
hidden_dim: 512
leaky_relu_neg_slope: 0.2
# loss configs
# predictor loss
edited_attribute_weight: 1.0
attr_file: ./configs/attributes_5.json
predictor_ckpt: ./download/pretrained_models/predictor_128.pth.tar
# arcface loss
pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth
arcface_weight: 5.0
arcface_loss_type: l1
# disciminator loss
disc_weight: 1.0
discriminator_ckpt: ./download/pretrained_models/stylegan2_128.pt
# stylegan configs
img_res: 128
latent_dim: 512
n_mlp: 8
channel_multiplier: 1
generator_ckpt: ./download/pretrained_models/stylegan2_128.pt
latent_space: w
================================================
FILE: configs/train/field_128_beard.yml
================================================
name: field_128_beard
use_tb_logger: true
set_CUDA_VISIBLE_DEVICES: ~
gpu_ids: [3]
attribute: No_Beard
model_type: FieldFunctionModel
fix_layers: true
replaced_layers: 8
# dataset configs
batch_size: 32
num_workers: 8
input_latent_dir: ./download/train_data/128/No_Beard
editing_latent_code_path: ./download/editing_data/128/No_Beard.npz.npy
num_attr: 5
val_on_train_subset: true
val_on_valset: true
# training configs
val_freq: 1
print_freq: 100
weight_decay: 0
manual_seed: 2021
num_epochs: 30
lr: !!float 1e-4
lr_decay: step
gamma: 0.1
step: 100
# editing configs
confidence_thresh: 0.8
max_cls_num: 5
max_trials_num: 100
print_every: False
# field_function configs
num_layer: 8
hidden_dim: 512
leaky_relu_neg_slope: 0.2
# loss configs
# predictor loss
edited_attribute_weight: 1.0
attr_file: ./configs/attributes_5.json
predictor_ckpt: ./download/pretrained_models/predictor_128.pth.tar
# arcface loss
pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth
arcface_weight: 5.0
arcface_loss_type: l1
# disciminator loss
disc_weight: 1.0
discriminator_ckpt: ./download/pretrained_models/stylegan2_128.pt
# stylegan configs
img_res: 128
latent_dim: 512
n_mlp: 8
channel_multiplier: 1
generator_ckpt: ./download/pretrained_models/stylegan2_128.pt
latent_space: w
================================================
FILE: configs/train/field_128_eyeglasses.yml
================================================
name: field_128_eyeglasses
use_tb_logger: true
set_CUDA_VISIBLE_DEVICES: ~
gpu_ids: [3]
attribute: Eyeglasses
model_type: FieldFunctionModel
fix_layers: true
replaced_layers: 8
# dataset configs
batch_size: 32
num_workers: 8
input_latent_dir: ./download/train_data/128/Eyeglasses
editing_latent_code_path: ./download/editing_data/128/Eyeglasses.npz.npy
num_attr: 5
val_on_train_subset: true
val_on_valset: true
# training configs
val_freq: 1
print_freq: 100
weight_decay: 0
manual_seed: 2021
num_epochs: 30
lr: !!float 1e-4
lr_decay: step
gamma: 0.1
step: 100
# editing configs
confidence_thresh: 0.8
max_cls_num: 5
max_trials_num: 100
print_every: False
# field_function configs
num_layer: 8
hidden_dim: 512
leaky_relu_neg_slope: 0.2
# loss configs
# predictor loss
edited_attribute_weight: 1.0
attr_file: ./configs/attributes_5.json
predictor_ckpt: ./download/pretrained_models/predictor_128.pth.tar
# arcface loss
pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth
arcface_weight: 5.0
arcface_loss_type: l1
# disciminator loss
disc_weight: 1.0
discriminator_ckpt: ./download/pretrained_models/stylegan2_128.pt
# stylegan configs
img_res: 128
latent_dim: 512
n_mlp: 8
channel_multiplier: 1
generator_ckpt: ./download/pretrained_models/stylegan2_128.pt
latent_space: w
================================================
FILE: configs/train/field_128_smiling.yml
================================================
name: field_128_smiling
use_tb_logger: true
set_CUDA_VISIBLE_DEVICES: ~
gpu_ids: [3]
attribute: Smiling
model_type: FieldFunctionModel
fix_layers: true
replaced_layers: 8
# dataset configs
batch_size: 32
num_workers: 8
input_latent_dir: ./download/train_data/128/Smiling
editing_latent_code_path: ./download/editing_data/128/Smiling.npz.npy
num_attr: 5
val_on_train_subset: true
val_on_valset: true
# training configs
val_freq: 1
print_freq: 100
weight_decay: 0
manual_seed: 2021
num_epochs: 30
lr: !!float 1e-4
lr_decay: step
gamma: 0.1
step: 100
# editing configs
confidence_thresh: 0.8
max_cls_num: 5
max_trials_num: 100
print_every: False
# field_function configs
num_layer: 8
hidden_dim: 512
leaky_relu_neg_slope: 0.2
# loss configs
# predictor loss
edited_attribute_weight: 1.0
attr_file: ./configs/attributes_5.json
predictor_ckpt: ./download/pretrained_models/predictor_128.pth.tar
# arcface loss
pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth
arcface_weight: 5.0
arcface_loss_type: l1
# disciminator loss
disc_weight: 1.0
discriminator_ckpt: ./download/pretrained_models/stylegan2_128.pt
# stylegan configs
img_res: 128
latent_dim: 512
n_mlp: 8
channel_multiplier: 1
generator_ckpt: ./download/pretrained_models/stylegan2_128.pt
================================================
FILE: configs/train/field_128_young.yml
================================================
name: field_128_young
use_tb_logger: true
set_CUDA_VISIBLE_DEVICES: ~
gpu_ids: [3]
attribute: Young
model_type: FieldFunctionModel
fix_layers: true
replaced_layers: 8
# dataset configs
batch_size: 32
num_workers: 8
input_latent_dir: ./download/train_data/128/Young
editing_latent_code_path: ./download/editing_data/128/Young.npz.npy
num_attr: 5
val_on_train_subset: true
val_on_valset: true
# training configs
val_freq: 1
print_freq: 100
weight_decay: 0
manual_seed: 2021
num_epochs: 30
lr: !!float 1e-4
lr_decay: step
gamma: 0.1
step: 100
# editing configs
confidence_thresh: 0.5
max_cls_num: 5
max_trials_num: 100
print_every: False
# field_function configs
num_layer: 8
hidden_dim: 512
leaky_relu_neg_slope: 0.2
# loss configs
# predictor loss
edited_attribute_weight: 1.0
attr_file: ./configs/attributes_5.json
predictor_ckpt: ./download/pretrained_models/predictor_128.pth.tar
# arcface loss
pretrained_arcface: ./download/pretrained_models/arcface_resnet18_110.pth
arcface_weight: 5.0
arcface_loss_type: l1
# disciminator loss
disc_weight: 1.0
discriminator_ckpt: ./download/pretrained_models/stylegan2_128.pt
# stylegan configs
img_res: 128
latent_dim: 512
n_mlp: 8
channel_multiplier: 1
generator_ckpt: ./download/pretrained_models/stylegan2_128.pt
latent_space: w
================================================
FILE: data/__init__.py
================================================
================================================
FILE: data/latent_code_dataset.py
================================================
"""
Dataset for field function
"""
import os
import os.path
import random
import numpy as np
import torch
import torch.utils.data as data
class LatentCodeDataset(data.Dataset):
def __init__(self, input_dir, subset_samples=None):
assert os.path.exists(input_dir)
self.latent_codes = np.load(
os.path.join(input_dir, 'selected_latent_code.npy')).astype(float)
self.labels = np.load(
os.path.join(input_dir, 'selected_pred_class.npy')).astype(int)
self.scores = np.load(
os.path.join(input_dir, 'selected_pred_scores.npy')).astype(float)
self.latent_codes = torch.FloatTensor(self.latent_codes)
self.labels = torch.LongTensor(self.labels)
self.scores = torch.FloatTensor(self.scores)
# select a subset from train set
if subset_samples is not None and len(
self.latent_codes) > subset_samples:
idx = list(range(len(self.latent_codes)))
selected_idx = random.sample(idx, subset_samples)
self.latent_codes = [self.latent_codes[i] for i in selected_idx]
self.labels = [self.labels[i] for i in selected_idx]
self.scores = [self.scores[i] for i in selected_idx]
assert len(self.latent_codes) == len(self.labels)
assert len(self.labels) == len(self.scores)
def __getitem__(self, index):
return (self.latent_codes[index], self.labels[index],
self.scores[index])
def __len__(self):
return len(self.latent_codes)
================================================
FILE: editing_quantitative.py
================================================
import argparse
import logging
import os
import numpy as np
from models import create_model
from utils.logger import get_root_logger
from utils.numerical_metrics import compute_num_metrics
from utils.options import dict2str, dict_to_nonedict, parse
from utils.util import make_exp_dirs
def main():
# options
parser = argparse.ArgumentParser()
parser.add_argument('--opt', type=str, help='Path to option YAML file.')
parser.add_argument(
'--pretrained_path', type=str, help='Path to pretrained field model')
args = parser.parse_args()
opt = parse(args.opt, is_train=False)
# mkdir and loggers
make_exp_dirs(opt)
# convert to NoneDict, which returns None for missing keys
opt = dict_to_nonedict(opt)
# load editing latent code
editing_latent_codes = np.load(opt['editing_latent_code_path'])
num_latent_codes = editing_latent_codes.shape[0]
save_path = f'{opt["path"]["visualization"]}'
os.makedirs(save_path)
editing_logger = get_root_logger(
logger_name='editing',
log_level=logging.INFO,
log_file=f'{save_path}/editing.log')
editing_logger.info(dict2str(opt))
field_model = create_model(opt)
field_model.load_network(args.pretrained_path)
field_model.continuous_editing(editing_latent_codes, save_path,
editing_logger)
_, _ = compute_num_metrics(save_path, num_latent_codes,
opt['pretrained_arcface'], opt['attr_file'],
opt['predictor_ckpt'],
opt['attr_dict'][opt['attribute']],
editing_logger)
if __name__ == '__main__':
main()
================================================
FILE: editing_with_dialog.py
================================================
import argparse
import json
import logging
import os.path
import numpy as np
import torch
from models import create_model
from utils.dialog_edit_utils import dialog_with_real_user
from utils.inversion_utils import inversion
from utils.logger import get_root_logger
from utils.options import (dict2str, dict_to_nonedict, parse,
parse_args_from_opt, parse_opt_wrt_resolution)
from utils.util import make_exp_dirs
def parse_args():
"""Parses arguments."""
parser = argparse.ArgumentParser(description='')
parser.add_argument(
'--opt', default=None, type=str, help='Path to option YAML file.')
return parser.parse_args()
def main():
# ---------- Set up -----------
args = parse_args()
opt = parse(args.opt, is_train=False)
opt = parse_opt_wrt_resolution(opt)
args = parse_args_from_opt(args, opt)
make_exp_dirs(opt)
# convert to NoneDict, which returns None for missing keys
opt = dict_to_nonedict(opt)
# set up logger
save_log_path = f'{opt["path"]["log"]}'
dialog_logger = get_root_logger(
logger_name='dialog',
log_level=logging.INFO,
log_file=f'{save_log_path}/dialog.log')
dialog_logger.info(dict2str(opt))
save_image_path = f'{opt["path"]["visualization"]}'
os.makedirs(save_image_path)
# ---------- Load files -----------
dialog_logger.info('loading template files')
with open(opt['feedback_templates_file'], 'r') as f:
args.feedback_templates = json.load(f)
args.feedback_replacement = args.feedback_templates['replacement']
with open(opt['pool_file'], 'r') as f:
pool = json.load(f)
args.synonyms_dict = pool["synonyms"]
# ---------- create model ----------
field_model = create_model(opt)
# ---------- load latent code ----------
if opt['inversion']['is_real_image']:
latent_code = inversion(opt, field_model)
else:
if opt['latent_code_path'] is None:
latent_code = torch.randn(1, 512, device=torch.device('cuda'))
with torch.no_grad():
latent_code = field_model.stylegan_gen.get_latent(latent_code)
latent_code = latent_code.cpu().numpy()
np.save(f'{opt["path"]["visualization"]}/latent_code.npz.npy',
latent_code)
else:
i = opt['latent_code_index']
latent_code = np.load(
opt['latent_code_path'],
allow_pickle=True).item()[f"{str(i).zfill(7)}.png"]
latent_code = torch.from_numpy(latent_code).to(
torch.device('cuda'))
with torch.no_grad():
latent_code = field_model.stylegan_gen.get_latent(latent_code)
latent_code = latent_code.cpu().numpy()
np.save(f'{opt["path"]["visualization"]}/latent_code.npz.npy', latent_code)
# ---------- Perform dialog-based editing with user -----------
dialog_overall_log = dialog_with_real_user(field_model, latent_code, opt,
args, dialog_logger)
# ---------- Log the dialog history -----------
for (key, value) in dialog_overall_log.items():
dialog_logger.info(f'{key}: {value}')
dialog_logger.info('successfully end.')
if __name__ == '__main__':
main()
================================================
FILE: editing_wo_dialog.py
================================================
import argparse
import logging
import os
import numpy as np
import torch
from models import create_model
from models.utils import save_image
from utils.editing_utils import edit_target_attribute
from utils.inversion_utils import inversion
from utils.logger import get_root_logger
from utils.options import (dict2str, dict_to_nonedict, parse,
parse_opt_wrt_resolution)
from utils.util import make_exp_dirs
def parse_args():
"""Parses arguments."""
parser = argparse.ArgumentParser(description='')
parser.add_argument('--opt', type=str, help='Path to option YAML file.')
parser.add_argument('--attr', type=str, help='Attribute to be edited.')
parser.add_argument(
'--target_val', type=int, help='Target Attribute Value.')
return parser.parse_args()
def main():
# ---------- Set up -----------
args = parse_args()
opt = parse(args.opt, is_train=False)
opt = parse_opt_wrt_resolution(opt)
# args = parse_args_from_opt(args, opt)
make_exp_dirs(opt)
# convert to NoneDict, which returns None for missing keys
opt = dict_to_nonedict(opt)
# set up logger
save_log_path = f'{opt["path"]["log"]}'
editing_logger = get_root_logger(
logger_name='editing',
log_level=logging.INFO,
log_file=f'{save_log_path}/editing.log')
editing_logger.info(dict2str(opt))
save_image_path = f'{opt["path"]["visualization"]}'
os.makedirs(save_image_path)
# ---------- create model ----------
field_model = create_model(opt)
# ---------- load latent code ----------
if opt['inversion']['is_real_image']:
latent_code = inversion(opt, field_model)
else:
if opt['latent_code_path'] is None:
latent_code = torch.randn(1, 512, device=torch.device('cuda'))
with torch.no_grad():
latent_code = field_model.stylegan_gen.get_latent(latent_code)
latent_code = latent_code.cpu().numpy()
np.save(f'{opt["path"]["visualization"]}/latent_code.npz.npy',
latent_code)
else:
i = opt['latent_code_index']
latent_code = np.load(
opt['latent_code_path'],
allow_pickle=True).item()[f"{str(i).zfill(7)}.png"]
latent_code = torch.from_numpy(latent_code).to(
torch.device('cuda'))
with torch.no_grad():
latent_code = field_model.stylegan_gen.get_latent(latent_code)
latent_code = latent_code.cpu().numpy()
# ---------- synthesize images ----------
with torch.no_grad():
start_image, start_label, start_score = \
field_model.synthesize_and_predict(torch.from_numpy(latent_code).to(torch.device('cuda'))) # noqa
save_image(start_image, f'{opt["path"]["visualization"]}/start_image.png')
# initialize attribtue_dict
attribute_dict = {
"Bangs": start_label[0],
"Eyeglasses": start_label[1],
"No_Beard": start_label[2],
"Smiling": start_label[3],
"Young": start_label[4],
}
edit_label = {'attribute': args.attr, 'target_score': args.target_val}
edited_latent_code = None
print_intermediate_result = True
round_idx = 0
attribute_dict, exception_mode, latent_code, edited_latent_code = edit_target_attribute(
opt, attribute_dict, edit_label, round_idx, latent_code,
edited_latent_code, field_model, editing_logger,
print_intermediate_result)
if exception_mode != 'normal':
if exception_mode == 'already_at_target_class':
editing_logger.info("This attribute is already at the degree that you want. Let's try a different attribute degree or another attribute.")
elif exception_mode == 'max_edit_num_reached':
editing_logger.info("Sorry, we are unable to edit this attribute. Perhaps we can try something else.")
if __name__ == '__main__':
main()
================================================
FILE: environment.yml
================================================
name: talk_edit
channels:
- pytorch
- conda-forge
- anaconda
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- absl-py=0.11.0=pyhd3eb1b0_1
- aiohttp=3.7.3=py37h27cfd23_1
- async-timeout=3.0.1=py37h06a4308_0
- attrs=20.3.0=pyhd3eb1b0_0
- backcall=0.2.0=py_0
- blas=1.0=mkl
- blinker=1.4=py37h06a4308_0
- blosc=1.21.0=h8c45485_0
- brotli=1.0.9=he6710b0_2
- brotlipy=0.7.0=py37h27cfd23_1003
- brunsli=0.1=h2531618_0
- bzip2=1.0.8=h7b6447c_0
- c-ares=1.17.1=h27cfd23_0
- ca-certificates=2021.7.5=h06a4308_1
- cachetools=4.2.1=pyhd3eb1b0_0
- certifi=2021.5.30=py37h06a4308_0
- cffi=1.14.4=py37h261ae71_0
- chardet=3.0.4=py37h06a4308_1003
- charls=2.1.0=he6710b0_2
- click=7.1.2=pyhd3eb1b0_0
- cloudpickle=1.6.0=py_0
- cryptography=2.9.2=py37h1ba5d50_0
- cudatoolkit=10.1.243=h6bb024c_0
- cycler=0.10.0=py_2
- cytoolz=0.11.0=py37h7b6447c_0
- dask-core=2021.3.0=pyhd3eb1b0_0
- decorator=4.4.2=pyhd3eb1b0_0
- freetype=2.10.4=h5ab3b9f_0
- giflib=5.1.4=h14c3975_1
- google-auth=1.24.0=pyhd3eb1b0_0
- google-auth-oauthlib=0.4.2=pyhd3eb1b0_2
- grpcio=1.31.0=py37hf8bcb03_0
- icu=67.1=he1b5a44_0
- idna=2.10=pyhd3eb1b0_0
- imagecodecs=2021.1.11=py37h581e88b_1
- imageio=2.9.0=py_0
- intel-openmp=2020.2=254
- ipython=7.18.1=py37h5ca1d4c_0
- ipython_genutils=0.2.0=py37_0
- jedi=0.18.0=py37h06a4308_1
- joblib=1.0.0=pyhd3eb1b0_0
- jpeg=9b=h024ee3a_2
- jxrlib=1.1=h7b6447c_2
- kiwisolver=1.3.1=py37hc928c03_0
- lcms2=2.11=h396b838_0
- ld_impl_linux-64=2.33.1=h53a641e_7
- lerc=2.2.1=h2531618_0
- libaec=1.0.4=he6710b0_1
- libdeflate=1.7=h27cfd23_5
- libedit=3.1.20191231=h14c3975_1
- libffi=3.3=he6710b0_2
- libgcc-ng=9.1.0=hdf63c60_0
- libgfortran-ng=7.3.0=hdf63c60_0
- libpng=1.6.37=hbc83047_0
- libprotobuf=3.13.0.1=h8b12597_0
- libstdcxx-ng=9.1.0=hdf63c60_0
- libtiff=4.1.0=h2733197_1
- libwebp=1.0.1=h8e7db2f_0
- libzopfli=1.0.3=he6710b0_0
- lz4-c=1.9.3=h2531618_0
- markdown=3.3.3=py37h06a4308_0
- matplotlib=3.2.2=1
- matplotlib-base=3.2.2=py37h1d35a4c_1
- mkl=2020.2=256
- mkl-service=2.3.0=py37he8ac12f_0
- mkl_fft=1.2.0=py37h23d657b_0
- mkl_random=1.1.1=py37h0573a6f_0
- multidict=4.7.6=py37h7b6447c_1
- ncurses=6.2=he6710b0_1
- networkx=2.5=py_0
- ninja=1.10.2=py37hff7bd54_0
- numpy=1.19.2=py37h54aff64_0
- numpy-base=1.19.2=py37hfa32c7d_0
- oauthlib=3.1.0=py_0
- olefile=0.46=py37_0
- openjpeg=2.3.0=h05c96fa_1
- openssl=1.1.1k=h27cfd23_0
- parso=0.8.0=py_0
- pexpect=4.8.0=py37_1
- pickleshare=0.7.5=py37_1001
- pillow=8.2.0=py37he98fc37_0
- pip=20.3.3=py37h06a4308_0
- prompt-toolkit=3.0.8=py_0
- protobuf=3.13.0.1=py37he6710b0_1
- ptyprocess=0.6.0=py37_0
- pyasn1=0.4.8=py_0
- pyasn1-modules=0.2.8=py_0
- pycparser=2.20=py_2
- pygments=2.7.1=py_0
- pyjwt=2.0.1=py37h06a4308_0
- pyopenssl=20.0.1=pyhd3eb1b0_1
- pyparsing=2.4.7=pyh9f0ad1d_0
- pysocks=1.7.1=py37_1
- python=3.7.9=h7579374_0
- python-dateutil=2.8.1=py_0
- python_abi=3.7=1_cp37m
- pytorch=1.6.0=py3.7_cuda10.1.243_cudnn7.6.3_0
- pywavelets=1.1.1=py37h7b6447c_2
- pyyaml=5.4.1=py37h27cfd23_1
- readline=8.0=h7b6447c_0
- requests=2.25.1=pyhd3eb1b0_0
- requests-oauthlib=1.3.0=py_0
- rsa=4.7=pyhd3eb1b0_1
- scikit-image=0.17.2=py37hdf5156a_0
- scikit-learn=0.23.2=py37h0573a6f_0
- scipy=1.6.2=py37h91f5cce_0
- setuptools=52.0.0=py37h06a4308_0
- six=1.15.0=py37h06a4308_0
- snappy=1.1.8=he6710b0_0
- sqlite=3.33.0=h62c20be_0
- tensorboard=2.3.0=pyh4dce500_0
- tensorboard-plugin-wit=1.6.0=py_0
- tensorboardx=2.1=py_0
- threadpoolctl=2.1.0=pyh5ca1d4c_0
- tifffile=2021.3.5=pyhd3eb1b0_1
- tk=8.6.10=hbc83047_0
- toolz=0.11.1=pyhd3eb1b0_0
- torchvision=0.7.0=py37_cu101
- tornado=6.1=py37h4abf009_0
- tqdm=4.55.1=pyhd3eb1b0_0
- traitlets=5.0.5=py_0
- typing-extensions=3.7.4.3=hd3eb1b0_0
- typing_extensions=3.7.4.3=pyh06a4308_0
- urllib3=1.26.3=pyhd3eb1b0_0
- wcwidth=0.2.5=py_0
- werkzeug=1.0.1=pyhd3eb1b0_0
- wheel=0.36.2=pyhd3eb1b0_0
- xz=5.2.5=h7b6447c_0
- yaml=0.2.5=h7b6447c_0
- yarl=1.5.1=py37h7b6447c_0
- zfp=0.5.5=h2531618_4
- zipp=3.4.0=pyhd3eb1b0_0
- zlib=1.2.11=h7b6447c_3
- zstd=1.4.5=h9ceee32_0
- pip:
- cmake==3.21.2
- dlib==19.22.1
- facenet-pytorch==2.5.2
- flake8==3.8.4
- future==0.18.2
- importlib-metadata==3.4.0
- isort==5.7.0
- lpips==0.1.4
- mccabe==0.6.1
- opencv-python==4.5.1.48
- pycodestyle==2.6.0
- pyflakes==2.2.0
- yapf==0.30.0
================================================
FILE: language/accuracy.py
================================================
import torch
def head_accuracy(output, target, unlabeled_value=999):
"""
Computes the precision@k for the specified values of k
output: batch_size * num_cls (for a specific attribute)
target: batch_size * 1 (for a specific attribute)
return res: res = 100 * num_correct / batch_size, for a specific attribute
for a batch
"""
with torch.no_grad():
batch_size = target.size(0)
# _ = the largest score, pred = cls_idx with the largest score
_, pred = output.topk(1, 1, True, True)
pred = pred.reshape(-1)
# acc = float(torch.sum(pred == target)) / float(batch_size) * 100
return_dict = {}
if unlabeled_value is not None:
correct_count = torch.sum(
(target != unlabeled_value) * (pred == target))
labeled_count = torch.sum(target != unlabeled_value)
if labeled_count:
labeled_acc = float(correct_count) / float(labeled_count) * 100
else:
labeled_acc = 0
return_dict['acc'] = labeled_acc
return_dict['labeled_count'] = labeled_count
else:
return_dict['acc'] = acc # noqa
return_dict['labeled_count'] = batch_size
return return_dict
================================================
FILE: language/build_vocab.py
================================================
import argparse
import json
import os
import sys
sys.path.append('.')
from language_utils import * # noqa
"""
Build vocabulary from all instantiated templates
"""
def parse_args():
"""Parses arguments."""
parser = argparse.ArgumentParser(description='Build vocabulary')
parser.add_argument(
'--input_data_path',
required=True,
type=str,
help='path to the input data file')
parser.add_argument(
'--output_dir',
required=True,
type=str,
help='folder to save the output vocabulary file')
return parser.parse_args()
def main():
args = parse_args()
# prepare output directory
if not os.path.isdir(args.output_dir):
os.makedirs(args.output_dir, exist_ok=True)
# load text data
print("Loading text data from", args.input_data_path)
with open(args.input_data_path, 'r') as f:
input_data = json.load(f)
# gather a list of text
print("Building vocabulary from", len(input_data), "text data samples")
text_list = []
for idx, data_sample in enumerate(input_data):
if idx % 10000 == 0:
print('loaded', idx, '/', len(input_data))
text = data_sample['text']
text_list.append(text)
# build vocabulary
text_token_to_idx = build_vocab(text_list=text_list) # noqa
vocab = {
'text_token_to_idx': text_token_to_idx,
}
# save vocabulary
print("Saving vocabulary file to",
os.path.join(args.output_dir, 'vocab.json'))
with open(os.path.join(args.output_dir, 'vocab.json'), 'w') as f:
json.dump(vocab, f, indent=4)
if __name__ == '__main__':
main()
================================================
FILE: language/dataset.py
================================================
import os.path
import numpy as np
from torch.utils.data import Dataset
class EncoderDataset(Dataset):
def __init__(self, preprocessed_dir):
# load text
text_path = os.path.join(preprocessed_dir, 'text.npy')
self.text = np.load(text_path)
# load system_mode
system_mode_path = os.path.join(preprocessed_dir, 'system_mode.npy')
self.system_mode = np.load(system_mode_path)
# load labels
labels_path = os.path.join(preprocessed_dir, 'labels.npy')
self.labels = np.load(labels_path)
def __getitem__(self, index):
# retrieve text
text = self.text[index]
# retrieve system_mode
system_mode = self.system_mode[index]
# retrieve labels
labels = self.labels[index]
return text, system_mode, labels
def __len__(self):
return len(self.text)
def main():
""" Testing the Dataset"""
encoderdataset = EncoderDataset(
preprocessed_dir= # noqa
'' # noqa
)
print('len(encoderdataset):', len(encoderdataset))
print('encoderdataset[0]:', encoderdataset[0])
if __name__ == '__main__':
main()
================================================
FILE: language/generate_feedback.py
================================================
import argparse
import json
import os.path
import random
import numpy as np
from .language_utils import proper_capitalize
def parse_args():
"""Parses arguments."""
parser = argparse.ArgumentParser(description='')
parser.add_argument(
'--feedback_templates_file',
default='./templates/feedback.json',
type=str,
help='directory to the request templates file')
parser.add_argument(
'--pool_file',
default='./templates/pool.json',
type=str,
help='directory to the word pool file')
parser.add_argument(
'--num_feedback',
default=100,
type=int,
help='number of feedback data to generate')
parser.add_argument(
'--output_file_dir',
required=True,
type=str,
help='folder to save the output request file')
parser.add_argument(
'--output_file_name',
required=True,
type=str,
help='name of the output request file')
parser.add_argument(
'--whether_enough_general_prob',
default=0.2,
type=float,
help='probability of using general templates in whether_enough mode')
return parser.parse_args()
def main():
args = parse_args()
if not os.path.isdir(args.output_file_dir):
os.makedirs(args.output_file_dir, exist_ok=True)
# load template files
print('loading template files')
with open(args.feedback_templates_file, 'r') as f:
args.feedback_templates = json.load(f)
args.feedback_replacement = args.feedback_templates['replacement']
with open(args.pool_file, 'r') as f:
pool = json.load(f)
args.synonyms_dict = pool["synonyms"]
system_mode_list = ['whats_next', 'whether_enough', 'suggestion']
attribute_list = ['Bangs', "Eyeglasses", "No_Beard", "Smiling", "Young"]
feedback_list = []
output_txt = []
# instantiate feedback
for index in range(args.num_feedback):
if index % 1000 == 0:
print('generated', index, '/', args.num_feedback, 'feedback')
# initialize feedback parameters
attribute = None
# randomly choose the feedback parameters
system_mode = random.choice(system_mode_list)
if system_mode == 'whether_enough' or system_mode == 'suggestion':
attribute = random.choice(attribute_list)
feedback = instantiate_feedback(
args, system_mode=system_mode, attribute=attribute)
feedback['index'] = index
feedback_list.append(feedback)
output_txt.append(feedback['text'])
# save feedback dataset
with open(os.path.join(args.output_file_dir, args.output_file_name),
'w') as f:
json.dump(feedback_list, f, indent=4)
np.savetxt(
os.path.join(args.output_file_dir, "feedback.txt"),
output_txt,
fmt='%s',
delimiter='\t')
print('successfully saved.')
def instantiate_feedback(args,
system_mode=None,
attribute=None,
exception_mode='normal'):
"""
Given the feedback mode (i.e. system_mode) and the attribute (if any),
return a feedback.
"""
if exception_mode != 'normal':
candidate_templates = args.feedback_templates[exception_mode]
template = random.choice(candidate_templates)
attribute = attribute
else:
# ---------- STEP 1: 1st part of feedback: 'ok' template ----------
# instantiate the feedback prefix like "ok"
ok_distribution_prob = random.uniform(0, 1)
ok_template = ''
if ok_distribution_prob < 0.7:
ok_templates = args.feedback_templates['ok']
for idx, templates in enumerate(ok_templates):
if 0.3 < ok_distribution_prob < 0.7 and (idx == 0 or idx == 1):
continue
ok_template += random.choice(templates)
ok_template += ' '
ok_template = ok_template[0].capitalize() + ok_template[1:]
# ---------- STEP 2: 2nd part of feedback: content template ----------
# feedback is trivial like "what's next?"
if system_mode == 'whats_next':
candidate_templates = args.feedback_templates['whats_next']
template = random.choice(candidate_templates)
# feedback asks whether the editing extent is enough
elif system_mode == 'whether_enough':
whether_enough_general_prob = random.uniform(0, 1)
if whether_enough_general_prob < args.whether_enough_general_prob \
or args.feedback_templates[
'whether_enough'][attribute] == []:
candidate_templates = args.feedback_templates[
'whether_enough']['general']
else:
candidate_templates = args.feedback_templates[
'whether_enough'][attribute]
template = random.choice(candidate_templates)
# feedback provides suggestion on the next edit
elif system_mode == 'suggestion':
candidate_templates = args.feedback_templates['suggestion']
template = random.choice(candidate_templates)
else:
raise KeyError('System mode "%s" not recognized' % system_mode)
# ---------- STEP 3: Postprocess the instantiated template sentence ---------- # noqa
# replace the in the template with
# proper attribute-specific words.
# this is not applicable to 'whats_next' type of feedback
if system_mode != 'whats_next':
for word in args.feedback_replacement:
new_word_dict = args.feedback_replacement[word]
new_word = new_word_dict[attribute]
template = template.replace(word, new_word)
# to lower case
template = template.lower()
# randomly replace words with synonyms
for word in args.synonyms_dict:
replacing_word = random.choice(args.synonyms_dict[word])
template = template.replace(word, replacing_word)
# capitalize
template = proper_capitalize(template)
if exception_mode != 'normal':
# after given feedback of cannot_edit
# encode user request by pretending that
# the system_mode is 'whats_next'
system_mode = 'whats_next'
else:
template = ok_template + template
# ---------- STEP 4: Return the feedback and its annotations ----------
feedback = {
"text": template,
"system_mode": system_mode,
"attribute": attribute
}
return feedback
if __name__ == '__main__':
main()
================================================
FILE: language/generate_training_request.py
================================================
import argparse
import json
import os.path
import random
import sys
sys.path.append('.')
from language_utils import proper_capitalize # noqa
def parse_args():
"""Parses arguments."""
parser = argparse.ArgumentParser(description='')
parser.add_argument(
'--num_request',
default=100,
type=int,
help='number of request data to generate')
# template files
parser.add_argument(
'--user_templates_file',
type=str,
default='./templates/user_fsm.json',
help='directory to the request templates file')
parser.add_argument(
'--pool_file',
type=str,
default='./templates/pool.json',
help='directory to the word pool file')
parser.add_argument(
'--metadata_file',
type=str,
default='./templates/metadata_fsm.json',
help='directory to the metadata file')
parser.add_argument(
'--system_mode_file',
type=str,
default='./templates/system_mode.json',
help='directory to the system_mode file')
# output
parser.add_argument(
'--output_file_dir',
required=True,
type=str,
help='folder to save the output request file')
return parser.parse_args()
def main():
args = parse_args()
if not os.path.isdir(args.output_file_dir):
os.makedirs(args.output_file_dir, exist_ok=False)
# load template files
print('loading template files')
with open(args.user_templates_file, 'r') as f:
args.user_templates = json.load(f)
with open(args.pool_file, 'r') as f:
pool = json.load(f)
args.synonyms_dict = pool["synonyms"]
args.postfix_list = pool["postfix"]
with open(args.metadata_file, 'r') as f:
args.metadata = json.load(f)
with open(args.system_mode_file, 'r') as f:
args.system_mode_dict = json.load(f)
args.system_mode_list = []
for key, value in args.system_mode_dict.items():
args.system_mode_list.append(key)
attribute_list = ['Bangs', "Eyeglasses", "No_Beard", "Smiling", "Young"]
target_score_list = [0, 1, 2, 3, 4, 5]
score_change_direction_list = ['positive', 'negative']
score_change_value_list = [1, 2, 3, 4, 5]
request_list = []
# instantiate requests
for index in range(args.num_request):
if index % 1000 == 0:
print('generated', index, '/', args.num_request, 'requests')
# randomly choose the semantic editing parameters
system_mode = random.choice(args.system_mode_list)
user_mode_list = list(args.metadata[system_mode].keys())
user_mode = random.choice(user_mode_list)
attribute = random.choice(attribute_list)
score_change_value = random.choice(score_change_value_list)
score_change_direction = random.choice(score_change_direction_list)
target_score = random.choice(target_score_list)
# instantiate a request according to the
# chosen semantic editing parameters
request = instantiate_training_request(
args,
attribute=attribute,
user_mode=user_mode,
score_change_direction=score_change_direction,
score_change_value=score_change_value,
target_score=target_score)
request['system_mode'] = system_mode
# assign each system_mode's user_mode
for mode in args.system_mode_list:
if system_mode == mode:
request[mode] = request['user_mode']
else:
request[mode] = None
request['index'] = index
request_list.append(request)
# save request dataset
if not os.path.isdir(args.output_file_dir):
os.makedirs(args.output_file_dir, exist_ok=True)
with open(
os.path.join(args.output_file_dir, 'training_request.json'),
'w') as f:
json.dump(request_list, f, indent=4)
print('successfully saved.')
def instantiate_training_request(
args,
attribute=None,
user_mode=None,
score_change_direction=None,
score_change_value=None,
target_score=None,
):
"""
Given semantic editing parameters, instantiate the request
using the request templates.
"""
request_mode = None
instantiated_sentence = ''
user_sub_mode_list = user_mode.split('_')
for user_sub_mode_idx, user_sub_mode in enumerate(user_sub_mode_list):
sub_mode_template = ''
if user_sub_mode != 'pureRequest':
sub_mode_templates = args.user_templates[user_sub_mode]
for templates in sub_mode_templates:
sub_mode_template += random.choice(templates)
else:
request_mode = random.choice(
['target', 'change_definite', 'change_indefinite'])
request_templates = args.user_templates['pureRequest']
attribute_templates = request_templates[attribute]
# request is the score change direction and value
if request_mode == 'change_definite':
assert score_change_direction is not None
assert score_change_value is not None
target_score = None
candidate_templates = attribute_templates['change'][
score_change_direction]['definite'][str(
score_change_value)]
# request is the score change direction without value
elif request_mode == 'change_indefinite':
assert score_change_direction is not None
score_change_value = None
target_score = None
candidate_templates = attribute_templates['change'][
score_change_direction]['indefinite']
# request is the edit target
elif request_mode == 'target':
score_change_direction = None
score_change_value = None
assert target_score is not None
candidate_templates = attribute_templates['target'][str(
target_score)]
else:
raise KeyError('Request mode "%s" not recognized' %
request_mode)
# randomly choose one request template
sub_mode_template = random.choice(candidate_templates)
if user_sub_mode_idx >= 1:
instantiated_sentence += ' '
instantiated_sentence += sub_mode_template
if 'pureRequest' not in user_sub_mode_list:
score_change_direction = None
score_change_value = None
target_score = None
attribute = None
# to lower case
instantiated_sentence = instantiated_sentence.lower()
# randomly replace words with synonyms
for word in args.synonyms_dict:
new_word = random.choice(args.synonyms_dict[word])
instantiated_sentence = instantiated_sentence.replace(word, new_word)
# capitalize
instantiated_sentence = proper_capitalize(instantiated_sentence)
request = {
"text": instantiated_sentence,
"user_mode": user_mode,
'request_mode': request_mode,
"attribute": attribute,
"score_change_direction": score_change_direction,
"score_change_value": score_change_value,
"target_score": target_score,
}
return request
if __name__ == '__main__':
main()
================================================
FILE: language/language_utils.py
================================================
import numpy as np
import torch
# global variables
PUNCTUATION_TO_KEEP = ['?', ';']
PUNCTUATION_TO_REMOVE = ['.', '!', ',']
SPECIAL_TOKENS = {
'': 0,
'': 1,
'': 2,
'': 3,
}
def build_vocab(text_list,
min_token_count=1,
delimiter=' ',
punct_to_keep=None,
punct_to_remove=None,
print_every=10000):
"""
Build token to index mapping from a list of text strings
-- Input: a list of text string
-- Output: a dict which is a mapping from token to index,
"""
token_to_count = {}
# tokenize text and add tokens to token_to_count dict
for text_idx, text in enumerate(text_list):
if text_idx % print_every == 0:
print('tokenized', text_idx, '/', len(text_list))
text_tokens = tokenize(text=text, delimiter=delimiter)
for token in text_tokens:
if token in token_to_count:
token_to_count[token] += 1
else:
token_to_count[token] = 1
token_to_idx = {}
print('Mapping tokens to indices')
# reserve indices for special tokens (must-have tokens)
for token, idx in SPECIAL_TOKENS.items():
token_to_idx[token] = idx
# assign indices to tokens
for token, count in sorted(token_to_count.items()):
if count >= min_token_count:
token_to_idx[token] = len(token_to_idx)
return token_to_idx
def tokenize(text,
delimiter=' ',
add_start_token=False,
add_end_token=False,
punctuation_to_keep=PUNCTUATION_TO_KEEP,
punctuation_to_remove=PUNCTUATION_TO_REMOVE):
"""
Tokenize a text string
-- Input: a text string
-- Output: a list of tokens,
each token is still a string (usually an english word)
"""
# (1) Optionally keep or remove certain punctuation
if punctuation_to_keep is not None:
for punctuation in punctuation_to_keep:
text = text.replace(punctuation, '%s%s' % (delimiter, punctuation))
if punctuation_to_remove is not None:
for punctuation in punctuation_to_remove:
text = text.replace(punctuation, '')
# (2) Split the text string into a list of tokens
text = text.lower()
tokens = text.split(delimiter)
# (3) Optionally add start and end tokens
if add_start_token:
tokens.insert(0, '')
if add_end_token:
tokens.append('')
return tokens
def encode(text_tokens, token_to_idx, allow_unk=False):
text_encoded = []
for token in text_tokens:
if token not in token_to_idx:
if allow_unk:
token = ''
else:
raise KeyError('Token "%s" not in vocab' % token)
text_encoded.append(token_to_idx[token])
return text_encoded
def decode(seq_idx, idx_to_token, delim=None, stop_at_end=True):
tokens = []
for idx in seq_idx:
tokens.append(idx_to_token[idx])
if stop_at_end and tokens[-1] == '':
break
if delim is None:
return tokens
else:
return delim.join(tokens)
def reverse_dict(input_dict):
reversed_dict = {}
for key in input_dict.keys():
val = input_dict[key]
reversed_dict[val] = key
return reversed_dict
def to_long_tensor(dset):
arr = np.asarray(dset, dtype=np.int64)
tensor = torch.LongTensor(arr)
return tensor
def proper_capitalize(text):
if len(text) > 0:
text = text.lower()
text = text[0].capitalize() + text[1:]
for idx, char in enumerate(text):
if char in ['.', '!', '?'] and (idx + 2) < len(text):
text = text[:idx + 2] + text[idx + 2].capitalize() + text[idx +
3:]
text = text.replace(' i ', ' I ')
text = text.replace(',i ', ',I ')
text = text.replace('.i ', '.I ')
text = text.replace('!i ', '!I ')
return text
================================================
FILE: language/lstm.py
================================================
"""
LSTM
Input: batch_size x max_text_length (tokenized questions)
Output: batch_size x lstm_hidden_size (question embedding)
Details:
Tokenized text are first word-embedded (300-D), then passed to
2-layer LSTM, where each cell has is 1024-D. For each text,
output the hidden state of the last non-null token.
"""
from __future__ import print_function
import json
import torch
import torch.nn as nn
from torch.autograd import Variable
class Encoder(nn.Module):
def __init__(self,
token_to_idx,
word_embedding_dim=300,
text_embed_size=1024,
metadata_file='./templates/metadata_fsm.json',
linear_hidden_size=256,
linear_dropout_rate=0):
super(Encoder, self).__init__()
# LSTM (shared)
self.lstm = LSTM(
token_to_idx=token_to_idx,
word_embedding_dim=word_embedding_dim,
lstm_hidden_size=text_embed_size)
# classifiers (not shared)
with open(metadata_file, 'r') as f:
self.metadata = json.load(f)
self.classifier_names = []
for idx, (key, val) in enumerate(self.metadata.items()):
num_val = len(val.items())
classifier_name = key
self.classifier_names.append(classifier_name)
setattr(
self, classifier_name,
nn.Sequential(
fc_block(text_embed_size, linear_hidden_size,
linear_dropout_rate),
nn.Linear(linear_hidden_size, num_val)))
def forward(self, text):
# LSTM (shared)
# Input: batch_size x max_text_length
# Output: batch_size x text_embed_size
text_embedding = self.lstm(text)
# classifiers (not shared)
output = []
for classifier_name in self.classifier_names:
classifier = getattr(self, classifier_name)
output.append(classifier(text_embedding))
return output
class LSTM(nn.Module):
def __init__(self,
token_to_idx,
word_embedding_dim=300,
lstm_hidden_size=1024,
lstm_num_layers=2,
lstm_dropout=0):
super(LSTM, self).__init__()
# token
self.token_to_idx = token_to_idx
self.NULL = token_to_idx['']
self.START = token_to_idx['']
self.END = token_to_idx['']
# word embedding
self.word2vec = nn.Embedding(
num_embeddings=len(token_to_idx), embedding_dim=word_embedding_dim)
# LSTM
self.rnn = nn.LSTM(
input_size=word_embedding_dim,
hidden_size=lstm_hidden_size,
num_layers=lstm_num_layers,
bias=True,
batch_first=True,
dropout=lstm_dropout,
bidirectional=False)
def forward(self, x):
batch_size, max_text_length = x.size()
# Find the last non-null element in each sequence, store in idx
idx = torch.LongTensor(batch_size).fill_(max_text_length - 1)
x_cpu = x.data.cpu()
for text_idx in range(batch_size):
for token_idx in range(max_text_length - 1):
if (x_cpu[text_idx, token_idx] != self.NULL
) and x_cpu[text_idx, token_idx + 1] == self.NULL: # noqa
idx[text_idx] = token_idx
break
idx = idx.type_as(x.data).long()
idx = Variable(idx, requires_grad=False)
# reduce memory access time
self.rnn.flatten_parameters()
# hs: all hidden states
# [batch_size x max_text_length x hidden_size]
# h_n: [2 x batch_size x hidden_size]
# c_n: [2 x batch_size x hidden_size]
hidden_states, (_, _) = self.rnn(self.word2vec(x))
idx = idx.view(batch_size, 1, 1).expand(batch_size, 1,
hidden_states.size(2))
hidden_size = hidden_states.size(2)
# only retrieve the hidden state of the last non-null element
# [batch_size x 1 x hidden_size]
hidden_state_at_last_token = hidden_states.gather(1, idx)
# [batch_size x hidden_size]
hidden_state_at_last_token = hidden_state_at_last_token.view(
batch_size, hidden_size)
return hidden_state_at_last_token
class fc_block(nn.Module):
def __init__(self, inplanes, planes, drop_rate=0.15):
super(fc_block, self).__init__()
self.fc = nn.Linear(inplanes, planes)
self.bn = nn.BatchNorm1d(planes)
if drop_rate > 0:
self.dropout = nn.Dropout(drop_rate)
self.relu = nn.ReLU(inplace=True)
self.drop_rate = drop_rate
def forward(self, x):
x = self.fc(x)
x = self.bn(x)
if self.drop_rate > 0:
x = self.dropout(x)
x = self.relu(x)
return x
def main():
""" Test Code """
# ################### LSTM #########################
question_token_to_idx = {
".": 4,
"missing": 34,
"large": 28,
"is": 26,
"cubes": 19,
"cylinder": 21,
"what": 54,
"": 1,
"green": 24,
"": 2,
"object": 35,
"things": 51,
"": 3,
"matte": 31,
"rubber": 41,
"tiny": 52,
"yellow": 55,
"red": 40,
"visible": 53,
"color": 17,
"size": 44,
"balls": 11,
"the": 48,
"any": 8,
"blocks": 14,
"ball": 10,
"a": 6,
"it": 27,
"an": 7,
"one": 38,
"purple": 39,
"how": 25,
"thing": 50,
"?": 5,
"objects": 36,
"blue": 15,
"block": 13,
"small": 45,
"shiny": 43,
"material": 30,
"cylinders": 22,
"": 0,
"many": 29,
"of": 37,
"cube": 18,
"metallic": 33,
"gray": 23,
"brown": 16,
"spheres": 47,
"there": 49,
"sphere": 46,
"shape": 42,
"are": 9,
"metal": 32,
"cyan": 20,
"big": 12
},
batch_size = 64
print('batch size:', batch_size)
# questions=torch.ones(batch_size, 15, dtype=torch.long)
questions = torch.randint(0, 10, (batch_size, 15), dtype=torch.long)
print('intput size:', questions.size())
lstm = LSTM(token_to_idx=question_token_to_idx[0])
output = lstm(questions)
print('output size:', output.size())
# ################### Language Encoder #########################
encoder = Encoder(
token_to_idx=question_token_to_idx[0],
metadata_file='./templates/metadata_fsm.json')
output = encoder(questions)
print('output length:', len(output))
for classifier in output:
print('classifier.size():', classifier.size())
if __name__ == '__main__':
main()
================================================
FILE: language/preprocess_request.py
================================================
import argparse
import json
import os
import sys
import numpy as np
sys.path.append('.')
from language_utils import * # noqa
"""
Preprocess the text
"""
def parse_args():
"""Parses arguments."""
parser = argparse.ArgumentParser()
parser.add_argument(
'--input_vocab_path',
required=True,
type=str,
help='path to the input vocabulary file')
parser.add_argument(
'--input_data_path',
required=True,
type=str,
help='path to the input data file')
parser.add_argument(
'--metadata_file',
type=str,
default='./templates/metadata_fsm.json',
help='directory to the metadata file')
parser.add_argument(
'--system_mode_file',
type=str,
default='./templates/system_mode.json',
help='directory to the system_mode file')
parser.add_argument(
'--allow_unknown',
default=0,
type=int,
help='whether allow unknown tokens (i.e. words)')
parser.add_argument(
'--expand_vocab',
default=0,
type=int,
help='whether expand vocabularies')
parser.add_argument(
'--output_dir',
required=True,
type=str,
help='folder to save the output vocabulary file')
parser.add_argument(
'--unlabeled_value',
default=999,
type=int,
help='value to represent unlabeled value')
return parser.parse_args()
def main():
args = parse_args()
if not os.path.isdir(args.output_dir):
os.makedirs(args.output_dir, exist_ok=False)
# load vocabulary
print("Loading vocab")
with open(args.input_vocab_path, 'r') as f:
vocab = json.load(f)
text_token_to_idx = vocab['text_token_to_idx']
# load metadata file
with open(args.metadata_file, 'r') as f:
metadata = json.load(f)
# load system_mode file
with open(args.system_mode_file, 'r') as f:
system_mode_file = json.load(f)
# load input data
with open(args.input_data_path, 'r') as f:
input_data = json.load(f)
# initialize lists to store encoded data
text_encoded_list = []
system_mode_encoded_list = []
labels_encoded_list = []
print('Encoding')
for idx, data_sample in enumerate(input_data):
# encode text
text = data_sample['text']
text_tokens = tokenize(text=text) # noqa
text_encoded = encode( # noqa
text_tokens=text_tokens,
token_to_idx=text_token_to_idx,
allow_unk=args.allow_unknown)
text_encoded_list.append(text_encoded)
# encode system_mode
system_mode = data_sample['system_mode']
system_mode_encoded = system_mode_file[system_mode]
system_mode_encoded_list.append(system_mode_encoded)
# encode labels
labels_encoded = []
for idx, (key, val) in enumerate(metadata.items()):
label = data_sample[key]
if label is None:
# use args.unlabeled_value to represent missing labels
label_encoded = args.unlabeled_value
else:
label_encoded = val[str(label)]
labels_encoded.append(label_encoded)
labels_encoded_list.append(labels_encoded)
# Pad encoded text to equal length
print('Padding tokens')
text_encoded_padded_list = []
max_text_length = max(len(text) for text in text_encoded_list)
for text_encoded in text_encoded_list:
while len(text_encoded) < max_text_length:
text_encoded.append(text_token_to_idx[''])
text_encoded_padded_list.append(text_encoded)
# save processed text
np.save(
os.path.join(args.output_dir, 'text.npy'), text_encoded_padded_list)
np.savetxt(
os.path.join(args.output_dir, 'text.txt'),
text_encoded_padded_list,
fmt='%.0f')
# save processed system_mode
np.save(
os.path.join(args.output_dir, 'system_mode.npy'),
system_mode_encoded_list)
np.savetxt(
os.path.join(args.output_dir, 'system_mode.txt'),
system_mode_encoded_list,
fmt='%.0f')
# save processed labels
np.save(os.path.join(args.output_dir, 'labels.npy'), labels_encoded_list)
np.savetxt(
os.path.join(args.output_dir, 'labels.txt'),
labels_encoded_list,
fmt='%.0f')
if __name__ == '__main__':
main()
================================================
FILE: language/run_encoder.py
================================================
import argparse
import json
import random
import torch
from .language_utils import * # noqa
from .lstm import Encoder
def parse_args():
"""Parses arguments."""
parser = argparse.ArgumentParser()
parser.add_argument(
'--input_vocab_file',
required=True,
type=str,
help='path to the input vocabulary file')
parser.add_argument(
'--allow_unknown',
default=1,
type=int,
help='whether allow unknown tokens (i.e. words)')
parser.add_argument(
'--pretrained_checkpoint',
default='',
type=str,
help='The pretrained network weights for testing')
parser.add_argument(
'--metadata_file',
default='./templates/metadata_fsm.json',
type=str,
help='path to metadata file.')
parser.add_argument(
'--system_mode_file',
default='./templates/system_mode.json',
type=str,
help='path to system_mode file.')
parser.add_argument(
'--device_name',
default='gpu',
type=str,
)
parser.add_argument(
'--verbose',
default=0,
type=int,
)
# LSTM hyperparameter
parser.add_argument('--word_embedding_dim', default=300, type=int)
parser.add_argument('--text_embed_size', default=1024, type=int)
parser.add_argument('--linear_hidden_size', default=256, type=int)
parser.add_argument('--linear_dropout_rate', default=0, type=float)
return parser.parse_args()
def main():
args = parse_args()
encode_request(args)
def encode_request(args, system_mode=None, dialog_logger=None):
# set up
if args.device_name == 'cpu':
args.device = torch.device('cpu')
elif args.device_name == 'gpu':
args.device = torch.device('cuda')
if dialog_logger is None:
output_function = print
else:
# output_function = dialog_logger.info
def output_function(input):
# suppress output when called by other scripts
pass
return
compulsory_output_function = dialog_logger.info
# ---------------- STEP 1: Input the Request ----------------
# choose system_mode
with open(args.system_mode_file, 'r') as f:
system_mode_dict = json.load(f)
system_mode_list = []
for (mode, mode_idx) in system_mode_dict.items():
system_mode_list.append(mode)
if __name__ == '__main__':
assert system_mode is None
system_mode = random.choice(system_mode_list)
output_function(' PREDEFINED system_mode:', system_mode)
else:
assert system_mode is not None
# input request
if True:
compulsory_output_function(
'Enter your request (Press enter when you finish):')
input_text = input()
else:
input_text = 'make the bangs slightly longer.'
compulsory_output_function('USER INPUT >>> ' + input_text)
# ---------------- STEP 2: Preprocess Request ----------------
# output_function(" The system is trying to understand your request:")
# output_function(" ########################################")
# load vocabulary
with open(args.input_vocab_file, 'r') as f:
vocab = json.load(f)
text_token_to_idx = vocab['text_token_to_idx']
text_tokens = tokenize(text=input_text) # noqa
text_encoded = encode( # noqa
text_tokens=text_tokens,
token_to_idx=text_token_to_idx,
allow_unk=args.allow_unknown)
text_encoded = to_long_tensor([text_encoded]).to(args.device) # noqa
# ---------------- STEP 3: Encode Request ----------------
# prepare encoder
encoder = Encoder(
token_to_idx=text_token_to_idx,
word_embedding_dim=args.word_embedding_dim,
text_embed_size=args.text_embed_size,
metadata_file=args.metadata_file,
linear_hidden_size=args.linear_hidden_size,
linear_dropout_rate=args.linear_dropout_rate)
encoder = encoder.to(args.device)
checkpoint = torch.load(args.pretrained_checkpoint)
encoder.load_state_dict(checkpoint['state_dict'], True)
encoder.eval()
# forward pass
output = encoder(text_encoded)
# ---------------- STEP 4: Process Encoder Output ----------------
output_labels = []
for head_idx in range(len(output)):
_, pred = torch.max(output[head_idx], 1)
head_label = pred.cpu().numpy()[0]
output_labels.append(head_label)
# load metadata file
with open(args.metadata_file, 'r') as f:
metadata = json.load(f)
# find mapping from value to label
reversed_metadata = {}
for idx, (key, val) in enumerate(metadata.items()):
reversed_val = reverse_dict(val) # noqa
reversed_metadata[key] = reversed_val
if args.verbose:
output_function('reversed_metadata:', reversed_metadata)
# convert predicted values to a dict of predicted labels
output_semantic_labels = {} # from LSTM output
valid_semantic_labels = {} # useful information among LSTM output
for idx, (key, val) in enumerate(reversed_metadata.items()):
output_semantic_labels[key] = val[output_labels[idx]]
valid_semantic_labels[key] = None
if args.verbose:
output_function('output_semantic_labels:', output_semantic_labels)
# extract predicted labels
user_mode = output_semantic_labels[system_mode]
valid_semantic_labels[system_mode] = user_mode
request_mode = output_semantic_labels['request_mode']
attribute = output_semantic_labels['attribute']
score_change_direction = output_semantic_labels['score_change_direction']
if output_semantic_labels['score_change_value'] is None:
score_change_value = None
else:
score_change_value = int(output_semantic_labels['score_change_value'])
if output_semantic_labels['target_score'] is None:
target_score = None
else:
target_score = int(output_semantic_labels['target_score'])
# print to screen
output_function(' ENCODED user_mode:' + ' ' + user_mode)
valid_semantic_labels['user_mode'] = user_mode
if 'pureRequest' in user_mode:
output_function(' ENCODED request_mode: ' + ' ' + request_mode)
valid_semantic_labels['request_mode'] = request_mode
output_function(' ENCODED attribute:' + ' ' + attribute)
valid_semantic_labels['attribute'] = attribute
# only output_function labels valid for this request_mode
if request_mode == 'change_definite':
output_function(' ENCODED score_change_direction:' + ' ' +
(score_change_direction))
valid_semantic_labels[
'score_change_direction'] = score_change_direction
output_function(' ENCODED score_change_value:' + ' ' +
str(score_change_value))
valid_semantic_labels['score_change_value'] = score_change_value
elif request_mode == 'change_indefinite':
output_function(' ENCODED score_change_direction:' + ' ' +
score_change_direction)
valid_semantic_labels[
'score_change_direction'] = score_change_direction
elif request_mode == 'target':
output_function(' ENCODED target_score:' + ' ' +
str(target_score))
valid_semantic_labels['target_score'] = target_score
valid_semantic_labels['text'] = input_text
if args.verbose:
output_function('valid_semantic_labels:' + ' ' +
str(valid_semantic_labels))
# output_function(" ########################################")
return valid_semantic_labels
if __name__ == '__main__':
main()
================================================
FILE: language/templates/attr_wise_caption_templates.json
================================================
{
"Bangs": {
"0": [
" has no bangs at all.",
" has no bangs at all and forehead is visible.",
" doesn't have any bangs.",
" doesn't have any bangs and forehead is visible.",
" entire forehead is visible.",
" entire forehead is visible without any bangs.",
" shows entire forehead without any bangs."
],
"1": [
" has very short bangs which only covers a tiny portion of forehead."
],
"2": [
" has short bangs that covers a small portion of forehead.",
" has short bangs that only covers a small portion of forehead."
],
"3": [
" has medium bangs that covers half of forehead.",
" has bangs of medium length that covers half of forehead.",
" has bangs of medium length that leaves half of forehead visible."
],
"4": [
" has long bangs that almost covers all of forehead.",
" has long bangs that almost covers This entire forehead."
],
"5": [
" has extremely long bangs that almost covers all of forehead.",
" has extremely long bangs that almost covers This entire forehead."
]
},
"Eyeglasses": {
"0": [
" is not wearing any eyeglasses.",
"There is not any eyeglasses on face."
],
"1": [
" is wearing rimless eyeglasses."
],
"2": [
" is wearing eyeglasses with thin frame.",
" is wearing thin frame eyeglasses."
],
"3": [
" is wearing eyeglasses with thick frame.",
" is wearing thick frame eyeglasses."
],
"4": [
" is wearing sunglasses with thin frame.",
" is wearing thin frame sunglasses."
],
"5": [
" is wearing sunglasses with thick frame.",
" is wearing thick frame sunglasses."
]
},
"No_Beard": {
"0": [
" doesn't have any beard.",
" doesn't have any beard at all."
],
"1": [
" face is covered with short pointed beard.",
" face is covered with his stubble.",
" face has a rough growth of stubble.",
" has a rough growth of stubble.",
"There should be stubble covering cheeks and chin."
],
"2": [
" face is covered with short beard."
],
"3": [
" face is covered with beard of medium length.",
" has beard of medium length."
],
"4": [
" has a big mustache on his face.",
" has a bushy beard."
],
"5": [
" has very long beard.",
" has full beard.",
" has very thick beard.",
" has a very bushy beard."
]
},
"Smiling": {
"0": [
" looks serious with no smile in face."
],
"1": [
" smiles with corners of the mouth turned up.",
" smiles with corners of mouth turned up.",
" turns up the corners of mouth."
],
"2": [
"This corners of mouth curve up and we can see some teeth.",
" smiles broadly and shows some teeth."
],
"3": [
"The entire face of this is beamed with happiness.",
" has a beaming face.",
" is smiling with teeth visible.",
" entire face is beamed with happiness."
],
"4": [
" has a big smile.",
" has a big smile on face.",
" is smiling with mouth slightly open.",
" is smiling with mouth slightly open and teeth visible."
],
"5": [
"This in the image is laughing happily.",
" has a deep rumbling laugh.",
" has a very big smile.",
" has a very big smile on face.",
" is smiling with mouth wide open.",
" is smiling with mouth wide open and teeth visible."
]
},
"Young": {
"0": [
"This is a young kid.",
"This is a young child."
],
"1": [
" is a teenager.",
" looks very young."
],
"2": [
" is a young adult.",
" is in thirties."
],
"3": [
" is in forties.",
" is in middle age."
],
"4": [
" is in sixties.",
" is in fifties.",
" looks like an elderly."
],
"5": [
" is in eighties.",
"This old is in eighties.",
" is in seventies.",
"This old is in seventies.",
" looks very old."
]
}
}
================================================
FILE: language/templates/feedback.json
================================================
{
"replacement": {
"": {
"Bangs": "bangs",
"Eyeglasses": "glasses",
"No_Beard": "beard",
"Smiling": "smile",
"Young": "age"
},
"": {
"Bangs": "them",
"Eyeglasses": "them",
"No_Beard": "it",
"Smiling": "it",
"Young": "it"
},
"": {
"Bangs": "are",
"Eyeglasses": "are",
"No_Beard": "is",
"Smiling": "is",
"Young": "is"
},
"": {
"Bangs": "length",
"Eyeglasses": "style",
"No_Beard": "shape",
"Smiling": "degree",
"Young": "level"
}
},
"suggestion": [
"Do you want to try manipulating the ?",
"Do you want to try manipulating the instead?",
"Do you want to try manipulating the as well?",
"Do you want to try editing the ?",
"Do you want to try editing the instead?",
"Do you want to try editing the as well?",
"What about the ? Do you want to play with ?",
"Do you want to play with the ?",
"What about the ? Do you want to edit ?",
"Do you want to edit the ?",
"What about the ? Do you want to manipulate ?",
"Do you want to manipulate the ?"
],
"whether_enough": {
"general": [
"Is this enough?",
"Is this good enough?",
" the just right now?",
" the what you want now?",
" the of the person just right now?",
" the of the person what you want now?",
" the of proper degree now?",
" the of the ok now?",
" the of the okay now?"
],
"Bangs": [
"Are the bangs in proper shape now?",
"Is the length of the bangs ok now?"
],
"Eyeglasses": [],
"No_Beard": [],
"Smiling": [],
"Young": [
"Is the age of the person ok now?"
]
},
"whats_next": [
"What's next?",
"What else do you want to play with?",
"What else do you want to manipulate?",
"What else do you want to edit?",
"What else do you want to change?",
"What else do you want to try?"
],
"ok": [
[
"Okay",
"Ok",
"Well",
"Okie"
],
[
" ",
", "
],
[
"done.",
"it's done.",
"bingo.",
"finished.",
"that's it.",
"this is it."
]
],
"max_edit_num_reached": [
"It is infeasible to edit this attribute. Let's try another attribute.",
"We cannot edit this attribute. Let's try something else.",
"Oops, it is hard to edit this attribute. Let's try something else.",
"Sorry, we are unable to edit this attribute. Perhaps we can try something else."
],
"already_at_target_class": [
"This attribute is already at the degree that you want. Let's try a different attribute degree or another attribute."
]
}
================================================
FILE: language/templates/gender.json
================================================
{
"male": {
"": [
"person",
"guy",
"gentleman"
],
"": [
"he",
"he",
"this person",
"this guy",
"this gentleman",
"this man"
],
"": [
"his",
"the"
],
"": [
"him"
],
"": [
"boy"
]
},
"female": {
"": [
"person",
"lady",
"female"
],
"": [
"she",
"she",
"this lady",
"this person",
"this female",
"this woman"
],
"": [
"her",
"the"
],
"": [
"her"
],
"": [
"girl"
]
}
}
================================================
FILE: language/templates/metadata_fsm.json
================================================
{
"start": {
"start_pureRequest": 0
},
"suggestion": {
"yes": 0,
"yes_pureRequest": 1,
"no": 2,
"no_pureRequest": 3,
"no_end": 4
},
"whether_enough": {
"yes": 0,
"yes_pureRequest": 1,
"yes_end": 2,
"no": 3,
"no_pureRequest": 4
},
"whats_next": {
"pureRequest": 0,
"end": 1
},
"attribute": {
"Bangs": 0,
"Eyeglasses": 1,
"No_Beard": 2,
"Smiling": 3,
"Young": 4
},
"score_change_direction": {
"negative": 0,
"positive": 1
},
"score_change_value": {
"1": 0,
"2": 1,
"3": 2,
"4": 3,
"5": 4
},
"target_score": {
"0": 0,
"1": 1,
"2": 2,
"3": 3,
"4": 4,
"5": 5
},
"request_mode": {
"change_definite": 0,
"change_indefinite": 1,
"target": 2,
"end": 3
}
}
================================================
FILE: language/templates/overall_caption_templates.json
================================================
{
"attr_order_mapping": {
"Bangs": {
"0": [
"has",
"sentence"
],
"1": [
"has"
],
"2": [
"has"
],
"3": [
"has"
],
"4": [
"has",
"sentence"
]
},
"No_Beard": {
"0": [
"has",
"sentence"
],
"1": [
"has"
],
"2": [
"has"
],
"3": [
"has"
],
"4": [
"has",
"sentence"
]
},
"Eyeglasses": {
"0": [
"has",
"sentence"
],
"1": [
"has"
],
"2": [
"has"
],
"3": [
"has"
],
"4": [
"has",
"sentence"
]
},
"Smiling": {
"0": [
"has",
"sentence"
],
"1": [
"has"
],
"2": [
"has"
],
"3": [
"has"
],
"4": [
"has",
"sentence"
]
},
"Young": {
"0": [
"start"
],
"1": [
"sentence"
],
"2": [
"sentence"
],
"3": [
"sentence"
],
"4": [
"sentence"
]
}
},
"has": {
"Bangs": {
"0": [
"no bangs"
],
"1": [
"very short bangs",
"very short bangs which only covers a tiny portion of forehead"
],
"2": [
"short bangs",
"short bangs that covers a small portion of forehead",
"short bangs that only covers a small portion of forehead"
],
"3": [
"medium bangs",
"medium bangs that covers half of forehead",
"bangs of medium length that covers half of forehead",
"bangs of medium length that leaves half of forehead visible"
],
"4": [
"long bangs",
"long bangs that almost covers all of forehead",
"long bangs that almost covers This entire forehead"
],
"5": [
"extremely long bangs",
"extremely long bangs that almost covers all of forehead",
"extremely long bangs that almost covers This entire forehead"
]
},
"Eyeglasses": {
"0": [
"no eyeglasses"
],
"1": [
"rimless eyeglasses"
],
"2": [
"eyeglasses with thin frame",
"thin frame eyeglasses"
],
"3": [
"eyeglasses with thick frame",
"thick frame eyeglasses"
],
"4": [
"sunglasses with thin frame",
"thin frame sunglasses"
],
"5": [
"sunglasses with thick frame",
"thick frame sunglasses"
]
},
"No_Beard": {
"0": [
"no beard",
"no beard at all"
],
"1": [
"short pointed beard",
"stubble",
"a rough growth of stubble",
"stubble covering cheeks and chin"
],
"2": [
"short beard"
],
"3": [
"beard of medium length"
],
"4": [
"a big mustache on his face",
"a bushy beard"
],
"5": [
"very long beard",
"full beard",
"very thick beard",
"a very bushy beard"
]
},
"Smiling": {
"0": [
"no smile"
],
"1": [
"a very mild smile"
],
"2": [
"a mild smile"
],
"3": [
"a beaming face",
"a smile with teeth visible",
"a face that is beamed with happiness",
"a smile"
],
"4": [
"a big smile",
"a big smile on face",
"a big smile with mouth slightly open",
"a big smile with mouth slightly open and teeth visible"
],
"5": [
"a deep rumbling laugh",
"a very big smile",
"a very big smile on face",
"a very big smile with mouth wide open",
"a very big smile with mouth wide open and teeth visible"
]
}
},
"start": {
"Young": {
"0": [
"This young kid",
"This young child",
"This little "
],
"1": [
"This teenager",
"This young ",
"This young "
],
"2": [
"This young adult",
"This in thirties"
],
"3": [
"This in forties",
"This in middle age",
"This middle-aged "
],
"4": [
"This in sixties",
"This in fifties",
"This elderly "
],
"5": [
"This old ",
"This in eighties",
"This old in eighties",
"This in seventies",
"This old in seventies",
"This very old "
]
}
},
"has_prefix": [
"This has ",
" has "
]
}
================================================
FILE: language/templates/pool.json
================================================
{
"synonyms": {
" can ": [
" can ",
" could ",
" should "
],
"i'm": [
"i'm",
"i am"
],
"it's": [
"it's",
"it is"
],
"bangs": [
"bangs",
"fringe"
],
"slightly": [
"slightly",
"a little bit",
"a tiny little bit",
"a little",
"a bit",
"only a little",
"just a little bit"
],
"somewhat": [
"somewhat",
"relatively",
"to some extent",
"to some degree",
"moderately",
"partially",
"sort of",
"kind of",
"considerably"
],
"very": [
"very",
"extremely"
],
"entire": [
"entire",
"whole",
"full"
],
"child": [
"child",
"schoolchild"
],
"teenager": [
"teenager",
"teen"
],
"beard": [
"beard",
"mustache"
],
"i think": [
"i think",
"i think that",
"i feel",
"i feel that",
"i kind of think",
"i kind of think that",
"i kind of feel",
"i kind of feel that",
"i guess",
"i guess that"
],
"i want": [
"i want",
"i kind of want",
"i would like"
],
"let's try": [
"let's try",
"how about trying",
"what about trying"
],
"but not too much": [
"but not too much",
"just not too much",
"just that not too much",
"just don't go too much"
],
"only": [
"only",
"simply",
"just"
],
"eyeglasses": [
"eyeglasses",
"glasses"
],
"pokerface": [
"pokerface",
"poker face"
],
"what's": [
"what's",
"what is"
],
"how's": [
"how's",
"how is"
],
"do you want to": [
"do you want to",
"would you like to",
"perhaps you would like to",
"perhaps you might want to",
"maybe you would like to",
"maybe you might want to"
],
"want to": [
"want to",
"would like to"
],
"manipulate": [
"manipulate",
"edit"
],
"manipulating": [
"manipulating",
"editing",
"playing with"
]
},
"prefix": [
"Actually,",
"To be honest,",
"Well,",
"Well",
"Emm",
"Emmm",
"Emmmm",
"Emm,",
"Emmm,",
"Emmmm,",
"Hi,",
"Hello,",
"Let me think about it.",
"I'm not too sure but",
"What about this?",
"Can we try this?",
"It looks okay now but",
"It looks better now, but still,",
"It looks nice, but still,",
"Let me have a look. Well,",
"Let me have a look. Well",
"Let me have a look. Emm,",
"Let me have a look. Emmm,",
"Let me have a look. Emmmm,",
"Let me have a look. Emm",
"Let me have a look. Emmm",
"Let me have a look. Emmmm",
"Let me take a look. Well,",
"Let me take a look. Well",
"Let me take a look. Emm,",
"Let me take a look. Emmm,",
"Let me take a look. Emmmm,",
"Let me take a look. Emm",
"Let me take a look. Emmm",
"Let me take a look. Emmmm"
],
"postfix": [
"Thanks!",
"Thank you!",
"Is that possible?",
"and emmm... well let's try this first.",
"I guess it will probably get better this way.",
"I'm not too sure, let's see how it goes first.",
"It would be nicer in that way.",
"It would be nicer in that way, I think.",
"It would be nicer in that way, I guess.",
"I think it would be nicer in that way.",
"I guess it would be nicer in that way.",
"It would be nicer this way.",
"It would be nicer this way, I think.",
"It would be nicer this way, I guess.",
"I think it would be nicer this way.",
"I guess it would be nicer this way.",
"It might be nicer in that way.",
"It might be nicer in that way, I think.",
"It might be nicer in that way, I guess.",
"I think it might be nicer in that way.",
"I guess it might be nicer in that way.",
"It might be nicer this way.",
"It might be nicer this way, I think.",
"It might be nicer this way, I guess.",
"I think it might be nicer this way.",
"I guess it might be nicer this way.",
"It would look better in that way.",
"It would look better in that way, I think.",
"It would look better in that way, I guess.",
"I think it would look better in that way.",
"I guess it would look better in that way.",
"It would look better this way.",
"It would look better this way, I think.",
"It would look better this way, I guess.",
"I think it would look better this way.",
"I guess it would look better this way.",
"It might look better in that way.",
"It might look better in that way, I think.",
"It might look better in that way, I guess.",
"I think it might look better in that way.",
"I guess it might look better in that way.",
"It might look better this way.",
"It might look better this way, I think.",
"It might look better this way, I guess.",
"I think it might look better this way.",
"I guess it might look better this way."
]
}
================================================
FILE: language/templates/system_mode.json
================================================
{
"start": 0,
"suggestion": 1,
"whether_enough": 2,
"whats_next": 3
}
================================================
FILE: language/templates/user_fsm.json
================================================
{
"start": [
[
"Hi.",
"Hello."
]
],
"pureRequest": {
"Bangs": {
"target": {
"0": [
"No bangs.",
"Remove all the bangs.",
"Cut off all the bangs.",
"I don't want the bangs at all.",
"I don't want any bangs.",
"I don't want any bangs visible.",
"The bangs doesn't look good, let's remove it.",
"The bangs covers the forehead, but I want the entire forehead visible."
],
"1": [
"Add very short bangs.",
"I want very short bangs.",
"Add very short bangs that leaves most of the forehead uncovered.",
"I want very short bangs that leaves most of the forehead uncovered."
],
"2": [
"Add short bangs.",
"Let's try short bangs.",
"Add short bangs that covers only a small portion of the forehead.",
"Let's try short bangs that covers only a small portion of the forehead."
],
"3": [
"Add medium bangs.",
"Add bangs of medium length.",
"Let's try bangs of medium length.",
"Let's try bangs that leaves half of the forehead visible."
],
"4": [
"Add long bangs.",
"Let's try long bangs.",
"Add long bangs but don't cover the entire forehead.",
"Let's try long bangs but don't cover the entire forehead."
],
"5": [
"Add extremely long bangs.",
"Let's try extremely long bangs.",
"Add extremely long bangs that covers the entire forehead.",
"Let's try extremely long bangs that covers the entire forehead.",
"Indeed, the bangs can be much longer. Let's cover the eyebrows."
]
},
"change": {
"positive": {
"definite": {
"1": [
"The bangs can be slightly longer.",
"Make the bangs slightly longer."
],
"2": [
"The bangs can be somewhat longer, but not too much.",
"Make the bangs somewhat longer, but not too much."
],
"3": [
"Make the bangs longer, but not too much."
],
"4": [
"The bangs can be longer.",
"Make the bangs longer."
],
"5": [
"The bangs can be much longer.",
"Make the bangs much longer."
]
},
"indefinite": [
"Longer bangs.",
"Add bangs.",
"The bangs can be longer.",
"Let's add some bangs.",
"Maybe the bangs can be longer.",
"Let's try adding longer bangs.",
"What about adding longer bangs?",
"Emm, I think the bangs can be longer.",
"Let's make the bangs longer.",
"Hi, I want to see how my friend looks like with some bangs."
]
},
"negative": {
"definite": {
"1": [
"The bangs can be slightly shorter.",
"Make the bangs slightly shorter."
],
"2": [
"The bangs can be somewhat shorter, but not too much.",
"Make the bangs somewhat shorter, but not too much."
],
"3": [
"The bangs can be shorter.",
"Make the bangs shorter."
],
"4": [
"The bangs can be much shorter.",
"Make the bangs much shorter."
],
"5": [
"Remove all the bangs.",
"I don't want the bangs at all.",
"I don't want any bangs at all."
]
},
"indefinite": [
"Less bangs",
"Remove bangs.",
"Remove the bangs.",
"Let's cut off the bangs.",
"Let's cut the bangs short.",
"Let's cut the bangs off.",
"I don't like the bangs, let's remove it.",
"I don't like the bangs, let's cut it off.",
"The bangs is too long, let's remove it.",
"The bangs is too long, let's cut it off."
]
}
}
},
"Eyeglasses": {
"target": {
"0": [
"No eyeglass",
"No eyeglasses please.",
"No eyeglasses.",
"Remove eyeglasses.",
"Remove the eyeglasses.",
"I don't want to see the eyeglasses.",
"I think there shouldn't be any eyeglasses."
],
"1": [
"The eyeglasses should be rimless.",
"Let's try rimless eyeglasses."
],
"2": [
"The eyeglasses should have thin frame.",
"Let's try thin frame eyeglasses."
],
"3": [
"The eyeglasses should have thick frame.",
"Let's try thick frame eyeglasses."
],
"4": [
"Let's try thin frame sunglasses.",
"It should be sunglasses with thin frame."
],
"5": [
"Let's try thick frame sunglasses.",
"It should be sunglasses with thick frame."
]
},
"change": {
"positive": {
"definite": {
"1": [
"Make the eyeglasses slightly more obvious.",
"The eyeglasses can be slightly more obvious."
],
"2": [
"Make the eyeglasses somewhat more obvious.",
"The eyeglasses can be somewhat more obvious."
],
"3": [
"Make the eyeglasses more obvious.",
"The eyeglasses can be more obvious."
],
"4": [
"Let's try eyeglasses with thicker frame and darker color."
],
"5": [
"Let's try thick frame sunglasses.",
"It should be sunglasses with thick frame."
]
},
"indefinite": [
"Add glasses",
"Use eyeglasses",
"Try eyeglasses.",
"Add eyeglasses.",
"Add eyeglasses to the face.",
"Add eyeglasses please.",
"Let's add eyeglasses.",
"The eyeglasses can be more obvious.",
"The eyeglasses are not obvious enough.",
"I can't see the eyeglasses clearly, let's make them more obvious.",
"The eyeglasses frame can be thicker.",
"The glass color can be darker."
]
},
"negative": {
"definite": {
"1": [
"Make the eyeglasses slightly less obvious.",
"The eyeglasses can be slightly less obvious."
],
"2": [
"Make the eyeglasses somewhat less obvious.",
"The eyeglasses can be somewhat less obvious."
],
"3": [
"Make the eyeglasses less obvious.",
"The eyeglasses can be less obvious."
],
"4": [
"The eyeglasses are too obvious, let's make it much less obvious.",
"The eyeglasses are too obvious, let's try make it much less obvious."
],
"5": [
"Remove eyeglasses.",
"Remove the eyeglasses.",
"I don't like the eyeglasses.",
"I don't want to see the eyeglasses.",
"There shouldn't be any eyeglasses."
]
},
"indefinite": [
"Remove eyeglasses.",
"No eyeglasses.",
"The eyeglasses can be less obvious.",
"The eyeglasses are too obvious.",
"Let's make the eyeglasses more obvious.",
"The eyeglasses frame can be thinner.",
"The glass color can be lighter."
]
}
}
},
"No_Beard": {
"target": {
"0": [
"Let's see what he looks like without his beard.",
"Let's shave the beard off.",
"No beard"
],
"1": [
"His face should be covered with short pointed beard.",
"His face should be covered with the stubble.",
"His face has a rough growth of stubble.",
"There should be stubble covering his cheeks and chin."
],
"2": [
"His face should be covered with short beard.",
"Let's add short beard to his face.",
"Let's try short beard on his face."
],
"3": [
"His face should be covered with beard of medium length.",
"Let's add medium-length beard to his face.",
"Let's try medium-length beard on his face."
],
"4": [
"Let's try a big mustache on his face.",
"He should have a bushy beard."
],
"5": [
"Let's add very long beard.",
"Let's add a full beard.",
"He should have very thick beard.",
"He should have a very bushy beard."
]
},
"change": {
"positive": {
"definite": {
"1": [
"The beard can be slightly longer.",
"Make the beard slightly longer.",
"Slightly add more beard."
],
"2": [
"The beard can be somewhat longer, but not too much.",
"Make the beard somewhat longer, but not too much."
],
"3": [
"The beard can be longer.",
"Make the beard longer."
],
"4": [
"The beard can be much longer.",
"Make the beard much longer."
],
"5": [
"Let's add very long beard.",
"Let's add a full beard.",
"He should have very thick beard",
"He has a very bushy beard."
]
},
"indefinite": [
"Add beard.",
"Add some beard.",
"Longer beard.",
"Let's add more beard.",
"I want some more beard on the face."
]
},
"negative": {
"definite": {
"1": [
"The beard can be slightly shorter.",
"Make the beard slightly shorter.",
"Slightly remove some beard."
],
"2": [
"The beard can be somewhat shorter, but not too much.",
"Make the beard somewhat shorter, but not too much."
],
"3": [
"The beard can be shorter.",
"Make the beard shorter."
],
"4": [
"The beard can be much shorter.",
"Make the beard much shorter."
],
"5": [
"Let's see what he looks like without his beard.",
"Let's shave the beard off."
]
},
"indefinite": [
"Less beard.",
"Remove beard.",
"Remove the beard.",
"The beard should be gone.",
"Let's try to remove the beard.",
"I don't like the beard.",
"Let's try shorter beard."
]
}
}
},
"Smiling": {
"target": {
"0": [
"I think the person shouldn't be smiling.",
"I don't like the smile.",
"I don't want the smile.",
"No smile.",
"Remove the smile."
],
"1": [
"Turn up the corners of the mouth.",
"The corners of the mouth should curve up."
],
"2": [
"The corners of the mouth should curve up and show some teeth.",
"Smile broadly and show some teeth."
],
"3": [
"I want a beaming face.",
"I want the face to be smiling with teeth visible.",
"The entire face should be beamed with happiness."
],
"4": [
"It can be a big smile.",
"I want a big smile on the face.",
"I want the face to be smiling with the mouth slightly open.",
"I want the face to be smiling with the mouth slightly open. We should be able to see the teeth.",
"I want the face to be smiling with the mouth slightly open so that we can see the teeth."
],
"5": [
"I want a deep rumbling laugh.",
"It can be laughing happily.",
"It can be a very big smile.",
"I want a very big smile on the face.",
"I want the face to be smiling with the mouth wide open.",
"I want the face to be smiling with the mouth wide open. We should be able to see the teeth."
]
},
"change": {
"positive": {
"definite": {
"1": [
"Smile slightly more.",
"The smile can be slightly bigger.",
"Make the smile slightly bigger.",
"The person can look slightly happier.",
"The person can smile slightly more happily."
],
"2": [
"The smile can be somewhat bigger, but not too much.",
"Make the smile somewhat bigger, but not too much.",
"The person can look somewhat happier.",
"The person can smile somewhat more happily."
],
"3": [
"Smile more.",
"The smile can be bigger.",
"Make the smile bigger.",
"The person can be happier.",
"The person can smile more happily."
],
"4": [
"The smile can be much bigger.",
"Make the smile much bigger.",
"The person can be a lot happier.",
"The person can smile a lot more happily."
],
"5": [
"I want a deep rumbling laugh.",
"It can be laughing happily.",
"It can be a very big smile.",
"I want a very big smile on the face.",
"I want the face to be smiling with the mouth wide open.",
"I want the face to be smiling with the mouth wide open. We should be able to see the teeth.",
"The person can smile very happily."
]
},
"indefinite": [
"Look not so serious.",
"Look less serious.",
"Too serious, be happier.",
"Add smile.",
"Add some smiling please.",
"The smile is not big enough.",
"I want a bigger smile.",
"I want the face to smile more.",
"I want to change the pokerface face to a smiling face.",
"The person can smile more happily.",
"Can look happier."
]
},
"negative": {
"definite": {
"1": [
"I want the smile to be slightly less obvious.",
"The smile can be slightly less obvious.",
"The person can smile slightly less happily."
],
"2": [
"I want the smile to be less obvious.",
"The smile can be less obvious.",
"The person can smile somewhat less happily."
],
"3": [
"I want the smile to be much less obvious.",
"The smile can be much less obvious.",
"The person can smile less happily."
],
"4": [
"I want to make the smile almost vanish.",
"The person can smile a lot less happily."
],
"5": [
"I want the smile to vanish.",
"I don't like the smile, let's remove it."
]
},
"indefinite": [
"Not serious enough.",
"More serious.",
"No smiling.",
"No smile.",
"Remove smiling.",
"Remove the smiling.",
"Remove smile.",
"Remove the smile.",
"Smile less happily.",
"Don't be so happy.",
"The smile is too much.",
"Can we have a gentler smile? This smile is too big.",
"I want to change the smiling face to a pokerface."
]
}
}
},
"Young": {
"target": {
"0": [
"Let's make the face a child one.",
"Let's make the face very young."
],
"1": [
"Let's make the face a teenager one.",
"Let's make the face relatively young.",
"The person should be in the twenties."
],
"2": [
"Let's make the face a young one.",
"It should be a young adult.",
"The person should be in the thirties."
],
"3": [
"Let's make the face a middle age one.",
"The person should be in the forties."
],
"4": [
"Let's make the face slightly older than middle age.",
"Let's make the face the one of a senior.",
"Let's make the face the one of an elderly.",
"The person should be in the sixties.",
"The person should be in the fifties."
],
"5": [
"Let's make the face a very old one.",
"The person should be in the seventies.",
"The person should be in the eighties."
]
},
"change": {
"positive": {
"definite": {
"1": [
"The face can be slightly older.",
"Make the face slightly older."
],
"2": [
"Somewhat older",
"The face can be somewhat older, just not too much.",
"Make the face somewhat older, but not too much."
],
"3": [
"Make the face older, but not too much.",
"Make the face older, but not too much."
],
"4": [
"The face can be older.",
"Make the face older."
],
"5": [
"The face can be much older.",
"Make the face much older.",
"Let's make the face a very old one."
]
},
"indefinite": [
"Older.",
"Make it older.",
"The face can be older.",
"This face is too young, let's make it older.",
"Let's make the face older.",
"What about making the face look older?"
]
},
"negative": {
"definite": {
"1": [
"The face can be slightly younger.",
"Make the face slightly younger."
],
"2": [
"Somewhat younger.",
"The face can be somewhat younger, but not too much.",
"Make the face somewhat younger, but not too much."
],
"3": [
"The face can be younger.",
"Make the face younger.",
"Younger face."
],
"4": [
"Much younger.",
"The face can be much younger.",
"Make the face much younger."
],
"5": [
"Let's make the face a child one."
]
},
"indefinite": [
"Younger face.",
"Younger.",
"Look younger",
"Make it younger.",
"Be younger.",
"Less old.",
"The face can be younger.",
"This face is too old, let's make it younger.",
"Let's make the face younger.",
"What about making it younger?",
"Can you make the person look younger?"
]
}
}
}
},
"yes": [
[
"Yes",
"Yep",
"Yeep",
"Yep sure",
"Yes sure",
"Sure",
"Ok"
],
[
"."
]
],
"no": [
[
"No",
"Nope"
],
[
"."
]
],
"end": [
[
"End.",
"Nothing.",
"Nothing else.",
"Nothing else for now.",
"It's all good now.",
"I don't want any further edits.",
"Actually it's all good now.",
"No need for further edits.",
"I don't need any further edits.",
"That's all.",
"This is it.",
"That is it.",
"That is all.",
"No."
],
[
" Thanks!",
" Thank you!",
" Thanks a lot!",
""
]
]
}
================================================
FILE: language/templates/user_old_templates.json
================================================
{
"start": [
[
"Hi.",
"Hello."
],
[
" "
]
],
"requests": {
"Bangs": {
"target": {
"0": [
"No bangs.",
"Remove all the bangs.",
"Cut off all the bangs.",
"I don't want the bangs at all.",
"I don't want any bangs.",
"I don't want any bangs visible.",
"The bangs doesn't look good, let's remove it.",
"The bangs covers the forehead, but I want the entire forehead visible."
],
"1": [
"Add very short bangs.",
"I want very short bangs.",
"Add very short bangs that leaves most of the forehead uncovered.",
"I want very short bangs that leaves most of the forehead uncovered."
],
"2": [
"Add short bangs.",
"Let's try short bangs.",
"Add short bangs that covers only a small portion of the forehead.",
"Let's try short bangs that covers only a small portion of the forehead."
],
"3": [
"Add medium bangs.",
"Add bangs of medium length.",
"Let's try bangs of medium length.",
"Let's try bangs that leaves half of the forehead visible."
],
"4": [
"Add long bangs.",
"Let's try long bangs.",
"Add long bangs but don't cover the entire forehead.",
"Let's try long bangs but don't cover the entire forehead."
],
"5": [
"Add extremely long bangs.",
"Let's try extremely long bangs.",
"Add extremely long bangs that covers the entire forehead.",
"Let's try extremely long bangs that covers the entire forehead.",
"Indeed, the bangs can be much longer. Let's cover the eyebrows."
]
},
"change": {
"positive": {
"definite": {
"1": [
"The bangs can be slightly longer.",
"Make the bangs slightly longer."
],
"2": [
"The bangs can be somewhat longer, but not too much.",
"Make the bangs somewhat longer, but not too much."
],
"3": [
"Make the bangs longer, but not too much."
],
"4": [
"The bangs can be longer.",
"Make the bangs longer."
],
"5": [
"The bangs can be much longer.",
"Make the bangs much longer."
]
},
"indefinite": [
"The bangs can be longer.",
"Let's add some bangs.",
"Maybe the bangs can be longer.",
"Let's try adding longer bangs.",
"What about adding longer bangs?",
"Emm, I think the bangs can be longer.",
"Let's make the bangs longer.",
"Hi, I want to see how my friend looks like with some bangs."
]
},
"negative": {
"definite": {
"1": [
"The bangs can be slightly shorter.",
"Make the bangs slightly shorter."
],
"2": [
"The bangs can be somewhat shorter, but not too much.",
"Make the bangs somewhat shorter, but not too much."
],
"3": [
"The bangs can be shorter.",
"Make the bangs shorter."
],
"4": [
"The bangs can be much shorter.",
"Make the bangs much shorter."
],
"5": [
"Remove all the bangs.",
"I don't want the bangs at all.",
"I don't want any bangs at all."
]
},
"indefinite": [
"Remove bangs.",
"Remove the bangs.",
"Let's cut off the bangs.",
"Let's cut the bangs short.",
"Let's cut the bangs off.",
"I don't like the bangs, let's remove it.",
"I don't like the bangs, let's cut it off.",
"The bangs is too long, let's remove it.",
"The bangs is too long, let's cut it off."
]
}
}
},
"Eyeglasses": {
"target": {
"0": [
"No eyeglasses please.",
"No eyeglasses.",
"Remove eyeglasses.",
"Remove the eyeglasses.",
"I don't want to see the eyeglasses.",
"I think there shouldn't be any eyeglasses."
],
"1": [
"The eyeglasses should be rimless.",
"Let's try rimless eyeglasses."
],
"2": [
"The eyeglasses should have thin frame.",
"Let's try thin frame eyeglasses."
],
"3": [
"The eyeglasses should have thick frame.",
"Let's try thick frame eyeglasses."
],
"4": [
"Let's try thin frame sunglasses.",
"It should be sunglasses with thin frame."
],
"5": [
"Let's try thick frame sunglasses.",
"It should be sunglasses with thick frame."
]
},
"change": {
"positive": {
"definite": {
"1": [
"Make the eyeglasses slightly more obvious.",
"The eyeglasses can be slightly more obvious."
],
"2": [
"Make the eyeglasses somewhat more obvious.",
"The eyeglasses can be somewhat more obvious."
],
"3": [
"Make the eyeglasses more obvious.",
"The eyeglasses can be more obvious."
],
"4": [
"Let's try eyeglasses with thicker frame and darker color."
],
"5": [
"Let's try thick frame sunglasses.",
"It should be sunglasses with thick frame."
]
},
"indefinite": [
"Try eyeglasses.",
"Add eyeglasses.",
"Add eyeglasses to the face.",
"Add eyeglasses please.",
"Let's add eyeglasses.",
"The eyeglasses can be more obvious.",
"The eyeglasses are not obvious enough.",
"I can't see the eyeglasses clearly, let's make them more obvious.",
"The eyeglasses frame can be thicker.",
"The glass color can be darker."
]
},
"negative": {
"definite": {
"1": [
"Make the eyeglasses slightly less obvious.",
"The eyeglasses can be slightly less obvious."
],
"2": [
"Make the eyeglasses somewhat less obvious.",
"The eyeglasses can be somewhat less obvious."
],
"3": [
"Make the eyeglasses less obvious.",
"The eyeglasses can be less obvious."
],
"4": [
"The eyeglasses are too obvious, let's make it much less obvious.",
"The eyeglasses are too obvious, let's try make it much less obvious."
],
"5": [
"Remove eyeglasses.",
"Remove the eyeglasses.",
"I don't like the eyeglasses.",
"I don't want to see the eyeglasses.",
"There shouldn't be any eyeglasses."
]
},
"indefinite": [
"The eyeglasses can be less obvious.",
"The eyeglasses are too obvious.",
"Let's make the eyeglasses more obvious.",
"The eyeglasses frame can be thinner.",
"The glass color can be lighter."
]
}
}
},
"No_Beard": {
"target": {
"0": [
"Let's see what he looks like without his beard.",
"Let's shave the beard off."
],
"1": [
"His face should be covered with short pointed beard.",
"His face should be covered with the stubble.",
"His face has a rough growth of stubble.",
"There should be stubble covering his cheeks and chin."
],
"2": [
"His face should be covered with short beard.",
"Let's add short beard to his face.",
"Let's try short beard on his face."
],
"3": [
"His face should be covered with beard of medium length.",
"Let's add medium-length beard to his face.",
"Let's try medium-length beard on his face."
],
"4": [
"Let's try a big mustache on his face.",
"He should have a bushy beard."
],
"5": [
"Let's add very long beard.",
"Let's add a full beard.",
"He should have very thick beard.",
"He should have a very bushy beard."
]
},
"change": {
"positive": {
"definite": {
"1": [
"The beard can be slightly longer.",
"Make the beard slightly longer.",
"Slightly add more beard."
],
"2": [
"The beard can be somewhat longer, but not too much.",
"Make the beard somewhat longer, but not too much."
],
"3": [
"The beard can be longer.",
"Make the beard longer."
],
"4": [
"The beard can be much longer.",
"Make the beard much longer."
],
"5": [
"Let's add very long beard.",
"Let's add a full beard.",
"He should have very thick beard",
"He has a very bushy beard."
]
},
"indefinite": [
"Add beard.",
"Add some beard.",
"Longer beard.",
"Let's add more beard.",
"I want some more beard on the face."
]
},
"negative": {
"definite": {
"1": [
"The beard can be slightly shorter.",
"Make the beard slightly shorter.",
"Slightly remove some beard."
],
"2": [
"The beard can be somewhat shorter, but not too much.",
"Make the beard somewhat shorter, but not too much."
],
"3": [
"The beard can be shorter.",
"Make the beard shorter."
],
"4": [
"The beard can be much shorter.",
"Make the beard much shorter."
],
"5": [
"Let's see what he looks like without his beard.",
"Let's shave the beard off."
]
},
"indefinite": [
"Remove beard.",
"Remove the beard.",
"The beard should be gone.",
"Let's try to remove the beard.",
"I don't like the beard.",
"Let's try shorter beard"
]
}
}
},
"Smiling": {
"target": {
"0": [
"I think the person shouldn't be smiling.",
"I don't like the smile.",
"I don't want the smile"
],
"1": [
"Turn up the corners of the mouth",
"The corners of the mouth curve up."
],
"2": [
"The corners of the mouth curve up and show some teeth.",
"Smile broadly and show some teeth."
],
"3": [
"I want a beaming face.",
"I want the face to be smiling with teeth visible.",
"The entire face should be beamed with happiness."
],
"4": [
"It can be a big smile.",
"I want a big smile on the face.",
"I want the face to be smiling with the mouth slightly open.",
"I want the face to be smiling with the mouth slightly open. We should be able to see the teeth.",
"I want the face to be smiling with the mouth slightly open so that we can see the teeth."
],
"5": [
"I want a deep rumbling laugh.",
"It can be laughing happily.",
"It can be a very big smile.",
"I want a very big smile on the face.",
"I want the face to be smiling with the mouth wide open.",
"I want the face to be smiling with the mouth wide open. We should be able to see the teeth."
]
},
"change": {
"positive": {
"definite": {
"1": [
"Smile slightly more.",
"The smile can be slightly bigger.",
"Make the smile slightly bigger.",
"The person can look slightly happier.",
"The person can smile slightly more happily."
],
"2": [
"The smile can be somewhat bigger, but not too much.",
"Make the smile somewhat bigger, but not too much.",
"The person can look somewhat happier.",
"The person can smile somewhat more happily."
],
"3": [
"Smile more.",
"The smile can be bigger.",
"Make the smile bigger.",
"The person can be happier.",
"The person can smile more happily."
],
"4": [
"The smile can be much bigger.",
"Make the smile much bigger.",
"The person can be a lot happier.",
"The person can smile a lot more happily."
],
"5": [
"I want a deep rumbling laugh.",
"It can be laughing happily.",
"It can be a very big smile.",
"I want a very big smile on the face.",
"I want the face to be smiling with the mouth wide open.",
"I want the face to be smiling with the mouth wide open. We should be able to see the teeth.",
"The person can smile very happily."
]
},
"indefinite": [
"Add some smiling please.",
"The smile is not big enough.",
"I want a bigger smile.",
"I want the face to smile more.",
"I want to change the pokerface face to a smiling face.",
"The person can smile more happily.",
"Can look happier."
]
},
"negative": {
"definite": {
"1": [
"I want the smile to be slightly less obvious.",
"The smile can be slightly less obvious.",
"The person can smile slightly less happily."
],
"2": [
"I want the smile to be less obvious.",
"The smile can be less obvious.",
"The person can smile somewhat less happily."
],
"3": [
"I want the smile to be much less obvious.",
"The smile can be much less obvious.",
"The person can smile less happily."
],
"4": [
"I want to make the smile almost vanish.",
"The person can smile a lot less happily."
],
"5": [
"I want the smile to vanish.",
"I don't like the smile, let's remove it."
]
},
"indefinite": [
"No smiling.",
"No smile.",
"Remove smiling.",
"Remove the smiling.",
"Remove smile.",
"Remove the smile.",
"Smile less happily.",
"Don't be so happy.",
"The smile is too much.",
"Can we have a gentler smile? This smile is too big.",
"I want to change the smiling face to a pokerface."
]
}
}
},
"Young": {
"target": {
"0": [
"Let's make the face a child one.",
"Let's make the face very young."
],
"1": [
"Let's make the face a teenager one.",
"Let's make the face relatively young.",
"The person should be in the twenties."
],
"2": [
"Let's make the face a young one.",
"It should be a young adult.",
"The person should be in the thirties."
],
"3": [
"Let's make the face a middle age one.",
"The person should be in the forties."
],
"4": [
"Let's make the face slightly older than middle age.",
"Let's make the face the one of a senior.",
"Let's make the face the one of an elderly.",
"The person should be in the sixties.",
"The person should be in the fifties."
],
"5": [
"Let's make the face a very old one.",
"The person should be in the seventies.",
"The person should be in the eighties."
]
},
"change": {
"positive": {
"definite": {
"1": [
"The face can be slightly older.",
"Make the face slightly older."
],
"2": [
"Somewhat older",
"The face can be somewhat older, just not too much.",
"Make the face somewhat older, but not too much."
],
"3": [
"Make the face older, but not too much.",
"Make the face older, but not too much."
],
"4": [
"The face can be older.",
"Make the face older."
],
"5": [
"The face can be much older.",
"Make the face much older.",
"Let's make the face a very old one."
]
},
"indefinite": [
"Older.",
"Make it older.",
"The face can be older.",
"This face is too young, let's make it older.",
"Let's make the face older.",
"What about making the face look older?"
]
},
"negative": {
"definite": {
"1": [
"The face can be slightly younger.",
"Make the face slightly younger."
],
"2": [
"Somewhat younger.",
"The face can be somewhat younger, but not too much.",
"Make the face somewhat younger, but not too much."
],
"3": [
"The face can be younger.",
"Make the face younger.",
"Younger face."
],
"4": [
"Much younger.",
"The face can be much younger.",
"Make the face much younger."
],
"5": [
"Let's make the face a child one."
]
},
"indefinite": [
"Younger face.",
"Younger.",
"Make it younger.",
"Be younger.",
"Less old.",
"The face can be younger.",
"This face is too old, let's make it younger.",
"Let's make the face younger.",
"What about making it younger?"
]
}
}
}
},
"yes_enough": [
[
"Emmm, yep",
"Emmm, yes",
"Emmm, yeep",
"Yes",
"Yep",
"Yeep",
"Yep sure"
],
[
", ",
". ",
"! "
],
[
"That's good enough now.",
"That's nice.",
"That's perfect.",
"This is great."
],
[
" "
]
],
"no_enough": [
[
"Actually,",
"To be honest,",
"Well,",
"Well",
"Emm",
"Emmm",
"Emmmm",
"Emm,",
"Emmm,",
"Emmmm,",
"I'm not too sure but",
"It looks okay now but",
"It looks better now, but still,",
"It looks nice, but still,",
"Let me have a look. Well,",
"Let me have a look. Well",
"Let me have a look. Emm,",
"Let me have a look. Emmm,",
"Let me have a look. Emmmm,",
"Let me have a look. Emm",
"Let me have a look. Emmm",
"Let me have a look. Emmmm",
"Let me take a look. Well,",
"Let me take a look. Well",
"Let me take a look. Emm,",
"Let me take a look. Emmm,",
"Let me take a look. Emmmm,",
"Let me take a look. Emm",
"Let me take a look. Emmm",
"Let me take a look. Emmmm"
],
[
" "
]
],
"yes_suggestion": [
[
"Emmm, yep",
"Emmm, yes",
"Emmm, yeep",
"Yes",
"Yep",
"Yeep",
"Yep sure",
"Yes sure"
],
[
",",
".",
"!"
],
[
" "
]
],
"no_suggestion": [
[
"Well,",
"Well",
"Emm,",
"Emmm",
"Emmmm",
"Emm,",
"Emmm,",
"Emmmm,",
"I'm not too sure so",
"It looks okay now so",
"It looks nice, so,",
"Let me have a look. Well,",
"Let me have a look. Well",
"Let me have a look. Emm,",
"Let me have a look. Emmm,",
"Let me have a look. Emmmm,",
"Let me have a look. Emm",
"Let me have a look. Emmm",
"Let me have a look. Emmmm",
"Let me take a look. Well,",
"Let me take a look. Well",
"Let me take a look. Emm,",
"Let me take a look. Emmm,",
"Let me take a look. Emmmm,",
"Let me take a look. Emm",
"Let me take a look. Emmm",
"Let me take a look. Emmmm"
],
[
" "
],
[
"Not really.",
"Not really actually.",
"No actually."
],
[
" "
]
],
"end": [
[
"Nothing else.",
"Nothing else for now.",
"It's all good now.",
"I don't want any further edits.",
"Actually it's all good now.",
"No need for further edits.",
"I don't need any further edits.",
"That's all.",
"This is it.",
"That is it.",
"That is all.",
"No."
],
[
" "
],
[
"Thanks!",
"Thank you!",
"Thanks a lot!"
]
]
}
================================================
FILE: language/templates/vocab.json
================================================
{
"text_token_to_idx": {
"": 0,
"": 1,
"": 2,
"": 3,
"?": 4,
"a": 5,
"able": 6,
"about": 7,
"actually": 8,
"add": 9,
"adding": 10,
"adult": 11,
"age": 12,
"all": 13,
"almost": 14,
"an": 15,
"and": 16,
"any": 17,
"are": 18,
"at": 19,
"bangs": 20,
"be": 21,
"beamed": 22,
"beaming": 23,
"beard": 24,
"big": 25,
"bigger": 26,
"bit": 27,
"broadly": 28,
"bushy": 29,
"but": 30,
"can": 31,
"can't": 32,
"change": 33,
"cheeks": 34,
"child": 35,
"chin": 36,
"clearly": 37,
"color": 38,
"considerably": 39,
"corners": 40,
"could": 41,
"cover": 42,
"covered": 43,
"covering": 44,
"covers": 45,
"curve": 46,
"cut": 47,
"darker": 48,
"deep": 49,
"degree": 50,
"doesn't": 51,
"don't": 52,
"edits": 53,
"eighties": 54,
"elderly": 55,
"else": 56,
"emm": 57,
"end": 58,
"enough": 59,
"entire": 60,
"extent": 61,
"extremely": 62,
"eyebrows": 63,
"eyeglass": 64,
"eyeglasses": 65,
"face": 66,
"feel": 67,
"fifties": 68,
"for": 69,
"forehead": 70,
"forties": 71,
"frame": 72,
"friend": 73,
"fringe": 74,
"full": 75,
"further": 76,
"gentler": 77,
"glass": 78,
"glasses": 79,
"go": 80,
"gone": 81,
"good": 82,
"growth": 83,
"guess": 84,
"half": 85,
"happier": 86,
"happily": 87,
"happiness": 88,
"happy": 89,
"has": 90,
"have": 91,
"he": 92,
"hello": 93,
"hi": 94,
"his": 95,
"how": 96,
"i": 97,
"in": 98,
"indeed": 99,
"is": 100,
"it": 101,
"it's": 102,
"just": 103,
"kind": 104,
"laugh": 105,
"laughing": 106,
"leaves": 107,
"length": 108,
"less": 109,
"let's": 110,
"lighter": 111,
"like": 112,
"little": 113,
"long": 114,
"longer": 115,
"look": 116,
"looks": 117,
"lot": 118,
"make": 119,
"making": 120,
"maybe": 121,
"medium": 122,
"medium-length": 123,
"middle": 124,
"moderately": 125,
"more": 126,
"most": 127,
"mouth": 128,
"much": 129,
"mustache": 130,
"my": 131,
"need": 132,
"no": 133,
"nope": 134,
"not": 135,
"nothing": 136,
"now": 137,
"obvious": 138,
"of": 139,
"off": 140,
"ok": 141,
"old": 142,
"older": 143,
"on": 144,
"one": 145,
"only": 146,
"open": 147,
"partially": 148,
"person": 149,
"please": 150,
"pointed": 151,
"poker": 152,
"pokerface": 153,
"portion": 154,
"relatively": 155,
"remove": 156,
"rimless": 157,
"rough": 158,
"rumbling": 159,
"schoolchild": 160,
"see": 161,
"senior": 162,
"serious": 163,
"seventies": 164,
"shave": 165,
"short": 166,
"shorter": 167,
"should": 168,
"shouldn't": 169,
"show": 170,
"simply": 171,
"sixties": 172,
"slightly": 173,
"small": 174,
"smile": 175,
"smiling": 176,
"so": 177,
"some": 178,
"somewhat": 179,
"sort": 180,
"stubble": 181,
"sunglasses": 182,
"sure": 183,
"teen": 184,
"teenager": 185,
"teeth": 186,
"than": 187,
"thank": 188,
"thanks": 189,
"that": 190,
"that's": 191,
"the": 192,
"them": 193,
"there": 194,
"thick": 195,
"thicker": 196,
"thin": 197,
"think": 198,
"thinner": 199,
"thirties": 200,
"this": 201,
"tiny": 202,
"to": 203,
"too": 204,
"try": 205,
"trying": 206,
"turn": 207,
"twenties": 208,
"uncovered": 209,
"up": 210,
"use": 211,
"vanish": 212,
"very": 213,
"visible": 214,
"want": 215,
"we": 216,
"what": 217,
"whole": 218,
"wide": 219,
"with": 220,
"without": 221,
"would": 222,
"yeep": 223,
"yep": 224,
"yes": 225,
"you": 226,
"young": 227,
"younger": 228
}
}
================================================
FILE: language/train_encoder.py
================================================
import argparse
import json
import sys
import time
import torch
import torch.nn as nn
import torch.utils.data
sys.path.append('.')
from accuracy import head_accuracy # noqa
from dataset import EncoderDataset # noqa
from lstm import Encoder # noqa
from utils import AverageMeter, dict2str, save_checkpoint # noqa
from utils.setup_logger import setup_logger # noqa
def parse_args():
"""Parses arguments."""
parser = argparse.ArgumentParser(description='Train the language encoder')
# mode
parser.add_argument('--debug', type=int, default=0)
# training
parser.add_argument('--batch_size', type=int, default=2048)
parser.add_argument('--val_batch', type=int, default=1024)
# learning rate scheme
parser.add_argument('--num_epochs', default=20, type=int)
parser.add_argument('--lr', default=1e-3, type=float)
parser.add_argument('--weight_decay', default=0, type=float)
# LSTM hyperparameter
parser.add_argument('--word_embedding_dim', default=300, type=int)
parser.add_argument('--text_embed_size', default=1024, type=int)
parser.add_argument('--linear_hidden_size', default=256, type=int)
parser.add_argument('--linear_dropout_rate', default=0, type=float)
# input directories
parser.add_argument(
'--vocab_file', required=True, type=str, help='path to vocab file.')
parser.add_argument(
'--metadata_file',
default='./templates/metadata_fsm.json',
type=str,
help='path to metadata file.')
parser.add_argument(
'--train_set_dir', required=True, type=str, help='path to train data.')
parser.add_argument(
'--val_set_dir', required=True, type=str, help='path to val data.')
# output directories
parser.add_argument(
'--work_dir',
required=True,
type=str,
help='path to save checkpoint and log files.')
# misc
parser.add_argument(
'--unlabeled_value',
default=999,
type=int,
help='value to represent unlabeled value')
parser.add_argument('--num_workers', default=8, type=int)
return parser.parse_args()
best_val_acc, best_epoch, current_iters = 0, 0, 0
def main():
"""Main function."""
# ################### Set Up #######################
global args, best_val_acc, best_epoch
args = parse_args()
logger = setup_logger(
args.work_dir, logger_name='train.txt', debug=args.debug)
args.device = torch.device('cuda')
logger.info('Saving arguments.')
logger.info(dict2str(args.__dict__))
# ################### Metadata #######################
with open(args.metadata_file, 'r') as f:
args.metadata = json.load(f)
args.num_head = len(args.metadata.items())
logger.info(f'args.num_head: {args.num_head}, ')
logger.info(f'args.metadata: {args.metadata}.')
# ################### Language Encoder #######################
# load vocab file
with open(args.vocab_file, 'r') as f:
vocab = json.load(f)
text_token_to_idx = vocab['text_token_to_idx']
encoder = Encoder(
token_to_idx=text_token_to_idx,
word_embedding_dim=args.word_embedding_dim,
text_embed_size=args.text_embed_size,
metadata_file=args.metadata_file,
linear_hidden_size=args.linear_hidden_size,
linear_dropout_rate=args.linear_dropout_rate)
encoder = encoder.to(args.device)
# ################### DataLoader #######################
logger.info('Preparing train_dataset')
train_dataset = EncoderDataset(preprocessed_dir=args.train_set_dir)
logger.info('Preparing train_loader')
train_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=args.batch_size,
shuffle=True,
num_workers=args.num_workers,
pin_memory=False,
sampler=None)
logger.info('Preparing val_dataset')
val_dataset = EncoderDataset(preprocessed_dir=args.val_set_dir)
logger.info('Preparing val_loader')
val_loader = torch.utils.data.DataLoader(
val_dataset,
batch_size=args.val_batch,
shuffle=False,
num_workers=args.num_workers,
pin_memory=False)
logger.info(f'Number of train text: {len(train_dataset)}, '
f'Number of val text: {len(val_dataset)}.')
data_loader = {
'train': train_loader,
'val': val_loader,
}
# ################### Optimizer #######################
optimizer = torch.optim.Adam(
encoder.parameters(), args.lr, weight_decay=args.weight_decay)
# ################### Loss Function #######################
criterion = nn.CrossEntropyLoss(
reduction='mean', ignore_index=args.unlabeled_value)
# ################### Epochs #######################
for epoch in range(args.num_epochs):
logger.info(
'----------- Training: Epoch '
f'({epoch + 1} / {args.num_epochs}), LR: {args.lr:.4f}. ---------'
)
train_per_head_acc_avg, train_overall_acc = train(
args,
'train',
encoder,
data_loader['train'],
criterion,
optimizer,
logger,
)
logger.info(
'Train accuracy '
f'({epoch + 1} / {args.num_epochs}), '
f'{[str(round(i, 2))+"%" for i in train_per_head_acc_avg]}')
val_per_head_acc_avg, val_overall_acc = train(
args,
'val',
encoder,
data_loader['val'],
criterion,
optimizer,
logger,
)
logger.info('Validation accuracy '
f'({epoch + 1} / {args.num_epochs}), '
f'{[str(round(i, 2))+"%" for i in val_per_head_acc_avg]}')
# whether this epoch has the highest val acc so far
is_best = val_overall_acc > best_val_acc
if is_best:
best_epoch = epoch + 1
best_val_acc = val_overall_acc
logger.info(
f'Best Epoch: {best_epoch}, best acc: {best_val_acc: .4f}.')
save_checkpoint(
args, {
'epoch': epoch + 1,
'best_epoch_so_far': best_epoch,
'state_dict': encoder.state_dict(),
'best_val_acc': best_val_acc,
'optimizer': optimizer.state_dict(),
},
is_best,
checkpoint=args.work_dir)
logger.info('successful')
def train(args, phase, encoder, data_loader, criterion, optimizer, logger):
if phase == 'train':
encoder.train()
else:
encoder.eval()
# record time
batch_time = AverageMeter()
data_time = AverageMeter()
end = time.time()
# record accuracy
per_head_acc_list = [AverageMeter() for _ in range(args.num_head)]
for batch_idx, batch_data in enumerate(data_loader):
data_time.update(time.time() - end)
text, system_mode, labels = batch_data
text = text.to(args.device)
system_mode = system_mode.to(args.device)
labels = labels.to(args.device)
if phase == 'train':
output = encoder(text)
else:
with torch.no_grad():
output = encoder(text)
loss_list = []
# Labels: loss and acc
for head_idx, (key, val) in enumerate(args.metadata.items()):
loss = criterion(output[head_idx], labels[:, head_idx])
loss_list.append(loss)
acc_dict = head_accuracy(
output=output[head_idx],
target=labels[:, head_idx],
unlabeled_value=args.unlabeled_value)
acc = acc_dict['acc']
labeled_count = int(acc_dict['labeled_count'])
if labeled_count > 0:
per_head_acc_list[head_idx].update(acc, labeled_count)
loss_avg = sum(loss_list) / len(loss_list)
if phase == 'train':
optimizer.zero_grad()
loss_avg.backward()
optimizer.step()
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
logger.info(
f'Batch: {batch_idx+1}, '
f'Data time: {data_time.avg:.3f}s, Batch time: {batch_time.avg:.3f}s, ' # noqa
f'loss: {loss_avg:.4f}.')
overall_acc = 0
per_head_acc_avg = []
for head_idx in range(args.num_head):
per_head_acc_avg.append(per_head_acc_list[head_idx].avg)
overall_acc += per_head_acc_list[head_idx].avg
overall_acc = overall_acc / args.num_head
return per_head_acc_avg, overall_acc
if __name__ == '__main__':
main()
================================================
FILE: language/utils/__init__.py
================================================
"""Useful utils
"""
# progress bar
import os
import sys
from .eval import * # noqa
from .logger import * # noqa
from .lr_schedule import * # noqa
from .misc import * # noqa
from .numerical import * # noqa
from .visualize import * # noqa
sys.path.append(os.path.join(os.path.dirname(__file__), "progress"))
from progress.bar import Bar as Bar # noqa
================================================
FILE: language/utils/eval.py
================================================
from __future__ import absolute_import, print_function
import torch
__all__ = ['classification_accuracy', 'regression_accuracy']
def classification_accuracy(output,
target,
class_wise=False,
num_cls=6,
excluded_cls_idx=None):
"""
Computes the precision@k for the specified values of k
output: batch_size * num_cls (for a specific attribute)
target: batch_size * 1 (for a specific attribute)
return res: res = 100 * num_correct / batch_size, for a specific attribute
for a batch
"""
with torch.no_grad():
batch_size = target.size(0)
# _ = the largest score, pred = cls_idx with the largest score
_, pred = output.topk(1, 1, True, True)
pred = pred.reshape(-1)
acc = float(torch.sum(pred == target)) / float(batch_size) * 100
return_dict = {'acc': acc}
if excluded_cls_idx is not None:
correct_count = torch.sum(
(pred == target) * (target != excluded_cls_idx))
labeled_count = torch.sum(target != excluded_cls_idx)
if labeled_count:
labeled_acc = float(correct_count) / float(labeled_count) * 100
else:
labeled_acc = 0
return_dict['labeled_acc'] = labeled_acc
return_dict['labeled_count'] = labeled_count
else:
return_dict['labeled_acc'] = acc
return_dict['labeled_count'] = batch_size
if class_wise:
acc_class_wise = []
per_class_count = []
# actual number of classes <= num_cls=6
for i in range(num_cls):
total_sample_cls_i = torch.sum(target == i)
if total_sample_cls_i:
correct_samples_cls_i = torch.sum(
(pred == i) * (target == i))
acc_class_wise.append(
float(correct_samples_cls_i) /
float(total_sample_cls_i) * 100)
else:
acc_class_wise.append(0)
per_class_count.append(total_sample_cls_i)
return_dict['acc_class_wise'] = acc_class_wise
return_dict['per_class_count'] = per_class_count
return return_dict
def regression_accuracy(output,
target,
margin=0.2,
uni_neg=True,
class_wise=False,
num_cls=6,
excluded_cls_idx=None,
max_cls_value=5):
"""
Computes the regression accuracy
if predicted score is less than one margin from the ground-truth score, we
consider it as correct otherwise it is incorrect, the acc is the
percentage of correct regression
class_wise: if True, then report overall accuracy and class-wise accuracy
else, then only report overall accuracy
"""
output = output.clone().reshape(-1)
if uni_neg:
output[(output <= 0 + margin) * (target == 0)] = 0
output[(output >= max_cls_value - margin) *
(target == max_cls_value)] = max_cls_value
distance = torch.absolute(target - output)
distance = distance - margin
predicted_class = torch.zeros_like(target)
# if distance <= 0, assign ground truth class
predicted_class[distance <= 0] = target[distance <= 0]
# if distance > 0, assign an invalid value
predicted_class[distance > 0] = -1
acc = float(torch.sum(predicted_class == target)) / float(
target.size(0)) * 100
return_dict = {'acc': acc}
if excluded_cls_idx is not None:
correct_count = torch.sum(
(predicted_class == target) * (target != excluded_cls_idx))
labeled_count = torch.sum(target != excluded_cls_idx)
if labeled_count:
labeled_acc = float(correct_count) / float(labeled_count) * 100
else:
labeled_acc = 0
return_dict['labeled_acc'] = labeled_acc
return_dict['labeled_count'] = labeled_count
else:
labeled_acc = acc
return_dict['labeled_acc'] = acc
return_dict['labeled_count'] = target.size(0)
if class_wise:
acc_class_wise = []
per_class_count = []
for i in range(num_cls):
total_sample_cls_i = torch.sum(target == i)
if total_sample_cls_i:
correct_samples_cls_i = torch.sum(
(predicted_class == i) * (target == i))
acc_class_wise.append(
float(correct_samples_cls_i) / float(total_sample_cls_i) *
100)
else:
acc_class_wise.append(0)
per_class_count.append(total_sample_cls_i)
return_dict['acc_class_wise'] = acc_class_wise
return_dict['per_class_count'] = per_class_count
return return_dict
def main():
l1 = [
0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 2, 2, 2, 1.7, 0, 3, 3, 2.79, 3.3, 0, 4,
2, 5, 3, 0, 6, 6, 4.78, 6, 0
]
l2 = [
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4,
4, 5, 5, 5, 5, 5
]
output = torch.FloatTensor(l1)
target = torch.LongTensor(l2)
acc = regression_accuracy(output, target, margin=0.2)
print('acc:', acc)
print()
acc, acc_class_wise_list, per_class_count = regression_accuracy(
output, target, margin=0.2, class_wise=True)
print('acc:', acc)
print('acc_class_wise_list:', acc_class_wise_list)
print('per_class_count: ', per_class_count)
if __name__ == '__main__':
main()
================================================
FILE: language/utils/logger.py
================================================
from __future__ import absolute_import
import datetime
import logging
import time
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
import numpy as np
# from mmcv.runner import get_dist_info, master_only
__all__ = [
'Logger', 'LoggerMonitor', 'savefig', 'MessageLogger', 'init_tb_logger',
'get_root_logger', 'dict2str'
]
def savefig(fname, dpi=None):
dpi = 150 if dpi is None else dpi
plt.savefig(fname, dpi=dpi)
def plot_overlap(logger, names=None):
names = logger.names if names is None else names
numbers = logger.numbers
for _, name in enumerate(names):
x = np.arange(len(numbers[name]))
plt.plot(x, np.asarray(numbers[name]))
return [logger.title + '(' + name + ')' for name in names]
class Logger(object):
'''Save training process to log file with simple plot function.'''
def __init__(self, fpath, title=None, resume=False):
self.file = None
self.resume = resume
self.title = '' if title is None else title
if fpath is not None:
if resume:
self.file = open(fpath, 'r')
name = self.file.readline()
self.names = name.rstrip().split('\t')
self.numbers = {}
for _, name in enumerate(self.names):
self.numbers[name] = []
for numbers in self.file:
numbers = numbers.rstrip().split('\t')
for i in range(0, len(numbers)):
self.numbers[self.names[i]].append(numbers[i])
self.file.close()
self.file = open(fpath, 'a')
else:
self.file = open(fpath, 'w')
def set_names(self, names):
if self.resume:
pass
# initialize numbers as empty list
self.numbers = {}
self.names = names
for _, name in enumerate(self.names):
self.file.write(name)
self.file.write('\t')
self.numbers[name] = []
self.file.write('\n')
self.file.flush()
def append(self, numbers):
assert len(self.names) == len(numbers), 'Numbers do not match names'
for index, num in enumerate(numbers):
if type(num) == int:
self.file.write(str(num))
elif type(num) == float:
self.file.write("{0:.6f}".format(num))
else: # str
self.file.write(str(num))
self.file.write('\t')
self.numbers[self.names[index]].append(num)
self.file.write('\n')
self.file.flush()
def plot(self, out_file, names=None):
names = self.names if names is None else names
numbers = self.numbers
fig, ax = plt.subplots(1, 1)
for _, name in enumerate(names):
x = np.arange(len(numbers[name]))
ax.plot(x, numbers[name])
# whether add data labels to each point in the plot
if False:
for i in range(len(x)):
y = numbers[name][i]
# text = round(y, 2) # below 4 line are added by ziqi
if type(y) == int or type(y) == float:
text = round(y, 2)
else:
text = y
ax.text(x[i], y, text)
ax.legend([self.title + '(' + name + ')' for name in names])
loc = plticker.MultipleLocator(
base=1.0
) # this locator puts ticks at regular intervals # ziqi added
ax.xaxis.set_major_locator(loc)
ax.grid(True)
plt.savefig(out_file)
plt.close()
def close(self):
if self.file is not None:
self.file.close()
def get_numbers(self):
stats = {}
for name in self.names:
stats[name] = self.numbers[name]
return stats
class LoggerMonitor(object):
'''Load and visualize multiple logs.'''
def __init__(self, paths):
'''paths is a distionary with {name:filepath} pair'''
self.loggers = []
for title, path in paths.items():
logger = Logger(path, title=title, resume=True)
self.loggers.append(logger)
def plot(self, names=None):
plt.figure()
plt.subplot(121)
legend_text = []
for logger in self.loggers:
legend_text += plot_overlap(logger, names)
plt.legend(
legend_text, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.grid(True)
class MessageLogger():
"""Message logger for printing.
Args:
opt (dict): Config. It contains the following keys:
name (str): Exp name.
logger (dict): Contains 'print_freq' (str) for logger interval.
train (dict): Contains 'niter' (int) for total iters.
use_tb_logger (bool): Use tensorboard logger.
start_iter (int): Start iter. Default: 1.
tb_logger (obj:`tb_logger`): Tensorboard logger. Default: None.
"""
def __init__(self, opt, start_iter=1, tb_logger=None):
self.exp_name = opt['name']
self.interval = opt['logger']['print_freq']
self.start_iter = start_iter
self.max_iters = opt['train']['niter']
self.use_tb_logger = opt['use_tb_logger']
self.tb_logger = tb_logger
self.start_time = time.time()
self.logger = get_root_logger()
# @master_only
def __call__(self, log_vars):
"""Format logging message.
Args:
log_vars (dict): It contains the following keys:
epoch (int): Epoch number.
iter (int): Current iter.
lrs (list): List for learning rates.
time (float): Iter time.
data_time (float): Data time for each iter.
"""
# epoch, iter, learning rates
epoch = log_vars.pop('epoch')
current_iter = log_vars.pop('iter')
lrs = log_vars.pop('lrs')
message = (f'[{self.exp_name[:5]}..][epoch:{epoch:3d}, '
f'iter:{current_iter:8,d}, lr:(')
for v in lrs:
message += f'{v:.3e},'
message += ')] '
# time and estimated time
if 'time' in log_vars.keys():
iter_time = log_vars.pop('time')
data_time = log_vars.pop('data_time')
total_time = time.time() - self.start_time
time_sec_avg = total_time / (current_iter - self.start_iter + 1)
eta_sec = time_sec_avg * (self.max_iters - current_iter - 1)
eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
message += f'[eta: {eta_str}, '
message += f'time: {iter_time:.3f}, data_time: {data_time:.3f}] '
# other items, especially losses
for k, v in log_vars.items():
message += f'{k}: {v:.4e} '
# tensorboard logger
if self.use_tb_logger and 'debug' not in self.exp_name:
self.tb_logger.add_scalar(k, v, current_iter)
self.logger.info(message)
# @master_only
def init_tb_logger(log_dir):
from torch.utils.tensorboard import SummaryWriter
tb_logger = SummaryWriter(log_dir=log_dir)
return tb_logger
def get_root_logger(logger_name='base', log_level=logging.INFO, log_file=None):
"""Get the root logger.
The logger will be initialized if it has not been initialized. By default a
StreamHandler will be added. If `log_file` is specified, a FileHandler will
also be added.
Args:
logger_name (str): root logger name. Default: base.
log_file (str | None): The log filename. If specified, a FileHandler
will be added to the root logger.
log_level (int): The root logger level. Note that only the process of
rank 0 is affected, while other processes will set the level to
"Error" and be silent most of the time.
Returns:
logging.Logger: The root logger.
"""
logger = logging.getLogger(logger_name)
# if the logger has been initialized, just return it
if logger.hasHandlers():
return logger
format_str = '%(asctime)s.%(msecs)03d - %(levelname)s: %(message)s'
logging.basicConfig(format=format_str, level=log_level)
if log_file is not None:
file_handler = logging.FileHandler(log_file, 'w')
file_handler.setFormatter(logging.Formatter(format_str))
file_handler.setLevel(log_level)
logger.addHandler(file_handler)
return logger
def dict2str(opt, indent_level=1):
"""dict to string for printing options.
Args:
opt (dict): Option dict.
indent_level (int): Indent level. Default: 1.
Return:
(str): Option string for printing.
"""
msg = ''
for k, v in opt.items():
if isinstance(v, dict):
msg += ' ' * (indent_level * 2) + k + ':[\n'
msg += dict2str(v, indent_level + 1)
msg += ' ' * (indent_level * 2) + ']\n'
else:
msg += ' ' * (indent_level * 2) + k + ': ' + str(v) + '\n'
return msg
================================================
FILE: language/utils/lr_schedule.py
================================================
import math
__all__ = ['adjust_learning_rate']
def adjust_learning_rate(args, optimizer, epoch):
lr = optimizer.param_groups[0]['lr']
"""
Sets the learning rate to the initial LR decayed by 10 following schedule
"""
if args.lr_decay == 'step':
lr = args.lr * (args.gamma**(epoch // args.step))
elif args.lr_decay == 'cos':
lr = args.lr * (1 + math.cos(math.pi * epoch / args.epochs)) / 2
elif args.lr_decay == 'linear':
lr = args.lr * (1 - epoch / args.epochs)
elif args.lr_decay == 'linear2exp':
if epoch < args.turning_point + 1:
# learning rate decay as 95%
# at the turning point (1 / 95% = 1.0526)
lr = args.lr * (1 - epoch / int(args.turning_point * 1.0526))
else:
lr *= args.gamma
elif args.lr_decay == 'schedule':
if epoch in args.schedule:
lr *= args.gamma
else:
raise ValueError('Unknown lr mode {}'.format(args.lr_decay))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
return lr
================================================
FILE: language/utils/misc.py
================================================
'''Some helper functions for PyTorch, including:
- get_mean_and_std: calculate the mean and std value of dataset.
- msr_init: net parameter initialization.
- progress_bar: progress bar mimic xlua.progress.
'''
import errno
import os
import torch
import torch.nn as nn
import torch.nn.init as init
__all__ = [
'get_mean_and_std', 'init_params', 'mkdir_p', 'save_checkpoint',
'AverageMeter'
]
def get_mean_and_std(dataset):
'''Compute the mean and std value of dataset.'''
dataloader = trainloader = torch.utils.data.DataLoader( # noqa
dataset, batch_size=1, shuffle=True, num_workers=2)
mean = torch.zeros(3)
std = torch.zeros(3)
print('==> Computing mean and std..')
for inputs, targets in dataloader:
for i in range(3):
mean[i] += inputs[:, i, :, :].mean()
std[i] += inputs[:, i, :, :].std()
mean.div_(len(dataset))
std.div_(len(dataset))
return mean, std
def init_params(net):
'''Init layer parameters.'''
for m in net.modules():
if isinstance(m, nn.Conv2d):
init.kaiming_normal(m.weight, mode='fan_out')
if m.bias:
init.constant(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
init.constant(m.weight, 1)
init.constant(m.bias, 0)
elif isinstance(m, nn.Linear):
init.normal(m.weight, std=1e-3)
if m.bias:
init.constant(m.bias, 0)
def mkdir_p(path):
'''make dir if not exist'''
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def save_checkpoint(args,
state,
is_best,
checkpoint='checkpoint',
filename='checkpoint.pth.tar'):
epoch = str(state['epoch']).zfill(2)
save_every_epoch = True
if not os.path.exists(os.path.join(args.work_dir, 'checkpoints')):
os.makedirs(os.path.join(args.work_dir, 'checkpoints'))
if save_every_epoch:
filename = 'checkpoint_' + epoch + '.pth.tar'
filepath = os.path.join(checkpoint, 'checkpoints', filename)
torch.save(state, filepath)
if is_best:
filename = 'model_best.pth.tar'
filepath = os.path.join(checkpoint, 'checkpoints', filename)
torch.save(state, filepath)
# shutil.copyfile(filepath, os.path.join(checkpoint, \
# 'model_best_'+epoch+'.pth.tar'))
class AverageMeter(object):
"""
Computes and stores the average and current value
Imported from
https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262
"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0 # running average = running sum / running count
self.sum = 0 # running sum
self.count = 0 # running count
def update(self, val, n=1):
# n = batch_size
# val = batch accuracy for an attribute
# self.val = val
# sum = 100 * accumulative correct predictions for this attribute
self.sum += val * n
# count = total samples so far
self.count += n
# avg = 100 * avg accuracy for this attribute
# for all the batches so far
self.avg = self.sum / self.count
================================================
FILE: language/utils/numerical.py
================================================
import json
import numpy as np
__all__ = ['get_weight', 'transpose_and_format']
def get_weight(args):
"""
read the attribute class distribution file stats.txt and return the counts
"""
# read counts from stats file
stats_f = open(args.stats_file, "r")
# each list [] in the count_list is for one attribute
# each value in [] is the number of training samples
# for that attribute value
count_list = []
for i in range(args.num_attr):
count_list.append([])
for row_idx, row in enumerate(stats_f):
# row 0 is attr names, row 1 is unlabeled statistics
if row_idx == 0 or row_idx == 1:
continue
# [:-1] because the last value is the new line character
row = row.split(' ')[:-1]
for new_idx_in_row, attr_val in enumerate(row):
# print('num_idx:', num_idx, 'num:', num)
if new_idx_in_row == 0:
continue
new_idx = new_idx_in_row - 1
count_list[new_idx].append((int(attr_val))) # **0.5)
# weight for gt_remapping case
count_list = np.array(count_list)
num_attr = count_list.shape[0]
num_cls = count_list.shape[1]
if args.gt_remapping:
remap_count_list = np.zeros((num_attr, num_cls))
for attr_idx in range(num_attr):
for cls_idx in range(num_cls):
new_cls_idx = int(args.gt_remapping[attr_idx][cls_idx])
remap_count_list[attr_idx][new_cls_idx] += count_list[
attr_idx][cls_idx]
count_list = remap_count_list
# For each attribute, among classes, weight Inversion and Normalization
value_weights = []
for attr_idx in range(num_attr):
weight_l = np.zeros(num_cls)
for cls_idx in range(num_cls):
weight_l[cls_idx] = (1 / count_list[attr_idx][cls_idx]
) if count_list[attr_idx][cls_idx] else 0
# normalize weight_l so that their average value is 1
normalized_weight_l = np.zeros(num_cls)
for cls_idx in range(num_cls):
normalized_weight_l[cls_idx] = weight_l[cls_idx] / sum(weight_l)
value_weights.append(normalized_weight_l)
# Among attributes, weight Inversion and Normalization
# count_sum_list = []
# for a_list in count_list:
# count_sum_list.append(sum(a_list))
# count_sum = sum(count_sum_list)
# attribute_weights = []
# for i in range(len(count_sum_list)):
# attribute_weight = count_sum / count_sum_list[i]
# attribute_weights.append(attribute_weight)
# # normalize attribute_weights so that their average value is 1
# normalized_attribute_weights = []
# for i in range(len(attribute_weights)):
# normalized_attribute_weights.append(attribute_weights[i] /
# sum(attribute_weights) *
# len(attribute_weights))
weights = {'value_weights': value_weights}
return weights
def transpose_and_format(args, input):
"""
input = [
[#, #, #, #, #, #],
[#, #, #, #, #, #],
[#, #, #, #, #, #]
]
where outer loop is attribute
inner loop is class labels
new_f:
attr_val Bangs Smiling Young
0 # # #
1 # # #
2 # # #
3 # # #
4 # # #
5 # # #
"""
with open(args.attr_file, 'r') as f:
attr_f = json.load(f)
attr_info = attr_f['attr_info']
attr_list = ['attr_val']
for key, val in attr_info.items():
attr_list.append(val["name"])
# new_f stores the output
new_f = []
# first line is the header
new_f.append(attr_list)
for i in range(len(input[0])):
row = []
row.append(i)
for j in range(args.num_attr):
row.append(round(input[j][i].item(), 2))
# row.append(round(input[j][i], 2))
new_f.append(row)
return new_f
================================================
FILE: language/utils/progress/.gitignore
================================================
================================================
FILE: language/utils/progress/LICENSE
================================================
# Copyright (c) 2012 Giorgos Verigakis
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
================================================
FILE: language/utils/progress/MANIFEST.in
================================================
include README.rst LICENSE
================================================
FILE: language/utils/progress/README.rst
================================================
Easy progress reporting for Python
==================================
|pypi|
|demo|
.. |pypi| image:: https://img.shields.io/pypi/v/progress.svg
.. |demo| image:: https://raw.github.com/verigak/progress/master/demo.gif
:alt: Demo
Bars
----
There are 7 progress bars to choose from:
- ``Bar``
- ``ChargingBar``
- ``FillingSquaresBar``
- ``FillingCirclesBar``
- ``IncrementalBar``
- ``PixelBar``
- ``ShadyBar``
To use them, just call ``next`` to advance and ``finish`` to finish:
.. code-block:: python
from progress.bar import Bar
bar = Bar('Processing', max=20)
for i in range(20):
# Do some work
bar.next()
bar.finish()
The result will be a bar like the following: ::
Processing |############# | 42/100
To simplify the common case where the work is done in an iterator, you can
use the ``iter`` method:
.. code-block:: python
for i in Bar('Processing').iter(it):
# Do some work
Progress bars are very customizable, you can change their width, their fill
character, their suffix and more:
.. code-block:: python
bar = Bar('Loading', fill='@', suffix='%(percent)d%%')
This will produce a bar like the following: ::
Loading |@@@@@@@@@@@@@ | 42%
You can use a number of template arguments in ``message`` and ``suffix``:
========== ================================
Name Value
========== ================================
index current value
max maximum value
remaining max - index
progress index / max
percent progress * 100
avg simple moving average time per item (in seconds)
elapsed elapsed time in seconds
elapsed_td elapsed as a timedelta (useful for printing as a string)
eta avg * remaining
eta_td eta as a timedelta (useful for printing as a string)
========== ================================
Instead of passing all configuration options on instatiation, you can create
your custom subclass:
.. code-block:: python
class FancyBar(Bar):
message = 'Loading'
fill = '*'
suffix = '%(percent).1f%% - %(eta)ds'
You can also override any of the arguments or create your own:
.. code-block:: python
class SlowBar(Bar):
suffix = '%(remaining_hours)d hours remaining'
@property
def remaining_hours(self):
return self.eta // 3600
Spinners
========
For actions with an unknown number of steps you can use a spinner:
.. code-block:: python
from progress.spinner import Spinner
spinner = Spinner('Loading ')
while state != 'FINISHED':
# Do some work
spinner.next()
There are 5 predefined spinners:
- ``Spinner``
- ``PieSpinner``
- ``MoonSpinner``
- ``LineSpinner``
- ``PixelSpinner``
Other
=====
There are a number of other classes available too, please check the source or
subclass one of them to create your own.
License
=======
progress is licensed under ISC
================================================
FILE: language/utils/progress/progress/__init__.py
================================================
# Copyright (c) 2012 Giorgos Verigakis
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
from __future__ import division
from collections import deque
from datetime import timedelta
from math import ceil
from sys import stderr
from time import time
__version__ = '1.3'
class Infinite(object):
file = stderr
sma_window = 10 # Simple Moving Average window
def __init__(self, *args, **kwargs):
self.index = 0
self.start_ts = time()
self.avg = 0
self._ts = self.start_ts
self._xput = deque(maxlen=self.sma_window)
for key, val in kwargs.items():
setattr(self, key, val)
def __getitem__(self, key):
if key.startswith('_'):
return None
return getattr(self, key, None)
@property
def elapsed(self):
return int(time() - self.start_ts)
@property
def elapsed_td(self):
return timedelta(seconds=self.elapsed)
def update_avg(self, n, dt):
if n > 0:
self._xput.append(dt / n)
self.avg = sum(self._xput) / len(self._xput)
def update(self):
pass
def start(self):
pass
def finish(self):
pass
def next(self, n=1):
now = time()
dt = now - self._ts
self.update_avg(n, dt)
self._ts = now
self.index = self.index + n
self.update()
def iter(self, it):
try:
for x in it:
yield x
self.next()
finally:
self.finish()
class Progress(Infinite):
def __init__(self, *args, **kwargs):
super(Progress, self).__init__(*args, **kwargs)
self.max = kwargs.get('max', 100)
@property
def eta(self):
return int(ceil(self.avg * self.remaining))
@property
def eta_td(self):
return timedelta(seconds=self.eta)
@property
def percent(self):
return self.progress * 100
@property
def progress(self):
return min(1, self.index / self.max)
@property
def remaining(self):
return max(self.max - self.index, 0)
def start(self):
self.update()
def goto(self, index):
incr = index - self.index
self.next(incr)
def iter(self, it):
try:
self.max = len(it)
except TypeError:
pass
try:
for x in it:
yield x
self.next()
finally:
self.finish()
================================================
FILE: language/utils/progress/progress/bar.py
================================================
# -*- coding: utf-8 -*-
# Copyright (c) 2012 Giorgos Verigakis
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
from __future__ import unicode_literals
from . import Progress
from .helpers import WritelnMixin
class Bar(WritelnMixin, Progress):
width = 32
message = ''
suffix = '%(index)d/%(max)d'
bar_prefix = ' |'
bar_suffix = '| '
empty_fill = ' '
fill = '#'
hide_cursor = True
def update(self):
filled_length = int(self.width * self.progress)
empty_length = self.width - filled_length
message = self.message % self
bar = self.fill * filled_length
empty = self.empty_fill * empty_length
suffix = self.suffix % self
line = ''.join([message, self.bar_prefix, bar, empty, self.bar_suffix,
suffix])
self.writeln(line)
class ChargingBar(Bar):
suffix = '%(percent)d%%'
bar_prefix = ' '
bar_suffix = ' '
empty_fill = '∙'
fill = '█'
class FillingSquaresBar(ChargingBar):
empty_fill = '▢'
fill = '▣'
class FillingCirclesBar(ChargingBar):
empty_fill = '◯'
fill = '◉'
class IncrementalBar(Bar):
phases = (' ', '▏', '▎', '▍', '▌', '▋', '▊', '▉', '█')
def update(self):
nphases = len(self.phases)
filled_len = self.width * self.progress
nfull = int(filled_len) # Number of full chars
phase = int((filled_len - nfull) * nphases) # Phase of last char
nempty = self.width - nfull # Number of empty chars
message = self.message % self
bar = self.phases[-1] * nfull
current = self.phases[phase] if phase > 0 else ''
empty = self.empty_fill * max(0, nempty - len(current))
suffix = self.suffix % self
line = ''.join([message, self.bar_prefix, bar, current, empty,
self.bar_suffix, suffix])
self.writeln(line)
class PixelBar(IncrementalBar):
phases = ('⡀', '⡄', '⡆', '⡇', '⣇', '⣧', '⣷', '⣿')
class ShadyBar(IncrementalBar):
phases = (' ', '░', '▒', '▓', '█')
================================================
FILE: language/utils/progress/progress/counter.py
================================================
# -*- coding: utf-8 -*-
# Copyright (c) 2012 Giorgos Verigakis
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
from __future__ import unicode_literals
from . import Infinite, Progress
from .helpers import WriteMixin
class Counter(WriteMixin, Infinite):
message = ''
hide_cursor = True
def update(self):
self.write(str(self.index))
class Countdown(WriteMixin, Progress):
hide_cursor = True
def update(self):
self.write(str(self.remaining))
class Stack(WriteMixin, Progress):
phases = (' ', '▁', '▂', '▃', '▄', '▅', '▆', '▇', '█')
hide_cursor = True
def update(self):
nphases = len(self.phases)
i = min(nphases - 1, int(self.progress * nphases))
self.write(self.phases[i])
class Pie(Stack):
phases = ('○', '◔', '◑', '◕', '●')
================================================
FILE: language/utils/progress/progress/helpers.py
================================================
# Copyright (c) 2012 Giorgos Verigakis
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
from __future__ import print_function
HIDE_CURSOR = '\x1b[?25l'
SHOW_CURSOR = '\x1b[?25h'
class WriteMixin(object):
hide_cursor = False
def __init__(self, message=None, **kwargs):
super(WriteMixin, self).__init__(**kwargs)
self._width = 0
if message:
self.message = message
if self.file.isatty():
if self.hide_cursor:
print(HIDE_CURSOR, end='', file=self.file)
print(self.message, end='', file=self.file)
self.file.flush()
def write(self, s):
if self.file.isatty():
b = '\b' * self._width
c = s.ljust(self._width)
print(b + c, end='', file=self.file)
self._width = max(self._width, len(s))
self.file.flush()
def finish(self):
if self.file.isatty() and self.hide_cursor:
print(SHOW_CURSOR, end='', file=self.file)
class WritelnMixin(object):
hide_cursor = False
def __init__(self, message=None, **kwargs):
super(WritelnMixin, self).__init__(**kwargs)
if message:
self.message = message
if self.file.isatty() and self.hide_cursor:
print(HIDE_CURSOR, end='', file=self.file)
def clearln(self):
if self.file.isatty():
print('\r\x1b[K', end='', file=self.file)
def writeln(self, line):
if self.file.isatty():
self.clearln()
print(line, end='', file=self.file)
self.file.flush()
def finish(self):
if self.file.isatty():
print(file=self.file)
if self.hide_cursor:
print(SHOW_CURSOR, end='', file=self.file)
from signal import signal, SIGINT
from sys import exit
class SigIntMixin(object):
"""Registers a signal handler that calls finish on SIGINT"""
def __init__(self, *args, **kwargs):
super(SigIntMixin, self).__init__(*args, **kwargs)
signal(SIGINT, self._sigint_handler)
def _sigint_handler(self, signum, frame):
self.finish()
exit(0)
================================================
FILE: language/utils/progress/progress/spinner.py
================================================
# -*- coding: utf-8 -*-
# Copyright (c) 2012 Giorgos Verigakis
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
from __future__ import unicode_literals
from . import Infinite
from .helpers import WriteMixin
class Spinner(WriteMixin, Infinite):
message = ''
phases = ('-', '\\', '|', '/')
hide_cursor = True
def update(self):
i = self.index % len(self.phases)
self.write(self.phases[i])
class PieSpinner(Spinner):
phases = ['◷', '◶', '◵', '◴']
class MoonSpinner(Spinner):
phases = ['◑', '◒', '◐', '◓']
class LineSpinner(Spinner):
phases = ['⎺', '⎻', '⎼', '⎽', '⎼', '⎻']
class PixelSpinner(Spinner):
phases = ['⣾','⣷', '⣯', '⣟', '⡿', '⢿', '⣻', '⣽']
================================================
FILE: language/utils/progress/setup.py
================================================
#!/usr/bin/env python
from setuptools import setup
import progress
setup(
name='progress',
version=progress.__version__,
description='Easy to use progress bars',
long_description=open('README.rst').read(),
author='Giorgos Verigakis',
author_email='verigak@gmail.com',
url='http://github.com/verigak/progress/',
license='ISC',
packages=['progress'],
classifiers=[
'Environment :: Console',
'Intended Audience :: Developers',
'License :: OSI Approved :: ISC License (ISCL)',
'Programming Language :: Python :: 2.6',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
]
)
================================================
FILE: language/utils/progress/test_progress.py
================================================
#!/usr/bin/env python
from __future__ import print_function
import random
import time
from progress.bar import (Bar, ChargingBar, FillingSquaresBar,
FillingCirclesBar, IncrementalBar, PixelBar,
ShadyBar)
from progress.spinner import (Spinner, PieSpinner, MoonSpinner, LineSpinner,
PixelSpinner)
from progress.counter import Counter, Countdown, Stack, Pie
def sleep():
t = 0.01
t += t * random.uniform(-0.1, 0.1) # Add some variance
time.sleep(t)
for bar_cls in (Bar, ChargingBar, FillingSquaresBar, FillingCirclesBar):
suffix = '%(index)d/%(max)d [%(elapsed)d / %(eta)d / %(eta_td)s]'
bar = bar_cls(bar_cls.__name__, suffix=suffix)
for i in bar.iter(range(200)):
sleep()
for bar_cls in (IncrementalBar, PixelBar, ShadyBar):
suffix = '%(percent)d%% [%(elapsed_td)s / %(eta)d / %(eta_td)s]'
bar = bar_cls(bar_cls.__name__, suffix=suffix)
for i in bar.iter(range(200)):
sleep()
for spin in (Spinner, PieSpinner, MoonSpinner, LineSpinner, PixelSpinner):
for i in spin(spin.__name__ + ' ').iter(range(100)):
sleep()
print()
for singleton in (Counter, Countdown, Stack, Pie):
for i in singleton(singleton.__name__ + ' ').iter(range(100)):
sleep()
print()
bar = IncrementalBar('Random', suffix='%(index)d')
for i in range(100):
bar.goto(random.randint(0, 100))
sleep()
bar.finish()
================================================
FILE: language/utils/setup_logger.py
================================================
# python3.7
"""Utility functions for logging."""
import logging
import os
import sys
__all__ = ['setup_logger']
def setup_logger(work_dir=None,
logfile_name='log.txt',
logger_name='logger',
debug=0):
"""Sets up logger from target work directory.
The function will sets up a logger with `DEBUG` log level. Two handlers will
be added to the logger automatically. One is the `sys.stdout` stream, with
`INFO` log level, which will print improtant messages on the screen. The other
is used to save all messages to file `$WORK_DIR/$LOGFILE_NAME`. Messages will
be added time stamp and log level before logged.
NOTE: If `work_dir` or `logfile_name` is empty, the file stream will be
skipped.
Args:
work_dir: The work directory. All intermediate files will be saved here.
(default: None)
logfile_name: Name of the file to save log message. (default: `log.txt`)
logger_name: Unique name for the logger. (default: `logger`)
Returns:
A `logging.Logger` object.
Raises:
SystemExit: If the work directory has already existed, of the logger with
specified name `logger_name` has already existed.
"""
logger = logging.getLogger(logger_name)
if logger.hasHandlers(): # Already existed
raise SystemExit(
f'Logger name `{logger_name}` has already been set up!\n'
f'Please use another name, or otherwise the messages '
f'may be mixed between these two loggers.')
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter("[%(asctime)s][%(levelname)s] %(message)s")
# Print log message with `INFO` level or above onto the screen.
sh = logging.StreamHandler(stream=sys.stdout)
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
logger.addHandler(sh)
if not work_dir or not logfile_name:
return logger
if os.path.exists(work_dir) and debug == 0:
raise SystemExit(f'Work directory `{work_dir}` has already existed!\n'
f'Please specify another one.')
os.makedirs(work_dir, exist_ok=debug)
# Save log message with all levels in log file.
fh = logging.FileHandler(os.path.join(work_dir, logfile_name))
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)
return logger
================================================
FILE: language/utils/visualize.py
================================================
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import numpy as np
from .misc import *
__all__ = ['make_image', 'show_batch', 'show_mask', 'show_mask_single']
# functions to show an image
def make_image(img, mean=(0,0,0), std=(1,1,1)):
for i in range(0, 3):
img[i] = img[i] * std[i] + mean[i] # unnormalize
npimg = img.numpy()
return np.transpose(npimg, (1, 2, 0))
def gauss(x,a,b,c):
return torch.exp(-torch.pow(torch.add(x,-b),2).div(2*c*c)).mul(a)
def colorize(x):
''' Converts a one-channel grayscale image to a color heatmap image '''
if x.dim() == 2:
torch.unsqueeze(x, 0, out=x)
if x.dim() == 3:
cl = torch.zeros([3, x.size(1), x.size(2)])
cl[0] = gauss(x,.5,.6,.2) + gauss(x,1,.8,.3)
cl[1] = gauss(x,1,.5,.3)
cl[2] = gauss(x,1,.2,.3)
cl[cl.gt(1)] = 1
elif x.dim() == 4:
cl = torch.zeros([x.size(0), 3, x.size(2), x.size(3)])
cl[:,0,:,:] = gauss(x,.5,.6,.2) + gauss(x,1,.8,.3)
cl[:,1,:,:] = gauss(x,1,.5,.3)
cl[:,2,:,:] = gauss(x,1,.2,.3)
return cl
def show_batch(images, Mean=(2, 2, 2), Std=(0.5,0.5,0.5)):
images = make_image(torchvision.utils.make_grid(images), Mean, Std)
plt.imshow(images)
plt.show()
def show_mask_single(images, mask, Mean=(2, 2, 2), Std=(0.5,0.5,0.5)):
im_size = images.size(2)
# save for adding mask
im_data = images.clone()
for i in range(0, 3):
im_data[:,i,:,:] = im_data[:,i,:,:] * Std[i] + Mean[i] # unnormalize
images = make_image(torchvision.utils.make_grid(images), Mean, Std)
plt.subplot(2, 1, 1)
plt.imshow(images)
plt.axis('off')
# for b in range(mask.size(0)):
# mask[b] = (mask[b] - mask[b].min())/(mask[b].max() - mask[b].min())
mask_size = mask.size(2)
# print('Max %f Min %f' % (mask.max(), mask.min()))
mask = (upsampling(mask, scale_factor=im_size/mask_size))
# mask = colorize(upsampling(mask, scale_factor=im_size/mask_size))
# for c in range(3):
# mask[:,c,:,:] = (mask[:,c,:,:] - Mean[c])/Std[c]
# print(mask.size())
mask = make_image(torchvision.utils.make_grid(0.3*im_data+0.7*mask.expand_as(im_data)))
# mask = make_image(torchvision.utils.make_grid(0.3*im_data+0.7*mask), Mean, Std)
plt.subplot(2, 1, 2)
plt.imshow(mask)
plt.axis('off')
def show_mask(images, masklist, Mean=(2, 2, 2), Std=(0.5,0.5,0.5)):
im_size = images.size(2)
# save for adding mask
im_data = images.clone()
for i in range(0, 3):
im_data[:,i,:,:] = im_data[:,i,:,:] * Std[i] + Mean[i] # unnormalize
images = make_image(torchvision.utils.make_grid(images), Mean, Std)
plt.subplot(1+len(masklist), 1, 1)
plt.imshow(images)
plt.axis('off')
for i in range(len(masklist)):
mask = masklist[i].data.cpu()
# for b in range(mask.size(0)):
# mask[b] = (mask[b] - mask[b].min())/(mask[b].max() - mask[b].min())
mask_size = mask.size(2)
# print('Max %f Min %f' % (mask.max(), mask.min()))
mask = (upsampling(mask, scale_factor=im_size/mask_size))
# mask = colorize(upsampling(mask, scale_factor=im_size/mask_size))
# for c in range(3):
# mask[:,c,:,:] = (mask[:,c,:,:] - Mean[c])/Std[c]
# print(mask.size())
mask = make_image(torchvision.utils.make_grid(0.3*im_data+0.7*mask.expand_as(im_data)))
# mask = make_image(torchvision.utils.make_grid(0.3*im_data+0.7*mask), Mean, Std)
plt.subplot(1+len(masklist), 1, i+2)
plt.imshow(mask)
plt.axis('off')
# x = torch.zeros(1, 3, 3)
# out = colorize(x)
# out_im = make_image(out)
# plt.imshow(out_im)
# plt.show()
================================================
FILE: models/__init__.py
================================================
import glob
import importlib
import logging
import os.path as osp
# automatically scan and import model modules
# scan all the files under the 'models' folder and collect files ending with
# '_model.py'
model_folder = osp.dirname(osp.abspath(__file__))
model_filenames = [
osp.splitext(osp.basename(v))[0]
for v in glob.glob(f'{model_folder}/*_model.py')
]
# import all the model modules
_model_modules = [
importlib.import_module(f'models.{file_name}')
for file_name in model_filenames
]
def create_model(opt):
"""Create model.
Args:
opt (dict): Configuration. It constains:
model_type (str): Model type.
"""
model_type = opt['model_type']
# dynamically instantiation
for module in _model_modules:
model_cls = getattr(module, model_type, None)
if model_cls is not None:
break
if model_cls is None:
raise ValueError(f'Model {model_type} is not found.')
model = model_cls(opt)
logger = logging.getLogger('base')
logger.info(f'Model [{model.__class__.__name__}] is created.')
return model
================================================
FILE: models/archs/__init__.py
================================================
================================================
FILE: models/archs/attribute_predictor_arch.py
================================================
import json
import torch.nn as nn
import torch.utils.model_zoo as model_zoo
__all__ = ['ResNet', 'resnet50']
model_urls = {
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}
def conv3x3(in_planes, out_planes, stride=1):
"""3x3 convolution with padding"""
return nn.Conv2d(
in_planes,
out_planes,
kernel_size=3,
stride=stride,
padding=1,
bias=False)
def conv1x1(in_planes, out_planes, stride=1):
"""1x1 convolution"""
return nn.Conv2d(
in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = conv1x1(inplanes, planes)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = conv3x3(planes, planes, stride)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = conv1x1(planes, planes * self.expansion)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class fc_block(nn.Module):
def __init__(self, inplanes, planes, drop_rate=0.15):
super(fc_block, self).__init__()
self.fc = nn.Linear(inplanes, planes)
self.bn = nn.BatchNorm1d(planes)
if drop_rate > 0:
self.dropout = nn.Dropout(drop_rate)
self.relu = nn.ReLU(inplace=True)
self.drop_rate = drop_rate
def forward(self, x):
x = self.fc(x)
x = self.bn(x)
if self.drop_rate > 0:
x = self.dropout(x)
x = self.relu(x)
return x
class ResNet(nn.Module):
def __init__(self,
block,
layers,
attr_file,
zero_init_residual=False,
dropout_rate=0):
super(ResNet, self).__init__()
self.inplanes = 64
self.conv1 = nn.Conv2d(
3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.stem = fc_block(512 * block.expansion, 512, dropout_rate)
# construct classifier heads according to the number of values of
# each attribute
self.attr_file = attr_file
with open(self.attr_file, 'r') as f:
attr_f = json.load(f)
self.attr_info = attr_f['attr_info']
for idx, (key, val) in enumerate(self.attr_info.items()):
num_val = int(len(val["value"]))
setattr(
self, 'classifier' + str(key).zfill(2) + val["name"],
nn.Sequential(
fc_block(512, 256, dropout_rate), nn.Linear(256, num_val)))
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(
m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
# Zero-initialize the last BN in each residual branch,
# so that the residual branch starts with zeros, and each residual
# block behaves like an identity.
# This improves the model by 0.2~0.3% according
# to https://arxiv.org/abs/1706.02677
if zero_init_residual:
for m in self.modules():
if isinstance(m, Bottleneck):
nn.init.constant_(m.bn3.weight, 0)
elif isinstance(m, BasicBlock):
nn.init.constant_(m.bn2.weight, 0)
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.stem(x)
y = []
for idx, (key, val) in enumerate(self.attr_info.items()):
classifier = getattr(
self, 'classifier' + str(key).zfill(2) + val["name"])
y.append(classifier(x))
return y
def resnet50(pretrained=True, **kwargs):
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
init_pretrained_weights(model, model_urls['resnet50'])
return model
def init_pretrained_weights(model, model_url):
"""
Initialize model with pretrained weights.
Layers that don't match with pretrained layers in name or size are kept
unchanged.
"""
pretrain_dict = model_zoo.load_url(model_url)
model_dict = model.state_dict()
pretrain_dict = {
k: v
for k, v in pretrain_dict.items()
if k in model_dict and model_dict[k].size() == v.size()
}
model_dict.update(pretrain_dict)
model.load_state_dict(model_dict)
print(
"Initialized model with pretrained weights from {}".format(model_url))
================================================
FILE: models/archs/field_function_arch.py
================================================
import torch
import torch.nn as nn
class FieldFunction(nn.Module):
def __init__(
self,
num_layer=4,
latent_dim=512,
hidden_dim=512,
leaky_relu_neg_slope=0.2,
):
super(FieldFunction, self).__init__()
layers = []
# first layer
linear_layer = LinearLayer(
in_dim=latent_dim,
out_dim=hidden_dim,
activation=True,
negative_slope=leaky_relu_neg_slope)
layers.append(linear_layer)
# hidden layers
for i in range(num_layer - 2):
linear_layer = LinearLayer(
in_dim=hidden_dim,
out_dim=hidden_dim,
activation=True,
negative_slope=leaky_relu_neg_slope)
layers.append(linear_layer)
# final layers
linear_layer = LinearLayer(
in_dim=hidden_dim, out_dim=latent_dim, activation=False)
layers.append(linear_layer)
self.field = nn.Sequential(*layers)
def forward(self, x):
x = self.field(x)
return x
class LinearLayer(nn.Module):
def __init__(
self,
in_dim=512,
out_dim=512,
activation=True,
negative_slope=0.2,
):
super(LinearLayer, self).__init__()
self.Linear = nn.Linear(
in_features=in_dim, out_features=out_dim, bias=True)
self.activation = activation
if activation:
self.leaky_relu = nn.LeakyReLU(
negative_slope=negative_slope, inplace=False)
def forward(self, x):
x = self.Linear(x)
if self.activation:
x = self.leaky_relu(x)
return x
class Normalization(nn.Module):
def __init__(self, ):
super(Normalization, self).__init__()
self.mean = torch.tensor([0.485, 0.456, 0.406
]).unsqueeze(-1).unsqueeze(-1).to('cuda')
print(self.mean.shape)
self.std = torch.tensor([0.229, 0.224,
0.225]).unsqueeze(-1).unsqueeze(-1).to('cuda')
def forward(self, x):
x = torch.sub(x, self.mean)
x = torch.div(x, self.std)
return x
================================================
FILE: models/archs/stylegan2/.gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
wandb/
*.lmdb/
*.pkl
================================================
FILE: models/archs/stylegan2/LICENSE
================================================
MIT License
Copyright (c) 2019 Kim Seonghyeon
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: models/archs/stylegan2/LICENSE-FID
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: models/archs/stylegan2/LICENSE-LPIPS
================================================
Copyright (c) 2018, Richard Zhang, Phillip Isola, Alexei A. Efros, Eli Shechtman, Oliver Wang
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
================================================
FILE: models/archs/stylegan2/LICENSE-NVIDIA
================================================
Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
Nvidia Source Code License-NC
=======================================================================
1. Definitions
"Licensor" means any person or entity that distributes its Work.
"Software" means the original work of authorship made available under
this License.
"Work" means the Software and any additions to or derivative works of
the Software that are made available under this License.
"Nvidia Processors" means any central processing unit (CPU), graphics
processing unit (GPU), field-programmable gate array (FPGA),
application-specific integrated circuit (ASIC) or any combination
thereof designed, made, sold, or provided by Nvidia or its affiliates.
The terms "reproduce," "reproduction," "derivative works," and
"distribution" have the meaning as provided under U.S. copyright law;
provided, however, that for the purposes of this License, derivative
works shall not include works that remain separable from, or merely
link (or bind by name) to the interfaces of, the Work.
Works, including the Software, are "made available" under this License
by including in or with the Work either (a) a copyright notice
referencing the applicability of this License to the Work, or (b) a
copy of this License.
2. License Grants
2.1 Copyright Grant. Subject to the terms and conditions of this
License, each Licensor grants to you a perpetual, worldwide,
non-exclusive, royalty-free, copyright license to reproduce,
prepare derivative works of, publicly display, publicly perform,
sublicense and distribute its Work and any resulting derivative
works in any form.
3. Limitations
3.1 Redistribution. You may reproduce or distribute the Work only
if (a) you do so under this License, (b) you include a complete
copy of this License with your distribution, and (c) you retain
without modification any copyright, patent, trademark, or
attribution notices that are present in the Work.
3.2 Derivative Works. You may specify that additional or different
terms apply to the use, reproduction, and distribution of your
derivative works of the Work ("Your Terms") only if (a) Your Terms
provide that the use limitation in Section 3.3 applies to your
derivative works, and (b) you identify the specific derivative
works that are subject to Your Terms. Notwithstanding Your Terms,
this License (including the redistribution requirements in Section
3.1) will continue to apply to the Work itself.
3.3 Use Limitation. The Work and any derivative works thereof only
may be used or intended for use non-commercially. The Work or
derivative works thereof may be used or intended for use by Nvidia
or its affiliates commercially or non-commercially. As used herein,
"non-commercially" means for research or evaluation purposes only.
3.4 Patent Claims. If you bring or threaten to bring a patent claim
against any Licensor (including any claim, cross-claim or
counterclaim in a lawsuit) to enforce any patents that you allege
are infringed by any Work, then your rights under this License from
such Licensor (including the grants in Sections 2.1 and 2.2) will
terminate immediately.
3.5 Trademarks. This License does not grant any rights to use any
Licensor's or its affiliates' names, logos, or trademarks, except
as necessary to reproduce the notices described in this License.
3.6 Termination. If you violate any term of this License, then your
rights under this License (including the grants in Sections 2.1 and
2.2) will terminate immediately.
4. Disclaimer of Warranty.
THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
THIS LICENSE.
5. Limitation of Liability.
EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
THE POSSIBILITY OF SUCH DAMAGES.
=======================================================================
================================================
FILE: models/archs/stylegan2/__init__.py
================================================
================================================
FILE: models/archs/stylegan2/apply_factor.py
================================================
import argparse
import torch
from torchvision import utils
from model import Generator
if __name__ == "__main__":
torch.set_grad_enabled(False)
parser = argparse.ArgumentParser(description="Apply closed form factorization")
parser.add_argument(
"-i", "--index", type=int, default=0, help="index of eigenvector"
)
parser.add_argument(
"-d",
"--degree",
type=float,
default=5,
help="scalar factors for moving latent vectors along eigenvector",
)
parser.add_argument(
"--channel_multiplier",
type=int,
default=2,
help='channel multiplier factor. config-f = 2, else = 1',
)
parser.add_argument("--ckpt", type=str, required=True, help="stylegan2 checkpoints")
parser.add_argument(
"--size", type=int, default=256, help="output image size of the generator"
)
parser.add_argument(
"-n", "--n_sample", type=int, default=7, help="number of samples created"
)
parser.add_argument(
"--truncation", type=float, default=0.7, help="truncation factor"
)
parser.add_argument(
"--device", type=str, default="cuda", help="device to run the model"
)
parser.add_argument(
"--out_prefix",
type=str,
default="factor",
help="filename prefix to result samples",
)
parser.add_argument(
"factor",
type=str,
help="name of the closed form factorization result factor file",
)
args = parser.parse_args()
eigvec = torch.load(args.factor)["eigvec"].to(args.device)
ckpt = torch.load(args.ckpt)
g = Generator(args.size, 512, 8, channel_multiplier=args.channel_multiplier).to(args.device)
g.load_state_dict(ckpt["g_ema"], strict=False)
trunc = g.mean_latent(4096)
latent = torch.randn(args.n_sample, 512, device=args.device)
latent = g.get_latent(latent)
direction = args.degree * eigvec[:, args.index].unsqueeze(0)
img, _ = g(
[latent],
truncation=args.truncation,
truncation_latent=trunc,
input_is_latent=True,
)
img1, _ = g(
[latent + direction],
truncation=args.truncation,
truncation_latent=trunc,
input_is_latent=True,
)
img2, _ = g(
[latent - direction],
truncation=args.truncation,
truncation_latent=trunc,
input_is_latent=True,
)
grid = utils.save_image(
torch.cat([img1, img, img2], 0),
f"{args.out_prefix}_index-{args.index}_degree-{args.degree}.png",
normalize=True,
range=(-1, 1),
nrow=args.n_sample,
)
================================================
FILE: models/archs/stylegan2/calc_inception.py
================================================
import argparse
import pickle
import os
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.models import inception_v3, Inception3
import numpy as np
from tqdm import tqdm
from inception import InceptionV3
from dataset import MultiResolutionDataset
class Inception3Feature(Inception3):
def forward(self, x):
if x.shape[2] != 299 or x.shape[3] != 299:
x = F.interpolate(x, size=(299, 299), mode="bilinear", align_corners=True)
x = self.Conv2d_1a_3x3(x) # 299 x 299 x 3
x = self.Conv2d_2a_3x3(x) # 149 x 149 x 32
x = self.Conv2d_2b_3x3(x) # 147 x 147 x 32
x = F.max_pool2d(x, kernel_size=3, stride=2) # 147 x 147 x 64
x = self.Conv2d_3b_1x1(x) # 73 x 73 x 64
x = self.Conv2d_4a_3x3(x) # 73 x 73 x 80
x = F.max_pool2d(x, kernel_size=3, stride=2) # 71 x 71 x 192
x = self.Mixed_5b(x) # 35 x 35 x 192
x = self.Mixed_5c(x) # 35 x 35 x 256
x = self.Mixed_5d(x) # 35 x 35 x 288
x = self.Mixed_6a(x) # 35 x 35 x 288
x = self.Mixed_6b(x) # 17 x 17 x 768
x = self.Mixed_6c(x) # 17 x 17 x 768
x = self.Mixed_6d(x) # 17 x 17 x 768
x = self.Mixed_6e(x) # 17 x 17 x 768
x = self.Mixed_7a(x) # 17 x 17 x 768
x = self.Mixed_7b(x) # 8 x 8 x 1280
x = self.Mixed_7c(x) # 8 x 8 x 2048
x = F.avg_pool2d(x, kernel_size=8) # 8 x 8 x 2048
return x.view(x.shape[0], x.shape[1]) # 1 x 1 x 2048
def load_patched_inception_v3():
# inception = inception_v3(pretrained=True)
# inception_feat = Inception3Feature()
# inception_feat.load_state_dict(inception.state_dict())
inception_feat = InceptionV3([3], normalize_input=False)
return inception_feat
@torch.no_grad()
def extract_features(loader, inception, device):
pbar = tqdm(loader)
feature_list = []
for img in pbar:
img = img.to(device)
feature = inception(img)[0].view(img.shape[0], -1)
feature_list.append(feature.to("cpu"))
features = torch.cat(feature_list, 0)
return features
if __name__ == "__main__":
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
parser = argparse.ArgumentParser(
description="Calculate Inception v3 features for datasets"
)
parser.add_argument(
"--size",
type=int,
default=256,
help="image sizes used for embedding calculation",
)
parser.add_argument(
"--batch", default=64, type=int, help="batch size for inception networks"
)
parser.add_argument(
"--n_sample",
type=int,
default=50000,
help="number of samples used for embedding calculation",
)
parser.add_argument(
"--flip", action="store_true", help="apply random flipping to real images"
)
parser.add_argument("path", metavar="PATH", help="path to datset lmdb file")
args = parser.parse_args()
inception = load_patched_inception_v3()
inception = nn.DataParallel(inception).eval().to(device)
transform = transforms.Compose(
[
transforms.RandomHorizontalFlip(p=0.5 if args.flip else 0),
transforms.ToTensor(),
transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
]
)
dset = MultiResolutionDataset(args.path, transform=transform, resolution=args.size)
loader = DataLoader(dset, batch_size=args.batch, num_workers=4)
features = extract_features(loader, inception, device).numpy()
features = features[: args.n_sample]
print(f"extracted {features.shape[0]} features")
mean = np.mean(features, 0)
cov = np.cov(features, rowvar=False)
name = os.path.splitext(os.path.basename(args.path))[0]
with open(f"inception_{name}.pkl", "wb") as f:
pickle.dump({"mean": mean, "cov": cov, "size": args.size, "path": args.path}, f)
================================================
FILE: models/archs/stylegan2/checkpoint/.gitignore
================================================
*.pt
================================================
FILE: models/archs/stylegan2/convert_weight.py
================================================
import argparse
import math
import os
import pickle
import sys
import numpy as np
import torch
from torchvision import utils
from model import Discriminator, Generator
def convert_modconv(vars, source_name, target_name, flip=False):
weight = vars[source_name + "/weight"].value().eval()
mod_weight = vars[source_name + "/mod_weight"].value().eval()
mod_bias = vars[source_name + "/mod_bias"].value().eval()
noise = vars[source_name + "/noise_strength"].value().eval()
bias = vars[source_name + "/bias"].value().eval()
dic = {
"conv.weight": np.expand_dims(weight.transpose((3, 2, 0, 1)), 0),
"conv.modulation.weight": mod_weight.transpose((1, 0)),
"conv.modulation.bias": mod_bias + 1,
"noise.weight": np.array([noise]),
"activate.bias": bias,
}
dic_torch = {}
for k, v in dic.items():
dic_torch[target_name + "." + k] = torch.from_numpy(v)
if flip:
dic_torch[target_name + ".conv.weight"] = torch.flip(
dic_torch[target_name + ".conv.weight"], [3, 4])
return dic_torch
def convert_conv(vars, source_name, target_name, bias=True, start=0):
weight = vars[source_name + "/weight"].value().eval()
dic = {"weight": weight.transpose((3, 2, 0, 1))}
if bias:
dic["bias"] = vars[source_name + "/bias"].value().eval()
dic_torch = {}
dic_torch[target_name + f".{start}.weight"] = torch.from_numpy(
dic["weight"])
if bias:
dic_torch[target_name + f".{start + 1}.bias"] = torch.from_numpy(
dic["bias"])
return dic_torch
def convert_torgb(vars, source_name, target_name):
weight = vars[source_name + "/weight"].value().eval()
mod_weight = vars[source_name + "/mod_weight"].value().eval()
mod_bias = vars[source_name + "/mod_bias"].value().eval()
bias = vars[source_name + "/bias"].value().eval()
dic = {
"conv.weight": np.expand_dims(weight.transpose((3, 2, 0, 1)), 0),
"conv.modulation.weight": mod_weight.transpose((1, 0)),
"conv.modulation.bias": mod_bias + 1,
"bias": bias.reshape((1, 3, 1, 1)),
}
dic_torch = {}
for k, v in dic.items():
dic_torch[target_name + "." + k] = torch.from_numpy(v)
return dic_torch
def convert_dense(vars, source_name, target_name):
weight = vars[source_name + "/weight"].value().eval()
bias = vars[source_name + "/bias"].value().eval()
dic = {"weight": weight.transpose((1, 0)), "bias": bias}
dic_torch = {}
for k, v in dic.items():
dic_torch[target_name + "." + k] = torch.from_numpy(v)
return dic_torch
def update(state_dict, new):
for k, v in new.items():
if k not in state_dict:
raise KeyError(k + " is not found")
if v.shape != state_dict[k].shape:
raise ValueError(
f"Shape mismatch: {v.shape} vs {state_dict[k].shape}")
state_dict[k] = v
def discriminator_fill_statedict(statedict, vars, size):
log_size = int(math.log(size, 2))
update(statedict, convert_conv(vars, f"{size}x{size}/FromRGB", "convs.0"))
conv_i = 1
for i in range(log_size - 2, 0, -1):
reso = 4 * 2**i
update(
statedict,
convert_conv(vars, f"{reso}x{reso}/Conv0",
f"convs.{conv_i}.conv1"),
)
update(
statedict,
convert_conv(
vars,
f"{reso}x{reso}/Conv1_down",
f"convs.{conv_i}.conv2",
start=1),
)
update(
statedict,
convert_conv(
vars,
f"{reso}x{reso}/Skip",
f"convs.{conv_i}.skip",
start=1,
bias=False),
)
conv_i += 1
update(statedict, convert_conv(vars, f"4x4/Conv", "final_conv"))
update(statedict, convert_dense(vars, f"4x4/Dense0", "final_linear.0"))
update(statedict, convert_dense(vars, f"Output", "final_linear.1"))
return statedict
def fill_statedict(state_dict, vars, size, n_mlp):
log_size = int(math.log(size, 2))
for i in range(n_mlp):
update(state_dict,
convert_dense(vars, f"G_mapping/Dense{i}", f"style.{i + 1}"))
update(
state_dict,
{
"input.input":
torch.from_numpy(
vars["G_synthesis/4x4/Const/const"].value().eval())
},
)
update(state_dict, convert_torgb(vars, "G_synthesis/4x4/ToRGB", "to_rgb1"))
for i in range(log_size - 2):
reso = 4 * 2**(i + 1)
update(
state_dict,
convert_torgb(vars, f"G_synthesis/{reso}x{reso}/ToRGB",
f"to_rgbs.{i}"),
)
update(state_dict, convert_modconv(vars, "G_synthesis/4x4/Conv", "conv1"))
conv_i = 0
for i in range(log_size - 2):
reso = 4 * 2**(i + 1)
update(
state_dict,
convert_modconv(
vars,
f"G_synthesis/{reso}x{reso}/Conv0_up",
f"convs.{conv_i}",
flip=True,
),
)
update(
state_dict,
convert_modconv(vars, f"G_synthesis/{reso}x{reso}/Conv1",
f"convs.{conv_i + 1}"),
)
conv_i += 2
for i in range(0, (log_size - 2) * 2 + 1):
update(
state_dict,
{
f"noises.noise_{i}":
torch.from_numpy(vars[f"G_synthesis/noise{i}"].value().eval())
},
)
return state_dict
if __name__ == "__main__":
device = "cuda"
parser = argparse.ArgumentParser(
description="Tensorflow to pytorch model checkpoint converter")
parser.add_argument(
"--repo",
type=str,
required=True,
help="path to the offical StyleGAN2 repository with dnnlib/ folder",
)
parser.add_argument(
"--gen", action="store_true", help="convert the generator weights")
parser.add_argument(
"--disc",
action="store_true",
help="convert the discriminator weights")
parser.add_argument(
"--channel_multiplier",
type=int,
default=2,
help="channel multiplier factor. config-f = 2, else = 1",
)
parser.add_argument(
"path", metavar="PATH", help="path to the tensorflow weights")
args = parser.parse_args()
sys.path.append(args.repo)
import dnnlib
from dnnlib import tflib
tflib.init_tf()
with open(args.path, "rb") as f:
generator, discriminator, g_ema = pickle.load(f)
size = g_ema.output_shape[2]
print(size)
raie NotImplementedError
n_mlp = 0
mapping_layers_names = g_ema.__getstate__(
)['components']['mapping'].list_layers()
for layer in mapping_layers_names:
if layer[0].startswith('Dense'):
n_mlp += 1
g = Generator(size, 512, n_mlp, channel_multiplier=args.channel_multiplier)
state_dict = g.state_dict()
state_dict = fill_statedict(state_dict, g_ema.vars, size, n_mlp)
g.load_state_dict(state_dict)
latent_avg = torch.from_numpy(g_ema.vars["dlatent_avg"].value().eval())
ckpt = {"g_ema": state_dict, "latent_avg": latent_avg}
if args.gen:
g_train = Generator(
size, 512, n_mlp, channel_multiplier=args.channel_multiplier)
g_train_state = g_train.state_dict()
g_train_state = fill_statedict(g_train_state, generator.vars, size)
ckpt["g"] = g_train_state
if args.disc:
disc = Discriminator(size, channel_multiplier=args.channel_multiplier)
d_state = disc.state_dict()
d_state = discriminator_fill_statedict(d_state, discriminator.vars,
size)
ckpt["d"] = d_state
name = os.path.splitext(os.path.basename(args.path))[0]
torch.save(ckpt, name + ".pt")
batch_size = {256: 16, 512: 9, 1024: 4}
n_sample = batch_size.get(size, 25)
g = g.to(device)
z = np.random.RandomState(0).randn(n_sample, 512).astype("float32")
with torch.no_grad():
img_pt, _ = g(
[torch.from_numpy(z).to(device)],
truncation=0.5,
truncation_latent=latent_avg.to(device),
randomize_noise=False,
)
Gs_kwargs = dnnlib.EasyDict()
Gs_kwargs.randomize_noise = False
img_tf = g_ema.run(z, None, **Gs_kwargs)
img_tf = torch.from_numpy(img_tf).to(device)
img_diff = ((img_pt + 1) / 2).clamp(0.0, 1.0) - (
(img_tf.to(device) + 1) / 2).clamp(0.0, 1.0)
img_concat = torch.cat((img_tf, img_pt, img_diff), dim=0)
print(img_diff.abs().max())
utils.save_image(
img_concat,
name + ".png",
nrow=n_sample,
normalize=True,
range=(-1, 1))
================================================
FILE: models/archs/stylegan2/dataset.py
================================================
from io import BytesIO
import lmdb
from PIL import Image
from torch.utils.data import Dataset
class MultiResolutionDataset(Dataset):
def __init__(self, path, transform, resolution=256):
self.env = lmdb.open(
path,
max_readers=32,
readonly=True,
lock=False,
readahead=False,
meminit=False,
)
if not self.env:
raise IOError('Cannot open lmdb dataset', path)
with self.env.begin(write=False) as txn:
self.length = int(txn.get('length'.encode('utf-8')).decode('utf-8'))
self.resolution = resolution
self.transform = transform
def __len__(self):
return self.length
def __getitem__(self, index):
with self.env.begin(write=False) as txn:
key = f'{self.resolution}-{str(index).zfill(5)}'.encode('utf-8')
img_bytes = txn.get(key)
buffer = BytesIO(img_bytes)
img = Image.open(buffer)
img = self.transform(img)
return img
================================================
FILE: models/archs/stylegan2/distributed.py
================================================
import math
import pickle
import torch
from torch import distributed as dist
from torch.utils.data.sampler import Sampler
def get_rank():
if not dist.is_available():
return 0
if not dist.is_initialized():
return 0
return dist.get_rank()
def synchronize():
if not dist.is_available():
return
if not dist.is_initialized():
return
world_size = dist.get_world_size()
if world_size == 1:
return
dist.barrier()
def get_world_size():
if not dist.is_available():
return 1
if not dist.is_initialized():
return 1
return dist.get_world_size()
def reduce_sum(tensor):
if not dist.is_available():
return tensor
if not dist.is_initialized():
return tensor
tensor = tensor.clone()
dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
return tensor
def gather_grad(params):
world_size = get_world_size()
if world_size == 1:
return
for param in params:
if param.grad is not None:
dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM)
param.grad.data.div_(world_size)
def all_gather(data):
world_size = get_world_size()
if world_size == 1:
return [data]
buffer = pickle.dumps(data)
storage = torch.ByteStorage.from_buffer(buffer)
tensor = torch.ByteTensor(storage).to('cuda')
local_size = torch.IntTensor([tensor.numel()]).to('cuda')
size_list = [torch.IntTensor([0]).to('cuda') for _ in range(world_size)]
dist.all_gather(size_list, local_size)
size_list = [int(size.item()) for size in size_list]
max_size = max(size_list)
tensor_list = []
for _ in size_list:
tensor_list.append(torch.ByteTensor(size=(max_size,)).to('cuda'))
if local_size != max_size:
padding = torch.ByteTensor(size=(max_size - local_size,)).to('cuda')
tensor = torch.cat((tensor, padding), 0)
dist.all_gather(tensor_list, tensor)
data_list = []
for size, tensor in zip(size_list, tensor_list):
buffer = tensor.cpu().numpy().tobytes()[:size]
data_list.append(pickle.loads(buffer))
return data_list
def reduce_loss_dict(loss_dict):
world_size = get_world_size()
if world_size < 2:
return loss_dict
with torch.no_grad():
keys = []
losses = []
for k in sorted(loss_dict.keys()):
keys.append(k)
losses.append(loss_dict[k])
losses = torch.stack(losses, 0)
dist.reduce(losses, dst=0)
if dist.get_rank() == 0:
losses /= world_size
reduced_losses = {k: v for k, v in zip(keys, losses)}
return reduced_losses
================================================
FILE: models/archs/stylegan2/fid.py
================================================
import argparse
import pickle
import torch
from torch import nn
import numpy as np
from scipy import linalg
from tqdm import tqdm
from model import Generator
from calc_inception import load_patched_inception_v3
@torch.no_grad()
def extract_feature_from_samples(
generator, inception, truncation, truncation_latent, batch_size, n_sample, device
):
n_batch = n_sample // batch_size
resid = n_sample - (n_batch * batch_size)
batch_sizes = [batch_size] * n_batch + [resid]
features = []
for batch in tqdm(batch_sizes):
latent = torch.randn(batch, 512, device=device)
img, _ = g([latent], truncation=truncation, truncation_latent=truncation_latent)
feat = inception(img)[0].view(img.shape[0], -1)
features.append(feat.to("cpu"))
features = torch.cat(features, 0)
return features
def calc_fid(sample_mean, sample_cov, real_mean, real_cov, eps=1e-6):
cov_sqrt, _ = linalg.sqrtm(sample_cov @ real_cov, disp=False)
if not np.isfinite(cov_sqrt).all():
print("product of cov matrices is singular")
offset = np.eye(sample_cov.shape[0]) * eps
cov_sqrt = linalg.sqrtm((sample_cov + offset) @ (real_cov + offset))
if np.iscomplexobj(cov_sqrt):
if not np.allclose(np.diagonal(cov_sqrt).imag, 0, atol=1e-3):
m = np.max(np.abs(cov_sqrt.imag))
raise ValueError(f"Imaginary component {m}")
cov_sqrt = cov_sqrt.real
mean_diff = sample_mean - real_mean
mean_norm = mean_diff @ mean_diff
trace = np.trace(sample_cov) + np.trace(real_cov) - 2 * np.trace(cov_sqrt)
fid = mean_norm + trace
return fid
if __name__ == "__main__":
device = "cuda"
parser = argparse.ArgumentParser(description="Calculate FID scores")
parser.add_argument("--truncation", type=float, default=1, help="truncation factor")
parser.add_argument(
"--truncation_mean",
type=int,
default=4096,
help="number of samples to calculate mean for truncation",
)
parser.add_argument(
"--batch", type=int, default=64, help="batch size for the generator"
)
parser.add_argument(
"--n_sample",
type=int,
default=50000,
help="number of the samples for calculating FID",
)
parser.add_argument(
"--size", type=int, default=256, help="image sizes for generator"
)
parser.add_argument(
"--inception",
type=str,
default=None,
required=True,
help="path to precomputed inception embedding",
)
parser.add_argument(
"ckpt", metavar="CHECKPOINT", help="path to generator checkpoint"
)
args = parser.parse_args()
ckpt = torch.load(args.ckpt)
g = Generator(args.size, 512, 8).to(device)
g.load_state_dict(ckpt["g_ema"])
g = nn.DataParallel(g)
g.eval()
if args.truncation < 1:
with torch.no_grad():
mean_latent = g.mean_latent(args.truncation_mean)
else:
mean_latent = None
inception = nn.DataParallel(load_patched_inception_v3()).to(device)
inception.eval()
features = extract_feature_from_samples(
g, inception, args.truncation, mean_latent, args.batch, args.n_sample, device
).numpy()
print(f"extracted {features.shape[0]} features")
sample_mean = np.mean(features, 0)
sample_cov = np.cov(features, rowvar=False)
with open(args.inception, "rb") as f:
embeds = pickle.load(f)
real_mean = embeds["mean"]
real_cov = embeds["cov"]
fid = calc_fid(sample_mean, sample_cov, real_mean, real_cov)
print("fid:", fid)
================================================
FILE: models/archs/stylegan2/generate.py
================================================
import argparse
import os
import sys
import numpy as np
import torch
from torchvision import utils
from tqdm import tqdm
sys.path.append('..')
from stylegan2_pytorch.model import Generator
def generate(args, g_ema, device, mean_latent):
if not os.path.exists(args.synthetic_image_dir):
os.makedirs(args.synthetic_image_dir)
latent_code = {}
w_space_code = {}
with torch.no_grad():
g_ema.eval()
for i in tqdm(range(args.pics)):
sample_z = torch.randn(args.sample, args.latent, device=device)
sample, w_space = g_ema([sample_z],
truncation=args.truncation,
truncation_latent=mean_latent,
return_latents=True,
randomize_noise=False)
utils.save_image(
sample,
os.path.join(args.synthetic_image_dir,
f"{str(i).zfill(7)}.png"),
nrow=1,
normalize=True,
range=(-1, 1),
)
latent_code[f"{str(i).zfill(7)}.png"] = sample_z.cpu().numpy()
w_space_code[f"{str(i).zfill(7)}.png"] = w_space.cpu().numpy()
# save latent code
np.save(f'{args.synthetic_image_dir}/latent_code.npz', latent_code)
np.save(f'{args.synthetic_image_dir}/w_space_code.npz', w_space_code)
if __name__ == "__main__":
device = "cuda"
parser = argparse.ArgumentParser(
description="Generate samples from the generator")
parser.add_argument(
"--size",
type=int,
default=1024,
help="output image size of the generator")
parser.add_argument(
"--sample",
type=int,
default=1,
help="number of samples to be generated for each image",
)
parser.add_argument(
"--pics",
type=int,
default=20,
help="number of images to be generated")
parser.add_argument(
"--truncation", type=float, default=1, help="truncation ratio")
parser.add_argument(
"--truncation_mean",
type=int,
default=4096,
help="number of vectors to calculate mean for the truncation",
)
parser.add_argument(
"--ckpt",
type=str,
default="stylegan2-ffhq-config-f.pt",
help="path to the model checkpoint",
)
parser.add_argument(
"--channel_multiplier",
type=int,
default=2,
help="channel multiplier of the generator. config-f = 2, else = 1",
)
parser.add_argument(
"--synthetic_image_dir",
default='',
help="channel multiplier of the generator. config-f = 2, else = 1",
)
args = parser.parse_args()
args.latent = 512
args.n_mlp = 8
g_ema = Generator(
args.size,
args.latent,
args.n_mlp,
channel_multiplier=args.channel_multiplier).to(device)
checkpoint = torch.load(args.ckpt)
g_ema.load_state_dict(checkpoint["g_ema"])
if args.truncation < 1:
with torch.no_grad():
mean_latent = g_ema.mean_latent(args.truncation_mean)
else:
mean_latent = None
generate(args, g_ema, device, mean_latent)
================================================
FILE: models/archs/stylegan2/inception.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
try:
from torchvision.models.utils import load_state_dict_from_url
except ImportError:
from torch.utils.model_zoo import load_url as load_state_dict_from_url
# Inception weights ported to Pytorch from
# http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth'
class InceptionV3(nn.Module):
"""Pretrained InceptionV3 network returning feature maps"""
# Index of default block of inception to return,
# corresponds to output of final average pooling
DEFAULT_BLOCK_INDEX = 3
# Maps feature dimensionality to their output blocks indices
BLOCK_INDEX_BY_DIM = {
64: 0, # First max pooling features
192: 1, # Second max pooling featurs
768: 2, # Pre-aux classifier features
2048: 3 # Final average pooling features
}
def __init__(self,
output_blocks=[DEFAULT_BLOCK_INDEX],
resize_input=True,
normalize_input=True,
requires_grad=False,
use_fid_inception=True):
"""Build pretrained InceptionV3
Parameters
----------
output_blocks : list of int
Indices of blocks to return features of. Possible values are:
- 0: corresponds to output of first max pooling
- 1: corresponds to output of second max pooling
- 2: corresponds to output which is fed to aux classifier
- 3: corresponds to output of final average pooling
resize_input : bool
If true, bilinearly resizes input to width and height 299 before
feeding input to model. As the network without fully connected
layers is fully convolutional, it should be able to handle inputs
of arbitrary size, so resizing might not be strictly needed
normalize_input : bool
If true, scales the input from range (0, 1) to the range the
pretrained Inception network expects, namely (-1, 1)
requires_grad : bool
If true, parameters of the model require gradients. Possibly useful
for finetuning the network
use_fid_inception : bool
If true, uses the pretrained Inception model used in Tensorflow's
FID implementation. If false, uses the pretrained Inception model
available in torchvision. The FID Inception model has different
weights and a slightly different structure from torchvision's
Inception model. If you want to compute FID scores, you are
strongly advised to set this parameter to true to get comparable
results.
"""
super(InceptionV3, self).__init__()
self.resize_input = resize_input
self.normalize_input = normalize_input
self.output_blocks = sorted(output_blocks)
self.last_needed_block = max(output_blocks)
assert self.last_needed_block <= 3, \
'Last possible output block index is 3'
self.blocks = nn.ModuleList()
if use_fid_inception:
inception = fid_inception_v3()
else:
inception = models.inception_v3(pretrained=True)
# Block 0: input to maxpool1
block0 = [
inception.Conv2d_1a_3x3,
inception.Conv2d_2a_3x3,
inception.Conv2d_2b_3x3,
nn.MaxPool2d(kernel_size=3, stride=2)
]
self.blocks.append(nn.Sequential(*block0))
# Block 1: maxpool1 to maxpool2
if self.last_needed_block >= 1:
block1 = [
inception.Conv2d_3b_1x1,
inception.Conv2d_4a_3x3,
nn.MaxPool2d(kernel_size=3, stride=2)
]
self.blocks.append(nn.Sequential(*block1))
# Block 2: maxpool2 to aux classifier
if self.last_needed_block >= 2:
block2 = [
inception.Mixed_5b,
inception.Mixed_5c,
inception.Mixed_5d,
inception.Mixed_6a,
inception.Mixed_6b,
inception.Mixed_6c,
inception.Mixed_6d,
inception.Mixed_6e,
]
self.blocks.append(nn.Sequential(*block2))
# Block 3: aux classifier to final avgpool
if self.last_needed_block >= 3:
block3 = [
inception.Mixed_7a,
inception.Mixed_7b,
inception.Mixed_7c,
nn.AdaptiveAvgPool2d(output_size=(1, 1))
]
self.blocks.append(nn.Sequential(*block3))
for param in self.parameters():
param.requires_grad = requires_grad
def forward(self, inp):
"""Get Inception feature maps
Parameters
----------
inp : torch.autograd.Variable
Input tensor of shape Bx3xHxW. Values are expected to be in
range (0, 1)
Returns
-------
List of torch.autograd.Variable, corresponding to the selected output
block, sorted ascending by index
"""
outp = []
x = inp
if self.resize_input:
x = F.interpolate(x,
size=(299, 299),
mode='bilinear',
align_corners=False)
if self.normalize_input:
x = 2 * x - 1 # Scale from range (0, 1) to range (-1, 1)
for idx, block in enumerate(self.blocks):
x = block(x)
if idx in self.output_blocks:
outp.append(x)
if idx == self.last_needed_block:
break
return outp
def fid_inception_v3():
"""Build pretrained Inception model for FID computation
The Inception model for FID computation uses a different set of weights
and has a slightly different structure than torchvision's Inception.
This method first constructs torchvision's Inception and then patches the
necessary parts that are different in the FID Inception model.
"""
inception = models.inception_v3(num_classes=1008,
aux_logits=False,
pretrained=False)
inception.Mixed_5b = FIDInceptionA(192, pool_features=32)
inception.Mixed_5c = FIDInceptionA(256, pool_features=64)
inception.Mixed_5d = FIDInceptionA(288, pool_features=64)
inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128)
inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160)
inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160)
inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192)
inception.Mixed_7b = FIDInceptionE_1(1280)
inception.Mixed_7c = FIDInceptionE_2(2048)
state_dict = load_state_dict_from_url(FID_WEIGHTS_URL, progress=True)
inception.load_state_dict(state_dict)
return inception
class FIDInceptionA(models.inception.InceptionA):
"""InceptionA block patched for FID computation"""
def __init__(self, in_channels, pool_features):
super(FIDInceptionA, self).__init__(in_channels, pool_features)
def forward(self, x):
branch1x1 = self.branch1x1(x)
branch5x5 = self.branch5x5_1(x)
branch5x5 = self.branch5x5_2(branch5x5)
branch3x3dbl = self.branch3x3dbl_1(x)
branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
# Patch: Tensorflow's average pool does not use the padded zero's in
# its average calculation
branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
count_include_pad=False)
branch_pool = self.branch_pool(branch_pool)
outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
return torch.cat(outputs, 1)
class FIDInceptionC(models.inception.InceptionC):
"""InceptionC block patched for FID computation"""
def __init__(self, in_channels, channels_7x7):
super(FIDInceptionC, self).__init__(in_channels, channels_7x7)
def forward(self, x):
branch1x1 = self.branch1x1(x)
branch7x7 = self.branch7x7_1(x)
branch7x7 = self.branch7x7_2(branch7x7)
branch7x7 = self.branch7x7_3(branch7x7)
branch7x7dbl = self.branch7x7dbl_1(x)
branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
# Patch: Tensorflow's average pool does not use the padded zero's in
# its average calculation
branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
count_include_pad=False)
branch_pool = self.branch_pool(branch_pool)
outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
return torch.cat(outputs, 1)
class FIDInceptionE_1(models.inception.InceptionE):
"""First InceptionE block patched for FID computation"""
def __init__(self, in_channels):
super(FIDInceptionE_1, self).__init__(in_channels)
def forward(self, x):
branch1x1 = self.branch1x1(x)
branch3x3 = self.branch3x3_1(x)
branch3x3 = [
self.branch3x3_2a(branch3x3),
self.branch3x3_2b(branch3x3),
]
branch3x3 = torch.cat(branch3x3, 1)
branch3x3dbl = self.branch3x3dbl_1(x)
branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
branch3x3dbl = [
self.branch3x3dbl_3a(branch3x3dbl),
self.branch3x3dbl_3b(branch3x3dbl),
]
branch3x3dbl = torch.cat(branch3x3dbl, 1)
# Patch: Tensorflow's average pool does not use the padded zero's in
# its average calculation
branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
count_include_pad=False)
branch_pool = self.branch_pool(branch_pool)
outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
return torch.cat(outputs, 1)
class FIDInceptionE_2(models.inception.InceptionE):
"""Second InceptionE block patched for FID computation"""
def __init__(self, in_channels):
super(FIDInceptionE_2, self).__init__(in_channels)
def forward(self, x):
branch1x1 = self.branch1x1(x)
branch3x3 = self.branch3x3_1(x)
branch3x3 = [
self.branch3x3_2a(branch3x3),
self.branch3x3_2b(branch3x3),
]
branch3x3 = torch.cat(branch3x3, 1)
branch3x3dbl = self.branch3x3dbl_1(x)
branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
branch3x3dbl = [
self.branch3x3dbl_3a(branch3x3dbl),
self.branch3x3dbl_3b(branch3x3dbl),
]
branch3x3dbl = torch.cat(branch3x3dbl, 1)
# Patch: The FID Inception model uses max pooling instead of average
# pooling. This is likely an error in this specific Inception
# implementation, as other Inception models use average pooling here
# (which matches the description in the paper).
branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
branch_pool = self.branch_pool(branch_pool)
outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
return torch.cat(outputs, 1)
================================================
FILE: models/archs/stylegan2/inversion.py
================================================
import argparse
import math
import os
import numpy as np
import torch
from PIL import Image
from torch import optim
from torch.nn import functional as F
from torchvision import transforms
from tqdm import tqdm
import lpips
from model import Generator
def noise_regularize(noises):
loss = 0
for noise in noises:
size = noise.shape[2]
while True:
loss = (
loss +
(noise * torch.roll(noise, shifts=1, dims=3)).mean().pow(2) +
(noise * torch.roll(noise, shifts=1, dims=2)).mean().pow(2))
if size <= 8:
break
noise = noise.reshape([-1, 1, size // 2, 2, size // 2, 2])
noise = noise.mean([3, 5])
size //= 2
return loss
def noise_normalize_(noises):
for noise in noises:
mean = noise.mean()
std = noise.std()
noise.data.add_(-mean).div_(std)
def get_lr(t, initial_lr, rampdown=0.25, rampup=0.05):
lr_ramp = min(1, (1 - t) / rampdown)
lr_ramp = 0.5 - 0.5 * math.cos(lr_ramp * math.pi)
lr_ramp = lr_ramp * min(1, t / rampup)
return initial_lr * lr_ramp
def latent_noise(latent, strength):
noise = torch.randn_like(latent) * strength
return latent + noise
def make_image(tensor):
return (tensor.detach().clamp_(min=-1, max=1).add(1).div_(2).mul(255).type(
torch.uint8).permute(0, 2, 3, 1).to("cpu").numpy())
if __name__ == "__main__":
device = "cuda"
parser = argparse.ArgumentParser(
description="Image projector to the generator latent spaces")
parser.add_argument(
"--ckpt", type=str, required=True, help="path to the model checkpoint")
parser.add_argument(
"--size",
type=int,
default=256,
help="output image sizes of the generator")
parser.add_argument(
"--lr_rampup",
type=float,
default=0.05,
help="duration of the learning rate warmup",
)
parser.add_argument(
"--lr_rampdown",
type=float,
default=0.25,
help="duration of the learning rate decay",
)
parser.add_argument("--lr", type=float, default=0.1, help="learning rate")
parser.add_argument(
"--noise",
type=float,
default=0.05,
help="strength of the noise level")
parser.add_argument(
"--noise_ramp",
type=float,
default=0.75,
help="duration of the noise level decay",
)
parser.add_argument(
"--step", type=int, default=1000, help="optimize iterations")
parser.add_argument(
"--noise_regularize",
type=float,
default=1e5,
help="weight of the noise regularization",
)
parser.add_argument("--randomise_noise", type=int, default=1)
parser.add_argument(
"--img_mse_weight",
type=float,
default=0,
help="weight of the mse loss")
parser.add_argument(
"files",
metavar="FILES",
nargs="+",
help="path to image files to be projected")
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument(
"--w_plus",
action="store_true",
help="allow to use distinct latent codes to each layers",
)
parser.add_argument(
"--postfix", default='', type=str, help='postfix for filenames')
parser.add_argument(
"--latent_type",
required=True,
type=str,
help='z or w, not case sensitive')
parser.add_argument(
"--w_path", default='', type=str, help='path to w latent code')
parser.add_argument('--w_mse_weight', default=0, type=float)
parser.add_argument('--w_loss_type', default='mse', type=str)
args = parser.parse_args()
# latent space type
args.latent_type = args.latent_type.lower()
if args.latent_type == 'z':
args.input_is_latent = False
elif args.latent_type == 'w':
args.input_is_latent = True
else:
assert False, "Unrecognized args.latent_type"
n_mean_latent = 10000
resize = min(args.size, 256)
transform = transforms.Compose([
transforms.Resize(resize),
transforms.CenterCrop(resize),
transforms.ToTensor(),
transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
])
imgs = []
for imgfile in args.files:
img = transform(Image.open(imgfile).convert("RGB"))
imgs.append(img)
imgs = torch.stack(imgs, 0).to(device)
if args.w_mse_weight:
assert args.latent_type == 'z'
w_latent_code = np.load(args.w_path)
w_latent_code = torch.tensor(w_latent_code).to(device)
# g_ema = Generator(args.size, 512, 8) # ziqi modified
g_ema = Generator(args.size, 512, 8, 1)
g_ema.load_state_dict(torch.load(args.ckpt)["g_ema"], strict=False)
g_ema.eval()
g_ema = g_ema.to(device)
with torch.no_grad():
noise_sample = torch.randn(n_mean_latent, 512, device=device)
latent_out = g_ema.style(noise_sample)
latent_mean = latent_out.mean(0)
latent_std = ((latent_out - latent_mean).pow(2).sum() /
n_mean_latent)**0.5
percept = lpips.PerceptualLoss(
model="net-lin", net="vgg", use_gpu=device.startswith("cuda"))
if args.latent_type == 'w':
latent_in = latent_mean.detach().clone().unsqueeze(0).repeat(
imgs.shape[0], 1)
elif args.latent_type == 'z':
latent_in = noise_sample.mean(0).detach().clone().unsqueeze(0).repeat(
imgs.shape[0], 1)
if args.w_plus:
latent_in = latent_in.unsqueeze(1).repeat(1, g_ema.n_latent, 1)
latent_in.requires_grad = True
if args.randomise_noise:
print('Noise term will be optimized together.')
noises_single = g_ema.make_noise()
noises = []
for noise in noises_single:
noises.append(noise.repeat(imgs.shape[0], 1, 1, 1).normal_())
for noise in noises:
noise.requires_grad = True
optimizer = optim.Adam(
[latent_in] + noises + [g_ema.parameters()], lr=args.lr)
else:
optim_params = []
for v in g_ema.parameters():
if v.requires_grad:
optim_params.append(v)
optimizer = optim.Adam([{
'params': [latent_in]
}, {
'params': optim_params,
'lr': 1e-4
}],
lr=args.lr)
pbar = tqdm(range(args.step))
latent_path = []
for i in pbar:
t = i / args.step
lr = get_lr(t, args.lr)
optimizer.param_groups[0]["lr"] = lr
noise_strength = latent_std * args.noise * max(
0, 1 - t / args.noise_ramp)**2
if args.latent_type == 'z':
latent_w = g_ema.style(latent_in)
latent_n = latent_noise(latent_w, noise_strength.item())
else:
latent_n = latent_noise(latent_in, noise_strength.item())
if args.randomise_noise:
img_gen, _ = g_ema([latent_n], input_is_latent=True, noise=noises)
else:
img_gen, _ = g_ema([latent_n],
input_is_latent=True,
randomize_noise=False)
batch, channel, height, width = img_gen.shape
if height > 256:
factor = height // 256
img_gen = img_gen.reshape(batch, channel, height // factor, factor,
width // factor, factor)
img_gen = img_gen.mean([3, 5])
p_loss = percept(img_gen, imgs).sum()
mse_loss = F.mse_loss(img_gen, imgs)
if args.randomise_noise:
n_loss = noise_regularize(noises)
else:
n_loss = 0
loss = p_loss + args.noise_regularize * n_loss + args.img_mse_weight * mse_loss
if args.w_mse_weight > 0:
# this loss is only applicable to z space
assert args.latent_type == 'z'
if args.w_loss_type == 'mse':
w_mse_loss = F.mse_loss(latent_w, w_latent_code)
elif args.w_loss_type == 'l1':
w_mse_loss = F.l1_loss(latent_w, w_latent_code)
loss += args.w_mse_weight * w_mse_loss
else:
w_mse_loss = 0
optimizer.zero_grad()
loss.backward()
optimizer.step()
if args.randomise_noise:
noise_normalize_(noises)
if (i + 1) % 100 == 0:
latent_path.append(latent_in.detach().clone())
pbar.set_description((
f"total: {loss:.4f}; perceptual: {p_loss:.4f}; noise regularize: {n_loss:.4f};"
f" mse: {mse_loss:.4f}; w_mse_loss: {w_mse_loss:.4f}; lr: {lr:.4f}"
))
if args.randomise_noise:
img_gen, _ = g_ema([latent_path[-1]],
input_is_latent=args.input_is_latent,
noise=noises)
else:
img_gen, _ = g_ema([latent_path[-1]],
input_is_latent=args.input_is_latent,
randomize_noise=False)
filename = os.path.splitext(os.path.basename(args.files[0]))[0] + ".pt"
img_ar = make_image(img_gen)
result_file = {}
for i, input_name in enumerate(args.files):
result_file[input_name] = {"img": img_gen[i], "latent": latent_in[i]}
if args.randomise_noise:
noise_single = []
for noise in noises:
noise_single.append(noise[i:i + 1])
result_file[input_name]["noise"] = noise_single
img_name = os.path.splitext(
os.path.basename(input_name)
)[0] + '_' + args.postfix + '-' + args.latent_type + "-project.png"
pil_img = Image.fromarray(img_ar[i])
# save image
if not os.path.isdir(os.path.join(args.output_dir, 'recovered_image')):
os.makedirs(
os.path.join(args.output_dir, 'recovered_image'),
exist_ok=False)
pil_img.save(
os.path.join(args.output_dir, 'recovered_image', img_name))
latent_code = latent_in[i].cpu()
latent_code = latent_code.detach().numpy()
latent_code = np.expand_dims(latent_code, axis=0)
print('latent_code:', len(latent_code), len(latent_code[0]))
# save latent code
if not os.path.isdir(os.path.join(args.output_dir, 'latent_codes')):
os.makedirs(
os.path.join(args.output_dir, 'latent_codes'), exist_ok=False)
np.save(
f'{args.output_dir}/latent_codes/{img_name}_{args.latent_type}.npz.npy',
latent_code)
if not os.path.isdir(os.path.join(args.output_dir, 'checkpoint')):
os.makedirs(
os.path.join(args.output_dir, 'checkpoint'), exist_ok=False)
torch.save(
{
"g_ema": g_ema.state_dict(),
},
f"{os.path.join(args.output_dir, 'checkpoint')}/{img_name}_{args.latent_type}.pt",
)
# save info
if not os.path.isdir(os.path.join(args.output_dir, 'pt')):
os.makedirs(os.path.join(args.output_dir, 'pt'), exist_ok=False)
torch.save(
result_file,
os.path.join(
args.output_dir,
os.path.join(args.output_dir, 'pt',
filename + '_' + args.latent_type)))
================================================
FILE: models/archs/stylegan2/lpips/__init__.py
================================================
from __future__ import absolute_import, division, print_function
import numpy as np
import torch
from models.archs.stylegan2.lpips import dist_model
from skimage.measure import compare_ssim
class PerceptualLoss(torch.nn.Module):
def __init__(
self,
model='net-lin',
net='alex',
colorspace='rgb',
spatial=False,
use_gpu=True,
gpu_ids=[
0
]): # VGG using our perceptually-learned weights (LPIPS metric)
# def __init__(self, model='net', net='vgg', use_gpu=True): # "default" way of using VGG as a perceptual loss
super(PerceptualLoss, self).__init__()
self.use_gpu = use_gpu
self.spatial = spatial
self.gpu_ids = gpu_ids
self.model = dist_model.DistModel()
self.model.initialize(
model=model,
net=net,
use_gpu=use_gpu,
colorspace=colorspace,
spatial=self.spatial,
gpu_ids=gpu_ids)
def forward(self, pred, target, normalize=False):
"""
Pred and target are Variables.
If normalize is True, assumes the images are between [0,1] and then scales them between [-1,+1]
If normalize is False, assumes the images are already between [-1,+1]
Inputs pred and target are Nx3xHxW
Output pytorch Variable N long
"""
if normalize:
target = 2 * target - 1
pred = 2 * pred - 1
return self.model.forward(target, pred)
def normalize_tensor(in_feat, eps=1e-10):
norm_factor = torch.sqrt(torch.sum(in_feat**2, dim=1, keepdim=True))
return in_feat / (norm_factor + eps)
def l2(p0, p1, range=255.):
return .5 * np.mean((p0 / range - p1 / range)**2)
def psnr(p0, p1, peak=255.):
return 10 * np.log10(peak**2 / np.mean((1. * p0 - 1. * p1)**2))
def dssim(p0, p1, range=255.):
return (1 - compare_ssim(p0, p1, data_range=range, multichannel=True)) / 2.
def rgb2lab(in_img, mean_cent=False):
from skimage import color
img_lab = color.rgb2lab(in_img)
if (mean_cent):
img_lab[:, :, 0] = img_lab[:, :, 0] - 50
return img_lab
def tensor2np(tensor_obj):
# change dimension of a tensor object into a numpy array
return tensor_obj[0].cpu().float().numpy().transpose((1, 2, 0))
def np2tensor(np_obj):
# change dimenion of np array into tensor array
return torch.Tensor(np_obj[:, :, :, np.newaxis].transpose((3, 2, 0, 1)))
def tensor2tensorlab(image_tensor, to_norm=True, mc_only=False):
# image tensor to lab tensor
from skimage import color
img = tensor2im(image_tensor)
img_lab = color.rgb2lab(img)
if (mc_only):
img_lab[:, :, 0] = img_lab[:, :, 0] - 50
if (to_norm and not mc_only):
img_lab[:, :, 0] = img_lab[:, :, 0] - 50
img_lab = img_lab / 100.
return np2tensor(img_lab)
def tensorlab2tensor(lab_tensor, return_inbnd=False):
import warnings
from skimage import color
warnings.filterwarnings("ignore")
lab = tensor2np(lab_tensor) * 100.
lab[:, :, 0] = lab[:, :, 0] + 50
rgb_back = 255. * np.clip(color.lab2rgb(lab.astype('float')), 0, 1)
if (return_inbnd):
# convert back to lab, see if we match
lab_back = color.rgb2lab(rgb_back.astype('uint8'))
mask = 1. * np.isclose(lab_back, lab, atol=2.)
mask = np2tensor(np.prod(mask, axis=2)[:, :, np.newaxis])
return (im2tensor(rgb_back), mask)
else:
return im2tensor(rgb_back)
def rgb2lab(input):
from skimage import color
return color.rgb2lab(input / 255.)
def tensor2im(image_tensor, imtype=np.uint8, cent=1., factor=255. / 2.):
image_numpy = image_tensor[0].cpu().float().numpy()
image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + cent) * factor
return image_numpy.astype(imtype)
def im2tensor(image, imtype=np.uint8, cent=1., factor=255. / 2.):
return torch.Tensor((image / factor - cent)[:, :, :, np.newaxis].transpose(
(3, 2, 0, 1)))
def tensor2vec(vector_tensor):
return vector_tensor.data.cpu().numpy()[:, :, 0, 0]
def voc_ap(rec, prec, use_07_metric=False):
""" ap = voc_ap(rec, prec, [use_07_metric])
Compute VOC AP given precision and recall.
If use_07_metric is true, uses the
VOC 07 11 point method (default:False).
"""
if use_07_metric:
# 11 point metric
ap = 0.
for t in np.arange(0., 1.1, 0.1):
if np.sum(rec >= t) == 0:
p = 0
else:
p = np.max(prec[rec >= t])
ap = ap + p / 11.
else:
# correct AP calculation
# first append sentinel values at the end
mrec = np.concatenate(([0.], rec, [1.]))
mpre = np.concatenate(([0.], prec, [0.]))
# compute the precision envelope
for i in range(mpre.size - 1, 0, -1):
mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
# to calculate area under PR curve, look for points
# where X axis (recall) changes value
i = np.where(mrec[1:] != mrec[:-1])[0]
# and sum (\Delta recall) * prec
ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
return ap
def tensor2im(image_tensor, imtype=np.uint8, cent=1., factor=255. / 2.):
# def tensor2im(image_tensor, imtype=np.uint8, cent=1., factor=1.):
image_numpy = image_tensor[0].cpu().float().numpy()
image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + cent) * factor
return image_numpy.astype(imtype)
def im2tensor(image, imtype=np.uint8, cent=1., factor=255. / 2.):
# def im2tensor(image, imtype=np.uint8, cent=1., factor=1.):
return torch.Tensor((image / factor - cent)[:, :, :, np.newaxis].transpose(
(3, 2, 0, 1)))
================================================
FILE: models/archs/stylegan2/lpips/base_model.py
================================================
import os
import numpy as np
import torch
class BaseModel():
def __init__(self):
pass
def name(self):
return 'BaseModel'
def initialize(self, use_gpu=True, gpu_ids=[0]):
self.use_gpu = use_gpu
self.gpu_ids = gpu_ids
def forward(self):
pass
def get_image_paths(self):
pass
def optimize_parameters(self):
pass
def get_current_visuals(self):
return self.input
def get_current_errors(self):
return {}
def save(self, label):
pass
# helper saving function that can be used by subclasses
def save_network(self, network, path, network_label, epoch_label):
save_filename = '%s_net_%s.pth' % (epoch_label, network_label)
save_path = os.path.join(path, save_filename)
torch.save(network.state_dict(), save_path)
# helper loading function that can be used by subclasses
def load_network(self, network, network_label, epoch_label):
save_filename = '%s_net_%s.pth' % (epoch_label, network_label)
save_path = os.path.join(self.save_dir, save_filename)
print('Loading network from %s' % save_path)
network.load_state_dict(torch.load(save_path))
def update_learning_rate():
pass
def get_image_paths(self):
return self.image_paths
def save_done(self, flag=False):
np.save(os.path.join(self.save_dir, 'done_flag'), flag)
np.savetxt(
os.path.join(self.save_dir, 'done_flag'), [
flag,
], fmt='%i')
================================================
FILE: models/archs/stylegan2/lpips/dist_model.py
================================================
from __future__ import absolute_import
import os
from collections import OrderedDict
import models.archs.stylegan2.lpips as util
import numpy as np
import torch
from scipy.ndimage import zoom
from torch.autograd import Variable
from tqdm import tqdm
from . import networks_basic as networks
from .base_model import BaseModel
class DistModel(BaseModel):
def name(self):
return self.model_name
def initialize(self,
model='net-lin',
net='alex',
colorspace='Lab',
pnet_rand=False,
pnet_tune=False,
model_path=None,
use_gpu=True,
printNet=False,
spatial=False,
is_train=False,
lr=.0001,
beta1=0.5,
version='0.1',
gpu_ids=[0]):
'''
INPUTS
model - ['net-lin'] for linearly calibrated network
['net'] for off-the-shelf network
['L2'] for L2 distance in Lab colorspace
['SSIM'] for ssim in RGB colorspace
net - ['squeeze','alex','vgg']
model_path - if None, will look in weights/[NET_NAME].pth
colorspace - ['Lab','RGB'] colorspace to use for L2 and SSIM
use_gpu - bool - whether or not to use a GPU
printNet - bool - whether or not to print network architecture out
spatial - bool - whether to output an array containing varying distances across spatial dimensions
spatial_shape - if given, output spatial shape. if None then spatial shape is determined automatically via spatial_factor (see below).
spatial_factor - if given, specifies upsampling factor relative to the largest spatial extent of a convolutional layer. if None then resized to size of input images.
spatial_order - spline order of filter for upsampling in spatial mode, by default 1 (bilinear).
is_train - bool - [True] for training mode
lr - float - initial learning rate
beta1 - float - initial momentum term for adam
version - 0.1 for latest, 0.0 was original (with a bug)
gpu_ids - int array - [0] by default, gpus to use
'''
BaseModel.initialize(self, use_gpu=use_gpu, gpu_ids=gpu_ids)
self.model = model
self.net = net
self.is_train = is_train
self.spatial = spatial
self.gpu_ids = gpu_ids
self.model_name = '%s [%s]' % (model, net)
if (self.model == 'net-lin'): # pretrained net + linear layer
self.net = networks.PNetLin(
pnet_rand=pnet_rand,
pnet_tune=pnet_tune,
pnet_type=net,
use_dropout=True,
spatial=spatial,
version=version,
lpips=True)
kw = {}
if not use_gpu:
kw['map_location'] = 'cpu'
if (model_path is None):
import inspect
model_path = os.path.abspath(
os.path.join(
inspect.getfile(self.initialize), '..',
'weights/v%s/%s.pth' % (version, net)))
if (not is_train):
print('Loading model from: %s' % model_path)
self.net.load_state_dict(
torch.load(model_path, **kw), strict=False)
elif (self.model == 'net'): # pretrained network
self.net = networks.PNetLin(
pnet_rand=pnet_rand, pnet_type=net, lpips=False)
elif (self.model in ['L2', 'l2']):
self.net = networks.L2(
use_gpu=use_gpu, colorspace=colorspace
) # not really a network, only for testing
self.model_name = 'L2'
elif (self.model in ['DSSIM', 'dssim', 'SSIM', 'ssim']):
self.net = networks.DSSIM(use_gpu=use_gpu, colorspace=colorspace)
self.model_name = 'SSIM'
else:
raise ValueError("Model [%s] not recognized." % self.model)
self.parameters = list(self.net.parameters())
if self.is_train: # training mode
# extra network on top to go from distances (d0,d1) => predicted human judgment (h*)
self.rankLoss = networks.BCERankingLoss()
self.parameters += list(self.rankLoss.net.parameters())
self.lr = lr
self.old_lr = lr
self.optimizer_net = torch.optim.Adam(
self.parameters, lr=lr, betas=(beta1, 0.999))
else: # test mode
self.net.eval()
if (use_gpu):
self.net.to(gpu_ids[0])
self.net = torch.nn.DataParallel(self.net, device_ids=gpu_ids)
if (self.is_train):
self.rankLoss = self.rankLoss.to(
device=gpu_ids[0]) # just put this on GPU0
if (printNet):
print('---------- Networks initialized -------------')
networks.print_network(self.net)
print('-----------------------------------------------')
def forward(self, in0, in1, retPerLayer=False):
''' Function computes the distance between image patches in0 and in1
INPUTS
in0, in1 - torch.Tensor object of shape Nx3xXxY - image patch scaled to [-1,1]
OUTPUT
computed distances between in0 and in1
'''
return self.net.forward(in0, in1, retPerLayer=retPerLayer)
# ***** TRAINING FUNCTIONS *****
def optimize_parameters(self):
self.forward_train()
self.optimizer_net.zero_grad()
self.backward_train()
self.optimizer_net.step()
self.clamp_weights()
def clamp_weights(self):
for module in self.net.modules():
if (hasattr(module, 'weight') and module.kernel_size == (1, 1)):
module.weight.data = torch.clamp(module.weight.data, min=0)
def set_input(self, data):
self.input_ref = data['ref']
self.input_p0 = data['p0']
self.input_p1 = data['p1']
self.input_judge = data['judge']
if (self.use_gpu):
self.input_ref = self.input_ref.to(device=self.gpu_ids[0])
self.input_p0 = self.input_p0.to(device=self.gpu_ids[0])
self.input_p1 = self.input_p1.to(device=self.gpu_ids[0])
self.input_judge = self.input_judge.to(device=self.gpu_ids[0])
self.var_ref = Variable(self.input_ref, requires_grad=True)
self.var_p0 = Variable(self.input_p0, requires_grad=True)
self.var_p1 = Variable(self.input_p1, requires_grad=True)
def forward_train(self): # run forward pass
# print(self.net.module.scaling_layer.shift)
# print(torch.norm(self.net.module.net.slice1[0].weight).item(), torch.norm(self.net.module.lin0.model[1].weight).item())
self.d0 = self.forward(self.var_ref, self.var_p0)
self.d1 = self.forward(self.var_ref, self.var_p1)
self.acc_r = self.compute_accuracy(self.d0, self.d1, self.input_judge)
self.var_judge = Variable(1. * self.input_judge).view(self.d0.size())
self.loss_total = self.rankLoss.forward(self.d0, self.d1,
self.var_judge * 2. - 1.)
return self.loss_total
def backward_train(self):
torch.mean(self.loss_total).backward()
def compute_accuracy(self, d0, d1, judge):
''' d0, d1 are Variables, judge is a Tensor '''
d1_lt_d0 = (d1 < d0).cpu().data.numpy().flatten()
judge_per = judge.cpu().numpy().flatten()
return d1_lt_d0 * judge_per + (1 - d1_lt_d0) * (1 - judge_per)
def get_current_errors(self):
retDict = OrderedDict([('loss_total',
self.loss_total.data.cpu().numpy()),
('acc_r', self.acc_r)])
for key in retDict.keys():
retDict[key] = np.mean(retDict[key])
return retDict
def get_current_visuals(self):
zoom_factor = 256 / self.var_ref.data.size()[2]
ref_img = util.tensor2im(self.var_ref.data)
p0_img = util.tensor2im(self.var_p0.data)
p1_img = util.tensor2im(self.var_p1.data)
ref_img_vis = zoom(ref_img, [zoom_factor, zoom_factor, 1], order=0)
p0_img_vis = zoom(p0_img, [zoom_factor, zoom_factor, 1], order=0)
p1_img_vis = zoom(p1_img, [zoom_factor, zoom_factor, 1], order=0)
return OrderedDict([('ref', ref_img_vis), ('p0', p0_img_vis),
('p1', p1_img_vis)])
def save(self, path, label):
if (self.use_gpu):
self.save_network(self.net.module, path, '', label)
else:
self.save_network(self.net, path, '', label)
self.save_network(self.rankLoss.net, path, 'rank', label)
def update_learning_rate(self, nepoch_decay):
lrd = self.lr / nepoch_decay
lr = self.old_lr - lrd
for param_group in self.optimizer_net.param_groups:
param_group['lr'] = lr
print('update lr [%s] decay: %f -> %f' % (type, self.old_lr, lr))
self.old_lr = lr
def score_2afc_dataset(data_loader, func, name=''):
''' Function computes Two Alternative Forced Choice (2AFC) score using
distance function 'func' in dataset 'data_loader'
INPUTS
data_loader - CustomDatasetDataLoader object - contains a TwoAFCDataset inside
func - callable distance function - calling d=func(in0,in1) should take 2
pytorch tensors with shape Nx3xXxY, and return numpy array of length N
OUTPUTS
[0] - 2AFC score in [0,1], fraction of time func agrees with human evaluators
[1] - dictionary with following elements
d0s,d1s - N arrays containing distances between reference patch to perturbed patches
gts - N array in [0,1], preferred patch selected by human evaluators
(closer to "0" for left patch p0, "1" for right patch p1,
"0.6" means 60pct people preferred right patch, 40pct preferred left)
scores - N array in [0,1], corresponding to what percentage function agreed with humans
CONSTS
N - number of test triplets in data_loader
'''
d0s = []
d1s = []
gts = []
for data in tqdm(data_loader.load_data(), desc=name):
d0s += func(data['ref'],
data['p0']).data.cpu().numpy().flatten().tolist()
d1s += func(data['ref'],
data['p1']).data.cpu().numpy().flatten().tolist()
gts += data['judge'].cpu().numpy().flatten().tolist()
d0s = np.array(d0s)
d1s = np.array(d1s)
gts = np.array(gts)
scores = (d0s < d1s) * (1. - gts) + (d1s < d0s) * gts + (d1s == d0s) * .5
return (np.mean(scores), dict(d0s=d0s, d1s=d1s, gts=gts, scores=scores))
def score_jnd_dataset(data_loader, func, name=''):
''' Function computes JND score using distance function 'func' in dataset 'data_loader'
INPUTS
data_loader - CustomDatasetDataLoader object - contains a JNDDataset inside
func - callable distance function - calling d=func(in0,in1) should take 2
pytorch tensors with shape Nx3xXxY, and return pytorch array of length N
OUTPUTS
[0] - JND score in [0,1], mAP score (area under precision-recall curve)
[1] - dictionary with following elements
ds - N array containing distances between two patches shown to human evaluator
sames - N array containing fraction of people who thought the two patches were identical
CONSTS
N - number of test triplets in data_loader
'''
ds = []
gts = []
for data in tqdm(data_loader.load_data(), desc=name):
ds += func(data['p0'], data['p1']).data.cpu().numpy().tolist()
gts += data['same'].cpu().numpy().flatten().tolist()
sames = np.array(gts)
ds = np.array(ds)
sorted_inds = np.argsort(ds)
ds_sorted = ds[sorted_inds]
sames_sorted = sames[sorted_inds]
TPs = np.cumsum(sames_sorted)
FPs = np.cumsum(1 - sames_sorted)
FNs = np.sum(sames_sorted) - TPs
precs = TPs / (TPs + FPs)
recs = TPs / (TPs + FNs)
score = util.voc_ap(recs, precs)
return (score, dict(ds=ds, sames=sames))
================================================
FILE: models/archs/stylegan2/lpips/networks_basic.py
================================================
from __future__ import absolute_import
import models.archs.stylegan2.lpips as util
import torch
import torch.nn as nn
from torch.autograd import Variable
from . import pretrained_networks as pn
def spatial_average(in_tens, keepdim=True):
return in_tens.mean([2, 3], keepdim=keepdim)
def upsample(in_tens, out_H=64): # assumes scale factor is same for H and W
in_H = in_tens.shape[2]
scale_factor = 1. * out_H / in_H
return nn.Upsample(
scale_factor=scale_factor, mode='bilinear', align_corners=False)(
in_tens)
# Learned perceptual metric
class PNetLin(nn.Module):
def __init__(self,
pnet_type='vgg',
pnet_rand=False,
pnet_tune=False,
use_dropout=True,
spatial=False,
version='0.1',
lpips=True):
super(PNetLin, self).__init__()
self.pnet_type = pnet_type
self.pnet_tune = pnet_tune
self.pnet_rand = pnet_rand
self.spatial = spatial
self.lpips = lpips
self.version = version
self.scaling_layer = ScalingLayer()
if (self.pnet_type in ['vgg', 'vgg16']):
net_type = pn.vgg16
self.chns = [64, 128, 256, 512, 512]
elif (self.pnet_type == 'alex'):
net_type = pn.alexnet
self.chns = [64, 192, 384, 256, 256]
elif (self.pnet_type == 'squeeze'):
net_type = pn.squeezenet
self.chns = [64, 128, 256, 384, 384, 512, 512]
self.L = len(self.chns)
self.net = net_type(
pretrained=not self.pnet_rand, requires_grad=self.pnet_tune)
if (lpips):
self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
self.lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
if (self.pnet_type == 'squeeze'): # 7 layers for squeezenet
self.lin5 = NetLinLayer(self.chns[5], use_dropout=use_dropout)
self.lin6 = NetLinLayer(self.chns[6], use_dropout=use_dropout)
self.lins += [self.lin5, self.lin6]
def forward(self, in0, in1, retPerLayer=False):
# v0.0 - original release had a bug, where input was not scaled
in0_input, in1_input = (
self.scaling_layer(in0),
self.scaling_layer(in1)) if self.version == '0.1' else (in0, in1)
outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input)
feats0, feats1, diffs = {}, {}, {}
for kk in range(self.L):
feats0[kk], feats1[kk] = util.normalize_tensor(
outs0[kk]), util.normalize_tensor(outs1[kk])
diffs[kk] = (feats0[kk] - feats1[kk])**2
if (self.lpips):
if (self.spatial):
res = [
upsample(
self.lins[kk].model(diffs[kk]), out_H=in0.shape[2])
for kk in range(self.L)
]
else:
res = [
spatial_average(
self.lins[kk].model(diffs[kk]), keepdim=True)
for kk in range(self.L)
]
else:
if (self.spatial):
res = [
upsample(
diffs[kk].sum(dim=1, keepdim=True), out_H=in0.shape[2])
for kk in range(self.L)
]
else:
res = [
spatial_average(
diffs[kk].sum(dim=1, keepdim=True), keepdim=True)
for kk in range(self.L)
]
val = res[0]
for l in range(1, self.L):
val += res[l]
if (retPerLayer):
return (val, res)
else:
return val
class ScalingLayer(nn.Module):
def __init__(self):
super(ScalingLayer, self).__init__()
self.register_buffer(
'shift',
torch.Tensor([-.030, -.088, -.188])[None, :, None, None])
self.register_buffer(
'scale',
torch.Tensor([.458, .448, .450])[None, :, None, None])
def forward(self, inp):
return (inp - self.shift) / self.scale
class NetLinLayer(nn.Module):
''' A single linear layer which does a 1x1 conv '''
def __init__(self, chn_in, chn_out=1, use_dropout=False):
super(NetLinLayer, self).__init__()
layers = [
nn.Dropout(),
] if (use_dropout) else []
layers += [
nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False),
]
self.model = nn.Sequential(*layers)
class Dist2LogitLayer(nn.Module):
''' takes 2 distances, puts through fc layers, spits out value between [0,1] (if use_sigmoid is True) '''
def __init__(self, chn_mid=32, use_sigmoid=True):
super(Dist2LogitLayer, self).__init__()
layers = [
nn.Conv2d(5, chn_mid, 1, stride=1, padding=0, bias=True),
]
layers += [
nn.LeakyReLU(0.2, True),
]
layers += [
nn.Conv2d(chn_mid, chn_mid, 1, stride=1, padding=0, bias=True),
]
layers += [
nn.LeakyReLU(0.2, True),
]
layers += [
nn.Conv2d(chn_mid, 1, 1, stride=1, padding=0, bias=True),
]
if (use_sigmoid):
layers += [
nn.Sigmoid(),
]
self.model = nn.Sequential(*layers)
def forward(self, d0, d1, eps=0.1):
return self.model.forward(
torch.cat((d0, d1, d0 - d1, d0 / (d1 + eps), d1 / (d0 + eps)),
dim=1))
class BCERankingLoss(nn.Module):
def __init__(self, chn_mid=32):
super(BCERankingLoss, self).__init__()
self.net = Dist2LogitLayer(chn_mid=chn_mid)
# self.parameters = list(self.net.parameters())
self.loss = torch.nn.BCELoss()
def forward(self, d0, d1, judge):
per = (judge + 1.) / 2.
self.logit = self.net.forward(d0, d1)
return self.loss(self.logit, per)
# L2, DSSIM metrics
class FakeNet(nn.Module):
def __init__(self, use_gpu=True, colorspace='Lab'):
super(FakeNet, self).__init__()
self.use_gpu = use_gpu
self.colorspace = colorspace
class L2(FakeNet):
def forward(self, in0, in1, retPerLayer=None):
assert (in0.size()[0] == 1) # currently only supports batchSize 1
if (self.colorspace == 'RGB'):
(N, C, X, Y) = in0.size()
value = torch.mean(
torch.mean(
torch.mean((in0 - in1)**2, dim=1).view(N, 1, X, Y),
dim=2).view(N, 1, 1, Y),
dim=3).view(N)
return value
elif (self.colorspace == 'Lab'):
value = util.l2(
util.tensor2np(util.tensor2tensorlab(in0.data, to_norm=False)),
util.tensor2np(util.tensor2tensorlab(in1.data, to_norm=False)),
range=100.).astype('float')
ret_var = Variable(torch.Tensor((value, )))
if (self.use_gpu):
ret_var = ret_var.cuda()
return ret_var
class DSSIM(FakeNet):
def forward(self, in0, in1, retPerLayer=None):
assert (in0.size()[0] == 1) # currently only supports batchSize 1
if (self.colorspace == 'RGB'):
value = util.dssim(
1. * util.tensor2im(in0.data),
1. * util.tensor2im(in1.data),
range=255.).astype('float')
elif (self.colorspace == 'Lab'):
value = util.dssim(
util.tensor2np(util.tensor2tensorlab(in0.data, to_norm=False)),
util.tensor2np(util.tensor2tensorlab(in1.data, to_norm=False)),
range=100.).astype('float')
ret_var = Variable(torch.Tensor((value, )))
if (self.use_gpu):
ret_var = ret_var.cuda()
return ret_var
def print_network(net):
num_params = 0
for param in net.parameters():
num_params += param.numel()
print('Network', net)
print('Total number of parameters: %d' % num_params)
================================================
FILE: models/archs/stylegan2/lpips/pretrained_networks.py
================================================
from collections import namedtuple
import torch
from torchvision import models as tv
class squeezenet(torch.nn.Module):
def __init__(self, requires_grad=False, pretrained=True):
super(squeezenet, self).__init__()
pretrained_features = tv.squeezenet1_1(pretrained=pretrained).features
self.slice1 = torch.nn.Sequential()
self.slice2 = torch.nn.Sequential()
self.slice3 = torch.nn.Sequential()
self.slice4 = torch.nn.Sequential()
self.slice5 = torch.nn.Sequential()
self.slice6 = torch.nn.Sequential()
self.slice7 = torch.nn.Sequential()
self.N_slices = 7
for x in range(2):
self.slice1.add_module(str(x), pretrained_features[x])
for x in range(2, 5):
self.slice2.add_module(str(x), pretrained_features[x])
for x in range(5, 8):
self.slice3.add_module(str(x), pretrained_features[x])
for x in range(8, 10):
self.slice4.add_module(str(x), pretrained_features[x])
for x in range(10, 11):
self.slice5.add_module(str(x), pretrained_features[x])
for x in range(11, 12):
self.slice6.add_module(str(x), pretrained_features[x])
for x in range(12, 13):
self.slice7.add_module(str(x), pretrained_features[x])
if not requires_grad:
for param in self.parameters():
param.requires_grad = False
def forward(self, X):
h = self.slice1(X)
h_relu1 = h
h = self.slice2(h)
h_relu2 = h
h = self.slice3(h)
h_relu3 = h
h = self.slice4(h)
h_relu4 = h
h = self.slice5(h)
h_relu5 = h
h = self.slice6(h)
h_relu6 = h
h = self.slice7(h)
h_relu7 = h
vgg_outputs = namedtuple(
"SqueezeOutputs",
['relu1', 'relu2', 'relu3', 'relu4', 'relu5', 'relu6', 'relu7'])
out = vgg_outputs(h_relu1, h_relu2, h_relu3, h_relu4, h_relu5, h_relu6,
h_relu7)
return out
class alexnet(torch.nn.Module):
def __init__(self, requires_grad=False, pretrained=True):
super(alexnet, self).__init__()
alexnet_pretrained_features = tv.alexnet(
pretrained=pretrained).features
self.slice1 = torch.nn.Sequential()
self.slice2 = torch.nn.Sequential()
self.slice3 = torch.nn.Sequential()
self.slice4 = torch.nn.Sequential()
self.slice5 = torch.nn.Sequential()
self.N_slices = 5
for x in range(2):
self.slice1.add_module(str(x), alexnet_pretrained_features[x])
for x in range(2, 5):
self.slice2.add_module(str(x), alexnet_pretrained_features[x])
for x in range(5, 8):
self.slice3.add_module(str(x), alexnet_pretrained_features[x])
for x in range(8, 10):
self.slice4.add_module(str(x), alexnet_pretrained_features[x])
for x in range(10, 12):
self.slice5.add_module(str(x), alexnet_pretrained_features[x])
if not requires_grad:
for param in self.parameters():
param.requires_grad = False
def forward(self, X):
h = self.slice1(X)
h_relu1 = h
h = self.slice2(h)
h_relu2 = h
h = self.slice3(h)
h_relu3 = h
h = self.slice4(h)
h_relu4 = h
h = self.slice5(h)
h_relu5 = h
alexnet_outputs = namedtuple(
"AlexnetOutputs", ['relu1', 'relu2', 'relu3', 'relu4', 'relu5'])
out = alexnet_outputs(h_relu1, h_relu2, h_relu3, h_relu4, h_relu5)
return out
class vgg16(torch.nn.Module):
def __init__(self, requires_grad=False, pretrained=True):
super(vgg16, self).__init__()
vgg_pretrained_features = tv.vgg16(pretrained=pretrained).features
self.slice1 = torch.nn.Sequential()
self.slice2 = torch.nn.Sequential()
self.slice3 = torch.nn.Sequential()
self.slice4 = torch.nn.Sequential()
self.slice5 = torch.nn.Sequential()
self.N_slices = 5
for x in range(4):
self.slice1.add_module(str(x), vgg_pretrained_features[x])
for x in range(4, 9):
self.slice2.add_module(str(x), vgg_pretrained_features[x])
for x in range(9, 16):
self.slice3.add_module(str(x), vgg_pretrained_features[x])
for x in range(16, 23):
self.slice4.add_module(str(x), vgg_pretrained_features[x])
for x in range(23, 30):
self.slice5.add_module(str(x), vgg_pretrained_features[x])
if not requires_grad:
for param in self.parameters():
param.requires_grad = False
def forward(self, X):
h = self.slice1(X)
h_relu1_2 = h
h = self.slice2(h)
h_relu2_2 = h
h = self.slice3(h)
h_relu3_3 = h
h = self.slice4(h)
h_relu4_3 = h
h = self.slice5(h)
h_relu5_3 = h
vgg_outputs = namedtuple(
"VggOutputs",
['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3', 'relu5_3'])
out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3,
h_relu5_3)
return out
class resnet(torch.nn.Module):
def __init__(self, requires_grad=False, pretrained=True, num=18):
super(resnet, self).__init__()
if (num == 18):
self.net = tv.resnet18(pretrained=pretrained)
elif (num == 34):
self.net = tv.resnet34(pretrained=pretrained)
elif (num == 50):
self.net = tv.resnet50(pretrained=pretrained)
elif (num == 101):
self.net = tv.resnet101(pretrained=pretrained)
elif (num == 152):
self.net = tv.resnet152(pretrained=pretrained)
self.N_slices = 5
self.conv1 = self.net.conv1
self.bn1 = self.net.bn1
self.relu = self.net.relu
self.maxpool = self.net.maxpool
self.layer1 = self.net.layer1
self.layer2 = self.net.layer2
self.layer3 = self.net.layer3
self.layer4 = self.net.layer4
def forward(self, X):
h = self.conv1(X)
h = self.bn1(h)
h = self.relu(h)
h_relu1 = h
h = self.maxpool(h)
h = self.layer1(h)
h_conv2 = h
h = self.layer2(h)
h_conv3 = h
h = self.layer3(h)
h_conv4 = h
h = self.layer4(h)
h_conv5 = h
outputs = namedtuple("Outputs",
['relu1', 'conv2', 'conv3', 'conv4', 'conv5'])
out = outputs(h_relu1, h_conv2, h_conv3, h_conv4, h_conv5)
return out
================================================
FILE: models/archs/stylegan2/model.py
================================================
import functools
import math
import operator
import random
import sys
import torch
from models.archs.stylegan2.op import (FusedLeakyReLU, fused_leaky_relu,
upfirdn2d)
from torch import nn
from torch.autograd import Function
from torch.nn import functional as F
class PixelNorm(nn.Module):
def __init__(self):
super().__init__()
def forward(self, input):
return input * torch.rsqrt(
torch.mean(input**2, dim=1, keepdim=True) + 1e-8)
def make_kernel(k):
k = torch.tensor(k, dtype=torch.float32)
if k.ndim == 1:
k = k[None, :] * k[:, None]
k /= k.sum()
return k
class Upsample(nn.Module):
def __init__(self, kernel, factor=2):
super().__init__()
self.factor = factor
kernel = make_kernel(kernel) * (factor**2)
self.register_buffer("kernel", kernel)
p = kernel.shape[0] - factor
pad0 = (p + 1) // 2 + factor - 1
pad1 = p // 2
self.pad = (pad0, pad1)
def forward(self, input):
out = upfirdn2d(
input, self.kernel, up=self.factor, down=1, pad=self.pad)
return out
class Downsample(nn.Module):
def __init__(self, kernel, factor=2):
super().__init__()
self.factor = factor
kernel = make_kernel(kernel)
self.register_buffer("kernel", kernel)
p = kernel.shape[0] - factor
pad0 = (p + 1) // 2
pad1 = p // 2
self.pad = (pad0, pad1)
def forward(self, input):
out = upfirdn2d(
input, self.kernel, up=1, down=self.factor, pad=self.pad)
return out
class Blur(nn.Module):
def __init__(self, kernel, pad, upsample_factor=1):
super().__init__()
kernel = make_kernel(kernel)
if upsample_factor > 1:
kernel = kernel * (upsample_factor**2)
self.register_buffer("kernel", kernel)
self.pad = pad
def forward(self, input):
out = upfirdn2d(input, self.kernel, pad=self.pad)
return out
class EqualConv2d(nn.Module):
def __init__(self,
in_channel,
out_channel,
kernel_size,
stride=1,
padding=0,
bias=True):
super().__init__()
self.weight = nn.Parameter(
torch.randn(out_channel, in_channel, kernel_size, kernel_size))
self.scale = 1 / math.sqrt(in_channel * kernel_size**2)
self.stride = stride
self.padding = padding
if bias:
self.bias = nn.Parameter(torch.zeros(out_channel))
else:
self.bias = None
def forward(self, input):
out = F.conv2d(
input,
self.weight * self.scale,
bias=self.bias,
stride=self.stride,
padding=self.padding,
)
return out
def __repr__(self):
return (
f"{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]},"
f" {self.weight.shape[2]}, stride={self.stride}, padding={self.padding})"
)
class EqualLinear(nn.Module):
def __init__(self,
in_dim,
out_dim,
bias=True,
bias_init=0,
lr_mul=1,
activation=None):
super().__init__()
self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul))
if bias:
self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init))
else:
self.bias = None
self.activation = activation
self.scale = (1 / math.sqrt(in_dim)) * lr_mul
self.lr_mul = lr_mul
def forward(self, input):
if self.activation:
out = F.linear(input, self.weight * self.scale)
out = fused_leaky_relu(out, self.bias * self.lr_mul)
else:
out = F.linear(
input, self.weight * self.scale, bias=self.bias * self.lr_mul)
return out
def __repr__(self):
return (
f"{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]})"
)
class ModulatedConv2d(nn.Module):
def __init__(
self,
in_channel,
out_channel,
kernel_size,
style_dim,
demodulate=True,
upsample=False,
downsample=False,
blur_kernel=[1, 3, 3, 1],
):
super().__init__()
self.eps = 1e-8
self.kernel_size = kernel_size
self.in_channel = in_channel
self.out_channel = out_channel
self.upsample = upsample
self.downsample = downsample
if upsample:
factor = 2
p = (len(blur_kernel) - factor) - (kernel_size - 1)
pad0 = (p + 1) // 2 + factor - 1
pad1 = p // 2 + 1
self.blur = Blur(
blur_kernel, pad=(pad0, pad1), upsample_factor=factor)
if downsample:
factor = 2
p = (len(blur_kernel) - factor) + (kernel_size - 1)
pad0 = (p + 1) // 2
pad1 = p // 2
self.blur = Blur(blur_kernel, pad=(pad0, pad1))
fan_in = in_channel * kernel_size**2
self.scale = 1 / math.sqrt(fan_in)
self.padding = kernel_size // 2
self.weight = nn.Parameter(
torch.randn(1, out_channel, in_channel, kernel_size, kernel_size))
self.modulation = EqualLinear(style_dim, in_channel, bias_init=1)
self.demodulate = demodulate
def __repr__(self):
return (
f"{self.__class__.__name__}({self.in_channel}, {self.out_channel}, {self.kernel_size}, "
f"upsample={self.upsample}, downsample={self.downsample})")
def forward(self, input, style):
batch, in_channel, height, width = input.shape
style = self.modulation(style).view(batch, 1, in_channel, 1, 1)
weight = self.scale * self.weight * style
if self.demodulate:
demod = torch.rsqrt(weight.pow(2).sum([2, 3, 4]) + 1e-8)
weight = weight * demod.view(batch, self.out_channel, 1, 1, 1)
weight = weight.view(batch * self.out_channel, in_channel,
self.kernel_size, self.kernel_size)
if self.upsample:
input = input.view(1, batch * in_channel, height, width)
weight = weight.view(batch, self.out_channel, in_channel,
self.kernel_size, self.kernel_size)
weight = weight.transpose(1, 2).reshape(batch * in_channel,
self.out_channel,
self.kernel_size,
self.kernel_size)
out = F.conv_transpose2d(
input, weight, padding=0, stride=2, groups=batch)
_, _, height, width = out.shape
out = out.view(batch, self.out_channel, height, width)
out = self.blur(out)
elif self.downsample:
input = self.blur(input)
_, _, height, width = input.shape
input = input.view(1, batch * in_channel, height, width)
out = F.conv2d(input, weight, padding=0, stride=2, groups=batch)
_, _, height, width = out.shape
out = out.view(batch, self.out_channel, height, width)
else:
input = input.view(1, batch * in_channel, height, width)
out = F.conv2d(input, weight, padding=self.padding, groups=batch)
_, _, height, width = out.shape
out = out.view(batch, self.out_channel, height, width)
return out
class NoiseInjection(nn.Module):
def __init__(self):
super().__init__()
self.weight = nn.Parameter(torch.zeros(1))
def forward(self, image, noise=None):
if noise is None:
batch, _, height, width = image.shape
noise = image.new_empty(batch, 1, height, width).normal_()
return image + self.weight * noise
class ConstantInput(nn.Module):
def __init__(self, channel, size=4):
super().__init__()
self.input = nn.Parameter(torch.randn(1, channel, size, size))
def forward(self, input):
batch = input.shape[0]
out = self.input.repeat(batch, 1, 1, 1)
return out
class StyledConv(nn.Module):
def __init__(
self,
in_channel,
out_channel,
kernel_size,
style_dim,
upsample=False,
blur_kernel=[1, 3, 3, 1],
demodulate=True,
):
super().__init__()
self.conv = ModulatedConv2d(
in_channel,
out_channel,
kernel_size,
style_dim,
upsample=upsample,
blur_kernel=blur_kernel,
demodulate=demodulate,
)
self.noise = NoiseInjection()
# self.bias = nn.Parameter(torch.zeros(1, out_channel, 1, 1))
# self.activate = ScaledLeakyReLU(0.2)
self.activate = FusedLeakyReLU(out_channel)
def forward(self, input, style, noise=None):
out = self.conv(input, style)
out = self.noise(out, noise=noise)
# out = out + self.bias
out = self.activate(out)
return out
class ToRGB(nn.Module):
def __init__(self,
in_channel,
style_dim,
upsample=True,
blur_kernel=[1, 3, 3, 1]):
super().__init__()
if upsample:
self.upsample = Upsample(blur_kernel)
self.conv = ModulatedConv2d(
in_channel, 3, 1, style_dim, demodulate=False)
self.bias = nn.Parameter(torch.zeros(1, 3, 1, 1))
def forward(self, input, style, skip=None):
out = self.conv(input, style)
out = out + self.bias
if skip is not None:
skip = self.upsample(skip)
out = out + skip
return out
class Generator(nn.Module):
def __init__(
self,
size,
style_dim,
n_mlp,
channel_multiplier=2,
blur_kernel=[1, 3, 3, 1],
lr_mlp=0.01,
):
super().__init__()
self.size = size
self.style_dim = style_dim
layers = [PixelNorm()]
for i in range(n_mlp):
layers.append(
EqualLinear(
style_dim,
style_dim,
lr_mul=lr_mlp,
activation="fused_lrelu"))
# self.style = nn.Sequential(*layers)
self.style = nn.ModuleList(layers)
self.channels = {
4: 512,
8: 512,
16: 512,
32: 512,
64: 256 * channel_multiplier,
128: 128 * channel_multiplier,
256: 64 * channel_multiplier,
512: 32 * channel_multiplier,
1024: 16 * channel_multiplier,
}
self.input = ConstantInput(self.channels[4])
self.conv1 = StyledConv(
self.channels[4],
self.channels[4],
3,
style_dim,
blur_kernel=blur_kernel)
self.to_rgb1 = ToRGB(self.channels[4], style_dim, upsample=False)
self.log_size = int(math.log(size, 2))
self.num_layers = (self.log_size - 2) * 2 + 1
self.convs = nn.ModuleList()
self.upsamples = nn.ModuleList()
self.to_rgbs = nn.ModuleList()
self.noises = nn.Module()
in_channel = self.channels[4]
for layer_idx in range(self.num_layers):
res = (layer_idx + 5) // 2
shape = [1, 1, 2**res, 2**res]
self.noises.register_buffer(f"noise_{layer_idx}",
torch.randn(*shape))
for i in range(3, self.log_size + 1):
out_channel = self.channels[2**i]
self.convs.append(
StyledConv(
in_channel,
out_channel,
3,
style_dim,
upsample=True,
blur_kernel=blur_kernel,
))
self.convs.append(
StyledConv(
out_channel,
out_channel,
3,
style_dim,
blur_kernel=blur_kernel))
self.to_rgbs.append(ToRGB(out_channel, style_dim))
in_channel = out_channel
self.n_latent = self.log_size * 2 - 2
def make_noise(self):
device = self.input.input.device
noises = [torch.randn(1, 1, 2**2, 2**2, device=device)]
for i in range(3, self.log_size + 1):
for _ in range(2):
noises.append(torch.randn(1, 1, 2**i, 2**i, device=device))
return noises
def mean_latent(self, n_latent):
latent_in = torch.randn(
n_latent, self.style_dim, device=self.input.input.device)
latent = self.style_forward(latent_in).mean(0, keepdim=True)
return latent
def get_latent(self, input):
out = input
for i, layer in enumerate(self.style):
out = layer(out)
return out
def style_forward(self, input, skip_norm=False):
out = input
for i, layer in enumerate(self.style):
if i == 0 and skip_norm:
continue
out = layer(out)
return out
def forward(
self,
styles,
return_latents=False,
inject_index=None,
truncation=1,
truncation_latent=None,
input_is_latent=False,
noise=None,
randomize_noise=True,
):
if not input_is_latent:
styles = [self.style_forward(s) for s in styles]
if noise is None:
if randomize_noise:
noise = [None] * self.num_layers
else:
noise = [
getattr(self.noises, f"noise_{i}")
for i in range(self.num_layers)
]
if truncation < 1:
style_t = []
for style in styles:
style_t.append(truncation_latent + truncation *
(style - truncation_latent))
styles = style_t
if len(styles) < 2:
inject_index = self.n_latent
if styles[0].ndim < 3:
latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
else:
latent = styles[0]
else:
if inject_index is None:
inject_index = random.randint(1, self.n_latent - 1)
latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
latent2 = styles[1].unsqueeze(1).repeat(
1, self.n_latent - inject_index, 1)
latent = torch.cat([latent, latent2], 1)
out = self.input(latent)
out = self.conv1(out, latent[:, 0], noise=noise[0])
skip = self.to_rgb1(out, latent[:, 1])
i = 1
for conv1, conv2, noise1, noise2, to_rgb in zip(
self.convs[::2], self.convs[1::2], noise[1::2], noise[2::2],
self.to_rgbs):
out = conv1(out, latent[:, i], noise=noise1)
out = conv2(out, latent[:, i + 1], noise=noise2)
skip = to_rgb(out, latent[:, i + 2], skip)
i += 2
image = skip
if return_latents:
return image, latent
else:
return image, None
class ConvLayer(nn.Sequential):
def __init__(
self,
in_channel,
out_channel,
kernel_size,
downsample=False,
blur_kernel=[1, 3, 3, 1],
bias=True,
activate=True,
):
layers = []
if downsample:
factor = 2
p = (len(blur_kernel) - factor) + (kernel_size - 1)
pad0 = (p + 1) // 2
pad1 = p // 2
layers.append(Blur(blur_kernel, pad=(pad0, pad1)))
stride = 2
self.padding = 0
else:
stride = 1
self.padding = kernel_size // 2
layers.append(
EqualConv2d(
in_channel,
out_channel,
kernel_size,
padding=self.padding,
stride=stride,
bias=bias and not activate,
))
if activate:
layers.append(FusedLeakyReLU(out_channel, bias=bias))
super().__init__(*layers)
class ResBlock(nn.Module):
def __init__(self, in_channel, out_channel, blur_kernel=[1, 3, 3, 1]):
super().__init__()
self.conv1 = ConvLayer(in_channel, in_channel, 3)
self.conv2 = ConvLayer(in_channel, out_channel, 3, downsample=True)
self.skip = ConvLayer(
in_channel,
out_channel,
1,
downsample=True,
activate=False,
bias=False)
def forward(self, input):
out = self.conv1(input)
out = self.conv2(out)
skip = self.skip(input)
out = (out + skip) / math.sqrt(2)
return out
class Discriminator(nn.Module):
def __init__(self, size, channel_multiplier=2, blur_kernel=[1, 3, 3, 1]):
super().__init__()
channels = {
4: 512,
8: 512,
16: 512,
32: 512,
64: 256 * channel_multiplier,
128: 128 * channel_multiplier,
256: 64 * channel_multiplier,
512: 32 * channel_multiplier,
1024: 16 * channel_multiplier,
}
convs = [ConvLayer(3, channels[size], 1)]
log_size = int(math.log(size, 2))
in_channel = channels[size]
for i in range(log_size, 2, -1):
out_channel = channels[2**(i - 1)]
convs.append(ResBlock(in_channel, out_channel, blur_kernel))
in_channel = out_channel
self.convs = nn.Sequential(*convs)
self.stddev_group = 4
self.stddev_feat = 1
self.final_conv = ConvLayer(in_channel + 1, channels[4], 3)
self.final_linear = nn.Sequential(
EqualLinear(
channels[4] * 4 * 4, channels[4], activation="fused_lrelu"),
EqualLinear(channels[4], 1),
)
def forward(self, input):
out = self.convs(input)
batch, channel, height, width = out.shape
group = min(batch, self.stddev_group)
stddev = out.view(group, -1, self.stddev_feat,
channel // self.stddev_feat, height, width)
stddev = torch.sqrt(stddev.var(0, unbiased=False) + 1e-8)
stddev = stddev.mean([2, 3, 4], keepdims=True).squeeze(2)
stddev = stddev.repeat(group, 1, height, width)
out = torch.cat([out, stddev], 1)
out = self.final_conv(out)
out = out.view(batch, -1)
out = self.final_linear(out)
return out
================================================
FILE: models/archs/stylegan2/non_leaking.py
================================================
import math
import torch
from torch.nn import functional as F
from distributed import reduce_sum
from op import upfirdn2d
class AdaptiveAugment:
def __init__(self, ada_aug_target, ada_aug_len, update_every, device):
self.ada_aug_target = ada_aug_target
self.ada_aug_len = ada_aug_len
self.update_every = update_every
self.ada_aug_buf = torch.tensor([0.0, 0.0], device=device)
self.r_t_stat = 0
self.ada_aug_p = 0
@torch.no_grad()
def tune(self, real_pred):
ada_aug_data = torch.tensor(
(torch.sign(real_pred).sum().item(), real_pred.shape[0]),
device=real_pred.device,
)
self.ada_aug_buf += reduce_sum(ada_aug_data)
if self.ada_aug_buf[1] > self.update_every - 1:
pred_signs, n_pred = self.ada_aug_buf.tolist()
self.r_t_stat = pred_signs / n_pred
if self.r_t_stat > self.ada_aug_target:
sign = 1
else:
sign = -1
self.ada_aug_p += sign * n_pred / self.ada_aug_len
self.ada_aug_p = min(1, max(0, self.ada_aug_p))
self.ada_aug_buf.mul_(0)
return self.ada_aug_p
SYM6 = (
0.015404109327027373,
0.0034907120842174702,
-0.11799011114819057,
-0.048311742585633,
0.4910559419267466,
0.787641141030194,
0.3379294217276218,
-0.07263752278646252,
-0.021060292512300564,
0.04472490177066578,
0.0017677118642428036,
-0.007800708325034148,
)
def translate_mat(t_x, t_y):
batch = t_x.shape[0]
mat = torch.eye(3).unsqueeze(0).repeat(batch, 1, 1)
translate = torch.stack((t_x, t_y), 1)
mat[:, :2, 2] = translate
return mat
def rotate_mat(theta):
batch = theta.shape[0]
mat = torch.eye(3).unsqueeze(0).repeat(batch, 1, 1)
sin_t = torch.sin(theta)
cos_t = torch.cos(theta)
rot = torch.stack((cos_t, -sin_t, sin_t, cos_t), 1).view(batch, 2, 2)
mat[:, :2, :2] = rot
return mat
def scale_mat(s_x, s_y):
batch = s_x.shape[0]
mat = torch.eye(3).unsqueeze(0).repeat(batch, 1, 1)
mat[:, 0, 0] = s_x
mat[:, 1, 1] = s_y
return mat
def translate3d_mat(t_x, t_y, t_z):
batch = t_x.shape[0]
mat = torch.eye(4).unsqueeze(0).repeat(batch, 1, 1)
translate = torch.stack((t_x, t_y, t_z), 1)
mat[:, :3, 3] = translate
return mat
def rotate3d_mat(axis, theta):
batch = theta.shape[0]
u_x, u_y, u_z = axis
eye = torch.eye(3).unsqueeze(0)
cross = torch.tensor([(0, -u_z, u_y), (u_z, 0, -u_x), (-u_y, u_x, 0)]).unsqueeze(0)
outer = torch.tensor(axis)
outer = (outer.unsqueeze(1) * outer).unsqueeze(0)
sin_t = torch.sin(theta).view(-1, 1, 1)
cos_t = torch.cos(theta).view(-1, 1, 1)
rot = cos_t * eye + sin_t * cross + (1 - cos_t) * outer
eye_4 = torch.eye(4).unsqueeze(0).repeat(batch, 1, 1)
eye_4[:, :3, :3] = rot
return eye_4
def scale3d_mat(s_x, s_y, s_z):
batch = s_x.shape[0]
mat = torch.eye(4).unsqueeze(0).repeat(batch, 1, 1)
mat[:, 0, 0] = s_x
mat[:, 1, 1] = s_y
mat[:, 2, 2] = s_z
return mat
def luma_flip_mat(axis, i):
batch = i.shape[0]
eye = torch.eye(4).unsqueeze(0).repeat(batch, 1, 1)
axis = torch.tensor(axis + (0,))
flip = 2 * torch.ger(axis, axis) * i.view(-1, 1, 1)
return eye - flip
def saturation_mat(axis, i):
batch = i.shape[0]
eye = torch.eye(4).unsqueeze(0).repeat(batch, 1, 1)
axis = torch.tensor(axis + (0,))
axis = torch.ger(axis, axis)
saturate = axis + (eye - axis) * i.view(-1, 1, 1)
return saturate
def lognormal_sample(size, mean=0, std=1):
return torch.empty(size).log_normal_(mean=mean, std=std)
def category_sample(size, categories):
category = torch.tensor(categories)
sample = torch.randint(high=len(categories), size=(size,))
return category[sample]
def uniform_sample(size, low, high):
return torch.empty(size).uniform_(low, high)
def normal_sample(size, mean=0, std=1):
return torch.empty(size).normal_(mean, std)
def bernoulli_sample(size, p):
return torch.empty(size).bernoulli_(p)
def random_mat_apply(p, transform, prev, eye):
size = transform.shape[0]
select = bernoulli_sample(size, p).view(size, 1, 1)
select_transform = select * transform + (1 - select) * eye
return select_transform @ prev
def sample_affine(p, size, height, width):
G = torch.eye(3).unsqueeze(0).repeat(size, 1, 1)
eye = G
# flip
param = category_sample(size, (0, 1))
Gc = scale_mat(1 - 2.0 * param, torch.ones(size))
G = random_mat_apply(p, Gc, G, eye)
# print('flip', G, scale_mat(1 - 2.0 * param, torch.ones(size)), sep='\n')
# 90 rotate
param = category_sample(size, (0, 3))
Gc = rotate_mat(-math.pi / 2 * param)
G = random_mat_apply(p, Gc, G, eye)
# print('90 rotate', G, rotate_mat(-math.pi / 2 * param), sep='\n')
# integer translate
param = uniform_sample(size, -0.125, 0.125)
param_height = torch.round(param * height) / height
param_width = torch.round(param * width) / width
Gc = translate_mat(param_width, param_height)
G = random_mat_apply(p, Gc, G, eye)
# print('integer translate', G, translate_mat(param_width, param_height), sep='\n')
# isotropic scale
param = lognormal_sample(size, std=0.2 * math.log(2))
Gc = scale_mat(param, param)
G = random_mat_apply(p, Gc, G, eye)
# print('isotropic scale', G, scale_mat(param, param), sep='\n')
p_rot = 1 - math.sqrt(1 - p)
# pre-rotate
param = uniform_sample(size, -math.pi, math.pi)
Gc = rotate_mat(-param)
G = random_mat_apply(p_rot, Gc, G, eye)
# print('pre-rotate', G, rotate_mat(-param), sep='\n')
# anisotropic scale
param = lognormal_sample(size, std=0.2 * math.log(2))
Gc = scale_mat(param, 1 / param)
G = random_mat_apply(p, Gc, G, eye)
# print('anisotropic scale', G, scale_mat(param, 1 / param), sep='\n')
# post-rotate
param = uniform_sample(size, -math.pi, math.pi)
Gc = rotate_mat(-param)
G = random_mat_apply(p_rot, Gc, G, eye)
# print('post-rotate', G, rotate_mat(-param), sep='\n')
# fractional translate
param = normal_sample(size, std=0.125)
Gc = translate_mat(param, param)
G = random_mat_apply(p, Gc, G, eye)
# print('fractional translate', G, translate_mat(param, param), sep='\n')
return G
def sample_color(p, size):
C = torch.eye(4).unsqueeze(0).repeat(size, 1, 1)
eye = C
axis_val = 1 / math.sqrt(3)
axis = (axis_val, axis_val, axis_val)
# brightness
param = normal_sample(size, std=0.2)
Cc = translate3d_mat(param, param, param)
C = random_mat_apply(p, Cc, C, eye)
# contrast
param = lognormal_sample(size, std=0.5 * math.log(2))
Cc = scale3d_mat(param, param, param)
C = random_mat_apply(p, Cc, C, eye)
# luma flip
param = category_sample(size, (0, 1))
Cc = luma_flip_mat(axis, param)
C = random_mat_apply(p, Cc, C, eye)
# hue rotation
param = uniform_sample(size, -math.pi, math.pi)
Cc = rotate3d_mat(axis, param)
C = random_mat_apply(p, Cc, C, eye)
# saturation
param = lognormal_sample(size, std=1 * math.log(2))
Cc = saturation_mat(axis, param)
C = random_mat_apply(p, Cc, C, eye)
return C
def make_grid(shape, x0, x1, y0, y1, device):
n, c, h, w = shape
grid = torch.empty(n, h, w, 3, device=device)
grid[:, :, :, 0] = torch.linspace(x0, x1, w, device=device)
grid[:, :, :, 1] = torch.linspace(y0, y1, h, device=device).unsqueeze(-1)
grid[:, :, :, 2] = 1
return grid
def affine_grid(grid, mat):
n, h, w, _ = grid.shape
return (grid.view(n, h * w, 3) @ mat.transpose(1, 2)).view(n, h, w, 2)
def get_padding(G, height, width):
extreme = (
G[:, :2, :]
@ torch.tensor([(-1.0, -1, 1), (-1, 1, 1), (1, -1, 1), (1, 1, 1)]).t()
)
size = torch.tensor((width, height))
pad_low = (
((extreme.min(-1).values + 1) * size)
.clamp(max=0)
.abs()
.ceil()
.max(0)
.values.to(torch.int64)
.tolist()
)
pad_high = (
(extreme.max(-1).values * size - size)
.clamp(min=0)
.ceil()
.max(0)
.values.to(torch.int64)
.tolist()
)
return pad_low[0], pad_high[0], pad_low[1], pad_high[1]
def try_sample_affine_and_pad(img, p, pad_k, G=None):
batch, _, height, width = img.shape
G_try = G
while True:
if G is None:
G_try = sample_affine(p, batch, height, width)
pad_x1, pad_x2, pad_y1, pad_y2 = get_padding(
torch.inverse(G_try), height, width
)
try:
img_pad = F.pad(
img,
(pad_x1 + pad_k, pad_x2 + pad_k, pad_y1 + pad_k, pad_y2 + pad_k),
mode="reflect",
)
except RuntimeError:
continue
break
return img_pad, G_try, (pad_x1, pad_x2, pad_y1, pad_y2)
def random_apply_affine(img, p, G=None, antialiasing_kernel=SYM6):
kernel = antialiasing_kernel
len_k = len(kernel)
pad_k = (len_k + 1) // 2
kernel = torch.as_tensor(kernel)
kernel = torch.ger(kernel, kernel).to(img)
kernel_flip = torch.flip(kernel, (0, 1))
img_pad, G, (pad_x1, pad_x2, pad_y1, pad_y2) = try_sample_affine_and_pad(
img, p, pad_k, G
)
p_ux1 = pad_x1
p_ux2 = pad_x2 + 1
p_uy1 = pad_y1
p_uy2 = pad_y2 + 1
w_p = img_pad.shape[3] - len_k + 1
h_p = img_pad.shape[2] - len_k + 1
h_o = img.shape[2]
w_o = img.shape[3]
img_2x = upfirdn2d(img_pad, kernel_flip, up=2)
grid = make_grid(
img_2x.shape,
-2 * p_ux1 / w_o - 1,
2 * (w_p - p_ux1) / w_o - 1,
-2 * p_uy1 / h_o - 1,
2 * (h_p - p_uy1) / h_o - 1,
device=img_2x.device,
).to(img_2x)
grid = affine_grid(grid, torch.inverse(G)[:, :2, :].to(img_2x))
grid = grid * torch.tensor(
[w_o / w_p, h_o / h_p], device=grid.device
) + torch.tensor(
[(w_o + 2 * p_ux1) / w_p - 1, (h_o + 2 * p_uy1) / h_p - 1], device=grid.device
)
img_affine = F.grid_sample(
img_2x, grid, mode="bilinear", align_corners=False, padding_mode="zeros"
)
img_down = upfirdn2d(img_affine, kernel, down=2)
end_y = -pad_y2 - 1
if end_y == 0:
end_y = img_down.shape[2]
end_x = -pad_x2 - 1
if end_x == 0:
end_x = img_down.shape[3]
img = img_down[:, :, pad_y1:end_y, pad_x1:end_x]
return img, G
def apply_color(img, mat):
batch = img.shape[0]
img = img.permute(0, 2, 3, 1)
mat_mul = mat[:, :3, :3].transpose(1, 2).view(batch, 1, 3, 3)
mat_add = mat[:, :3, 3].view(batch, 1, 1, 3)
img = img @ mat_mul + mat_add
img = img.permute(0, 3, 1, 2)
return img
def random_apply_color(img, p, C=None):
if C is None:
C = sample_color(p, img.shape[0])
img = apply_color(img, C.to(img))
return img, C
def augment(img, p, transform_matrix=(None, None)):
img, G = random_apply_affine(img, p, transform_matrix[0])
img, C = random_apply_color(img, p, transform_matrix[1])
return img, (G, C)
================================================
FILE: models/archs/stylegan2/op/__init__.py
================================================
from .fused_act import FusedLeakyReLU, fused_leaky_relu
from .upfirdn2d import upfirdn2d
================================================
FILE: models/archs/stylegan2/op/fused_act.py
================================================
import os
import torch
from torch import nn
from torch.nn import functional as F
from torch.autograd import Function
from torch.utils.cpp_extension import load
module_path = os.path.dirname(__file__)
fused = load(
"fused",
sources=[
os.path.join(module_path, "fused_bias_act.cpp"),
os.path.join(module_path, "fused_bias_act_kernel.cu"),
],
)
class FusedLeakyReLUFunctionBackward(Function):
@staticmethod
def forward(ctx, grad_output, out, bias, negative_slope, scale):
ctx.save_for_backward(out)
ctx.negative_slope = negative_slope
ctx.scale = scale
empty = grad_output.new_empty(0)
grad_input = fused.fused_bias_act(
grad_output, empty, out, 3, 1, negative_slope, scale
)
dim = [0]
if grad_input.ndim > 2:
dim += list(range(2, grad_input.ndim))
if bias:
grad_bias = grad_input.sum(dim).detach()
else:
grad_bias = empty
return grad_input, grad_bias
@staticmethod
def backward(ctx, gradgrad_input, gradgrad_bias):
out, = ctx.saved_tensors
gradgrad_out = fused.fused_bias_act(
gradgrad_input, gradgrad_bias, out, 3, 1, ctx.negative_slope, ctx.scale
)
return gradgrad_out, None, None, None, None
class FusedLeakyReLUFunction(Function):
@staticmethod
def forward(ctx, input, bias, negative_slope, scale):
empty = input.new_empty(0)
ctx.bias = bias is not None
if bias is None:
bias = empty
out = fused.fused_bias_act(input, bias, empty, 3, 0, negative_slope, scale)
ctx.save_for_backward(out)
ctx.negative_slope = negative_slope
ctx.scale = scale
return out
@staticmethod
def backward(ctx, grad_output):
out, = ctx.saved_tensors
grad_input, grad_bias = FusedLeakyReLUFunctionBackward.apply(
grad_output, out, ctx.bias, ctx.negative_slope, ctx.scale
)
if not ctx.bias:
grad_bias = None
return grad_input, grad_bias, None, None
class FusedLeakyReLU(nn.Module):
def __init__(self, channel, bias=True, negative_slope=0.2, scale=2 ** 0.5):
super().__init__()
if bias:
self.bias = nn.Parameter(torch.zeros(channel))
else:
self.bias = None
self.negative_slope = negative_slope
self.scale = scale
def forward(self, input):
return fused_leaky_relu(input, self.bias, self.negative_slope, self.scale)
def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2 ** 0.5):
if input.device.type == "cpu":
if bias is not None:
rest_dim = [1] * (input.ndim - bias.ndim - 1)
return (
F.leaky_relu(
input + bias.view(1, bias.shape[0], *rest_dim), negative_slope=0.2
)
* scale
)
else:
return F.leaky_relu(input, negative_slope=0.2) * scale
else:
return FusedLeakyReLUFunction.apply(input, bias, negative_slope, scale)
================================================
FILE: models/archs/stylegan2/op/fused_bias_act.cpp
================================================
#include
torch::Tensor fused_bias_act_op(const torch::Tensor& input, const torch::Tensor& bias, const torch::Tensor& refer,
int act, int grad, float alpha, float scale);
#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
torch::Tensor fused_bias_act(const torch::Tensor& input, const torch::Tensor& bias, const torch::Tensor& refer,
int act, int grad, float alpha, float scale) {
CHECK_CUDA(input);
CHECK_CUDA(bias);
return fused_bias_act_op(input, bias, refer, act, grad, alpha, scale);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("fused_bias_act", &fused_bias_act, "fused bias act (CUDA)");
}
================================================
FILE: models/archs/stylegan2/op/fused_bias_act_kernel.cu
================================================
// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
//
// This work is made available under the Nvidia Source Code License-NC.
// To view a copy of this license, visit
// https://nvlabs.github.io/stylegan2/license.html
#include
#include
#include
#include
#include
#include
#include
template
static __global__ void fused_bias_act_kernel(scalar_t* out, const scalar_t* p_x, const scalar_t* p_b, const scalar_t* p_ref,
int act, int grad, scalar_t alpha, scalar_t scale, int loop_x, int size_x, int step_b, int size_b, int use_bias, int use_ref) {
int xi = blockIdx.x * loop_x * blockDim.x + threadIdx.x;
scalar_t zero = 0.0;
for (int loop_idx = 0; loop_idx < loop_x && xi < size_x; loop_idx++, xi += blockDim.x) {
scalar_t x = p_x[xi];
if (use_bias) {
x += p_b[(xi / step_b) % size_b];
}
scalar_t ref = use_ref ? p_ref[xi] : zero;
scalar_t y;
switch (act * 10 + grad) {
default:
case 10: y = x; break;
case 11: y = x; break;
case 12: y = 0.0; break;
case 30: y = (x > 0.0) ? x : x * alpha; break;
case 31: y = (ref > 0.0) ? x : x * alpha; break;
case 32: y = 0.0; break;
}
out[xi] = y * scale;
}
}
torch::Tensor fused_bias_act_op(const torch::Tensor& input, const torch::Tensor& bias, const torch::Tensor& refer,
int act, int grad, float alpha, float scale) {
int curDevice = -1;
cudaGetDevice(&curDevice);
cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
auto x = input.contiguous();
auto b = bias.contiguous();
auto ref = refer.contiguous();
int use_bias = b.numel() ? 1 : 0;
int use_ref = ref.numel() ? 1 : 0;
int size_x = x.numel();
int size_b = b.numel();
int step_b = 1;
for (int i = 1 + 1; i < x.dim(); i++) {
step_b *= x.size(i);
}
int loop_x = 4;
int block_size = 4 * 32;
int grid_size = (size_x - 1) / (loop_x * block_size) + 1;
auto y = torch::empty_like(x);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "fused_bias_act_kernel", [&] {
fused_bias_act_kernel<<>>(
y.data_ptr(),
x.data_ptr(),
b.data_ptr(),
ref.data_ptr(),
act,
grad,
alpha,
scale,
loop_x,
size_x,
step_b,
size_b,
use_bias,
use_ref
);
});
return y;
}
================================================
FILE: models/archs/stylegan2/op/upfirdn2d.cpp
================================================
#include
torch::Tensor upfirdn2d_op(const torch::Tensor& input, const torch::Tensor& kernel,
int up_x, int up_y, int down_x, int down_y,
int pad_x0, int pad_x1, int pad_y0, int pad_y1);
#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
torch::Tensor upfirdn2d(const torch::Tensor& input, const torch::Tensor& kernel,
int up_x, int up_y, int down_x, int down_y,
int pad_x0, int pad_x1, int pad_y0, int pad_y1) {
CHECK_CUDA(input);
CHECK_CUDA(kernel);
return upfirdn2d_op(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("upfirdn2d", &upfirdn2d, "upfirdn2d (CUDA)");
}
================================================
FILE: models/archs/stylegan2/op/upfirdn2d.py
================================================
import os
import torch
from torch.nn import functional as F
from torch.autograd import Function
from torch.utils.cpp_extension import load
module_path = os.path.dirname(__file__)
upfirdn2d_op = load(
"upfirdn2d",
sources=[
os.path.join(module_path, "upfirdn2d.cpp"),
os.path.join(module_path, "upfirdn2d_kernel.cu"),
],
)
class UpFirDn2dBackward(Function):
@staticmethod
def forward(
ctx, grad_output, kernel, grad_kernel, up, down, pad, g_pad, in_size, out_size
):
up_x, up_y = up
down_x, down_y = down
g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1 = g_pad
grad_output = grad_output.reshape(-1, out_size[0], out_size[1], 1)
grad_input = upfirdn2d_op.upfirdn2d(
grad_output,
grad_kernel,
down_x,
down_y,
up_x,
up_y,
g_pad_x0,
g_pad_x1,
g_pad_y0,
g_pad_y1,
)
grad_input = grad_input.view(in_size[0], in_size[1], in_size[2], in_size[3])
ctx.save_for_backward(kernel)
pad_x0, pad_x1, pad_y0, pad_y1 = pad
ctx.up_x = up_x
ctx.up_y = up_y
ctx.down_x = down_x
ctx.down_y = down_y
ctx.pad_x0 = pad_x0
ctx.pad_x1 = pad_x1
ctx.pad_y0 = pad_y0
ctx.pad_y1 = pad_y1
ctx.in_size = in_size
ctx.out_size = out_size
return grad_input
@staticmethod
def backward(ctx, gradgrad_input):
kernel, = ctx.saved_tensors
gradgrad_input = gradgrad_input.reshape(-1, ctx.in_size[2], ctx.in_size[3], 1)
gradgrad_out = upfirdn2d_op.upfirdn2d(
gradgrad_input,
kernel,
ctx.up_x,
ctx.up_y,
ctx.down_x,
ctx.down_y,
ctx.pad_x0,
ctx.pad_x1,
ctx.pad_y0,
ctx.pad_y1,
)
# gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.out_size[0], ctx.out_size[1], ctx.in_size[3])
gradgrad_out = gradgrad_out.view(
ctx.in_size[0], ctx.in_size[1], ctx.out_size[0], ctx.out_size[1]
)
return gradgrad_out, None, None, None, None, None, None, None, None
class UpFirDn2d(Function):
@staticmethod
def forward(ctx, input, kernel, up, down, pad):
up_x, up_y = up
down_x, down_y = down
pad_x0, pad_x1, pad_y0, pad_y1 = pad
kernel_h, kernel_w = kernel.shape
batch, channel, in_h, in_w = input.shape
ctx.in_size = input.shape
input = input.reshape(-1, in_h, in_w, 1)
ctx.save_for_backward(kernel, torch.flip(kernel, [0, 1]))
out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
ctx.out_size = (out_h, out_w)
ctx.up = (up_x, up_y)
ctx.down = (down_x, down_y)
ctx.pad = (pad_x0, pad_x1, pad_y0, pad_y1)
g_pad_x0 = kernel_w - pad_x0 - 1
g_pad_y0 = kernel_h - pad_y0 - 1
g_pad_x1 = in_w * up_x - out_w * down_x + pad_x0 - up_x + 1
g_pad_y1 = in_h * up_y - out_h * down_y + pad_y0 - up_y + 1
ctx.g_pad = (g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1)
out = upfirdn2d_op.upfirdn2d(
input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1
)
# out = out.view(major, out_h, out_w, minor)
out = out.view(-1, channel, out_h, out_w)
return out
@staticmethod
def backward(ctx, grad_output):
kernel, grad_kernel = ctx.saved_tensors
grad_input = UpFirDn2dBackward.apply(
grad_output,
kernel,
grad_kernel,
ctx.up,
ctx.down,
ctx.pad,
ctx.g_pad,
ctx.in_size,
ctx.out_size,
)
return grad_input, None, None, None, None
def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
if input.device.type == "cpu":
out = upfirdn2d_native(
input, kernel, up, up, down, down, pad[0], pad[1], pad[0], pad[1]
)
else:
out = UpFirDn2d.apply(
input, kernel, (up, up), (down, down), (pad[0], pad[1], pad[0], pad[1])
)
return out
def upfirdn2d_native(
input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1
):
_, channel, in_h, in_w = input.shape
input = input.reshape(-1, in_h, in_w, 1)
_, in_h, in_w, minor = input.shape
kernel_h, kernel_w = kernel.shape
out = input.view(-1, in_h, 1, in_w, 1, minor)
out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
out = out.view(-1, in_h * up_y, in_w * up_x, minor)
out = F.pad(
out, [0, 0, max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)]
)
out = out[
:,
max(-pad_y0, 0) : out.shape[1] - max(-pad_y1, 0),
max(-pad_x0, 0) : out.shape[2] - max(-pad_x1, 0),
:,
]
out = out.permute(0, 3, 1, 2)
out = out.reshape(
[-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1]
)
w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w)
out = F.conv2d(out, w)
out = out.reshape(
-1,
minor,
in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
)
out = out.permute(0, 2, 3, 1)
out = out[:, ::down_y, ::down_x, :]
out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
return out.view(-1, channel, out_h, out_w)
================================================
FILE: models/archs/stylegan2/op/upfirdn2d_kernel.cu
================================================
// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
//
// This work is made available under the Nvidia Source Code License-NC.
// To view a copy of this license, visit
// https://nvlabs.github.io/stylegan2/license.html
#include
#include
#include
#include
#include
#include
#include
static __host__ __device__ __forceinline__ int floor_div(int a, int b) {
int c = a / b;
if (c * b > a) {
c--;
}
return c;
}
struct UpFirDn2DKernelParams {
int up_x;
int up_y;
int down_x;
int down_y;
int pad_x0;
int pad_x1;
int pad_y0;
int pad_y1;
int major_dim;
int in_h;
int in_w;
int minor_dim;
int kernel_h;
int kernel_w;
int out_h;
int out_w;
int loop_major;
int loop_x;
};
template
__global__ void upfirdn2d_kernel_large(scalar_t *out, const scalar_t *input,
const scalar_t *kernel,
const UpFirDn2DKernelParams p) {
int minor_idx = blockIdx.x * blockDim.x + threadIdx.x;
int out_y = minor_idx / p.minor_dim;
minor_idx -= out_y * p.minor_dim;
int out_x_base = blockIdx.y * p.loop_x * blockDim.y + threadIdx.y;
int major_idx_base = blockIdx.z * p.loop_major;
if (out_x_base >= p.out_w || out_y >= p.out_h ||
major_idx_base >= p.major_dim) {
return;
}
int mid_y = out_y * p.down_y + p.up_y - 1 - p.pad_y0;
int in_y = min(max(floor_div(mid_y, p.up_y), 0), p.in_h);
int h = min(max(floor_div(mid_y + p.kernel_h, p.up_y), 0), p.in_h) - in_y;
int kernel_y = mid_y + p.kernel_h - (in_y + 1) * p.up_y;
for (int loop_major = 0, major_idx = major_idx_base;
loop_major < p.loop_major && major_idx < p.major_dim;
loop_major++, major_idx++) {
for (int loop_x = 0, out_x = out_x_base;
loop_x < p.loop_x && out_x < p.out_w; loop_x++, out_x += blockDim.y) {
int mid_x = out_x * p.down_x + p.up_x - 1 - p.pad_x0;
int in_x = min(max(floor_div(mid_x, p.up_x), 0), p.in_w);
int w = min(max(floor_div(mid_x + p.kernel_w, p.up_x), 0), p.in_w) - in_x;
int kernel_x = mid_x + p.kernel_w - (in_x + 1) * p.up_x;
const scalar_t *x_p =
&input[((major_idx * p.in_h + in_y) * p.in_w + in_x) * p.minor_dim +
minor_idx];
const scalar_t *k_p = &kernel[kernel_y * p.kernel_w + kernel_x];
int x_px = p.minor_dim;
int k_px = -p.up_x;
int x_py = p.in_w * p.minor_dim;
int k_py = -p.up_y * p.kernel_w;
scalar_t v = 0.0f;
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++) {
v += static_cast(*x_p) * static_cast(*k_p);
x_p += x_px;
k_p += k_px;
}
x_p += x_py - w * x_px;
k_p += k_py - w * k_px;
}
out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim +
minor_idx] = v;
}
}
}
template
__global__ void upfirdn2d_kernel(scalar_t *out, const scalar_t *input,
const scalar_t *kernel,
const UpFirDn2DKernelParams p) {
const int tile_in_h = ((tile_out_h - 1) * down_y + kernel_h - 1) / up_y + 1;
const int tile_in_w = ((tile_out_w - 1) * down_x + kernel_w - 1) / up_x + 1;
__shared__ volatile float sk[kernel_h][kernel_w];
__shared__ volatile float sx[tile_in_h][tile_in_w];
int minor_idx = blockIdx.x;
int tile_out_y = minor_idx / p.minor_dim;
minor_idx -= tile_out_y * p.minor_dim;
tile_out_y *= tile_out_h;
int tile_out_x_base = blockIdx.y * p.loop_x * tile_out_w;
int major_idx_base = blockIdx.z * p.loop_major;
if (tile_out_x_base >= p.out_w | tile_out_y >= p.out_h |
major_idx_base >= p.major_dim) {
return;
}
for (int tap_idx = threadIdx.x; tap_idx < kernel_h * kernel_w;
tap_idx += blockDim.x) {
int ky = tap_idx / kernel_w;
int kx = tap_idx - ky * kernel_w;
scalar_t v = 0.0;
if (kx < p.kernel_w & ky < p.kernel_h) {
v = kernel[(p.kernel_h - 1 - ky) * p.kernel_w + (p.kernel_w - 1 - kx)];
}
sk[ky][kx] = v;
}
for (int loop_major = 0, major_idx = major_idx_base;
loop_major < p.loop_major & major_idx < p.major_dim;
loop_major++, major_idx++) {
for (int loop_x = 0, tile_out_x = tile_out_x_base;
loop_x < p.loop_x & tile_out_x < p.out_w;
loop_x++, tile_out_x += tile_out_w) {
int tile_mid_x = tile_out_x * down_x + up_x - 1 - p.pad_x0;
int tile_mid_y = tile_out_y * down_y + up_y - 1 - p.pad_y0;
int tile_in_x = floor_div(tile_mid_x, up_x);
int tile_in_y = floor_div(tile_mid_y, up_y);
__syncthreads();
for (int in_idx = threadIdx.x; in_idx < tile_in_h * tile_in_w;
in_idx += blockDim.x) {
int rel_in_y = in_idx / tile_in_w;
int rel_in_x = in_idx - rel_in_y * tile_in_w;
int in_x = rel_in_x + tile_in_x;
int in_y = rel_in_y + tile_in_y;
scalar_t v = 0.0;
if (in_x >= 0 & in_y >= 0 & in_x < p.in_w & in_y < p.in_h) {
v = input[((major_idx * p.in_h + in_y) * p.in_w + in_x) *
p.minor_dim +
minor_idx];
}
sx[rel_in_y][rel_in_x] = v;
}
__syncthreads();
for (int out_idx = threadIdx.x; out_idx < tile_out_h * tile_out_w;
out_idx += blockDim.x) {
int rel_out_y = out_idx / tile_out_w;
int rel_out_x = out_idx - rel_out_y * tile_out_w;
int out_x = rel_out_x + tile_out_x;
int out_y = rel_out_y + tile_out_y;
int mid_x = tile_mid_x + rel_out_x * down_x;
int mid_y = tile_mid_y + rel_out_y * down_y;
int in_x = floor_div(mid_x, up_x);
int in_y = floor_div(mid_y, up_y);
int rel_in_x = in_x - tile_in_x;
int rel_in_y = in_y - tile_in_y;
int kernel_x = (in_x + 1) * up_x - mid_x - 1;
int kernel_y = (in_y + 1) * up_y - mid_y - 1;
scalar_t v = 0.0;
#pragma unroll
for (int y = 0; y < kernel_h / up_y; y++)
#pragma unroll
for (int x = 0; x < kernel_w / up_x; x++)
v += sx[rel_in_y + y][rel_in_x + x] *
sk[kernel_y + y * up_y][kernel_x + x * up_x];
if (out_x < p.out_w & out_y < p.out_h) {
out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim +
minor_idx] = v;
}
}
}
}
}
torch::Tensor upfirdn2d_op(const torch::Tensor &input,
const torch::Tensor &kernel, int up_x, int up_y,
int down_x, int down_y, int pad_x0, int pad_x1,
int pad_y0, int pad_y1) {
int curDevice = -1;
cudaGetDevice(&curDevice);
cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
UpFirDn2DKernelParams p;
auto x = input.contiguous();
auto k = kernel.contiguous();
p.major_dim = x.size(0);
p.in_h = x.size(1);
p.in_w = x.size(2);
p.minor_dim = x.size(3);
p.kernel_h = k.size(0);
p.kernel_w = k.size(1);
p.up_x = up_x;
p.up_y = up_y;
p.down_x = down_x;
p.down_y = down_y;
p.pad_x0 = pad_x0;
p.pad_x1 = pad_x1;
p.pad_y0 = pad_y0;
p.pad_y1 = pad_y1;
p.out_h = (p.in_h * p.up_y + p.pad_y0 + p.pad_y1 - p.kernel_h + p.down_y) /
p.down_y;
p.out_w = (p.in_w * p.up_x + p.pad_x0 + p.pad_x1 - p.kernel_w + p.down_x) /
p.down_x;
auto out =
at::empty({p.major_dim, p.out_h, p.out_w, p.minor_dim}, x.options());
int mode = -1;
int tile_out_h = -1;
int tile_out_w = -1;
if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 &&
p.kernel_h <= 4 && p.kernel_w <= 4) {
mode = 1;
tile_out_h = 16;
tile_out_w = 64;
}
if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 &&
p.kernel_h <= 3 && p.kernel_w <= 3) {
mode = 2;
tile_out_h = 16;
tile_out_w = 64;
}
if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 &&
p.kernel_h <= 4 && p.kernel_w <= 4) {
mode = 3;
tile_out_h = 16;
tile_out_w = 64;
}
if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 &&
p.kernel_h <= 2 && p.kernel_w <= 2) {
mode = 4;
tile_out_h = 16;
tile_out_w = 64;
}
if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 &&
p.kernel_h <= 4 && p.kernel_w <= 4) {
mode = 5;
tile_out_h = 8;
tile_out_w = 32;
}
if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 &&
p.kernel_h <= 2 && p.kernel_w <= 2) {
mode = 6;
tile_out_h = 8;
tile_out_w = 32;
}
dim3 block_size;
dim3 grid_size;
if (tile_out_h > 0 && tile_out_w > 0) {
p.loop_major = (p.major_dim - 1) / 16384 + 1;
p.loop_x = 1;
block_size = dim3(32 * 8, 1, 1);
grid_size = dim3(((p.out_h - 1) / tile_out_h + 1) * p.minor_dim,
(p.out_w - 1) / (p.loop_x * tile_out_w) + 1,
(p.major_dim - 1) / p.loop_major + 1);
} else {
p.loop_major = (p.major_dim - 1) / 16384 + 1;
p.loop_x = 4;
block_size = dim3(4, 32, 1);
grid_size = dim3((p.out_h * p.minor_dim - 1) / block_size.x + 1,
(p.out_w - 1) / (p.loop_x * block_size.y) + 1,
(p.major_dim - 1) / p.loop_major + 1);
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] {
switch (mode) {
case 1:
upfirdn2d_kernel
<<>>(out.data_ptr