Repository: max-andr/square-attack Branch: master Commit: ea95eebb5aca Files: 41 Total size: 168.2 KB Directory structure: gitextract_dxr_6zq8/ ├── .gitignore ├── LICENSE ├── README.md ├── attack.py ├── data.py ├── logit_pairing/ │ └── models.py ├── madry_cifar10/ │ ├── LICENSE │ ├── README.md │ ├── cifar10_input.py │ ├── config.json │ ├── eval.py │ ├── fetch_model.py │ ├── model.py │ ├── model_robustml.py │ ├── pgd_attack.py │ ├── run_attack.py │ └── train.py ├── madry_mnist/ │ ├── LICENSE │ ├── config.json │ ├── eval.py │ ├── fetch_model.py │ ├── model.py │ ├── run_attack.py │ └── train.py ├── metrics/ │ ├── 2019-11-10 15:57:14 model=pt_inception dataset=imagenet n_ex=1000 eps=12.75 p=0.05 n_iter=10000.metrics.npy │ ├── 2019-11-10 15:57:14 model=pt_resnet dataset=imagenet n_ex=1000 eps=12.75 p=0.05 n_iter=10000.metrics.npy │ ├── 2019-11-10 15:57:14 model=pt_vgg dataset=imagenet n_ex=1000 eps=12.75 p=0.05 n_iter=10000.metrics.npy │ ├── square_l2_inceptionv3_queries.npy │ ├── square_l2_resnet50_queries.npy │ └── square_l2_vgg16_queries.npy ├── models.py ├── post_avg/ │ ├── LICENSE.txt │ ├── PADefense.py │ ├── README.md │ ├── attacks.py │ ├── postAveragedModels.py │ ├── resnetSmall.py │ ├── robustml_test_cifar10.py │ ├── robustml_test_imagenet.py │ └── visualHelper.py └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ .DS_Store # data files MNIST_DATA fast_mnist/ # model files models secret.zip # compiled python files *.pyc .idea/ ================================================ FILE: LICENSE ================================================ Copyright (c) 2019, Maksym Andriushchenko, Francesco Croce, Nicolas Flammarion, Matthias Hein All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: README.md ================================================ # Square Attack: a query-efficient black-box adversarial attack via random search **ECCV 2020** **Maksym Andriushchenko\*, Francesco Croce\*, Nicolas Flammarion, Matthias Hein** **EPFL, University of Tübingen** **Paper:** [https://arxiv.org/abs/1912.00049](https://arxiv.org/abs/1912.00049) \* denotes equal contribution ## News + [Jul 2020] The paper is accepted at **ECCV 2020**! Please stop by our virtual poster for the latest insights in black-box adversarial attacks (also check out our recent preprint [Sparse-RS paper](https://arxiv.org/abs/2006.12834) where we use random search for sparse attacks). + [Mar 2020] Our attack is now part of [AutoAttack](https://github.com/fra31/auto-attack), an ensemble of attacks used for automatic (i.e., no hyperparameter tuning needed) robustness evaluation. Table 2 in the [AutoAttack paper](https://arxiv.org/abs/2003.01690) shows that at least on 6 models our **black-box** attack outperforms gradient-based methods. Always useful to have a black-box attack to prevent inaccurate robustness claims! + [Mar 2020] We also achieve the best results on [TRADES MNIST benchmark](https://github.com/yaodongyu/TRADES)! + [Jan 2020] The Square Attack achieves the best results on [MadryLab's MNIST challenge](https://github.com/MadryLab/mnist_challenge), outperforming all white-box attacks! In this case we used 50 random restarts of our attack, each with a query limit of 20000. + [Nov 2019] The Square Attack breaks the recently proposed defense from "Bandlimiting Neural Networks Against Adversarial Attacks" ([https://github.com/robust-ml/robust-ml.github.io/issues/15](https://github.com/robust-ml/robust-ml.github.io/issues/15)). ## Abstract We propose the *Square Attack*, a score-based black-box L2- and Linf-adversarial attack that does not rely on local gradient information and thus is not affected by gradient masking. Square Attack is based on a randomized search scheme which selects localized square-shaped updates at random positions so that at each iteration the perturbation is situated approximately at the boundary of the feasible set. Our method is significantly more query efficient and achieves a higher success rate compared to the state-of-the-art methods, especially in the untargeted setting. In particular, on ImageNet we improve the average query efficiency in the untargeted setting for various deep networks by a factor of at least 1.8 and up to 3 compared to the recent state-of-the-art Linf-attack of Al-Dujaili & O’Reilly. Moreover, although our attack is *black-box*, it can also outperform gradient-based *white-box* attacks on the standard benchmarks achieving a new state-of-the-art in terms of the success rate. ----- The code of the Square Attack can be found in `square_attack_linf(...)` and `square_attack_l2(...)` in `attack.py`.\ Below we show adversarial examples generated by our method for Linf and L2 perturbations:

## About the paper The general algorithm of the attack is extremely simple and relies on the random search algorithm: we try some update and accept it only if it helps to improve the loss:

The only thing we customize is the sampling distribution P (see the paper for details). The main idea behind the choice of the sampling distributions is that: - We start at the boundary of the feasible set with a good initialization that helps to improve the query efficiency (particularly for the Linf-attack). - Every iteration we stay at the boundary of the feasible set by changing squared-shaped regions of the image. In the paper we also provide convergence analysis of a variant of our attack in the non-convex setting, and justify the main algorithmic choices such as modifying squares and using the same sign of the update. This simple algorithm is sufficient to significantly outperform much more complex approaches in terms of the success rate and query efficiency:

Here are the complete success rate curves with respect to different number of queries. We note that the Square Attack also outperforms the competing approaches in the low-query regime.

The Square Attack also performs very well on adversarially trained models on MNIST achieving results competitive or better than *white-box* attacks despite the fact our attack is *black-box*:

Interestingly, the L2 perturbations for the Linf adversarially trained model are challenging for many attacks, including white-box PGD, and also other black-box attacks. However, the Square Attack is able to much more accurately assess the robustness in this setting:

## Running the code `attack.py` is the main module that implements the Square Attack, see the command line arguments there. The main functions which implement the attack are `square_attack_linf()` and `square_attack_l2()`. In order to run the untargeted Linf Square Attack on ImageNet models from the PyTorch repository you need to specify a correct path to the validation set (see `IMAGENET_PATH` in `data.py`) and then run: - ``` python attack.py --attack=square_linf --model=pt_vgg --n_ex=1000 --eps=12.75 --p=0.05 --n_iter=10000 ``` - ``` python attack.py --attack=square_linf --model=pt_resnet --n_ex=1000 --eps=12.75 --p=0.05 --n_iter=10000 ``` - ``` python attack.py --attack=square_linf --model=pt_inception --n_ex=1000 --eps=12.75 --p=0.05 --n_iter=10000 ``` Note that eps=12.75 is then divided by 255, so in the end it is equal to 0.05. For performing targeted attacks, one should use additionally the flag `--targeted`, use a lower `p`, and specify more iterations `--n_iter=100000` since it usually takes more iteration to achieve a misclassification to some particular, randomly chosen class. The rest of the models have to downloaded first (see the instructions below), and then can be evaluated in the following way: Post-averaging models: - ``` python attack.py --attack=square_linf --model=pt_post_avg_cifar10 --n_ex=1000 --eps=8.0 --p=0.3 --n_iter=20000 ``` - ``` python attack.py --attack=square_linf --model=pt_post_avg_imagenet --n_ex=1000 --eps=8.0 --p=0.3 --n_iter=20000 ``` Clean logit pairing and logit squeezing models: - ``` python attack.py --attack=square_linf --model=clp_mnist --n_ex=1000 --eps=0.3 --p=0.3 --n_iter=20000 ``` - ``` python attack.py --attack=square_linf --model=lsq_mnist --n_ex=1000 --eps=0.3 --p=0.3 --n_iter=20000 ``` - ``` python attack.py --attack=square_linf --model=clp_cifar10 --n_ex=1000 --eps=16.0 --p=0.3 --n_iter=20000 ``` - ``` python attack.py --attack=square_linf --model=lsq_cifar10 --n_ex=1000 --eps=16.0 --p=0.3 --n_iter=20000 ``` Adversarially trained model (with only 1 restart; note that the results in the paper are based on 50 restarts): - ``` python attack.py --attack=square_linf --model=madry_mnist_robust --n_ex=10000 --eps=0.3 --p=0.8 --n_iter=20000 ``` The L2 Square Attack can be run similarly, but please check the recommended hyperparameters in the paper (Section B of the supplement) and make sure that you specify the right value `eps` taking into account whether the pixels are in [0, 1] or in [0, 255] for a particular dataset dataset and model. For example, for the standard ImageNet models, the correct L2 eps to specify is 1275 since after division by 255 it will become 5.0. ## Saved statistics In the folder `metrics`, we provide saved statistics of the attack on 4 models: Inception-v3, ResNet-50, VGG-16-BN.\ Here are simple examples how to load the metrics file. ### Linf attack To print the statistics from the last iteration: ``` metrics = np.load('metrics/2019-11-10 15:57:14 model=pt_resnet dataset=imagenet n_ex=1000 eps=12.75 p=0.05 n_iter=10000.metrics.npy') iteration = np.argmax(metrics[:, -1]) # max time is the last available iteration acc, acc_corr, mean_nq, mean_nq_ae, median_nq, avg_loss, time_total = metrics[iteration] print('[iter {}] acc={:.2%} acc_corr={:.2%} avg#q={:.2f} avg#q_ae={:.2f} med#q_ae={:.2f} (p={}, n_ex={}, eps={}, {:.2f}min)'. format(n_iters+1, acc, acc_corr, mean_nq, mean_nq_ae, median_nq_ae, p, n_ex, eps, time_total/60)) ``` Then one can also create different plots based on the data contained in `metrics`. For example, one can use `1 - acc_corr` to plot the success rate of the Square Attack at different number of queries. ### L2 attack In this case we provide the number of queries necessary to achieve misclassification (`n_queries[i] = 0` means that the image `i` was initially misclassified, `n_queries[i] = 10001` indicates that the attack could not find an adversarial example for the image `i`). To load the metrics and compute the success rate of the Square Attack after `k` queries, you can run: ``` n_queries = np.load('metrics/square_l2_resnet50_queries.npy')['n_queries'] success_rate = float(((n_queries > 0) * (n_queries <= k)).sum()) / (n_queries > 0).sum() ``` ## Models Note that in order to evaluate other models, one has to first download them and move them to the folders specified in `model_path_dict` from `models.py`: - [Clean Logit Pairing on MNIST](https://oc.cs.uni-saarland.de/owncloud/index.php/s/w2yegcfx8mc8kNa) - [Logit Squeezing on MNIST](https://oc.cs.uni-saarland.de/owncloud/index.php/s/a5ZY72BDCPEtb2S) - [Clean Logit Pairing on CIFAR-10](https://oc.cs.uni-saarland.de/owncloud/index.php/s/odcd7FgFdbqq6zL) - [Logit Squeezing on CIFAR-10](https://oc.cs.uni-saarland.de/owncloud/index.php/s/EYnbHDeMbe4mq5M) - MNIST, Madry adversarial training: run `python madry_mnist/fetch_model.py secret` - MNIST, TRADES: download the [models](https://drive.google.com/file/d/1scTd9-YO3-5Ul3q5SJuRrTNX__LYLD_M) and see their [repository](https://github.com/yaodongyu/TRADES) - [Post-averaging defense](https://github.com/YupingLin171/PostAvgDefense/blob/master/trainedModel/resnet110.th): the model can be downloaded directly from the repository For the first 4 models, one has to additionally update the paths in the `checkpoint` file in the following way: ``` model_checkpoint_path: "model.ckpt" all_model_checkpoint_paths: "model.ckpt" ``` ## Requirements - PyTorch 1.0.0 - Tensorflow 1.12.0 ## Contact Do you have a problem or question regarding the code? Please don't hesitate to open an issue or contact [Maksym Andriushchenko](https://github.com/max-andr) or [Francesco Croce](https://github.com/fra31) directly. ## Citation ``` @article{ACFH2020square, title={Square Attack: a query-efficient black-box adversarial attack via random search}, author={Andriushchenko, Maksym and Croce, Francesco and Flammarion, Nicolas and Hein, Matthias}, conference={ECCV}, year={2020} } ``` ================================================ FILE: attack.py ================================================ import argparse import time import numpy as np import data import models import os import utils from datetime import datetime np.set_printoptions(precision=5, suppress=True) def p_selection(p_init, it, n_iters): """ Piece-wise constant schedule for p (the fraction of pixels changed on every iteration). """ it = int(it / n_iters * 10000) if 10 < it <= 50: p = p_init / 2 elif 50 < it <= 200: p = p_init / 4 elif 200 < it <= 500: p = p_init / 8 elif 500 < it <= 1000: p = p_init / 16 elif 1000 < it <= 2000: p = p_init / 32 elif 2000 < it <= 4000: p = p_init / 64 elif 4000 < it <= 6000: p = p_init / 128 elif 6000 < it <= 8000: p = p_init / 256 elif 8000 < it <= 10000: p = p_init / 512 else: p = p_init return p def pseudo_gaussian_pert_rectangles(x, y): delta = np.zeros([x, y]) x_c, y_c = x // 2 + 1, y // 2 + 1 counter2 = [x_c - 1, y_c - 1] for counter in range(0, max(x_c, y_c)): delta[max(counter2[0], 0):min(counter2[0] + (2 * counter + 1), x), max(0, counter2[1]):min(counter2[1] + (2 * counter + 1), y)] += 1.0 / (counter + 1) ** 2 counter2[0] -= 1 counter2[1] -= 1 delta /= np.sqrt(np.sum(delta ** 2, keepdims=True)) return delta def meta_pseudo_gaussian_pert(s): delta = np.zeros([s, s]) n_subsquares = 2 if n_subsquares == 2: delta[:s // 2] = pseudo_gaussian_pert_rectangles(s // 2, s) delta[s // 2:] = pseudo_gaussian_pert_rectangles(s - s // 2, s) * (-1) delta /= np.sqrt(np.sum(delta ** 2, keepdims=True)) if np.random.rand(1) > 0.5: delta = np.transpose(delta) elif n_subsquares == 4: delta[:s // 2, :s // 2] = pseudo_gaussian_pert_rectangles(s // 2, s // 2) * np.random.choice([-1, 1]) delta[s // 2:, :s // 2] = pseudo_gaussian_pert_rectangles(s - s // 2, s // 2) * np.random.choice([-1, 1]) delta[:s // 2, s // 2:] = pseudo_gaussian_pert_rectangles(s // 2, s - s // 2) * np.random.choice([-1, 1]) delta[s // 2:, s // 2:] = pseudo_gaussian_pert_rectangles(s - s // 2, s - s // 2) * np.random.choice([-1, 1]) delta /= np.sqrt(np.sum(delta ** 2, keepdims=True)) return delta def square_attack_l2(model, x, y, corr_classified, eps, n_iters, p_init, metrics_path, targeted, loss_type): """ The L2 square attack """ np.random.seed(0) min_val, max_val = 0, 1 c, h, w = x.shape[1:] n_features = c * h * w n_ex_total = x.shape[0] x, y = x[corr_classified], y[corr_classified] ### initialization delta_init = np.zeros(x.shape) s = h // 5 log.print('Initial square side={} for bumps'.format(s)) sp_init = (h - s * 5) // 2 center_h = sp_init + 0 for counter in range(h // s): center_w = sp_init + 0 for counter2 in range(w // s): delta_init[:, :, center_h:center_h + s, center_w:center_w + s] += meta_pseudo_gaussian_pert(s).reshape( [1, 1, s, s]) * np.random.choice([-1, 1], size=[x.shape[0], c, 1, 1]) center_w += s center_h += s x_best = np.clip(x + delta_init / np.sqrt(np.sum(delta_init ** 2, axis=(1, 2, 3), keepdims=True)) * eps, 0, 1) logits = model.predict(x_best) loss_min = model.loss(y, logits, targeted, loss_type=loss_type) margin_min = model.loss(y, logits, targeted, loss_type='margin_loss') n_queries = np.ones(x.shape[0]) # ones because we have already used 1 query time_start = time.time() s_init = int(np.sqrt(p_init * n_features / c)) metrics = np.zeros([n_iters, 7]) for i_iter in range(n_iters): idx_to_fool = (margin_min > 0.0) x_curr, x_best_curr = x[idx_to_fool], x_best[idx_to_fool] y_curr, margin_min_curr = y[idx_to_fool], margin_min[idx_to_fool] loss_min_curr = loss_min[idx_to_fool] delta_curr = x_best_curr - x_curr p = p_selection(p_init, i_iter, n_iters) s = max(int(round(np.sqrt(p * n_features / c))), 3) if s % 2 == 0: s += 1 s2 = s + 0 ### window_1 center_h = np.random.randint(0, h - s) center_w = np.random.randint(0, w - s) new_deltas_mask = np.zeros(x_curr.shape) new_deltas_mask[:, :, center_h:center_h + s, center_w:center_w + s] = 1.0 ### window_2 center_h_2 = np.random.randint(0, h - s2) center_w_2 = np.random.randint(0, w - s2) new_deltas_mask_2 = np.zeros(x_curr.shape) new_deltas_mask_2[:, :, center_h_2:center_h_2 + s2, center_w_2:center_w_2 + s2] = 1.0 norms_window_2 = np.sqrt( np.sum(delta_curr[:, :, center_h_2:center_h_2 + s2, center_w_2:center_w_2 + s2] ** 2, axis=(-2, -1), keepdims=True)) ### compute total norm available curr_norms_window = np.sqrt( np.sum(((x_best_curr - x_curr) * new_deltas_mask) ** 2, axis=(2, 3), keepdims=True)) curr_norms_image = np.sqrt(np.sum((x_best_curr - x_curr) ** 2, axis=(1, 2, 3), keepdims=True)) mask_2 = np.maximum(new_deltas_mask, new_deltas_mask_2) norms_windows = np.sqrt(np.sum((delta_curr * mask_2) ** 2, axis=(2, 3), keepdims=True)) ### create the updates new_deltas = np.ones([x_curr.shape[0], c, s, s]) new_deltas = new_deltas * meta_pseudo_gaussian_pert(s).reshape([1, 1, s, s]) new_deltas *= np.random.choice([-1, 1], size=[x_curr.shape[0], c, 1, 1]) old_deltas = delta_curr[:, :, center_h:center_h + s, center_w:center_w + s] / (1e-10 + curr_norms_window) new_deltas += old_deltas new_deltas = new_deltas / np.sqrt(np.sum(new_deltas ** 2, axis=(2, 3), keepdims=True)) * ( np.maximum(eps ** 2 - curr_norms_image ** 2, 0) / c + norms_windows ** 2) ** 0.5 delta_curr[:, :, center_h_2:center_h_2 + s2, center_w_2:center_w_2 + s2] = 0.0 # set window_2 to 0 delta_curr[:, :, center_h:center_h + s, center_w:center_w + s] = new_deltas + 0 # update window_1 hps_str = 's={}->{}'.format(s_init, s) x_new = x_curr + delta_curr / np.sqrt(np.sum(delta_curr ** 2, axis=(1, 2, 3), keepdims=True)) * eps x_new = np.clip(x_new, min_val, max_val) curr_norms_image = np.sqrt(np.sum((x_new - x_curr) ** 2, axis=(1, 2, 3), keepdims=True)) logits = model.predict(x_new) loss = model.loss(y_curr, logits, targeted, loss_type=loss_type) margin = model.loss(y_curr, logits, targeted, loss_type='margin_loss') idx_improved = loss < loss_min_curr loss_min[idx_to_fool] = idx_improved * loss + ~idx_improved * loss_min_curr margin_min[idx_to_fool] = idx_improved * margin + ~idx_improved * margin_min_curr idx_improved = np.reshape(idx_improved, [-1, *[1] * len(x.shape[:-1])]) x_best[idx_to_fool] = idx_improved * x_new + ~idx_improved * x_best_curr n_queries[idx_to_fool] += 1 acc = (margin_min > 0.0).sum() / n_ex_total acc_corr = (margin_min > 0.0).mean() mean_nq, mean_nq_ae, median_nq, median_nq_ae = np.mean(n_queries), np.mean( n_queries[margin_min <= 0]), np.median(n_queries), np.median(n_queries[margin_min <= 0]) time_total = time.time() - time_start log.print( '{}: acc={:.2%} acc_corr={:.2%} avg#q_ae={:.1f} med#q_ae={:.1f} {}, n_ex={}, {:.0f}s, loss={:.3f}, max_pert={:.1f}, impr={:.0f}'. format(i_iter + 1, acc, acc_corr, mean_nq_ae, median_nq_ae, hps_str, x.shape[0], time_total, np.mean(margin_min), np.amax(curr_norms_image), np.sum(idx_improved))) metrics[i_iter] = [acc, acc_corr, mean_nq, mean_nq_ae, median_nq, margin_min.mean(), time_total] if (i_iter <= 500 and i_iter % 500) or (i_iter > 100 and i_iter % 500) or i_iter + 1 == n_iters or acc == 0: np.save(metrics_path, metrics) if acc == 0: curr_norms_image = np.sqrt(np.sum((x_best - x) ** 2, axis=(1, 2, 3), keepdims=True)) print('Maximal norm of the perturbations: {:.5f}'.format(np.amax(curr_norms_image))) break curr_norms_image = np.sqrt(np.sum((x_best - x) ** 2, axis=(1, 2, 3), keepdims=True)) print('Maximal norm of the perturbations: {:.5f}'.format(np.amax(curr_norms_image))) return n_queries, x_best def square_attack_linf(model, x, y, corr_classified, eps, n_iters, p_init, metrics_path, targeted, loss_type): """ The Linf square attack """ np.random.seed(0) # important to leave it here as well min_val, max_val = 0, 1 if x.max() <= 1 else 255 c, h, w = x.shape[1:] n_features = c*h*w n_ex_total = x.shape[0] x, y = x[corr_classified], y[corr_classified] # [c, 1, w], i.e. vertical stripes work best for untargeted attacks init_delta = np.random.choice([-eps, eps], size=[x.shape[0], c, 1, w]) x_best = np.clip(x + init_delta, min_val, max_val) logits = model.predict(x_best) loss_min = model.loss(y, logits, targeted, loss_type=loss_type) margin_min = model.loss(y, logits, targeted, loss_type='margin_loss') n_queries = np.ones(x.shape[0]) # ones because we have already used 1 query time_start = time.time() metrics = np.zeros([n_iters, 7]) for i_iter in range(n_iters - 1): idx_to_fool = margin_min > 0 x_curr, x_best_curr, y_curr = x[idx_to_fool], x_best[idx_to_fool], y[idx_to_fool] loss_min_curr, margin_min_curr = loss_min[idx_to_fool], margin_min[idx_to_fool] deltas = x_best_curr - x_curr p = p_selection(p_init, i_iter, n_iters) for i_img in range(x_best_curr.shape[0]): s = int(round(np.sqrt(p * n_features / c))) s = min(max(s, 1), h-1) # at least c x 1 x 1 window is taken and at most c x h-1 x h-1 center_h = np.random.randint(0, h - s) center_w = np.random.randint(0, w - s) x_curr_window = x_curr[i_img, :, center_h:center_h+s, center_w:center_w+s] x_best_curr_window = x_best_curr[i_img, :, center_h:center_h+s, center_w:center_w+s] # prevent trying out a delta if it doesn't change x_curr (e.g. an overlapping patch) while np.sum(np.abs(np.clip(x_curr_window + deltas[i_img, :, center_h:center_h+s, center_w:center_w+s], min_val, max_val) - x_best_curr_window) < 10**-7) == c*s*s: deltas[i_img, :, center_h:center_h+s, center_w:center_w+s] = np.random.choice([-eps, eps], size=[c, 1, 1]) x_new = np.clip(x_curr + deltas, min_val, max_val) logits = model.predict(x_new) loss = model.loss(y_curr, logits, targeted, loss_type=loss_type) margin = model.loss(y_curr, logits, targeted, loss_type='margin_loss') idx_improved = loss < loss_min_curr loss_min[idx_to_fool] = idx_improved * loss + ~idx_improved * loss_min_curr margin_min[idx_to_fool] = idx_improved * margin + ~idx_improved * margin_min_curr idx_improved = np.reshape(idx_improved, [-1, *[1]*len(x.shape[:-1])]) x_best[idx_to_fool] = idx_improved * x_new + ~idx_improved * x_best_curr n_queries[idx_to_fool] += 1 acc = (margin_min > 0.0).sum() / n_ex_total acc_corr = (margin_min > 0.0).mean() mean_nq, mean_nq_ae, median_nq_ae = np.mean(n_queries), np.mean(n_queries[margin_min <= 0]), np.median(n_queries[margin_min <= 0]) avg_margin_min = np.mean(margin_min) time_total = time.time() - time_start log.print('{}: acc={:.2%} acc_corr={:.2%} avg#q_ae={:.2f} med#q={:.1f}, avg_margin={:.2f} (n_ex={}, eps={:.3f}, {:.2f}s)'. format(i_iter+1, acc, acc_corr, mean_nq_ae, median_nq_ae, avg_margin_min, x.shape[0], eps, time_total)) metrics[i_iter] = [acc, acc_corr, mean_nq, mean_nq_ae, median_nq_ae, margin_min.mean(), time_total] if (i_iter <= 500 and i_iter % 20 == 0) or (i_iter > 100 and i_iter % 50 == 0) or i_iter + 1 == n_iters or acc == 0: np.save(metrics_path, metrics) if acc == 0: break return n_queries, x_best if __name__ == '__main__': parser = argparse.ArgumentParser(description='Define hyperparameters.') parser.add_argument('--model', type=str, default='pt_resnet', choices=models.all_model_names, help='Model name.') parser.add_argument('--attack', type=str, default='square_linf', choices=['square_linf', 'square_l2'], help='Attack.') parser.add_argument('--exp_folder', type=str, default='exps', help='Experiment folder to store all output.') parser.add_argument('--gpu', type=str, default='7', help='GPU number. Multiple GPUs are possible for PT models.') parser.add_argument('--n_ex', type=int, default=10000, help='Number of test ex to test on.') parser.add_argument('--p', type=float, default=0.05, help='Probability of changing a coordinate. Note: check the paper for the best values. ' 'Linf standard: 0.05, L2 standard: 0.1. But robust models require higher p.') parser.add_argument('--eps', type=float, default=0.05, help='Radius of the Lp ball.') parser.add_argument('--n_iter', type=int, default=10000) parser.add_argument('--targeted', action='store_true', help='Targeted or untargeted attack.') args = parser.parse_args() args.loss = 'margin_loss' if not args.targeted else 'cross_entropy' os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu dataset = 'mnist' if 'mnist' in args.model else 'cifar10' if 'cifar10' in args.model else 'imagenet' timestamp = str(datetime.now())[:-7] hps_str = '{} model={} dataset={} attack={} n_ex={} eps={} p={} n_iter={}'.format( timestamp, args.model, dataset, args.attack, args.n_ex, args.eps, args.p, args.n_iter) args.eps = args.eps / 255.0 if dataset == 'imagenet' else args.eps # for mnist and cifar10 we leave as it is batch_size = data.bs_dict[dataset] model_type = 'pt' if 'pt_' in args.model else 'tf' n_cls = 1000 if dataset == 'imagenet' else 10 gpu_memory = 0.5 if dataset == 'mnist' and args.n_ex > 1000 else 0.15 if dataset == 'mnist' else 0.99 log_path = '{}/{}.log'.format(args.exp_folder, hps_str) metrics_path = '{}/{}.metrics'.format(args.exp_folder, hps_str) log = utils.Logger(log_path) log.print('All hps: {}'.format(hps_str)) if args.model != 'pt_inception': x_test, y_test = data.datasets_dict[dataset](args.n_ex) else: # exception for inception net on imagenet -- 299x299 images instead of 224x224 x_test, y_test = data.datasets_dict[dataset](args.n_ex, size=299) x_test, y_test = x_test[:args.n_ex], y_test[:args.n_ex] if args.model == 'pt_post_avg_cifar10': x_test /= 255.0 args.eps = args.eps / 255.0 models_class_dict = {'tf': models.ModelTF, 'pt': models.ModelPT} model = models_class_dict[model_type](args.model, batch_size, gpu_memory) logits_clean = model.predict(x_test) corr_classified = logits_clean.argmax(1) == y_test # important to check that the model was restored correctly and the clean accuracy is high log.print('Clean accuracy: {:.2%}'.format(np.mean(corr_classified))) square_attack = square_attack_linf if args.attack == 'square_linf' else square_attack_l2 y_target = utils.random_classes_except_current(y_test, n_cls) if args.targeted else y_test y_target_onehot = utils.dense_to_onehot(y_target, n_cls=n_cls) # Note: we count the queries only across correctly classified images n_queries, x_adv = square_attack(model, x_test, y_target_onehot, corr_classified, args.eps, args.n_iter, args.p, metrics_path, args.targeted, args.loss) ================================================ FILE: data.py ================================================ import torch import numpy as np from torchvision import transforms from torchvision.datasets import ImageFolder from torch.utils.data import DataLoader def load_mnist(n_ex): from tensorflow.keras.datasets import mnist as mnist_keras x_test, y_test = mnist_keras.load_data()[1] x_test = x_test.astype(np.float64) / 255.0 x_test = x_test[:, None, :, :] return x_test[:n_ex], y_test[:n_ex] def load_cifar10(n_ex): from madry_cifar10.cifar10_input import CIFAR10Data cifar = CIFAR10Data('madry_cifar10/cifar10_data') x_test, y_test = cifar.eval_data.xs.astype(np.float32), cifar.eval_data.ys x_test = np.transpose(x_test, axes=[0, 3, 1, 2]) return x_test[:n_ex], y_test[:n_ex] def load_imagenet(n_ex, size=224): IMAGENET_SL = size IMAGENET_PATH = "/scratch/maksym/imagenet/val_orig" imagenet = ImageFolder(IMAGENET_PATH, transforms.Compose([ transforms.Resize(IMAGENET_SL), transforms.CenterCrop(IMAGENET_SL), transforms.ToTensor() ])) torch.manual_seed(0) imagenet_loader = DataLoader(imagenet, batch_size=n_ex, shuffle=True, num_workers=1) x_test, y_test = next(iter(imagenet_loader)) return np.array(x_test, dtype=np.float32), np.array(y_test) datasets_dict = {'mnist': load_mnist, 'cifar10': load_cifar10, 'imagenet': load_imagenet, } bs_dict = {'mnist': 10000, 'cifar10': 4096, # 4096 is the maximum that fits 'imagenet': 100, } ================================================ FILE: logit_pairing/models.py ================================================ import tensorflow as tf from collections import OrderedDict # ------------------------------------------------------------- # Models # ------------------------------------------------------------- class LeNet: def __init__(self): super().__init__() self.nb_classes = 10 self.input_shape = [28, 28, 3] self.weights_init = 'He' self.filters = 32 # 32 is the default here for all our pre-trained models self.is_training = False self.bn = False self.bn_scale = False self.bn_bias = False self.parameters = 0 # Create variables with tf.variable_scope('conv1_vars'): self.W_conv1 = create_conv2d_weights(kernel_size=3, filter_in=1, filter_out=self.filters, init=self.weights_init) self.parameters += 3 * 3 * self.input_shape[-1] * self.filters self.b_conv1 = create_biases(size=self.filters) self.parameters += self.filters with tf.variable_scope('conv2_vars'): self.W_conv2 = create_conv2d_weights(kernel_size=3, filter_in=self.filters, filter_out=self.filters * 2, init=self.weights_init) self.parameters += 3 * 3 * self.filters * (self.filters * 2) self.b_conv2 = create_biases(size=self.filters * 2) self.parameters += self.filters * 2 with tf.variable_scope('fc1_vars'): self.W_fc1 = create_weights(units_in=7 * 7 * self.filters * 2, units_out=1024, init=self.weights_init) self.parameters += (7 * 7 * self.filters * 2) * 1024 self.b_fc1 = create_biases(size=1024) self.parameters += 1024 with tf.variable_scope('fc2_vars'): self.W_fc2 = create_weights(units_in=1024, units_out=self.nb_classes, init=self.weights_init) self.parameters += 1024 * self.nb_classes self.b_fc2 = create_biases(size=self.nb_classes) self.parameters += self.nb_classes self.x_input = tf.placeholder(tf.float32, shape=[None, 784]) self.y_input = tf.placeholder(tf.int64, shape=[None]) x = tf.reshape(self.x_input, [-1, 28, 28, 1]) with tf.name_scope('conv-block-1'): conv1 = conv_layer(x, self.is_training, self.W_conv1, stride=1, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv1', bias=self.b_conv1) with tf.name_scope('max-pool-1'): conv1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID') with tf.name_scope('conv-block-2'): conv2 = conv_layer(conv1, self.is_training, self.W_conv2, stride=1, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv2', bias=self.b_conv2) with tf.name_scope('max-pool-2'): conv2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID') with tf.name_scope('fc-block'): conv2 = tf.layers.flatten(conv2) fc1 = fc_layer(conv2, self.is_training, self.W_fc1, bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='fc1', non_linearity='relu', bias=self.b_fc1) logits = fc_layer(fc1, self.is_training, self.W_fc2, bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='fc2', non_linearity='linear', bias=self.b_fc2) self.summaries = False self.logits = logits class ResNet20_v2: def __init__(self): super().__init__() self.nb_classes = 10 self.input_shape = [32, 32, 3] self.weights_init = 'He' self.filters = 64 # 64 is the default here for all our pre-trained models self.is_training = False self.bn = True self.bn_scale = True self.bn_bias = True self.parameters = 0 # Create variables with tf.variable_scope('conv1_vars'): self.W_conv1 = create_conv2d_weights(kernel_size=3, filter_in=self.input_shape[-1], filter_out=self.filters, init=self.weights_init) self.parameters += 3 * 3 * self.input_shape[-1] * self.filters with tf.variable_scope('conv2_vars'): self.W_conv2 = create_conv2d_weights(kernel_size=3, filter_in=self.filters, filter_out=self.filters, init=self.weights_init) self.parameters += 3 * 3 * self.filters * self.filters with tf.variable_scope('conv3_vars'): self.W_conv3 = create_conv2d_weights(kernel_size=3, filter_in=self.filters, filter_out=self.filters, init=self.weights_init) self.parameters += 3 * 3 * self.filters * self.filters with tf.variable_scope('conv4_vars'): self.W_conv4 = create_conv2d_weights(kernel_size=3, filter_in=self.filters, filter_out=self.filters, init=self.weights_init) self.parameters += 3 * 3 * self.filters * self.filters with tf.variable_scope('conv5_vars'): self.W_conv5 = create_conv2d_weights(kernel_size=3, filter_in=self.filters, filter_out=self.filters, init=self.weights_init) self.parameters += 3 * 3 * self.filters * self.filters with tf.variable_scope('conv6_vars'): self.W_conv6 = create_conv2d_weights(kernel_size=3, filter_in=self.filters, filter_out=self.filters, init=self.weights_init) self.parameters += 3 * 3 * self.filters * self.filters with tf.variable_scope('conv7_vars'): self.W_conv7 = create_conv2d_weights(kernel_size=3, filter_in=self.filters, filter_out=self.filters, init=self.weights_init) self.parameters += 3 * 3 * self.filters * self.filters with tf.variable_scope('conv8_vars'): self.W_conv8 = create_conv2d_weights(kernel_size=3, filter_in=self.filters, filter_out=self.filters * 2, init=self.weights_init) self.parameters += 3 * 3 * self.filters * (self.filters * 2) with tf.variable_scope('conv9_vars'): self.W_conv9 = create_conv2d_weights(kernel_size=3, filter_in=self.filters * 2, filter_out=self.filters * 2, init=self.weights_init) self.parameters += 3 * 3 * (self.filters * 2) * (self.filters * 2) with tf.variable_scope('conv10_vars'): self.W_conv10 = create_conv2d_weights(kernel_size=3, filter_in=self.filters * 2, filter_out=self.filters * 2, init=self.weights_init) self.parameters += 3 * 3 * (self.filters * 2) * (self.filters * 2) with tf.variable_scope('conv11_vars'): self.W_conv11 = create_conv2d_weights(kernel_size=3, filter_in=self.filters * 2, filter_out=self.filters * 2, init=self.weights_init) self.parameters += 3 * 3 * (self.filters * 2) * (self.filters * 2) with tf.variable_scope('conv12_vars'): self.W_conv12 = create_conv2d_weights(kernel_size=3, filter_in=self.filters * 2, filter_out=self.filters * 2, init=self.weights_init) self.parameters += 3 * 3 * (self.filters * 2) * (self.filters * 2) with tf.variable_scope('conv13_vars'): self.W_conv13 = create_conv2d_weights(kernel_size=3, filter_in=self.filters * 2, filter_out=self.filters * 2, init=self.weights_init) self.parameters += 3 * 3 * (self.filters * 2) * (self.filters * 2) with tf.variable_scope('conv14_vars'): self.W_conv14 = create_conv2d_weights(kernel_size=3, filter_in=self.filters * 2, filter_out=self.filters * 4, init=self.weights_init) self.parameters += 3 * 3 * (self.filters * 2) * (self.filters * 4) with tf.variable_scope('conv15_vars'): self.W_conv15 = create_conv2d_weights(kernel_size=3, filter_in=self.filters * 4, filter_out=self.filters * 4, init=self.weights_init) self.parameters += 3 * 3 * (self.filters * 4) * (self.filters * 4) with tf.variable_scope('conv16_vars'): self.W_conv16 = create_conv2d_weights(kernel_size=3, filter_in=self.filters * 4, filter_out=self.filters * 4, init=self.weights_init) self.parameters += 3 * 3 * (self.filters * 4) * (self.filters * 4) with tf.variable_scope('conv17_vars'): self.W_conv17 = create_conv2d_weights(kernel_size=3, filter_in=self.filters * 4, filter_out=self.filters * 4, init=self.weights_init) self.parameters += 3 * 3 * (self.filters * 4) * (self.filters * 4) with tf.variable_scope('conv18_vars'): self.W_conv18 = create_conv2d_weights(kernel_size=3, filter_in=self.filters * 4, filter_out=self.filters * 4, init=self.weights_init) self.parameters += 3 * 3 * (self.filters * 4) * (self.filters * 4) with tf.variable_scope('conv19_vars'): self.W_conv19 = create_conv2d_weights(kernel_size=3, filter_in=self.filters * 4, filter_out=self.filters * 4, init=self.weights_init) self.parameters += 3 * 3 * (self.filters * 4) * (self.filters * 4) with tf.variable_scope('fc1_vars'): self.W_fc1 = create_weights(units_in=self.filters * 4, units_out=self.nb_classes, init=self.weights_init) self.parameters += (self.filters * 4) * self.nb_classes self.b_fc1 = create_biases(size=self.nb_classes) self.parameters += self.nb_classes with tf.variable_scope('scip1_vars'): self.W_scip1 = create_conv2d_weights(kernel_size=1, filter_in=self.filters, filter_out=self.filters, init=self.weights_init) self.parameters += 1 * 1 * self.filters * self.filters with tf.variable_scope('scip2_vars'): self.W_scip2 = create_conv2d_weights(kernel_size=1, filter_in=self.filters, filter_out=self.filters * 2, init=self.weights_init) self.parameters += 1 * 1 * self.filters * (self.filters * 2) with tf.variable_scope('scip3_vars'): self.W_scip3 = create_conv2d_weights(kernel_size=1, filter_in=self.filters * 2, filter_out=self.filters * 4, init=self.weights_init) self.parameters += 1 * 1 * (self.filters * 2) * (self.filters * 4) self.x_input = tf.placeholder(tf.float32, shape=[None, 32, 32, 3]) self.y_input = tf.placeholder(tf.int64, shape=None) x = self.x_input / 255.0 # Specify forward pass with tf.name_scope('input-block'): conv1 = conv_layer(x, self.is_training, self.W_conv1, stride=1, padding='SAME', bn=False, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv1', non_linearity='linear') with tf.name_scope('conv-block-1'): conv2 = pre_act_conv_layer(conv1, self.is_training, self.W_conv2, stride=1, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv2') conv3 = pre_act_conv_layer(conv2, self.is_training, self.W_conv3, stride=1, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv3') # skip connection conv3 += tf.nn.conv2d(conv1, self.W_scip1, strides=[1, 1, 1, 1], padding='SAME', name='conv-skip1') conv4 = pre_act_conv_layer(conv3, self.is_training, self.W_conv4, stride=1, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv4') conv5 = pre_act_conv_layer(conv4, self.is_training, self.W_conv5, stride=1, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv5') # skip connection conv5 += conv3 conv6 = pre_act_conv_layer(conv5, self.is_training, self.W_conv6, stride=1, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv6') conv7 = pre_act_conv_layer(conv6, self.is_training, self.W_conv7, stride=1, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv7') # skip connection conv7 += conv5 with tf.name_scope('conv-block-2'): conv8 = pre_act_conv_layer(conv7, self.is_training, self.W_conv8, stride=2, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv8') conv9 = pre_act_conv_layer(conv8, self.is_training, self.W_conv9, stride=1, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv9') # skip connection conv9 += tf.nn.conv2d(conv7, self.W_scip2, strides=[1, 2, 2, 1], padding='SAME', name='conv-skip2') conv10 = pre_act_conv_layer(conv9, self.is_training, self.W_conv10, stride=1, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv10') conv11 = pre_act_conv_layer(conv10, self.is_training, self.W_conv11, stride=1, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv11') # skip connection conv11 += conv9 conv12 = pre_act_conv_layer(conv11, self.is_training, self.W_conv12, stride=1, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv12') conv13 = pre_act_conv_layer(conv12, self.is_training, self.W_conv13, stride=1, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv13') # skip connection conv13 += conv11 with tf.name_scope('conv-block-3'): conv14 = pre_act_conv_layer(conv13, self.is_training, self.W_conv14, stride=2, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv14') conv15 = pre_act_conv_layer(conv14, self.is_training, self.W_conv15, stride=1, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv15') # skip connection conv15 += tf.nn.conv2d(conv13, self.W_scip3, strides=[1, 2, 2, 1], padding='SAME', name='conv-skip3') conv16 = pre_act_conv_layer(conv15, self.is_training, self.W_conv16, stride=1, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv16') conv17 = pre_act_conv_layer(conv16, self.is_training, self.W_conv17, stride=1, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv17') # skip connection conv17 += conv15 conv18 = pre_act_conv_layer(conv17, self.is_training, self.W_conv18, stride=1, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv18') conv19 = pre_act_conv_layer(conv18, self.is_training, self.W_conv19, stride=1, padding='SAME', bn=self.bn, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='conv19') # skip connection conv19 += conv17 conv19 = nonlinearity(conv19) with tf.name_scope('output-block'): with tf.name_scope('global-average-pooling'): fc1 = tf.reduce_mean(conv19, axis=[1, 2]) logits = fc_layer(fc1, self.is_training, self.W_fc1, bn=False, bn_scale=self.bn_scale, bn_bias=self.bn_bias, name='fc1', non_linearity='linear', bias=self.b_fc1) self.summaries = False self.logits = logits # ------------------------------------------------------------- # Helpers # ------------------------------------------------------------- def create_weights(units_in, units_out, init='Xavier', seed=None): if init == 'Xavier': initializer = tf.variance_scaling_initializer(scale=1.0, mode='fan_in', distribution='normal', seed=None, dtype=tf.float32) elif init == 'He': initializer = tf.variance_scaling_initializer(scale=2.0, mode='fan_in', distribution='normal', seed=None, dtype=tf.float32) else: initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.01, seed=seed, dtype=tf.float32) weights = tf.get_variable(name='weights', shape=[units_in, units_out], dtype=tf.float32, initializer=initializer) return weights def create_conv2d_weights(kernel_size, filter_in, filter_out, init='Xavier', seed=None): if init == 'Xavier': initializer = tf.variance_scaling_initializer(scale=1.0, mode='fan_in', distribution='normal', seed=None, dtype=tf.float32) elif init == 'He': initializer = tf.variance_scaling_initializer(scale=2.0, mode='fan_in', distribution='normal', seed=None, dtype=tf.float32) else: initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.01, seed=seed, dtype=tf.float32) weights = tf.get_variable(name='weights', shape=[kernel_size, kernel_size, filter_in, filter_out], dtype=tf.float32, initializer=initializer) return weights def create_biases(size): return tf.get_variable(name='biases', shape=[size], dtype=tf.float32, initializer=tf.zeros_initializer()) def batch_norm(x, is_training, scale, bias, name, reuse): return tf.contrib.layers.batch_norm( x, decay=0.999, center=bias, scale=scale, epsilon=0.001, param_initializers=None, updates_collections=tf.GraphKeys.UPDATE_OPS, is_training=is_training, reuse=reuse, variables_collections=['batch-norm'], outputs_collections=None, trainable=True, batch_weights=None, fused=False, zero_debias_moving_mean=False, scope=name, renorm=False, renorm_clipping=None, renorm_decay=0.99 ) def nonlinearity(x, non_linearity='relu'): if non_linearity == 'linear': return tf.identity(x) if non_linearity == 'sigmoid': return tf.nn.sigmoid(x) if non_linearity == 'tanh': return tf.nn.tanh(x) if non_linearity == 'relu': return tf.nn.relu(x) if non_linearity == 'elu': return tf.nn.elu(x) if non_linearity == 'selu': return tf.nn.selu(x) def conv_layer(inputs, is_training, weights, stride, padding, bn, bn_scale, bn_bias, name, non_linearity='relu', bias=None): if bias is not None: inputs = tf.nn.conv2d(inputs, weights, strides=[1, stride, stride, 1], padding=padding) + bias else: inputs = tf.nn.conv2d(inputs, weights, strides=[1, stride, stride, 1], padding=padding) if bn: inputs = batch_norm(inputs, is_training=is_training, scale=bn_scale, bias=bn_bias, name='batch-norm-{:s}'.format(name), reuse=tf.AUTO_REUSE) activations = nonlinearity(inputs, non_linearity=non_linearity) return activations def pre_act_conv_layer(inputs, is_training, weights, stride, padding, bn, bn_scale, bn_bias, name, non_linearity='relu'): if bn: inputs = batch_norm(inputs, is_training=is_training, scale=bn_scale, bias=bn_bias, name='batch-norm-{:s}'.format(name), reuse=tf.AUTO_REUSE) activations = nonlinearity(inputs, non_linearity=non_linearity) outputs = tf.nn.conv2d(activations, weights, strides=[1, stride, stride, 1], padding=padding) return outputs def fc_layer(inputs, is_training, weights, bn, bn_scale, bn_bias, name, non_linearity='relu', bias=None): if bias is not None: inputs = tf.matmul(inputs, weights) + bias else: inputs = tf.matmul(inputs, weights) if bn: inputs = batch_norm(inputs, is_training=is_training, scale=bn_scale, bias=bn_bias, name='batch-norm-{:s}'.format(name), reuse=tf.AUTO_REUSE) activations = nonlinearity(inputs, non_linearity) return activations ================================================ FILE: madry_cifar10/LICENSE ================================================ MIT License Copyright (c) 2017 Aleksander Madry, Aleksandar Makelov, Ludwig Schmidt, Dimitris Tsipras, and Adrian Vladu Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: madry_cifar10/README.md ================================================ # CIFAR10 Adversarial Examples Challenge Recently, there has been much progress on adversarial *attacks* against neural networks, such as the [cleverhans](https://github.com/tensorflow/cleverhans) library and the code by [Carlini and Wagner](https://github.com/carlini/nn_robust_attacks). We now complement these advances by proposing an *attack challenge* for the [CIFAR10 dataset](https://www.cs.toronto.edu/~kriz/cifar.html) which follows the format of [our earlier MNIST challenge](https://github.com/MadryLab/mnist_challenge). We have trained a robust network, and the objective is to find a set of adversarial examples on which this network achieves only a low accuracy. To train an adversarially-robust network, we followed the approach from our recent paper: **Towards Deep Learning Models Resistant to Adversarial Attacks**
*Aleksander Madry, Aleksandar Makelov, Ludwig Schmidt, Dimitris Tsipras, Adrian Vladu*
https://arxiv.org/abs/1706.06083. As part of the challenge, we release both the training code and the network architecture, but keep the network weights secret. We invite any researcher to submit attacks against our model (see the detailed instructions below). We will maintain a leaderboard of the best attacks for the next two months and then publish our secret network weights. Analogously to our MNIST challenge, the goal of this challenge is to clarify the state-of-the-art for adversarial robustness on CIFAR10. Moreover, we hope that future work on defense mechanisms will adopt a similar challenge format in order to improve reproducibility and empirical comparisons. **Update 2017-12-10**: We released our secret model. You can download it by running `python fetch_model.py secret`. As of Dec 10 we are no longer accepting black-box challenge submissions. We have set up a leaderboard for white-box attacks on the (now released) secret model. The submission format is the same as before. We plan to continue evaluating submissions and maintaining the leaderboard for the foreseeable future. ## Black-Box Leaderboard (Original Challenge) | Attack | Submitted by | Accuracy | Submission Date | | -------------------------------------- | ------------- | -------- | ---- | | PGD on the cross-entropy loss for the
adversarially trained public network | (initial entry) | **63.39%** | Jul 12, 2017 | | PGD on the [CW](https://github.com/carlini/nn_robust_attacks) loss for the
adversarially trained public network | (initial entry) | 64.38% | Jul 12, 2017 | | FGSM on the [CW](https://github.com/carlini/nn_robust_attacks) loss for the
adversarially trained public network | (initial entry) | 67.25% | Jul 12, 2017 | | FGSM on the [CW](https://github.com/carlini/nn_robust_attacks) loss for the
naturally trained public network | (initial entry) | 85.23% | Jul 12, 2017 | ## White-Box Leaderboard | Attack | Submitted by | Accuracy | Submission Date | | -------------------------------------- | ------------- | -------- | ---- | | [FAB: Fast Adaptive Boundary Attack](https://github.com/fra31/fab-attack) | Francesco Croce | **44.51%** | Jun 7, 2019 | | [Distributionally Adversarial Attack](https://github.com/tianzheng4/Distributionally-Adversarial-Attack) | Tianhang Zheng | 44.71% | Aug 21, 2018 | | 20-step PGD on the cross-entropy loss
with 10 random restarts | Tianhang Zheng | 45.21% | Aug 24, 2018 | | 20-step PGD on the cross-entropy loss | (initial entry) | 47.04% | Dec 10, 2017 | | 20-step PGD on the [CW](https://github.com/carlini/nn_robust_attacks) loss | (initial entry) | 47.76% | Dec 10, 2017 | | FGSM on the [CW](https://github.com/carlini/nn_robust_attacks) loss | (initial entry) | 54.92% | Dec 10, 2017 | | FGSM on the cross-entropy loss | (initial entry) | 55.55% | Dec 10, 2017 | ## Format and Rules The objective of the challenge is to find black-box (transfer) attacks that are effective against our CIFAR10 model. Attacks are allowed to perturb each pixel of the input image by at most `epsilon=8.0` on a `0-255` pixel scale. To ensure that the attacks are indeed black-box, we release our training code and model architecture, but keep the actual network weights secret. We invite any interested researchers to submit attacks against our model. The most successful attacks will be listed in the leaderboard above. As a reference point, we have seeded the leaderboard with the results of some standard attacks. ### The CIFAR10 Model We used the code published in this repository to produce an adversarially robust model for CIFAR10 classification. The model is a residual convolutional neural network consisting of five residual units and a fully connected layer. This architecture is derived from the "w32-10 wide" variant of the [Tensorflow model repository](https://github.com/tensorflow/models/blob/master/resnet/resnet_model.py). The network was trained against an iterative adversary that is allowed to perturb each pixel by at most `epsilon=8.0`. The random seed used for training and the trained network weights will be kept secret. The `sha256()` digest of our model file is: ``` 555be6e892372599380c9da5d5f9802f9cbd098be8a47d24d96937a002305fd4 ``` We will release the corresponding model file on September 15 2017, which is roughly two months after the start of this competition. **Edit: We are extending the deadline for submitting attacks to October 15th due to requests.** ### The Attack Model We are interested in adversarial inputs that are derived from the CIFAR10 test set. Each pixel can be perturbed by at most `epsilon=8.0` from its initial value on the `0-255` pixel scale. All pixels can be perturbed independently, so this is an l_infinity attack. ### Submitting an Attack Each attack should consist of a perturbed version of the CIFAR10 test set. Each perturbed image in this test set should follow the above attack model. The adversarial test set should be formated as a numpy array with one row per example and each row containing a 32x32x3 array of pixels. Hence the overall dimensions are 10,000x32x32x3. Each pixel must be in the [0, 255] range. See the script `pgd_attack.py` for an attack that generates an adversarial test set in this format. In order to submit your attack, save the matrix containing your adversarial examples with `numpy.save` and email the resulting file to cifar10.challenge@gmail.com. We will then run the `run_attack.py` script on your file to verify that the attack is valid and to evaluate the accuracy of our secret model on your examples. After that, we will reply with the predictions of our model on each of your examples and the overall accuracy of our model on your evaluation set. If the attack is valid and outperforms all current attacks in the leaderboard, it will appear at the top of the leaderboard. Novel types of attacks might be included in the leaderboard even if they do not perform best. We strongly encourage you to disclose your attack method. We would be happy to add a link to your code in our leaderboard. ## Overview of the Code The code consists of seven Python scripts and the file `config.json` that contains various parameter settings. ### Running the code - `python train.py`: trains the network, storing checkpoints along the way. - `python eval.py`: an infinite evaluation loop, processing each new checkpoint as it is created while logging summaries. It is intended to be run in parallel with the `train.py` script. - `python pgd_attack.py`: applies the attack to the CIFAR10 eval set and stores the resulting adversarial eval set in a `.npy` file. This file is in a valid attack format for our challenge. - `python run_attack.py`: evaluates the model on the examples in the `.npy` file specified in config, while ensuring that the adversarial examples are indeed a valid attack. The script also saves the network predictions in `pred.npy`. - `python fetch_model.py name`: downloads the pre-trained model with the specified name (at the moment `adv_trained` or `natural`), prints the sha256 hash, and places it in the models directory. - `cifar10_input.py` provides utility functions and classes for loading the CIFAR10 dataset. ### Parameters in `config.json` Model configuration: - `model_dir`: contains the path to the directory of the currently trained/evaluated model. Training configuration: - `tf_random_seed`: the seed for the RNG used to initialize the network weights. - `numpy_random_seed`: the seed for the RNG used to pass over the dataset in random order - `max_num_training_steps`: the number of training steps. - `num_output_steps`: the number of training steps between printing progress in standard output. - `num_summary_steps`: the number of training steps between storing tensorboard summaries. - `num_checkpoint_steps`: the number of training steps between storing model checkpoints. - `training_batch_size`: the size of the training batch. Evaluation configuration: - `num_eval_examples`: the number of CIFAR10 examples to evaluate the model on. - `eval_batch_size`: the size of the evaluation batches. - `eval_on_cpu`: forces the `eval.py` script to run on the CPU so it does not compete with `train.py` for GPU resources. Adversarial examples configuration: - `epsilon`: the maximum allowed perturbation per pixel. - `k`: the number of PGD iterations used by the adversary. - `a`: the size of the PGD adversary steps. - `random_start`: specifies whether the adversary will start iterating from the natural example or a random perturbation of it. - `loss_func`: the loss function used to run pgd on. `xent` corresponds to the standard cross-entropy loss, `cw` corresponds to the loss function of [Carlini and Wagner](https://arxiv.org/abs/1608.04644). - `store_adv_path`: the file in which adversarial examples are stored. Relevant for the `pgd_attack.py` and `run_attack.py` scripts. ## Example usage After cloning the repository you can either train a new network or evaluate/attack one of our pre-trained networks. #### Training a new network * Start training by running: ``` python train.py ``` * (Optional) Evaluation summaries can be logged by simultaneously running: ``` python eval.py ``` #### Download a pre-trained network * For an adversarially trained network, run ``` python fetch_model.py adv_trained ``` and use the `config.json` file to set `"model_dir": "models/adv_trained"`. * For a naturally trained network, run ``` python fetch_model.py natural ``` and use the `config.json` file to set `"model_dir": "models/naturally_trained"`. #### Test the network * Create an attack file by running ``` python pgd_attack.py ``` * Evaluate the network with ``` python run_attack.py ``` ================================================ FILE: madry_cifar10/cifar10_input.py ================================================ """ Utilities for importing the CIFAR10 dataset. Each image in the dataset is a numpy array of shape (32, 32, 3), with the values being unsigned integers (i.e., in the range 0,1,...,255). """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import pickle import sys import tensorflow as tf from tensorflow.examples.tutorials.mnist import input_data version = sys.version_info import numpy as np class CIFAR10Data(object): """ Unpickles the CIFAR10 dataset from a specified folder containing a pickled version following the format of Krizhevsky which can be found [here](https://www.cs.toronto.edu/~kriz/cifar.html). Inputs to constructor ===================== - path: path to the pickled dataset. The training data must be pickled into five files named data_batch_i for i = 1, ..., 5, containing 10,000 examples each, the test data must be pickled into a single file called test_batch containing 10,000 examples, and the 10 class names must be pickled into a file called batches.meta. The pickled examples should be stored as a tuple of two objects: an array of 10,000 32x32x3-shaped arrays, and an array of their 10,000 true labels. """ def __init__(self, path): train_filenames = ['data_batch_{}'.format(ii + 1) for ii in range(5)] eval_filename = 'test_batch' metadata_filename = 'batches.meta' train_images = np.zeros((50000, 32, 32, 3), dtype='uint8') train_labels = np.zeros(50000, dtype='int32') for ii, fname in enumerate(train_filenames): cur_images, cur_labels = self._load_datafile(os.path.join(path, fname)) train_images[ii * 10000 : (ii+1) * 10000, ...] = cur_images train_labels[ii * 10000 : (ii+1) * 10000, ...] = cur_labels eval_images, eval_labels = self._load_datafile( os.path.join(path, eval_filename)) with open(os.path.join(path, metadata_filename), 'rb') as fo: if version.major == 3: data_dict = pickle.load(fo, encoding='bytes') else: data_dict = pickle.load(fo) self.label_names = data_dict[b'label_names'] for ii in range(len(self.label_names)): self.label_names[ii] = self.label_names[ii].decode('utf-8') self.train_data = DataSubset(train_images, train_labels) self.eval_data = DataSubset(eval_images, eval_labels) @staticmethod def _load_datafile(filename): with open(filename, 'rb') as fo: if version.major == 3: data_dict = pickle.load(fo, encoding='bytes') else: data_dict = pickle.load(fo) assert data_dict[b'data'].dtype == np.uint8 image_data = data_dict[b'data'] image_data = image_data.reshape((10000, 3, 32, 32)).transpose(0, 2, 3, 1) return image_data, np.array(data_dict[b'labels']) class AugmentedCIFAR10Data(object): """ Data augmentation wrapper over a loaded dataset. Inputs to constructor ===================== - raw_cifar10data: the loaded CIFAR10 dataset, via the CIFAR10Data class - sess: current tensorflow session - model: current model (needed for input tensor) """ def __init__(self, raw_cifar10data, sess, model): assert isinstance(raw_cifar10data, CIFAR10Data) self.image_size = 32 # create augmentation computational graph self.x_input_placeholder = tf.placeholder(tf.float32, shape=[None, 32, 32, 3]) padded = tf.map_fn(lambda img: tf.image.resize_image_with_crop_or_pad( img, self.image_size + 4, self.image_size + 4), self.x_input_placeholder) cropped = tf.map_fn(lambda img: tf.random_crop(img, [self.image_size, self.image_size, 3]), padded) flipped = tf.map_fn(lambda img: tf.image.random_flip_left_right(img), cropped) self.augmented = flipped self.train_data = AugmentedDataSubset(raw_cifar10data.train_data, sess, self.x_input_placeholder, self.augmented) self.eval_data = AugmentedDataSubset(raw_cifar10data.eval_data, sess, self.x_input_placeholder, self.augmented) self.label_names = raw_cifar10data.label_names class DataSubset(object): def __init__(self, xs, ys): self.xs = xs self.n = xs.shape[0] self.ys = ys self.batch_start = 0 self.cur_order = np.random.permutation(self.n) def get_next_batch(self, batch_size, multiple_passes=False, reshuffle_after_pass=True): if self.n < batch_size: raise ValueError('Batch size can be at most the dataset size') if not multiple_passes: actual_batch_size = min(batch_size, self.n - self.batch_start) if actual_batch_size <= 0: raise ValueError('Pass through the dataset is complete.') batch_end = self.batch_start + actual_batch_size batch_xs = self.xs[self.cur_order[self.batch_start : batch_end], ...] batch_ys = self.ys[self.cur_order[self.batch_start : batch_end], ...] self.batch_start += actual_batch_size return batch_xs, batch_ys actual_batch_size = min(batch_size, self.n - self.batch_start) if actual_batch_size < batch_size: if reshuffle_after_pass: self.cur_order = np.random.permutation(self.n) self.batch_start = 0 batch_end = self.batch_start + batch_size batch_xs = self.xs[self.cur_order[self.batch_start : batch_end], ...] batch_ys = self.ys[self.cur_order[self.batch_start : batch_end], ...] self.batch_start += batch_size return batch_xs, batch_ys class AugmentedDataSubset(object): def __init__(self, raw_datasubset, sess, x_input_placeholder, augmented): self.sess = sess self.raw_datasubset = raw_datasubset self.x_input_placeholder = x_input_placeholder self.augmented = augmented def get_next_batch(self, batch_size, multiple_passes=False, reshuffle_after_pass=True): raw_batch = self.raw_datasubset.get_next_batch(batch_size, multiple_passes, reshuffle_after_pass) images = raw_batch[0].astype(np.float32) return self.sess.run(self.augmented, feed_dict={self.x_input_placeholder: raw_batch[0]}), raw_batch[1] ================================================ FILE: madry_cifar10/config.json ================================================ { "_comment": "===== MODEL CONFIGURATION =====", "model_dir": "models/secret", "_comment": "===== DATASET CONFIGURATION =====", "data_path": "cifar10_data", "_comment": "===== TRAINING CONFIGURATION =====", "tf_random_seed": 451760341, "np_random_seed": 216105420, "max_num_training_steps": 80000, "num_output_steps": 100, "num_summary_steps": 100, "num_checkpoint_steps": 1000, "training_batch_size": 128, "step_size_schedule": [[0, 0.1], [40000, 0.01], [60000, 0.001]], "weight_decay": 0.0002, "momentum": 0.9, "_comment": "===== EVAL CONFIGURATION =====", "num_eval_examples": 100, "eval_batch_size": 100, "eval_on_cpu": false, "_comment": "=====ADVERSARIAL EXAMPLES CONFIGURATION=====", "epsilon": 8.0, "num_steps": 10, "step_size": 2.0, "random_start": true, "loss_func": "xent", "store_adv_path": "attack.npy" } ================================================ FILE: madry_cifar10/eval.py ================================================ """ Infinite evaluation loop going through the checkpoints in the model directory as they appear and evaluating them. Accuracy and average loss are printed and added as tensorboard summaries. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from datetime import datetime import json import math import os import sys import time import tensorflow as tf import cifar10_input from model import Model from pgd_attack import LinfPGDAttack # Global constants with open('config.json') as config_file: config = json.load(config_file) num_eval_examples = config['num_eval_examples'] eval_batch_size = config['eval_batch_size'] eval_on_cpu = config['eval_on_cpu'] data_path = config['data_path'] model_dir = config['model_dir'] # Set upd the data, hyperparameters, and the model cifar = cifar10_input.CIFAR10Data(data_path) if eval_on_cpu: with tf.device("/cpu:0"): model = Model(mode='eval') attack = LinfPGDAttack(model, config['epsilon'], config['num_steps'], config['step_size'], config['random_start'], config['loss_func']) else: model = Model(mode='eval') attack = LinfPGDAttack(model, config['epsilon'], config['num_steps'], config['step_size'], config['random_start'], config['loss_func']) global_step = tf.contrib.framework.get_or_create_global_step() # Setting up the Tensorboard and checkpoint outputs if not os.path.exists(model_dir): os.makedirs(model_dir) eval_dir = os.path.join(model_dir, 'eval') if not os.path.exists(eval_dir): os.makedirs(eval_dir) last_checkpoint_filename = '' already_seen_state = False saver = tf.train.Saver() summary_writer = tf.summary.FileWriter(eval_dir) # A function for evaluating a single checkpoint def evaluate_checkpoint(filename): with tf.Session() as sess: # Restore the checkpoint saver.restore(sess, filename) # Iterate over the samples batch-by-batch num_batches = int(math.ceil(num_eval_examples / eval_batch_size)) total_xent_nat = 0. total_xent_adv = 0. total_corr_nat = 0 total_corr_adv = 0 for ibatch in range(num_batches): bstart = ibatch * eval_batch_size bend = min(bstart + eval_batch_size, num_eval_examples) x_batch = cifar.eval_data.xs[bstart:bend, :] y_batch = cifar.eval_data.ys[bstart:bend] dict_nat = {model.x_input: x_batch, model.y_input: y_batch} x_batch_adv = attack.perturb(x_batch, y_batch, sess) dict_adv = {model.x_input: x_batch_adv, model.y_input: y_batch} cur_corr_nat, cur_xent_nat = sess.run( [model.num_correct,model.xent], feed_dict = dict_nat) cur_corr_adv, cur_xent_adv = sess.run( [model.num_correct,model.xent], feed_dict = dict_adv) print(eval_batch_size) print("Correctly classified natural examples: {}".format(cur_corr_nat)) print("Correctly classified adversarial examples: {}".format(cur_corr_adv)) total_xent_nat += cur_xent_nat total_xent_adv += cur_xent_adv total_corr_nat += cur_corr_nat total_corr_adv += cur_corr_adv avg_xent_nat = total_xent_nat / num_eval_examples avg_xent_adv = total_xent_adv / num_eval_examples acc_nat = total_corr_nat / num_eval_examples acc_adv = total_corr_adv / num_eval_examples summary = tf.Summary(value=[ tf.Summary.Value(tag='xent adv eval', simple_value= avg_xent_adv), tf.Summary.Value(tag='xent adv', simple_value= avg_xent_adv), tf.Summary.Value(tag='xent nat', simple_value= avg_xent_nat), tf.Summary.Value(tag='accuracy adv eval', simple_value= acc_adv), tf.Summary.Value(tag='accuracy adv', simple_value= acc_adv), tf.Summary.Value(tag='accuracy nat', simple_value= acc_nat)]) summary_writer.add_summary(summary, global_step.eval(sess)) print('natural: {:.2f}%'.format(100 * acc_nat)) print('adversarial: {:.2f}%'.format(100 * acc_adv)) print('avg nat loss: {:.4f}'.format(avg_xent_nat)) print('avg adv loss: {:.4f}'.format(avg_xent_adv)) # Infinite eval loop while True: cur_checkpoint = tf.train.latest_checkpoint(model_dir) # Case 1: No checkpoint yet if cur_checkpoint is None: if not already_seen_state: print('No checkpoint yet, waiting ...', end='') already_seen_state = True else: print('.', end='') sys.stdout.flush() time.sleep(10) # Case 2: Previously unseen checkpoint elif cur_checkpoint != last_checkpoint_filename: print('\nCheckpoint {}, evaluating ... ({})'.format(cur_checkpoint, datetime.now())) sys.stdout.flush() last_checkpoint_filename = cur_checkpoint already_seen_state = False evaluate_checkpoint(cur_checkpoint) # Case 3: Previously evaluated checkpoint else: if not already_seen_state: print('Waiting for the next checkpoint ... ({}) '.format( datetime.now()), end='') already_seen_state = True else: print('.', end='') sys.stdout.flush() time.sleep(10) ================================================ FILE: madry_cifar10/fetch_model.py ================================================ """Downloads a model, computes its SHA256 hash and unzips it at the proper location.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import sys import zipfile import hashlib if len(sys.argv) == 1 or sys.argv[1] not in ['natural', 'adv_trained', 'secret']: print('Usage: python fetch_model.py [natural, adv_trained]') sys.exit(1) if sys.argv[1] == 'natural': url = 'https://www.dropbox.com/s/cgzd5odqoojvxzk/natural.zip?dl=1' elif sys.argv[1] == 'adv_trained': url = 'https://www.dropbox.com/s/g4b6ntrp8zrudbz/adv_trained.zip?dl=1' else: # fetch secret model url = 'https://www.dropbox.com/s/ywc0hg8lr5ba8zd/secret.zip?dl=1' fname = url.split('/')[-1].split('?')[0] # get the name of the file # model download print('Downloading models') if sys.version_info >= (3,): import urllib.request urllib.request.urlretrieve(url, fname) else: import urllib urllib.urlretrieve(url, fname) # computing model hash sha256 = hashlib.sha256() with open(fname, 'rb') as f: data = f.read() sha256.update(data) print('SHA256 hash: {}'.format(sha256.hexdigest())) # extracting model print('Extracting model') with zipfile.ZipFile(fname, 'r') as model_zip: model_zip.extractall() print('Extracted model in {}'.format(model_zip.namelist()[0])) ================================================ FILE: madry_cifar10/model.py ================================================ # based on https://github.com/tensorflow/models/tree/master/resnet from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import tensorflow as tf class Model(object): """ResNet model.""" def __init__(self, mode='eval'): """ResNet constructor. Args: mode: One of 'train' and 'eval'. """ self.mode = mode self._build_model() def add_internal_summaries(self): pass def _stride_arr(self, stride): """Map a stride scalar to the stride array for tf.nn.conv2d.""" return [1, stride, stride, 1] def _build_model(self): assert self.mode == 'train' or self.mode == 'eval' """Build the core model within the graph.""" with tf.variable_scope('input'): self.x_input = tf.placeholder( tf.float32, shape=[None, 32, 32, 3]) self.y_input = tf.placeholder(tf.int64, shape=None) input_standardized = tf.map_fn(lambda img: tf.image.per_image_standardization(img), self.x_input) x = self._conv('init_conv', input_standardized, 3, 3, 16, self._stride_arr(1)) strides = [1, 2, 2] activate_before_residual = [True, False, False] res_func = self._residual # Uncomment the following codes to use w28-10 wide residual network. # It is more memory efficient than very deep residual network and has # comparably good performance. # https://arxiv.org/pdf/1605.07146v1.pdf filters = [16, 160, 320, 640] # Update hps.num_residual_units to 9 with tf.variable_scope('unit_1_0'): x = res_func(x, filters[0], filters[1], self._stride_arr(strides[0]), activate_before_residual[0]) for i in range(1, 5): with tf.variable_scope('unit_1_%d' % i): x = res_func(x, filters[1], filters[1], self._stride_arr(1), False) with tf.variable_scope('unit_2_0'): x = res_func(x, filters[1], filters[2], self._stride_arr(strides[1]), activate_before_residual[1]) for i in range(1, 5): with tf.variable_scope('unit_2_%d' % i): x = res_func(x, filters[2], filters[2], self._stride_arr(1), False) with tf.variable_scope('unit_3_0'): x = res_func(x, filters[2], filters[3], self._stride_arr(strides[2]), activate_before_residual[2]) for i in range(1, 5): with tf.variable_scope('unit_3_%d' % i): x = res_func(x, filters[3], filters[3], self._stride_arr(1), False) with tf.variable_scope('unit_last'): x = self._batch_norm('final_bn', x) x = self._relu(x, 0.1) x = self._global_avg_pool(x) with tf.variable_scope('logit'): self.pre_softmax = self._fully_connected(x, 10) self.predictions = tf.argmax(self.pre_softmax, 1) self.correct_prediction = tf.equal(self.predictions, self.y_input) self.num_correct = tf.reduce_sum( tf.cast(self.correct_prediction, tf.int64)) self.accuracy = tf.reduce_mean( tf.cast(self.correct_prediction, tf.float32)) with tf.variable_scope('costs'): self.y_xent = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.pre_softmax, labels=self.y_input) self.xent_per_point = self.y_xent self.xent = tf.reduce_sum(self.y_xent, name='y_xent') self.mean_xent = tf.reduce_mean(self.y_xent) self.weight_decay_loss = self._decay() def _batch_norm(self, name, x): """Batch normalization.""" with tf.name_scope(name): return tf.contrib.layers.batch_norm( inputs=x, decay=.9, center=True, scale=True, activation_fn=None, updates_collections=None, is_training=(self.mode == 'train')) def _residual(self, x, in_filter, out_filter, stride, activate_before_residual=False): """Residual unit with 2 sub layers.""" if activate_before_residual: with tf.variable_scope('shared_activation'): x = self._batch_norm('init_bn', x) x = self._relu(x, 0.1) orig_x = x else: with tf.variable_scope('residual_only_activation'): orig_x = x x = self._batch_norm('init_bn', x) x = self._relu(x, 0.1) with tf.variable_scope('sub1'): x = self._conv('conv1', x, 3, in_filter, out_filter, stride) with tf.variable_scope('sub2'): x = self._batch_norm('bn2', x) x = self._relu(x, 0.1) x = self._conv('conv2', x, 3, out_filter, out_filter, [1, 1, 1, 1]) with tf.variable_scope('sub_add'): if in_filter != out_filter: orig_x = tf.nn.avg_pool(orig_x, stride, stride, 'VALID') orig_x = tf.pad( orig_x, [[0, 0], [0, 0], [0, 0], [(out_filter-in_filter)//2, (out_filter-in_filter)//2]]) x += orig_x tf.logging.debug('image after unit %s', x.get_shape()) return x def _decay(self): """L2 weight decay loss.""" costs = [] for var in tf.trainable_variables(): if var.op.name.find('DW') > 0: costs.append(tf.nn.l2_loss(var)) return tf.add_n(costs) def _conv(self, name, x, filter_size, in_filters, out_filters, strides): """Convolution.""" with tf.variable_scope(name): n = filter_size * filter_size * out_filters kernel = tf.get_variable( 'DW', [filter_size, filter_size, in_filters, out_filters], tf.float32, initializer=tf.random_normal_initializer( stddev=np.sqrt(2.0/n))) return tf.nn.conv2d(x, kernel, strides, padding='SAME') def _relu(self, x, leakiness=0.0): """Relu, with optional leaky support.""" return tf.where(tf.less(x, 0.0), leakiness * x, x, name='leaky_relu') def _fully_connected(self, x, out_dim): """FullyConnected layer for final output.""" num_non_batch_dimensions = len(x.shape) prod_non_batch_dimensions = 1 for ii in range(num_non_batch_dimensions - 1): prod_non_batch_dimensions *= int(x.shape[ii + 1]) x = tf.reshape(x, [tf.shape(x)[0], -1]) w = tf.get_variable( 'DW', [prod_non_batch_dimensions, out_dim], initializer=tf.uniform_unit_scaling_initializer(factor=1.0)) b = tf.get_variable('biases', [out_dim], initializer=tf.constant_initializer()) return tf.nn.xw_plus_b(x, w, b) def _global_avg_pool(self, x): assert x.get_shape().ndims == 4 return tf.reduce_mean(x, [1, 2]) ================================================ FILE: madry_cifar10/model_robustml.py ================================================ import robustml import tensorflow as tf import model class Model(robustml.model.Model): def __init__(self, sess): self._model = model.Model('eval') saver = tf.train.Saver() checkpoint = tf.train.latest_checkpoint('models/secret') saver.restore(sess, checkpoint) self._sess = sess self._input = self._model.x_input self._logits = self._model.pre_softmax self._predictions = self._model.predictions self._dataset = robustml.dataset.CIFAR10() self._threat_model = robustml.threat_model.Linf(epsilon=0.03) @property def dataset(self): return self._dataset @property def threat_model(self): return self._threat_model def classify(self, x): return self._sess.run(self._predictions, {self._input: x})[0] # expose attack interface @property def input(self): return self._input @property def logits(self): return self._logits @property def predictions(self): return self._predictions ================================================ FILE: madry_cifar10/pgd_attack.py ================================================ """ Implementation of attack methods. Running this file as a program will apply the attack to the model specified by the config file and store the examples in an .npy file. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf import numpy as np import cifar10_input class LinfPGDAttack: def __init__(self, model, epsilon, num_steps, step_size, random_start, loss_func): """Attack parameter initialization. The attack performs k steps of size a, while always staying within epsilon from the initial point.""" self.model = model self.epsilon = epsilon self.num_steps = num_steps self.step_size = step_size self.rand = random_start if loss_func == 'xent': loss = model.xent elif loss_func == 'cw': label_mask = tf.one_hot(model.y_input, 10, on_value=1.0, off_value=0.0, dtype=tf.float32) correct_logit = tf.reduce_sum(label_mask * model.pre_softmax, axis=1) wrong_logit = tf.reduce_max((1 - label_mask) * model.pre_softmax - 1e4 * label_mask, axis=1) loss = -tf.nn.relu(correct_logit - wrong_logit + 50) else: print('Unknown loss function. Defaulting to cross-entropy') loss = model.xent self.grad = tf.gradients(loss, model.x_input)[0] def perturb(self, x_nat, y, sess): """Given a set of examples (x_nat, y), returns a set of adversarial examples within epsilon of x_nat in l_infinity norm.""" if self.rand: x = x_nat + np.random.uniform(-self.epsilon, self.epsilon, x_nat.shape) x = np.clip(x, 0, 255) # ensure valid pixel range else: x = np.copy(x_nat) for i in range(self.num_steps): grad = sess.run(self.grad, feed_dict={self.model.x_input: x, self.model.y_input: y}) x = np.add(x, self.step_size * np.sign(grad), out=x, casting='unsafe') x = np.clip(x, x_nat - self.epsilon, x_nat + self.epsilon) x = np.clip(x, 0, 255) # ensure valid pixel range return x if __name__ == '__main__': import json import sys import math from model import Model with open('config.json') as config_file: config = json.load(config_file) model_file = tf.train.latest_checkpoint(config['model_dir']) if model_file is None: print('No model found') sys.exit() model = Model(mode='eval') attack = LinfPGDAttack(model, config['epsilon'], config['num_steps'], config['step_size'], config['random_start'], config['loss_func']) saver = tf.train.Saver() data_path = config['data_path'] cifar = cifar10_input.CIFAR10Data(data_path) gpu_options = tf.GPUOptions(visible_device_list='7', per_process_gpu_memory_fraction=0.5) tf_config = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options) with tf.Session(config=tf_config) as sess: # Restore the checkpoint saver.restore(sess, model_file) # Iterate over the samples batch-by-batch num_eval_examples = config['num_eval_examples'] eval_batch_size = config['eval_batch_size'] num_batches = int(math.ceil(num_eval_examples / eval_batch_size)) x_adv = [] # adv accumulator print('Iterating over {} batches'.format(num_batches)) for ibatch in range(num_batches): bstart = ibatch * eval_batch_size bend = min(bstart + eval_batch_size, num_eval_examples) print('batch size: {}'.format(bend - bstart)) x_batch = cifar.eval_data.xs[bstart:bend, :] y_batch = cifar.eval_data.ys[bstart:bend] x_batch_adv = attack.perturb(x_batch, y_batch, sess) x_adv.append(x_batch_adv) print('Storing examples') path = config['store_adv_path'] x_adv = np.concatenate(x_adv, axis=0) np.save(path, x_adv) print('Examples stored in {}'.format(path)) ================================================ FILE: madry_cifar10/run_attack.py ================================================ """Evaluates a model against examples from a .npy file as specified in config.json""" from __future__ import absolute_import from __future__ import division from __future__ import print_function from datetime import datetime import json import math import os import sys import time import tensorflow as tf import numpy as np from model import Model import cifar10_input with open('config.json') as config_file: config = json.load(config_file) data_path = config['data_path'] def run_attack(checkpoint, x_adv, epsilon): cifar = cifar10_input.CIFAR10Data(data_path) model = Model(mode='eval') saver = tf.train.Saver() num_eval_examples = 10000 eval_batch_size = 100 num_batches = int(math.ceil(num_eval_examples / eval_batch_size)) total_corr = 0 x_nat = cifar.eval_data.xs l_inf = np.amax(np.abs(x_nat - x_adv)) if l_inf > epsilon + 0.0001: print('maximum perturbation found: {}'.format(l_inf)) print('maximum perturbation allowed: {}'.format(epsilon)) return y_pred = [] # label accumulator with tf.Session() as sess: # Restore the checkpoint saver.restore(sess, checkpoint) # Iterate over the samples batch-by-batch for ibatch in range(num_batches): bstart = ibatch * eval_batch_size bend = min(bstart + eval_batch_size, num_eval_examples) x_batch = x_adv[bstart:bend, :] y_batch = cifar.eval_data.ys[bstart:bend] dict_adv = {model.x_input: x_batch, model.y_input: y_batch} cur_corr, y_pred_batch = sess.run([model.num_correct, model.predictions], feed_dict=dict_adv) total_corr += cur_corr y_pred.append(y_pred_batch) accuracy = total_corr / num_eval_examples print('Accuracy: {:.2f}%'.format(100.0 * accuracy)) y_pred = np.concatenate(y_pred, axis=0) np.save('pred.npy', y_pred) print('Output saved at pred.npy') if __name__ == '__main__': import json with open('config.json') as config_file: config = json.load(config_file) model_dir = config['model_dir'] checkpoint = tf.train.latest_checkpoint(model_dir) x_adv = np.load(config['store_adv_path']) if checkpoint is None: print('No checkpoint found') elif x_adv.shape != (10000, 32, 32, 3): print('Invalid shape: expected (10000, 32, 32, 3), found {}'.format(x_adv.shape)) elif np.amax(x_adv) > 255.0001 or np.amin(x_adv) < -0.0001: print('Invalid pixel range. Expected [0, 255], found [{}, {}]'.format( np.amin(x_adv), np.amax(x_adv))) else: run_attack(checkpoint, x_adv, config['epsilon']) ================================================ FILE: madry_cifar10/train.py ================================================ """Trains a model, saving checkpoints and tensorboard summaries along the way.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function from datetime import datetime import json import os import shutil from timeit import default_timer as timer import tensorflow as tf import numpy as np from model import Model import cifar10_input from pgd_attack import LinfPGDAttack with open('config.json') as config_file: config = json.load(config_file) # seeding randomness tf.set_random_seed(config['tf_random_seed']) np.random.seed(config['np_random_seed']) # Setting up training parameters max_num_training_steps = config['max_num_training_steps'] num_output_steps = config['num_output_steps'] num_summary_steps = config['num_summary_steps'] num_checkpoint_steps = config['num_checkpoint_steps'] step_size_schedule = config['step_size_schedule'] weight_decay = config['weight_decay'] data_path = config['data_path'] momentum = config['momentum'] batch_size = config['training_batch_size'] # Setting up the data and the model raw_cifar = cifar10_input.CIFAR10Data(data_path) global_step = tf.contrib.framework.get_or_create_global_step() model = Model(mode='train') # Setting up the optimizer boundaries = [int(sss[0]) for sss in step_size_schedule] boundaries = boundaries[1:] values = [sss[1] for sss in step_size_schedule] learning_rate = tf.train.piecewise_constant( tf.cast(global_step, tf.int32), boundaries, values) total_loss = model.mean_xent + weight_decay * model.weight_decay_loss train_step = tf.train.MomentumOptimizer(learning_rate, momentum).minimize( total_loss, global_step=global_step) # Set up adversary attack = LinfPGDAttack(model, config['epsilon'], config['num_steps'], config['step_size'], config['random_start'], config['loss_func']) # Setting up the Tensorboard and checkpoint outputs model_dir = config['model_dir'] if not os.path.exists(model_dir): os.makedirs(model_dir) # We add accuracy and xent twice so we can easily make three types of # comparisons in Tensorboard: # - train vs eval (for a single run) # - train of different runs # - eval of different runs saver = tf.train.Saver(max_to_keep=3) tf.summary.scalar('accuracy adv train', model.accuracy) tf.summary.scalar('accuracy adv', model.accuracy) tf.summary.scalar('xent adv train', model.xent / batch_size) tf.summary.scalar('xent adv', model.xent / batch_size) tf.summary.image('images adv train', model.x_input) merged_summaries = tf.summary.merge_all() # keep the configuration file with the model for reproducibility shutil.copy('config.json', model_dir) with tf.Session() as sess: # initialize data augmentation cifar = cifar10_input.AugmentedCIFAR10Data(raw_cifar, sess, model) # Initialize the summary writer, global variables, and our time counter. summary_writer = tf.summary.FileWriter(model_dir, sess.graph) sess.run(tf.global_variables_initializer()) training_time = 0.0 # Main training loop for ii in range(max_num_training_steps): x_batch, y_batch = cifar.train_data.get_next_batch(batch_size, multiple_passes=True) # Compute Adversarial Perturbations start = timer() x_batch_adv = attack.perturb(x_batch, y_batch, sess) end = timer() training_time += end - start nat_dict = {model.x_input: x_batch, model.y_input: y_batch} adv_dict = {model.x_input: x_batch_adv, model.y_input: y_batch} # Output to stdout if ii % num_output_steps == 0: nat_acc = sess.run(model.accuracy, feed_dict=nat_dict) adv_acc = sess.run(model.accuracy, feed_dict=adv_dict) print('Step {}: ({})'.format(ii, datetime.now())) print(' training nat accuracy {:.4}%'.format(nat_acc * 100)) print(' training adv accuracy {:.4}%'.format(adv_acc * 100)) if ii != 0: print(' {} examples per second'.format( num_output_steps * batch_size / training_time)) training_time = 0.0 # Tensorboard summaries if ii % num_summary_steps == 0: summary = sess.run(merged_summaries, feed_dict=adv_dict) summary_writer.add_summary(summary, global_step.eval(sess)) # Write a checkpoint if ii % num_checkpoint_steps == 0: saver.save(sess, os.path.join(model_dir, 'checkpoint'), global_step=global_step) # Actual training step start = timer() sess.run(train_step, feed_dict=adv_dict) end = timer() training_time += end - start ================================================ FILE: madry_mnist/LICENSE ================================================ MIT License Copyright (c) 2017 Aleksander Madry, Aleksandar Makelov, Ludwig Schmidt, Dimitris Tsipras, and Adrian Vladu Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: madry_mnist/config.json ================================================ { "_comment": "===== MODEL CONFIGURATION =====", "model_dir": "models/secret", "_comment": "===== TRAINING CONFIGURATION =====", "random_seed": 4557077, "max_num_training_steps": 100000, "num_output_steps": 100, "num_summary_steps": 100, "num_checkpoint_steps": 300, "training_batch_size": 50, "_comment": "===== EVAL CONFIGURATION =====", "num_eval_examples": 10000, "eval_on_cpu": false, "_comment": "=====ADVERSARIAL EXAMPLES CONFIGURATION=====", "epsilon": 0.3, "k": 100, "a": 0.01, "random_start": true, "loss_func": "xent", "store_adv_path": "attack.npy" } ================================================ FILE: madry_mnist/eval.py ================================================ """ Infinite evaluation loop going through the checkpoints in the model directory as they appear and evaluating them. Accuracy and average loss are printed and added as tensorboard summaries. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from datetime import datetime import json import math import os import sys import time import tensorflow as tf from tensorflow.examples.tutorials.mnist import input_data from model import Model from attack import LinfPGDAttack # Global constants with open('config.json') as config_file: config = json.load(config_file) num_eval_examples = config['num_eval_examples'] eval_batch_size = config['eval_batch_size'] eval_on_cpu = config['eval_on_cpu'] model_dir = config['model_dir'] # Set upd the data, hyperparameters, and the model mnist = input_data.read_data_sets('MNIST_data', one_hot=False) if eval_on_cpu: with tf.device("/cpu:0"): model = Model() attack = LinfPGDAttack(model, config['epsilon'], config['k'], config['a'], config['random_start'], config['loss_func']) else: model = Model() attack = LinfPGDAttack(model, config['epsilon'], config['k'], config['a'], config['random_start'], config['loss_func']) global_step = tf.contrib.framework.get_or_create_global_step() # Setting up the Tensorboard and checkpoint outputs if not os.path.exists(model_dir): os.makedirs(model_dir) eval_dir = os.path.join(model_dir, 'eval') if not os.path.exists(eval_dir): os.makedirs(eval_dir) last_checkpoint_filename = '' already_seen_state = False saver = tf.train.Saver() summary_writer = tf.summary.FileWriter(eval_dir) # A function for evaluating a single checkpoint def evaluate_checkpoint(filename): with tf.Session() as sess: # Restore the checkpoint saver.restore(sess, filename) # Iterate over the samples batch-by-batch num_batches = int(math.ceil(num_eval_examples / eval_batch_size)) total_xent_nat = 0. total_xent_adv = 0. total_corr_nat = 0 total_corr_adv = 0 for ibatch in range(num_batches): bstart = ibatch * eval_batch_size bend = min(bstart + eval_batch_size, num_eval_examples) x_batch = mnist.test.images[bstart:bend, :] y_batch = mnist.test.labels[bstart:bend] dict_nat = {model.x_input: x_batch, model.y_input: y_batch} x_batch_adv = attack.perturb(x_batch, y_batch, sess) dict_adv = {model.x_input: x_batch_adv, model.y_input: y_batch} cur_corr_nat, cur_xent_nat = sess.run( [model.num_correct,model.xent], feed_dict = dict_nat) cur_corr_adv, cur_xent_adv = sess.run( [model.num_correct,model.xent], feed_dict = dict_adv) total_xent_nat += cur_xent_nat total_xent_adv += cur_xent_adv total_corr_nat += cur_corr_nat total_corr_adv += cur_corr_adv avg_xent_nat = total_xent_nat / num_eval_examples avg_xent_adv = total_xent_adv / num_eval_examples acc_nat = total_corr_nat / num_eval_examples acc_adv = total_corr_adv / num_eval_examples summary = tf.Summary(value=[ tf.Summary.Value(tag='xent adv eval', simple_value= avg_xent_adv), tf.Summary.Value(tag='xent adv', simple_value= avg_xent_adv), tf.Summary.Value(tag='xent nat', simple_value= avg_xent_nat), tf.Summary.Value(tag='accuracy adv eval', simple_value= acc_adv), tf.Summary.Value(tag='accuracy adv', simple_value= acc_adv), tf.Summary.Value(tag='accuracy nat', simple_value= acc_nat)]) summary_writer.add_summary(summary, global_step.eval(sess)) print('natural: {:.2f}%'.format(100 * acc_nat)) print('adversarial: {:.2f}%'.format(100 * acc_adv)) print('avg nat loss: {:.4f}'.format(avg_xent_nat)) print('avg adv loss: {:.4f}'.format(avg_xent_adv)) # Infinite eval loop while True: cur_checkpoint = tf.train.latest_checkpoint(model_dir) # Case 1: No checkpoint yet if cur_checkpoint is None: if not already_seen_state: print('No checkpoint yet, waiting ...', end='') already_seen_state = True else: print('.', end='') sys.stdout.flush() time.sleep(10) # Case 2: Previously unseen checkpoint elif cur_checkpoint != last_checkpoint_filename: print('\nCheckpoint {}, evaluating ... ({})'.format(cur_checkpoint, datetime.now())) sys.stdout.flush() last_checkpoint_filename = cur_checkpoint already_seen_state = False evaluate_checkpoint(cur_checkpoint) # Case 3: Previously evaluated checkpoint else: if not already_seen_state: print('Waiting for the next checkpoint ... ({}) '.format( datetime.now()), end='') already_seen_state = True else: print('.', end='') sys.stdout.flush() time.sleep(10) ================================================ FILE: madry_mnist/fetch_model.py ================================================ """Downloads a model, computes its SHA256 hash and unzips it at the proper location.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import sys import zipfile import hashlib if len(sys.argv) != 2 or sys.argv[1] not in ['natural', 'adv_trained', 'secret']: print('Usage: python fetch_model.py [natural, adv_trained, secret]') sys.exit(1) if sys.argv[1] == 'natural': url = 'https://github.com/MadryLab/mnist_challenge_models/raw/master/natural.zip' elif sys.argv[1] == 'secret': url = 'https://github.com/MadryLab/mnist_challenge_models/raw/master/secret.zip' else: # fetch adv_trained model url = 'https://github.com/MadryLab/mnist_challenge_models/raw/master/adv_trained.zip' fname = url.split('/')[-1] # get the name of the file # model download print('Downloading models') if sys.version_info >= (3,): import urllib.request urllib.request.urlretrieve(url, fname) else: import urllib urllib.urlretrieve(url, fname) # computing model hash sha256 = hashlib.sha256() with open(fname, 'rb') as f: data = f.read() sha256.update(data) print('SHA256 hash: {}'.format(sha256.hexdigest())) # extracting model print('Extracting model') with zipfile.ZipFile(fname, 'r') as model_zip: model_zip.extractall() print('Extracted model in {}'.format(model_zip.namelist()[0])) ================================================ FILE: madry_mnist/model.py ================================================ """ The model is adapted from the tensorflow tutorial: https://www.tensorflow.org/get_started/mnist/pros """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf class Model(object): def __init__(self): self.x_input = tf.placeholder(tf.float32, shape = [None, 784]) self.y_input = tf.placeholder(tf.int64, shape = [None]) self.x_image = tf.reshape(self.x_input, [-1, 28, 28, 1]) # first convolutional layer W_conv1 = self._weight_variable([5,5,1,32]) b_conv1 = self._bias_variable([32]) h_conv1 = tf.nn.relu(self._conv2d(self.x_image, W_conv1) + b_conv1) h_pool1 = self._max_pool_2x2(h_conv1) # second convolutional layer W_conv2 = self._weight_variable([5,5,32,64]) b_conv2 = self._bias_variable([64]) h_conv2 = tf.nn.relu(self._conv2d(h_pool1, W_conv2) + b_conv2) h_pool2 = self._max_pool_2x2(h_conv2) # first fully connected layer W_fc1 = self._weight_variable([7 * 7 * 64, 1024]) b_fc1 = self._bias_variable([1024]) h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) # output layer W_fc2 = self._weight_variable([1024,10]) b_fc2 = self._bias_variable([10]) self.pre_softmax = tf.matmul(h_fc1, W_fc2) + b_fc2 y_xent = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.y_input, logits=self.pre_softmax) self.xent_per_point = y_xent self.xent = tf.reduce_sum(y_xent) self.y_pred = tf.argmax(self.pre_softmax, 1) correct_prediction = tf.equal(self.y_pred, self.y_input) self.num_correct = tf.reduce_sum(tf.cast(correct_prediction, tf.int64)) self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) @staticmethod def _weight_variable(shape): initial = tf.truncated_normal(shape, stddev=0.1) return tf.Variable(initial) @staticmethod def _bias_variable(shape): initial = tf.constant(0.1, shape = shape) return tf.Variable(initial) @staticmethod def _conv2d(x, W): return tf.nn.conv2d(x, W, strides=[1,1,1,1], padding='SAME') @staticmethod def _max_pool_2x2( x): return tf.nn.max_pool(x, ksize = [1,2,2,1], strides=[1,2,2,1], padding='SAME') ================================================ FILE: madry_mnist/run_attack.py ================================================ """Evaluates a model against examples from a .npy file as specified in config.json""" from __future__ import absolute_import from __future__ import division from __future__ import print_function from datetime import datetime import json import math import os import sys import time import tensorflow as tf from tensorflow.examples.tutorials.mnist import input_data import numpy as np from model import Model def run_attack(checkpoint, x_adv, epsilon): mnist = input_data.read_data_sets('MNIST_data', one_hot=False) model = Model() saver = tf.train.Saver() num_eval_examples = 10000 eval_batch_size = 64 num_batches = int(math.ceil(num_eval_examples / eval_batch_size)) total_corr = 0 x_nat = mnist.test.images l_inf = np.amax(np.abs(x_nat - x_adv)) if l_inf > epsilon + 0.0001: print('maximum perturbation found: {}'.format(l_inf)) print('maximum perturbation allowed: {}'.format(epsilon)) return y_pred = [] # label accumulator with tf.Session() as sess: # Restore the checkpoint saver.restore(sess, checkpoint) # Iterate over the samples batch-by-batch for ibatch in range(num_batches): bstart = ibatch * eval_batch_size bend = min(bstart + eval_batch_size, num_eval_examples) x_batch = x_adv[bstart:bend, :] y_batch = mnist.test.labels[bstart:bend] dict_adv = {model.x_input: x_batch, model.y_input: y_batch} cur_corr, y_pred_batch = sess.run([model.num_correct, model.y_pred], feed_dict=dict_adv) total_corr += cur_corr y_pred.append(y_pred_batch) accuracy = total_corr / num_eval_examples print('Accuracy: {:.2f}%'.format(100.0 * accuracy)) y_pred = np.concatenate(y_pred, axis=0) np.save('pred.npy', y_pred) print('Output saved at pred.npy') if __name__ == '__main__': import json with open('config.json') as config_file: config = json.load(config_file) model_dir = config['model_dir'] checkpoint = tf.train.latest_checkpoint(model_dir) x_adv = np.load(config['store_adv_path']) if checkpoint is None: print('No checkpoint found') elif x_adv.shape != (10000, 784): print('Invalid shape: expected (10000,784), found {}'.format(x_adv.shape)) elif np.amax(x_adv) > 1.0001 or \ np.amin(x_adv) < -0.0001 or \ np.isnan(np.amax(x_adv)): print('Invalid pixel range. Expected [0, 1], found [{}, {}]'.format( np.amin(x_adv), np.amax(x_adv))) else: run_attack(checkpoint, x_adv, config['epsilon']) ================================================ FILE: madry_mnist/train.py ================================================ """Trains a model, saving checkpoints and tensorboard summaries along the way.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function from datetime import datetime import json import os import shutil from timeit import default_timer as timer import tensorflow as tf import numpy as np from tensorflow.examples.tutorials.mnist import input_data from model import Model from attack import LinfPGDAttack with open('config.json') as config_file: config = json.load(config_file) # Setting up training parameters tf.set_random_seed(config['random_seed']) max_num_training_steps = config['max_num_training_steps'] num_output_steps = config['num_output_steps'] num_summary_steps = config['num_summary_steps'] num_checkpoint_steps = config['num_checkpoint_steps'] batch_size = config['training_batch_size'] # Setting up the data and the model mnist = input_data.read_data_sets('MNIST_data', one_hot=False) global_step = tf.contrib.framework.get_or_create_global_step() model = Model() # Setting up the optimizer train_step = tf.train.AdamOptimizer(1e-4).minimize(model.xent, global_step=global_step) # Set up adversary attack = LinfPGDAttack(model, config['epsilon'], config['k'], config['a'], config['random_start'], config['loss_func']) # Setting up the Tensorboard and checkpoint outputs model_dir = config['model_dir'] if not os.path.exists(model_dir): os.makedirs(model_dir) # We add accuracy and xent twice so we can easily make three types of # comparisons in Tensorboard: # - train vs eval (for a single run) # - train of different runs # - eval of different runs saver = tf.train.Saver(max_to_keep=3) tf.summary.scalar('accuracy adv train', model.accuracy) tf.summary.scalar('accuracy adv', model.accuracy) tf.summary.scalar('xent adv train', model.xent / batch_size) tf.summary.scalar('xent adv', model.xent / batch_size) tf.summary.image('images adv train', model.x_image) merged_summaries = tf.summary.merge_all() shutil.copy('config.json', model_dir) with tf.Session() as sess: # Initialize the summary writer, global variables, and our time counter. summary_writer = tf.summary.FileWriter(model_dir, sess.graph) sess.run(tf.global_variables_initializer()) training_time = 0.0 # Main training loop for ii in range(max_num_training_steps): x_batch, y_batch = mnist.train.next_batch(batch_size) # Compute Adversarial Perturbations start = timer() x_batch_adv = attack.perturb(x_batch, y_batch, sess) end = timer() training_time += end - start nat_dict = {model.x_input: x_batch, model.y_input: y_batch} adv_dict = {model.x_input: x_batch_adv, model.y_input: y_batch} # Output to stdout if ii % num_output_steps == 0: nat_acc = sess.run(model.accuracy, feed_dict=nat_dict) adv_acc = sess.run(model.accuracy, feed_dict=adv_dict) print('Step {}: ({})'.format(ii, datetime.now())) print(' training nat accuracy {:.4}%'.format(nat_acc * 100)) print(' training adv accuracy {:.4}%'.format(adv_acc * 100)) if ii != 0: print(' {} examples per second'.format( num_output_steps * batch_size / training_time)) training_time = 0.0 # Tensorboard summaries if ii % num_summary_steps == 0: summary = sess.run(merged_summaries, feed_dict=adv_dict) summary_writer.add_summary(summary, global_step.eval(sess)) # Write a checkpoint if ii % num_checkpoint_steps == 0: saver.save(sess, os.path.join(model_dir, 'checkpoint'), global_step=global_step) # Actual training step start = timer() sess.run(train_step, feed_dict=adv_dict) end = timer() training_time += end - start ================================================ FILE: models.py ================================================ import torch import tensorflow as tf import numpy as np import math import utils from torchvision import models as torch_models from torch.nn import DataParallel from madry_mnist.model import Model as madry_model_mnist from madry_cifar10.model import Model as madry_model_cifar10 from logit_pairing.models import LeNet as lp_model_mnist, ResNet20_v2 as lp_model_cifar10 from post_avg.postAveragedModels import pa_resnet110_config1 as post_avg_cifar10_resnet from post_avg.postAveragedModels import pa_resnet152_config1 as post_avg_imagenet_resnet class Model: def __init__(self, batch_size, gpu_memory): self.batch_size = batch_size self.gpu_memory = gpu_memory def predict(self, x): raise NotImplementedError('use ModelTF or ModelPT') def loss(self, y, logits, targeted=False, loss_type='margin_loss'): """ Implements the margin loss (difference between the correct and 2nd best class). """ if loss_type == 'margin_loss': preds_correct_class = (logits * y).sum(1, keepdims=True) diff = preds_correct_class - logits # difference between the correct class and all other classes diff[y] = np.inf # to exclude zeros coming from f_correct - f_correct margin = diff.min(1, keepdims=True) loss = margin * -1 if targeted else margin elif loss_type == 'cross_entropy': probs = utils.softmax(logits) loss = -np.log(probs[y]) loss = loss * -1 if not targeted else loss else: raise ValueError('Wrong loss.') return loss.flatten() class ModelTF(Model): """ Wrapper class around TensorFlow models. In order to incorporate a new model, one has to ensure that self.model has a TF variable `logits`, and that the preprocessing of the inputs is done correctly (e.g. subtracting the mean and dividing over the standard deviation). """ def __init__(self, model_name, batch_size, gpu_memory): super().__init__(batch_size, gpu_memory) model_folder = model_path_dict[model_name] model_file = tf.train.latest_checkpoint(model_folder) self.model = model_class_dict[model_name]() self.batch_size = batch_size self.model_name = model_name self.model_file = model_file if 'logits' not in self.model.__dict__: self.model.logits = self.model.pre_softmax gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_memory) config = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options) self.sess = tf.Session(config=config) tf.train.Saver().restore(self.sess, model_file) def predict(self, x): if 'mnist' in self.model_name: shape = self.model.x_input.shape[1:].as_list() x = np.reshape(x, [-1, *shape]) elif 'cifar10' in self.model_name: x = np.transpose(x, axes=[0, 2, 3, 1]) n_batches = math.ceil(x.shape[0] / self.batch_size) logits_list = [] for i in range(n_batches): x_batch = x[i*self.batch_size:(i+1)*self.batch_size] logits = self.sess.run(self.model.logits, feed_dict={self.model.x_input: x_batch}) logits_list.append(logits) logits = np.vstack(logits_list) return logits class ModelPT(Model): """ Wrapper class around PyTorch models. In order to incorporate a new model, one has to ensure that self.model is a callable object that returns logits, and that the preprocessing of the inputs is done correctly (e.g. subtracting the mean and dividing over the standard deviation). """ def __init__(self, model_name, batch_size, gpu_memory): super().__init__(batch_size, gpu_memory) if model_name in ['pt_vgg', 'pt_resnet', 'pt_inception', 'pt_densenet']: model = model_class_dict[model_name](pretrained=True) self.mean = np.reshape([0.485, 0.456, 0.406], [1, 3, 1, 1]) self.std = np.reshape([0.229, 0.224, 0.225], [1, 3, 1, 1]) model = DataParallel(model.cuda()) else: model = model_class_dict[model_name]() if model_name in ['pt_post_avg_cifar10', 'pt_post_avg_imagenet']: # checkpoint = torch.load(model_path_dict[model_name]) self.mean = np.reshape([0.485, 0.456, 0.406], [1, 3, 1, 1]) self.std = np.reshape([0.229, 0.224, 0.225], [1, 3, 1, 1]) else: model = DataParallel(model).cuda() checkpoint = torch.load(model_path_dict[model_name] + '.pth') self.mean = np.reshape([0.485, 0.456, 0.406], [1, 3, 1, 1]) self.std = np.reshape([0.225, 0.225, 0.225], [1, 3, 1, 1]) model.load_state_dict(checkpoint) model.float() self.mean, self.std = self.mean.astype(np.float32), self.std.astype(np.float32) model.eval() self.model = model def predict(self, x): x = (x - self.mean) / self.std x = x.astype(np.float32) n_batches = math.ceil(x.shape[0] / self.batch_size) logits_list = [] with torch.no_grad(): # otherwise consumes too much memory and leads to a slowdown for i in range(n_batches): x_batch = x[i*self.batch_size:(i+1)*self.batch_size] x_batch_torch = torch.as_tensor(x_batch, device=torch.device('cuda')) logits = self.model(x_batch_torch).cpu().numpy() logits_list.append(logits) logits = np.vstack(logits_list) return logits model_path_dict = {'madry_mnist_robust': 'madry_mnist/models/robust', 'madry_cifar10_robust': 'madry_cifar10/models/robust', 'clp_mnist': 'logit_pairing/models/clp_mnist', 'lsq_mnist': 'logit_pairing/models/lsq_mnist', 'clp_cifar10': 'logit_pairing/models/clp_cifar10', 'lsq_cifar10': 'logit_pairing/models/lsq_cifar10', 'pt_post_avg_cifar10': 'post_avg/trainedModel/resnet110.th' } model_class_dict = {'pt_vgg': torch_models.vgg16_bn, 'pt_resnet': torch_models.resnet50, 'pt_inception': torch_models.inception_v3, 'pt_densenet': torch_models.densenet121, 'madry_mnist_robust': madry_model_mnist, 'madry_cifar10_robust': madry_model_cifar10, 'clp_mnist': lp_model_mnist, 'lsq_mnist': lp_model_mnist, 'clp_cifar10': lp_model_cifar10, 'lsq_cifar10': lp_model_cifar10, 'pt_post_avg_cifar10': post_avg_cifar10_resnet, 'pt_post_avg_imagenet': post_avg_imagenet_resnet, } all_model_names = list(model_class_dict.keys()) ================================================ FILE: post_avg/LICENSE.txt ================================================ MIT License Copyright (c) [2019] [Yuping Lin] Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: post_avg/PADefense.py ================================================ # -*- coding: utf-8 -*- import time import torch import torch.nn as nn import torch.utils.data as data import torch.cuda as cuda import torchvision.transforms as transforms import torchvision.utils as utl import torch.backends.cudnn as cudnn import torchvision.datasets as datasets import torchvision.models as mdl def checkEntropy(scores): scores = scores.squeeze() scr = scores.clone() scr[scr <= 0] = 1.0 return - torch.sum(scores * torch.log(scr)) def checkConfidence(scores, K=10): scores = scores.squeeze() hScores, _ = torch.sort(scores, dim=0, descending=True) return hScores[0] / torch.sum(hScores[:K]) def integratedForward(model, sps, batchSize, nClasses, device='cpu', voteMethod='avg_softmax'): N = sps.size(0) feats = torch.empty(N, nClasses) model = model.to(device) with torch.no_grad(): baseInx = 0 while baseInx < N: cuda.empty_cache() endInx = min(baseInx + batchSize, N) y = model(sps[baseInx:endInx, :].to(device)).detach().to('cpu') feats[baseInx:endInx, :] = y baseInx = endInx if voteMethod == 'avg_feat': feat = torch.mean(feats, dim=0, keepdim=True) elif voteMethod == 'most_vote': maxV, _ = torch.max(feats, dim=1, keepdim=True) feat = torch.sum(feats == maxV, dim=0, keepdim=True) elif voteMethod == 'weighted_feat': feat = torch.mean(feats, dim=0, keepdim=True) maxV, _ = torch.max(feats, dim=1, keepdim=True) feat = feat * torch.sum(feats == maxV, dim=0, keepdim=True).float() elif voteMethod == 'avg_softmax': feats = nn.functional.softmax(feats, dim=1) feat = torch.mean(feats, dim=0, keepdim=True) else: # default method: avg_softmax feats = nn.functional.softmax(feats, dim=1) feat = torch.mean(feats, dim=0, keepdim=True) return feat, feats # not updated, deprecated def integratedForward_cls(model, sps, batchSize, nClasses, device='cpu', count_votes=False): N = sps.size(0) feats = torch.empty(N, nClasses) model = model.to(device) with torch.no_grad(): baseInx = 0 while baseInx < N: cuda.empty_cache() endInx = min(baseInx + batchSize, N) y = model.classifier(sps[baseInx:endInx, :].to(device)).detach().to('cpu') feats[baseInx:endInx, :] = y baseInx = endInx if count_votes: maxV, _ = torch.max(feats, dim=1, keepdim=True) feat = torch.sum(feats == maxV, dim=0, keepdim=True) else: feat = torch.mean(feats, dim=0, keepdim=True) return feat, feats def findNeighbors_random(sp, K, r=[2], direction='both'): # only accept single sample if sp.size(0) != 1: return None if isinstance(K, list): K = sum(K) # randomly select directions shifts = torch.randn(K, sp.size(1) * sp.size(2) * sp.size(3)).to('cuda') shifts = nn.functional.normalize(shifts, p=2, dim=1) shifts = shifts.view(K, sp.size(1), sp.size(2), sp.size(3)).contiguous() if direction == 'both': shifts = torch.cat([shifts, -shifts], dim=0) nbs = [] for rInx in range(len(r)): nbs.append(sp.to('cuda') + r[rInx] * shifts) return torch.cat(nbs, dim=0) def findNeighbors_plain_vgg(model, sp, K, r=[2], direction='both', device='cpu'): # only accept single sample if sp.size(0) != 1: return None # storages for K selected distances and linear mapping selected_list = [] # set model to evaluation mode model = model.to(device) model = model.eval() # place holder for input, and set to require gradient x = sp.clone().to(device) x.requires_grad = True # forward through the feature part y = model.features(x) y = model.avgpool(y) y = y.view(y.size(0), -1) # forward through classifier layer by layer for lyInx, module in model.classifier.named_children(): # forward y = module(y) # at each layer activation if isinstance(module, nn.Linear): # for each neuron for i in range(y.size(1)): # clear previous gradients x.grad = None # compute gradients goal = torch.abs(y[0, i]) goal.backward(retain_graph=True) # retain graph for future computation # compute distance d = torch.abs(y[0, i]) / torch.norm(x.grad) # keep K shortest distances selected_list.append((d.clone().detach().to('cpu'), x.grad.clone().detach().to('cpu'))) selected_list = sorted(selected_list, key=lambda x:x[0], reverse=False) selected_list = selected_list[0:K] # generate neighboring samples grad_list = [e[1] / torch.norm(e[1]) for e in selected_list] unit_shifts = torch.cat(grad_list, dim=0) nbs = [] for rInx in range(len(r)): if direction == 'inc': nbs.append(sp.to('cpu') + r[rInx] * unit_shifts) elif direction == 'dec': nbs.append(sp.to('cpu') - r[rInx] * unit_shifts) else: nbs.append(sp.to('cpu') + r[rInx] * unit_shifts) nbs.append(sp.to('cpu') - r[rInx] * unit_shifts) nbs = torch.cat(nbs, dim=0) nbs = nbs.detach() nbs.requires_grad = False return nbs def findNeighbors_lastLy_vgg(model, sp, K, r=[2], direction='both', device='cpu'): # only accept single sample if sp.size(0) != 1: return None # storages for K selected distances and linear mapping selected_list = [] # set model to evaluation mode model = model.to(device) model = model.eval() # place holder for input, and set to require gradient x = sp.clone().to(device) x.requires_grad = True # forward through the feature part y = model(x) y = y.view(y.size(0), -1) for i in range(y.size(1)): # clear previous gradients x.grad = None # compute gradients goal = torch.abs(y[0, i]) if i < y.size(1) - 1: goal.backward(retain_graph=True) # retain graph for future computation else: goal.backward(retain_graph=False) # compute distance d = torch.abs(y[0, i]) / torch.norm(x.grad) # keep K shortest distances selected_list.append((d.clone().detach().to('cpu'), x.grad.clone().detach().to('cpu'))) selected_list = sorted(selected_list, key=lambda x:x[0], reverse=False) selected_list = selected_list[0:K] # generate neighboring samples grad_list = [e[1] / torch.norm(e[1]) for e in selected_list] unit_shifts = torch.cat(grad_list, dim=0) nbs = [] for rInx in range(len(r)): if direction == 'inc': nbs.append(sp.to('cpu') + r[rInx] * unit_shifts) elif direction == 'dec': nbs.append(sp.to('cpu') - r[rInx] * unit_shifts) else: nbs.append(sp.to('cpu') + r[rInx] * unit_shifts) nbs.append(sp.to('cpu') - r[rInx] * unit_shifts) nbs = torch.cat(nbs, dim=0) nbs = nbs.detach() nbs.requires_grad = False return nbs def findNeighbors_approx_vgg(model, sp, K, r=[2], direction='both', device='cpu'): # only accept single sample if sp.size(0) != 1: return None # storages for K selected distances and linear mapping selected_list = [] # set model to evaluation mode model = model.to(device) model = model.eval() # place holder for input, and set to require gradient x = sp.clone().to(device) x.requires_grad = True # forward through the feature part y = model.features(x) y = model.avgpool(y) y = y.view(y.size(0), -1) # forward through classifier layer by layer lnLy_inx = 0 for lyInx, module in model.classifier.named_children(): # forward y = module(y) # at each layer activation if isinstance(module, nn.Linear): KInx = min(lnLy_inx, len(K)-1) if K[KInx] > 0: with torch.no_grad(): # compute weight norm w_norm = torch.norm(module.weight, dim=1, keepdim=True) # compute distance d = torch.abs(y) / w_norm.t() _, sortedInx = torch.sort(d, dim=1, descending=False) # for each selected neuron for i in range(K[KInx]): # clear previous gradients x.grad = None # compute gradients goal = torch.abs(y[0, sortedInx[0, i]]) goal.backward(retain_graph=True) # retain graph for future computation # record gradients selected_list.append(x.grad.clone().detach().to('cpu') / torch.norm(x.grad).detach().to('cpu')) # update number of linear layer sampled lnLy_inx = lnLy_inx + 1 # generate neighboring samples unit_shifts = torch.cat(selected_list, dim=0) nbs = [] for rInx in range(len(r)): if direction == 'inc': nbs.append(sp.to('cpu') + r[rInx] * unit_shifts) elif direction == 'dec': nbs.append(sp.to('cpu') - r[rInx] * unit_shifts) else: nbs.append(sp.to('cpu') + r[rInx] * unit_shifts) nbs.append(sp.to('cpu') - r[rInx] * unit_shifts) nbs = torch.cat(nbs, dim=0) nbs = nbs.detach() nbs.requires_grad = False return nbs def findNeighbors_randPick_vgg(model, sp, K, r=[2], direction='both', device='cpu'): # only accept single sample if sp.size(0) != 1: return None # storages for K selected distances and linear mapping selected_list = [] # set model to evaluation mode model = model.to(device) model = model.eval() # place holder for input, and set to require gradient x = sp.clone().to(device) x.requires_grad = True # forward through the feature part y = model.features(x) y = model.avgpool(y) y = y.view(y.size(0), -1) # forward through classifier layer by layer lnLy_inx = 0 for lyInx, module in model.classifier.named_children(): # forward y = module(y) # at each layer activation if isinstance(module, nn.Linear): KInx = min(lnLy_inx, len(K)-1) if K[KInx] > 0: # randomly permute indices pickInx = torch.randperm(y.size(1)) # for each selected neuron for i in range(K[KInx]): # clear previous gradients x.grad = None # compute gradients goal = torch.abs(y[0, pickInx[i]]) goal.backward(retain_graph=True) # retain graph for future computation # record gradients selected_list.append(x.grad.clone().detach().to('cpu') / torch.norm(x.grad).detach().to('cpu')) # update number of linear layer sampled lnLy_inx = lnLy_inx + 1 # generate neighboring samples unit_shifts = torch.cat(selected_list, dim=0) nbs = [] for rInx in range(len(r)): if direction == 'inc': nbs.append(sp.to('cpu') + r[rInx] * unit_shifts) elif direction == 'dec': nbs.append(sp.to('cpu') - r[rInx] * unit_shifts) else: nbs.append(sp.to('cpu') + r[rInx] * unit_shifts) nbs.append(sp.to('cpu') - r[rInx] * unit_shifts) nbs = torch.cat(nbs, dim=0) nbs = nbs.detach() nbs.requires_grad = False return nbs # not updated, deprecated def findNeighbors_feats_lastLy_vgg(model, sp, K, r=[2], direction='both', device='cpu', includeOriginal=True): # only accept single sample if sp.size(0) != 1: return None # storages for K selected distances and linear mapping selected_list = [] # set model to evaluation mode model = model.to(device) model = model.eval() # forward through the feature part with torch.no_grad(): feat = model.features(sp.to(device)) feat = feat.view(feat.size(0), -1).contiguous().detach() # place holder for feature, and set to require gradient x = feat.clone().detach() x.requires_grad = True # forward through the classifier part y = model.classifier(x) y = y.view(y.size(0), -1) for i in range(y.size(1)): # clear previous gradients x.grad = None # compute gradients goal = torch.abs(y[0, i]) if i < y.size(1) - 1: goal.backward(retain_graph=True) # retain graph for future computation else: goal.backward(retain_graph=False) # compute distance d = torch.abs(y[0, i]) / torch.norm(x.grad) # keep K shortest distances selected_list.append((d.clone().detach().to('cpu'), x.grad.clone().detach().to('cpu'))) selected_list = sorted(selected_list, key=lambda x:x[0], reverse=False) selected_list = selected_list[0:K] # generate neighboring samples grad_list = [e[1] / torch.norm(e[1]) for e in selected_list] unit_shifts = torch.cat(grad_list, dim=0) if includeOriginal: nbs = [feat.to('cpu')] else: nbs = [] for rInx in range(len(r)): if direction == 'inc': nbs.append(feat.to('cpu') + r[rInx] * unit_shifts) elif direction == 'dec': nbs.append(feat.to('cpu') - r[rInx] * unit_shifts) else: nbs.append(feat.to('cpu') + r[rInx] * unit_shifts) nbs.append(feat.to('cpu') - r[rInx] * unit_shifts) nbs = torch.cat(nbs, dim=0) nbs = nbs.detach() nbs.requires_grad = False return nbs # not updated, deprecated def findNeighbors_feats_approx_vgg(model, sp, K, r=[2], direction='both', device='cpu', includeOriginal=True): # only accept single sample if sp.size(0) != 1: return None # storages for K selected distances and linear mapping selected_list = [] # set model to evaluation mode model = model.to(device) model = model.eval() # forward through the feature part with torch.no_grad(): feat = model.features(sp.to(device)) feat = feat.view(feat.size(0), -1).contiguous().detach() # place holder for feature, and set to require gradient x = feat.clone().detach() x.requires_grad = True y = x # forward through classifier layer by layer lnLy_inx = 0 for lyInx, module in model.classifier.named_children(): # forward y = module(y) # at each layer activation if isinstance(module, nn.Linear): KInx = min(lnLy_inx, len(K)-1) if K[KInx] > 0: with torch.no_grad(): # compute weight norm w_norm = torch.norm(module.weight, dim=1, keepdim=True) # compute distance d = torch.abs(y) / w_norm.t() _, sortedInx = torch.sort(d, dim=1, descending=False) # for each selected neuron for i in range(K[KInx]): # clear previous gradients x.grad = None # compute gradients goal = torch.abs(y[0, sortedInx[0, i]]) goal.backward(retain_graph=True) # retain graph for future computation # record gradients selected_list.append(x.grad.clone().detach().to('cpu') / torch.norm(x.grad).detach().to('cpu')) # update number of linear layer sampled lnLy_inx = lnLy_inx + 1 # generate neighboring samples unit_shifts = torch.cat(selected_list, dim=0) if includeOriginal: nbs = [feat.to('cpu')] else: nbs = [] for rInx in range(len(r)): if direction == 'inc': nbs.append(feat.to('cpu') + r[rInx] * unit_shifts) elif direction == 'dec': nbs.append(feat.to('cpu') - r[rInx] * unit_shifts) else: nbs.append(feat.to('cpu') + r[rInx] * unit_shifts) nbs.append(feat.to('cpu') - r[rInx] * unit_shifts) nbs = torch.cat(nbs, dim=0) nbs = nbs.detach() nbs.requires_grad = False return nbs def formSquad_vgg(method, model, sp, K, r=[2], direction='both', device='cpu', includeOriginal=True): if method == 'random': nbs = findNeighbors_random(sp, K, r, direction=direction) if includeOriginal: nbs = torch.cat([sp, nbs], dim=0) elif method == 'plain': nbs = findNeighbors_plain_vgg(model, sp, K, r, direction=direction, device=device) if includeOriginal: nbs = torch.cat([sp, nbs], dim=0) elif method == 'lastLy': nbs = findNeighbors_lastLy_vgg(model, sp, K, r, direction=direction, device=device) if includeOriginal: nbs = torch.cat([sp, nbs], dim=0) elif method == 'approx': nbs = findNeighbors_approx_vgg(model, sp, K, r, direction=direction, device=device) if includeOriginal: nbs = torch.cat([sp, nbs], dim=0) elif method == 'randPick': nbs = findNeighbors_randPick_vgg(model, sp, K, r, direction=direction, device=device) if includeOriginal: nbs = torch.cat([sp, nbs], dim=0) elif method == 'feats_lastLy': nbs = findNeighbors_feats_lastLy_vgg(model, sp, K, r, direction=direction, device=device, includeOriginal=includeOriginal) elif method == 'feats_approx': nbs = findNeighbors_feats_approx_vgg(model, sp, K, r, direction=direction, device=device, includeOriginal=includeOriginal) else: # if invalid method, use default setting. (actually should raise error here) nbs = findNeighbors_random(sp, K, r, direction=direction) if includeOriginal: nbs = torch.cat([sp, nbs], dim=0) return nbs def findNeighbors_approx_resnet(model, sp, K, r=[2], direction='both', device='cpu'): # only accept single sample if sp.size(0) != 1: return None # storages for K selected distances and linear mapping selected_list = [] # set model to evaluation mode model = model.to(device) model = model.eval() # place holder for input, and set to require gradient x = sp.clone().to(device) x.requires_grad = True # forward through the model y = model(x) y = y.view(y.size(0), -1) if K > 0: with torch.no_grad(): # compute weight norm w_norm = torch.norm(model.fc.weight, dim=1, keepdim=True) # compute distance d = torch.abs(y) / w_norm.t() _, sortedInx = torch.sort(d, dim=1, descending=False) # for each selected neuron for i in range(K): # clear previous gradients x.grad = None # compute gradients goal = torch.abs(y[0, sortedInx[0, i]]) goal.backward(retain_graph=True) # retain graph for future computation # record gradients selected_list.append(x.grad.clone().detach().to('cpu') / torch.norm(x.grad).detach().to('cpu')) # generate neighboring samples unit_shifts = torch.cat(selected_list, dim=0) nbs = [] for rInx in range(len(r)): if direction == 'inc': nbs.append(sp.to('cpu') + r[rInx] * unit_shifts) elif direction == 'dec': nbs.append(sp.to('cpu') - r[rInx] * unit_shifts) else: nbs.append(sp.to('cpu') + r[rInx] * unit_shifts) nbs.append(sp.to('cpu') - r[rInx] * unit_shifts) nbs = torch.cat(nbs, dim=0) nbs = nbs.detach() nbs.requires_grad = False return nbs def findNeighbors_approx_resnet_small(model, sp, K, r=[2], direction='both', device='cpu'): # only accept single sample if sp.size(0) != 1: return None # storages for K selected distances and linear mapping selected_list = [] # set model to evaluation mode model = model.to(device) model = model.eval() # place holder for input, and set to require gradient x = sp.clone().to(device) x.requires_grad = True # forward through the model y = model(x) y = y.view(y.size(0), -1) if K > 0: with torch.no_grad(): # compute weight norm w_norm = torch.norm(model.linear.weight, dim=1, keepdim=True) # compute distance d = torch.abs(y) / w_norm.t() _, sortedInx = torch.sort(d, dim=1, descending=False) # for each selected neuron for i in range(K): # clear previous gradients x.grad = None # compute gradients goal = torch.abs(y[0, sortedInx[0, i]]) goal.backward(retain_graph=True) # retain graph for future computation # record gradients selected_list.append(x.grad.clone().detach().to('cpu') / torch.norm(x.grad).detach().to('cpu')) # generate neighboring samples unit_shifts = torch.cat(selected_list, dim=0) nbs = [] for rInx in range(len(r)): if direction == 'inc': nbs.append(sp.to('cpu') + r[rInx] * unit_shifts) elif direction == 'dec': nbs.append(sp.to('cpu') - r[rInx] * unit_shifts) else: nbs.append(sp.to('cpu') + r[rInx] * unit_shifts) nbs.append(sp.to('cpu') - r[rInx] * unit_shifts) nbs = torch.cat(nbs, dim=0) nbs = nbs.detach() nbs.requires_grad = False return nbs def formSquad_resnet(method, model, sp, K, r=[2], direction='both', device='cpu', includeOriginal=True): if method == 'random': nbs = findNeighbors_random(sp, K, r, direction=direction) if includeOriginal: nbs = torch.cat([sp, nbs], dim=0) elif method == 'approx': nbs = findNeighbors_approx_resnet(model, sp, K, r, direction=direction, device=device) if includeOriginal: nbs = torch.cat([sp, nbs], dim=0) elif method == 'approx_cifar10': nbs = findNeighbors_approx_resnet_small(model, sp, K, r, direction=direction, device=device) if includeOriginal: nbs = torch.cat([sp, nbs], dim=0) else: # if invalid method, use default setting. (actually should raise error here) nbs = findNeighbors_random(sp, K, r, direction=direction) if includeOriginal: nbs = torch.cat([sp, nbs], dim=0) return nbs ================================================ FILE: post_avg/README.md ================================================ # Post-Average Adversarial Defense Implementation of the Post-Average adversarial defense method as described in [Bandlimiting Neural Networks Against Adversarial Attacks](https://arxiv.org/abs/1905.12797). This implementation is based on PyTorch and uses the [Foolbox](https://github.com/bethgelab/foolbox) toolbox to provide attacking methods. ## [robustml](https://github.com/robust-ml/robustml) evaluation This implementation supports the robustml API for evaluation. To evaluate on CIFAR-10: ``` python robustml_test_cifar10.py ``` To evaluate on ImageNet: ``` python robustml_test_imagenet.py ``` ================================================ FILE: post_avg/attacks.py ================================================ # -*- coding: utf-8 -*- import robustml import numpy as np import foolbox.criteria as crt import foolbox.attacks as attacks import foolbox.distances as distances import foolbox.adversarial as adversarial class NullAttack(robustml.attack.Attack): def run(self, x, y, target): return x class FoolboxAttackWrapper(robustml.attack.Attack): def __init__(self, attack): self._attacker = attack def run(self, x, y, target): # model requires image in (C, H, W), but robustml provides (H, W, C) # transpose x to accommodate pytorch's axis arrangement convention x = np.transpose(x, (2, 0, 1)) if target is not None: adv_criterion = crt.TargetClass(target) adv_obj = adversarial.Adversarial(self._attacker._default_model, adv_criterion, x, y, distance=self._attacker._default_distance) adv_x = self._attacker(adv_obj) else: adv_x = self._attacker(x, y) # transpose back to data provider's convention return np.transpose(adv_x, (1, 2, 0)) def fgsmAttack(victim_model): # victim_model should be model wrapped with foolbox model attacker = attacks.GradientSignAttack(victim_model, crt.Misclassification()) return FoolboxAttackWrapper(attacker) def pgdAttack(victim_model): # victim_model should be model wrapped with foolbox model attacker = attacks.RandomStartProjectedGradientDescentAttack(victim_model, crt.Misclassification(), distance=distances.Linfinity) return FoolboxAttackWrapper(attacker) def dfAttack(victim_model): # victim_model should be model wrapped with foolbox model attacker = attacks.DeepFoolAttack(victim_model, crt.Misclassification()) return FoolboxAttackWrapper(attacker) def cwAttack(victim_model): # victim_model should be model wrapped with foolbox model attacker = attacks.CarliniWagnerL2Attack(victim_model, crt.Misclassification()) return FoolboxAttackWrapper(attacker) ================================================ FILE: post_avg/postAveragedModels.py ================================================ # -*- coding: utf-8 -*- import robustml import numpy as np from collections import OrderedDict import post_avg.PADefense as padef import post_avg.resnetSmall as rnsmall import torch import torchvision.models as mdl import torchvision.transforms as transforms class PostAveragedResNet152(robustml.model.Model): def __init__(self, K, R, eps, device='cuda'): self._model = mdl.resnet152(pretrained=True).to(device) self._dataset = robustml.dataset.ImageNet((224, 224, 3)) self._threat_model = robustml.threat_model.Linf(epsilon=eps) self._K = K self._r = [R/3, 2*R/3, R] self._sample_method = 'random' self._vote_method = 'avg_softmax' self._device = device @property def model(self): return self._model @property def dataset(self): return self._dataset @property def threat_model(self): return self._threat_model def classify(self, x): x = x.unsqueeze(0) # gather neighbor samples x_squad = padef.formSquad_resnet(self._sample_method, self._model, x, self._K, self._r, device=self._device) # forward with a batch of neighbors logits, _ = padef.integratedForward(self._model, x_squad, batchSize=100, nClasses=1000, device=self._device, voteMethod=self._vote_method) return torch.as_tensor(logits) def __call__(self, x): logits_list = [] for img in x: logits = self.classify(img) logits_list.append(logits) return torch.cat(logits_list, dim=0) def _preprocess(self, image): # normalization used by pre-trained model normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) return normalize(image) def to(self, device): self._model = self._model.to(device) self._device = device def eval(self): self._model = self._model.eval() def pa_resnet152_config1(): return PostAveragedResNet152(K=15, R=30, eps=8/255) class PostAveragedResNet110(robustml.model.Model): def __init__(self, K, R, eps, device='cuda'): # load model state dict checkpoint = torch.load('post_avg/trainedModel/resnet110.th') paramDict = OrderedDict() for k, v in checkpoint['state_dict'].items(): # remove 'module.' prefix introduced by DataParallel, if any if k.startswith('module.'): paramDict[k[7:]] = v self._model = rnsmall.resnet110() self._model.load_state_dict(paramDict) self._model = self._model.to(device) self._dataset = robustml.dataset.CIFAR10() self._threat_model = robustml.threat_model.Linf(epsilon=eps) self._K = K self._r = [R/3, 2*R/3, R] self._sample_method = 'random' self._vote_method = 'avg_softmax' self._device = device @property def model(self): return self._model @property def dataset(self): return self._dataset @property def threat_model(self): return self._threat_model def classify(self, x): x = x.unsqueeze(0) # gather neighbor samples x_squad = padef.formSquad_resnet(self._sample_method, self._model, x, self._K, self._r, device=self._device) # forward with a batch of neighbors logits, _ = padef.integratedForward(self._model, x_squad, batchSize=1000, nClasses=10, device=self._device, voteMethod=self._vote_method) return torch.as_tensor(logits) def __call__(self, x): logits_list = [] for img in x: logits = self.classify(img) logits_list.append(logits) return torch.cat(logits_list, dim=0) def _preprocess(self, image): # normalization used by pre-trained model normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) return normalize(image) def to(self, device): self._model = self._model.to(device) self._device = device def eval(self): self._model = self._model.eval() def pa_resnet110_config1(): return PostAveragedResNet110(K=15, R=6, eps=8/255) ================================================ FILE: post_avg/resnetSmall.py ================================================ # -*- coding: utf-8 -*- ''' Properly implemented ResNet-s for CIFAR10 as described in paper [1]. The implementation and structure of this file is hugely influenced by [2] which is implemented for ImageNet and doesn't have option A for identity. Moreover, most of the implementations on the web is copy-paste from torchvision's resnet and has wrong number of params. Proper ResNet-s for CIFAR10 (for fair comparision and etc.) has following number of layers and parameters: name | layers | params ResNet20 | 20 | 0.27M ResNet32 | 32 | 0.46M ResNet44 | 44 | 0.66M ResNet56 | 56 | 0.85M ResNet110 | 110 | 1.7M ResNet1202| 1202 | 19.4m which this implementation indeed has. Reference: [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Deep Residual Learning for Image Recognition. arXiv:1512.03385 [2] https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py If you use this implementation in you work, please don't forget to mention the author, Yerlan Idelbayev. ''' import torch import torch.nn as nn import torch.nn.functional as F import torch.nn.init as init from torch.autograd import Variable __all__ = ['ResNet', 'resnet20', 'resnet32', 'resnet44', 'resnet56', 'resnet110', 'resnet1202'] def _weights_init(m): classname = m.__class__.__name__ # print(classname) if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d): init.kaiming_normal(m.weight) class LambdaLayer(nn.Module): def __init__(self, lambd): super(LambdaLayer, self).__init__() self.lambd = lambd def forward(self, x): return self.lambd(x) class BasicBlock(nn.Module): expansion = 1 def __init__(self, in_planes, planes, stride=1, option='A'): super(BasicBlock, self).__init__() self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.shortcut = nn.Sequential() if stride != 1 or in_planes != planes: if option == 'A': """ For CIFAR10 ResNet paper uses option A. """ self.shortcut = LambdaLayer(lambda x: F.pad(x[:, :, ::2, ::2], (0, 0, 0, 0, planes//4, planes//4), "constant", 0)) elif option == 'B': self.shortcut = nn.Sequential( nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(self.expansion * planes) ) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.bn2(self.conv2(out)) out += self.shortcut(x) out = F.relu(out) return out class ResNet(nn.Module): def __init__(self, block, num_blocks, num_classes=10): super(ResNet, self).__init__() self.in_planes = 16 self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(16) self.layer1 = self._make_layer(block, 16, num_blocks[0], stride=1) self.layer2 = self._make_layer(block, 32, num_blocks[1], stride=2) self.layer3 = self._make_layer(block, 64, num_blocks[2], stride=2) self.linear = nn.Linear(64, num_classes) self.apply(_weights_init) def _make_layer(self, block, planes, num_blocks, stride): strides = [stride] + [1]*(num_blocks-1) layers = [] for stride in strides: layers.append(block(self.in_planes, planes, stride)) self.in_planes = planes * block.expansion return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) out = F.avg_pool2d(out, out.size()[3]) out = out.view(out.size(0), -1) out = self.linear(out) return out def resnet20(): return ResNet(BasicBlock, [3, 3, 3]) def resnet32(): return ResNet(BasicBlock, [5, 5, 5]) def resnet44(): return ResNet(BasicBlock, [7, 7, 7]) def resnet56(): return ResNet(BasicBlock, [9, 9, 9]) def resnet110(): return ResNet(BasicBlock, [18, 18, 18]) def resnet1202(): return ResNet(BasicBlock, [200, 200, 200]) def test(net): import numpy as np total_params = 0 for x in filter(lambda p: p.requires_grad, net.parameters()): total_params += np.prod(x.data.numpy().shape) print("Total number of params", total_params) print("Total layers", len(list(filter(lambda p: p.requires_grad and len(p.data.size())>1, net.parameters())))) if __name__ == "__main__": for net_name in __all__: if net_name.startswith('resnet'): print(net_name) test(globals()[net_name]()) print() ================================================ FILE: post_avg/robustml_test_cifar10.py ================================================ # -*- coding: utf-8 -*- import torch import argparse import robustml import numpy as np from foolbox.models import PyTorchModel from robustml_portal import attacks as atk from robustml_portal import postAveragedModels as pamdl # argument parsing parser = argparse.ArgumentParser(description="robustml evaluation on CIFAR-10") parser.add_argument("datasetPath", help="path to the 'test_batch' file") parser.add_argument("--start", type=int, default=0, help="inclusive starting index for data. default: 0") parser.add_argument("--end", type=int, help="exclusive ending index for data. default: dataset size") parser.add_argument("--attack", choices=["pgd", "fgsm", "df", "cw", "none"], default="pgd", help="attack method to be used. default: pgd") parser.add_argument("--device", help="compuation device to be used. 'cpu' or 'cuda:'") args = parser.parse_args() if args.device is None: if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") else: device = torch.device(args.device) # setup test model model = pamdl.pa_resnet110_config1() model.to(device) model.eval() # setup attacker nClasses = 10 victim_model = PyTorchModel(model.model, (0,1), nClasses, device=device, preprocessing=(np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)), np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)))) if args.attack == "pgd": attack = atk.pgdAttack(victim_model) elif args.attack == "fgsm": attack = atk.fgsmAttack(victim_model) elif args.attack == "df": attack = atk.dfAttack(victim_model) elif args.attack == "cw": attack = atk.cwAttack(victim_model) else: attack = atk.NullAttack() # setup data provider prov = robustml.provider.CIFAR10(args.datasetPath) # evaluate performance if args.end is None: args.end = len(prov) atk_success_rate = robustml.evaluate.evaluate(model, attack, prov, start=args.start, end=args.end) print('Overall attack success rate: %.4f' % atk_success_rate) ================================================ FILE: post_avg/robustml_test_imagenet.py ================================================ # -*- coding: utf-8 -*- import torch import argparse import robustml import numpy as np from foolbox.models import PyTorchModel from robustml_portal import attacks as atk from robustml_portal import postAveragedModels as pamdl # argument parsing parser = argparse.ArgumentParser(description="robustml evaluation on ImageNet") parser.add_argument("datasetPath", help="directory containing 'val.txt' and 'val/' folder") parser.add_argument("--start", type=int, default=0, help="inclusive starting index for data. default: 0") parser.add_argument("--end", type=int, help="exclusive ending index for data. default: dataset size") parser.add_argument("--attack", choices=["pgd", "fgsm", "df", "cw", "none"], default="pgd", help="attack method to be used. default: pgd") parser.add_argument("--device", help="compuation device to be used. 'cpu' or 'cuda:'") args = parser.parse_args() if args.device is None: if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") else: device = torch.device(args.device) # setup test model model = pamdl.pa_resnet152_config1() model.to(device) model.eval() # setup attacker nClasses = 1000 victim_model = PyTorchModel(model.model, (0,1), nClasses, device=device, preprocessing=(np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)), np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)))) if args.attack == "pgd": attack = atk.pgdAttack(victim_model) elif args.attack == "fgsm": attack = atk.fgsmAttack(victim_model) elif args.attack == "df": attack = atk.dfAttack(victim_model) elif args.attack == "cw": attack = atk.cwAttack(victim_model) else: attack = atk.NullAttack() # setup data provider prov = robustml.provider.ImageNet(args.datasetPath, (224, 224, 3)) # evaluate performance if args.end is None: args.end = len(prov) atk_success_rate = robustml.evaluate.evaluate(model, attack, prov, start=args.start, end=args.end) print('Overall attack success rate: %.4f' % atk_success_rate) ================================================ FILE: post_avg/visualHelper.py ================================================ # -*- coding: utf-8 -*- import torch import torch.nn as nn import numpy as np import matplotlib; matplotlib.use('agg') import matplotlib.pyplot as plt def plotPredStats(feats, lb, K=10, image=None, noiseImage=None, savePath=None): # score by averaging scores = torch.mean(feats, dim=0) # sort and select the top K scores hScores, hCates = torch.sort(scores, dim=0, descending=True) hScores = hScores[:K].numpy() hCates = hCates[:K].numpy() # get individual preditions _, preds = torch.max(feats, dim=1) # count votes preds_count = {lb: 0} for i in range(feats.size(0)): if preds[i].item() in preds_count: preds_count[preds[i].item()] = preds_count[preds[i].item()] + 1 else: preds_count[preds[i].item()] = 1 candidates = sorted(preds_count.keys()) votes = [preds_count[x] for x in candidates] # generate figure fig = plt.figure() if image is None and noiseImage is None: ax1, ax2, ax3 = fig.subplots(3, 1) else: axes = fig.subplots(2, 2) ax1 = axes[0, 0] ax2 = axes[1, 0] ax3 = axes[0, 1] ax4 = axes[1, 1] # chart 1, votes distribution inx1 = list(range(len(candidates))) clr1 = [] for i in inx1: if candidates[i] == lb: clr1.append('Red') else: clr1.append('SkyBlue') rects1 = ax1.bar(inx1, votes, color=clr1) for rect in rects1: h = rect.get_height() ax1.text(rect.get_x() + 0.5 * rect.get_width(), 1.01 * h, '{}'.format(h), ha='center', va='bottom') ax1.set_ylim(top=1.1 * ax1.get_ylim()[1]) ax1.set_xticks(inx1) ax1.set_xticklabels([str(x) for x in candidates], rotation=30) ax1.set_ylabel('votes') ax1.set_title('Votes Distribution') # chart 2, top prediction scores inx2 = list(range(len(hCates))) clr2 = [] for i in inx2: if hCates[i] == lb: clr2.append('Red') else: clr2.append('SkyBlue') rects2 = ax2.bar(inx2, hScores, color=clr2) for rect in rects2: h = rect.get_height() ax2.text(rect.get_x() + 0.5 * rect.get_width(), 1.01 * h, '{:.2f}'.format(h), ha='center', va='bottom') ax2.set_ylim(top=1.1 * ax2.get_ylim()[1]) ax2.set_xticks(inx2) ax2.set_xticklabels([str(x) for x in hCates], rotation=30) ax2.set_ylabel('score') ax2.set_xlabel('Top Predictions') # axis 3, the noise image if noiseImage is not None: ax3.imshow(noiseImage) ax3.set_xlabel('Noise Image') ax3.set_axis_off() else: # if noise image is not given, show prediction event plot clr3 = [] for i in range(preds.size(0)): if preds[i] == lb: clr3.append('Red') else: clr3.append('Green') ax3.eventplot(preds.unsqueeze(1).numpy(), orientation='vertical', colors=clr3) ax3.set_yticks(candidates) ax3.set_yticklabels([str(x) for x in candidates]) ax3.set_xlabel('sample index') ax3.set_ylabel('class') # axis 4, the input image if image is not None: ax4.imshow(image) ax4.set_title('Input Image') ax4.set_axis_off() # save figure and close if savePath is not None: fig.savefig(savePath) plt.close(fig) def plotPerturbationDistribution(perturbations, savePath=None): # generate figure fig = plt.figure() ax1, ax2, ax3 = fig.subplots(3, 1) # plot scatter chart perts = np.asarray(perturbations) ax1.scatter(perts[:, 0], perts[:, 1], c='SkyBlue') ax1.autoscale(axis='x') ax1.set_ylim((-1, 2)) ax1.set_yticks([0, 1]) ax1.set_yticklabels(['missed', 'defensed']) ax1.set_xlabel('Perturbation distance') ax1.set_title('Perturbations Distribution') # plot bin chart for defensed adversarial samples x = [e[0] for e in perturbations if e[1] == 1] ax2.hist(x, bins=20, color='SkyBlue') ax2.set_xlabel('Perturbation distance') ax2.set_ylabel('Denfensed') # plot bin chart for missed adversarial samples x = [e[0] for e in perturbations if e[1] == 0] ax3.hist(x, bins=20, color='Red') ax3.set_xlabel('Perturbation distance') ax3.set_ylabel('Missed') # save figure and close if savePath is not None: fig.savefig(savePath) plt.close(fig) ================================================ FILE: utils.py ================================================ import numpy as np import os class Logger: def __init__(self, path): self.path = path if path != '': folder = '/'.join(path.split('/')[:-1]) if not os.path.exists(folder): os.makedirs(folder) def print(self, message): print(message) if self.path != '': with open(self.path, 'a') as f: f.write(message + '\n') f.flush() def dense_to_onehot(y_test, n_cls): y_test_onehot = np.zeros([len(y_test), n_cls], dtype=bool) y_test_onehot[np.arange(len(y_test)), y_test] = True return y_test_onehot def random_classes_except_current(y_test, n_cls): y_test_new = np.zeros_like(y_test) for i_img in range(y_test.shape[0]): lst_classes = list(range(n_cls)) lst_classes.remove(y_test[i_img]) y_test_new[i_img] = np.random.choice(lst_classes) return y_test_new def softmax(x): e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) return e_x / e_x.sum(axis=1, keepdims=True)