Repository: wanglouis49/pytorch-adversarial_box Branch: master Commit: bddb5a899a76 Files: 10 Total size: 21.1 KB Directory structure: gitextract_lz9oan5g/ ├── README.md ├── adversarialbox/ │ ├── __init__.py │ ├── attacks.py │ ├── train.py │ └── utils.py ├── mnist_adv_train.py ├── mnist_attack.py ├── mnist_blackbox.py ├── models/ │ └── adv_trained_lenet5.pkl └── models.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ # Adversarial Box - Pytorch Adversarial Attack and Training Luyu Wang and Gavin Ding, Borealis AI ## Motivation? [CleverHans](https://github.com/tensorflow/cleverhans) comes in handy for Tensorflow. However, PyTorch does not have the luck at this moment. [Foolbox](https://github.com/bethgelab/foolbox) supports multiple deep learning frameworks, but it lacks many major implementations (e.g., black-box attack, Carlini-Wagner attack, adversarial training). We feel there is a need to write an easy-to-use and versatile library to help our fellow researchers and engineers. **We have a much more updated version called [AdverTorch](https://github.com/BorealisAI/advertorch). You can find most of the popular attacks there. This repo will not be maintained anymore.** ## Usage from adversarialbox.attacks import FGSMAttack adversary = FGSMAttack(model, epsilon=0.1) X_adv = adversary.perturb(X_i, y_i) ## Examples 1. MNIST with FGSM ([code](https://github.com/wanglouis49/pytorch-adversarial_box/blob/master/mnist_attack.py)) 2. Adversarial Training on MNIST ([code](https://github.com/wanglouis49/pytorch-adversarial_box/blob/master/mnist_adv_train.py)) 3. MNIST using a black-box attack ([code](https://github.com/wanglouis49/pytorch-adversarial_box/blob/master/mnist_blackbox.py)) ## List of supported attacks 1. FGSM 2. PGD 3. Black-box ================================================ FILE: adversarialbox/__init__.py ================================================ ================================================ FILE: adversarialbox/attacks.py ================================================ import copy import numpy as np from collections import Iterable from scipy.stats import truncnorm import torch import torch.nn as nn from adversarialbox.utils import to_var # --- White-box attacks --- class FGSMAttack(object): def __init__(self, model=None, epsilon=None): """ One step fast gradient sign method """ self.model = model self.epsilon = epsilon self.loss_fn = nn.CrossEntropyLoss() def perturb(self, X_nat, y, epsilons=None): """ Given examples (X_nat, y), returns their adversarial counterparts with an attack length of epsilon. """ # Providing epsilons in batch if epsilons is not None: self.epsilon = epsilons X = np.copy(X_nat) X_var = to_var(torch.from_numpy(X), requires_grad=True) y_var = to_var(torch.LongTensor(y)) scores = self.model(X_var) loss = self.loss_fn(scores, y_var) loss.backward() grad_sign = X_var.grad.data.cpu().sign().numpy() X += self.epsilon * grad_sign X = np.clip(X, 0, 1) return X class LinfPGDAttack(object): def __init__(self, model=None, epsilon=0.3, k=40, a=0.01, random_start=True): """ Attack parameter initialization. The attack performs k steps of size a, while always staying within epsilon from the initial point. https://github.com/MadryLab/mnist_challenge/blob/master/pgd_attack.py """ self.model = model self.epsilon = epsilon self.k = k self.a = a self.rand = random_start self.loss_fn = nn.CrossEntropyLoss() def perturb(self, X_nat, y): """ Given examples (X_nat, y), returns adversarial examples within epsilon of X_nat in l_infinity norm. """ if self.rand: X = X_nat + np.random.uniform(-self.epsilon, self.epsilon, X_nat.shape).astype('float32') else: X = np.copy(X_nat) for i in range(self.k): X_var = to_var(torch.from_numpy(X), requires_grad=True) y_var = to_var(torch.LongTensor(y)) scores = self.model(X_var) loss = self.loss_fn(scores, y_var) loss.backward() grad = X_var.grad.data.cpu().numpy() X += self.a * np.sign(grad) X = np.clip(X, X_nat - self.epsilon, X_nat + self.epsilon) X = np.clip(X, 0, 1) # ensure valid pixel range return X # --- Black-box attacks --- def jacobian(model, x, nb_classes=10): """ This function will return a list of PyTorch gradients """ list_derivatives = [] x_var = to_var(torch.from_numpy(x), requires_grad=True) # derivatives for each class for class_ind in range(nb_classes): score = model(x_var)[:, class_ind] score.backward() list_derivatives.append(x_var.grad.data.cpu().numpy()) x_var.grad.data.zero_() return list_derivatives def jacobian_augmentation(model, X_sub_prev, Y_sub, lmbda=0.1): """ Create new numpy array for adversary training data with twice as many components on the first dimension. """ X_sub = np.vstack([X_sub_prev, X_sub_prev]) # For each input in the previous' substitute training iteration for ind, x in enumerate(X_sub_prev): grads = jacobian(model, x) # Select gradient corresponding to the label predicted by the oracle grad = grads[Y_sub[ind]] # Compute sign matrix grad_val = np.sign(grad) # Create new synthetic point in adversary substitute training set X_sub[len(X_sub_prev)+ind] = X_sub[ind] + lmbda * grad_val #??? # Return augmented training data (needs to be labeled afterwards) return X_sub ================================================ FILE: adversarialbox/train.py ================================================ """ Adversarial training """ import copy import numpy as np from collections import Iterable from scipy.stats import truncnorm import torch import torch.nn as nn from adversarialbox.attacks import FGSMAttack, LinfPGDAttack from adversarialbox.utils import truncated_normal def adv_train(X, y, model, criterion, adversary): """ Adversarial training. Returns pertubed mini batch. """ # If adversarial training, need a snapshot of # the model at each batch to compute grad, so # as not to mess up with the optimization step model_cp = copy.deepcopy(model) for p in model_cp.parameters(): p.requires_grad = False model_cp.eval() adversary.model = model_cp X_adv = adversary.perturb(X.numpy(), y) return torch.from_numpy(X_adv) def FGSM_train_rnd(X, y, model, criterion, fgsm_adversary, epsilon_max=0.3): """ FGSM with epsilon sampled from a truncated normal distribution. Returns pertubed mini batch. Kurakin et al, ADVERSARIAL MACHINE LEARNING AT SCALE, 2016 """ # If adversarial training, need a snapshot of # the model at each batch to compute grad, so # as not to mess up with the optimization step model_cp = copy.deepcopy(model) for p in model_cp.parameters(): p.requires_grad = False model_cp.eval() fgsm_adversary.model = model_cp # truncated Gaussian m = X.size()[0] # mini-batch size mean, std = 0., epsilon_max/2 epsilons = np.abs(truncated_normal(mean, std, m))[:, np.newaxis, \ np.newaxis, np.newaxis] X_adv = fgsm_adversary.perturb(X.numpy(), y, epsilons) return torch.from_numpy(X_adv) ================================================ FILE: adversarialbox/utils.py ================================================ import numpy as np import torch from torch.autograd import Variable import torch.nn as nn from torch.utils.data import sampler def truncated_normal(mean=0.0, stddev=1.0, m=1): ''' The generated values follow a normal distribution with specified mean and standard deviation, except that values whose magnitude is more than 2 standard deviations from the mean are dropped and re-picked. Returns a vector of length m ''' samples = [] for i in range(m): while True: sample = np.random.normal(mean, stddev) if np.abs(sample) <= 2 * stddev: break samples.append(sample) assert len(samples) == m, "something wrong" if m == 1: return samples[0] else: return np.array(samples) # --- PyTorch helpers --- def to_var(x, requires_grad=False, volatile=False): """ Varialbe type that automatically choose cpu or cuda """ if torch.cuda.is_available(): x = x.cuda() return Variable(x, requires_grad=requires_grad, volatile=volatile) def pred_batch(x, model): """ batch prediction helper """ y_pred = np.argmax(model(to_var(x)).data.cpu().numpy(), axis=1) return torch.from_numpy(y_pred) def test(model, loader, blackbox=False, hold_out_size=None): """ Check model accuracy on model based on loader (train or test) """ model.eval() num_correct, num_samples = 0, len(loader.dataset) if blackbox: num_samples -= hold_out_size for x, y in loader: x_var = to_var(x, volatile=True) scores = model(x_var) _, preds = scores.data.cpu().max(1) num_correct += (preds == y).sum() acc = float(num_correct)/float(num_samples) print('Got %d/%d correct (%.2f%%) on the clean data' % (num_correct, num_samples, 100 * acc)) return acc def attack_over_test_data(model, adversary, param, loader_test, oracle=None): """ Given target model computes accuracy on perturbed data """ total_correct = 0 total_samples = len(loader_test.dataset) # For black-box if oracle is not None: total_samples -= param['hold_out_size'] for t, (X, y) in enumerate(loader_test): y_pred = pred_batch(X, model) X_adv = adversary.perturb(X.numpy(), y_pred) X_adv = torch.from_numpy(X_adv) if oracle is not None: y_pred_adv = pred_batch(X_adv, oracle) else: y_pred_adv = pred_batch(X_adv, model) total_correct += (y_pred_adv.numpy() == y.numpy()).sum() acc = total_correct/total_samples print('Got %d/%d correct (%.2f%%) on the perturbed data' % (total_correct, total_samples, 100 * acc)) return acc def batch_indices(batch_nb, data_length, batch_size): """ This helper function computes a batch start and end index :param batch_nb: the batch number :param data_length: the total length of the data being parsed by batches :param batch_size: the number of inputs in each batch :return: pair of (start, end) indices """ # Batch start and end index start = int(batch_nb * batch_size) end = int((batch_nb + 1) * batch_size) # When there are not enough inputs left, we reuse some to complete the # batch if end > data_length: shift = end - data_length start -= shift end -= shift return start, end ================================================ FILE: mnist_adv_train.py ================================================ """ Adversarially train LeNet-5 """ import torch import torch.nn as nn import torchvision.datasets as datasets import torchvision.transforms as transforms from torch.autograd import Variable import torch.nn.functional as F from adversarialbox.attacks import FGSMAttack, LinfPGDAttack from adversarialbox.train import adv_train, FGSM_train_rnd from adversarialbox.utils import to_var, pred_batch, test from models import LeNet5 # Hyper-parameters param = { 'batch_size': 128, 'test_batch_size': 100, 'num_epochs': 15, 'delay': 10, 'learning_rate': 1e-3, 'weight_decay': 5e-4, } # Data loaders train_dataset = datasets.MNIST(root='../data/',train=True, download=True, transform=transforms.ToTensor()) loader_train = torch.utils.data.DataLoader(train_dataset, batch_size=param['batch_size'], shuffle=True) test_dataset = datasets.MNIST(root='../data/', train=False, download=True, transform=transforms.ToTensor()) loader_test = torch.utils.data.DataLoader(test_dataset, batch_size=param['test_batch_size'], shuffle=True) # Setup the model net = LeNet5() if torch.cuda.is_available(): print('CUDA ensabled.') net.cuda() net.train() # Adversarial training setup #adversary = FGSMAttack(epsilon=0.3) adversary = LinfPGDAttack() # Train the model criterion = nn.CrossEntropyLoss() optimizer = torch.optim.RMSprop(net.parameters(), lr=param['learning_rate'], weight_decay=param['weight_decay']) for epoch in range(param['num_epochs']): print('Starting epoch %d / %d' % (epoch + 1, param['num_epochs'])) for t, (x, y) in enumerate(loader_train): x_var, y_var = to_var(x), to_var(y.long()) loss = criterion(net(x_var), y_var) # adversarial training if epoch+1 > param['delay']: # use predicted label to prevent label leaking y_pred = pred_batch(x, net) x_adv = adv_train(x, y_pred, net, criterion, adversary) x_adv_var = to_var(x_adv) loss_adv = criterion(net(x_adv_var), y_var) loss = (loss + loss_adv) / 2 if (t + 1) % 100 == 0: print('t = %d, loss = %.8f' % (t + 1, loss.data[0])) optimizer.zero_grad() loss.backward() optimizer.step() test(net, loader_test) torch.save(net.state_dict(), 'models/adv_trained_lenet5.pkl') ================================================ FILE: mnist_attack.py ================================================ """ Adversarial attacks on LeNet5 """ from time import time import torch import torch.nn as nn import torchvision.datasets as datasets import torchvision.transforms as transforms from torch.autograd import Variable import torch.nn.functional as F from adversarialbox.attacks import FGSMAttack, LinfPGDAttack from adversarialbox.utils import to_var, pred_batch, test, \ attack_over_test_data from models import LeNet5 # Hyper-parameters param = { 'test_batch_size': 100, 'epsilon': 0.3, } # Data loaders test_dataset = datasets.MNIST(root='../data/', train=False, download=True, transform=transforms.ToTensor()) loader_test = torch.utils.data.DataLoader(test_dataset, batch_size=param['test_batch_size'], shuffle=False) # Setup model to be attacked net = LeNet5() net.load_state_dict(torch.load('models/adv_trained_lenet5.pkl')) if torch.cuda.is_available(): print('CUDA ensabled.') net.cuda() for p in net.parameters(): p.requires_grad = False net.eval() test(net, loader_test) # Adversarial attack adversary = FGSMAttack(net, param['epsilon']) # adversary = LinfPGDAttack(net, random_start=False) t0 = time() attack_over_test_data(net, adversary, param, loader_test) print('{}s eclipsed.'.format(time()-t0)) ================================================ FILE: mnist_blackbox.py ================================================ """ PyTorch Implementation of Papernot's Black-Box Attack arXiv:1602.02697 """ import pickle import numpy as np import pandas as pd import torch import torch.nn as nn import torchvision.datasets as datasets import torchvision.transforms as transforms from torch.autograd import Variable import torch.nn.functional as F from torch.utils.data.sampler import SubsetRandomSampler from adversarialbox.attacks import FGSMAttack, LinfPGDAttack, \ jacobian_augmentation from adversarialbox.utils import to_var, pred_batch, test, \ attack_over_test_data, batch_indices from models import LeNet5, SubstituteModel def MNIST_bbox_sub(param, loader_hold_out, loader_test): """ Train a substitute model using Jacobian data augmentation arXiv:1602.02697 """ # Setup the substitute net = SubstituteModel() if torch.cuda.is_available(): print('CUDA ensabled for the substitute.') net.cuda() net.train() # Setup the oracle oracle = LeNet5() if torch.cuda.is_available(): print('CUDA ensabled for the oracle.') oracle.cuda() oracle.load_state_dict(torch.load(param['oracle_name']+'.pkl')) oracle.eval() # Setup training criterion = nn.CrossEntropyLoss() # Careful optimization is crucial to train a well-representative # substitute. In Tensorflow Adam has some problem: # (https://github.com/tensorflow/cleverhans/issues/183) # But it works fine here in PyTorch (you may try other optimization # methods optimizer = torch.optim.Adam(net.parameters(), lr=param['learning_rate']) # Data held out for initial training data_iter = iter(loader_hold_out) X_sub, y_sub = data_iter.next() X_sub, y_sub = X_sub.numpy(), y_sub.numpy() # Train the substitute and augment dataset alternatively for rho in range(param['data_aug']): print("Substitute training epoch #"+str(rho)) print("Training data: "+str(len(X_sub))) rng = np.random.RandomState() # model training for epoch in range(param['nb_epochs']): print('Starting epoch %d / %d' % (epoch + 1, param['nb_epochs'])) # Compute number of batches nb_batches = int(np.ceil(float(len(X_sub)) / param['test_batch_size'])) assert nb_batches * param['test_batch_size'] >= len(X_sub) # Indices to shuffle training set index_shuf = list(range(len(X_sub))) rng.shuffle(index_shuf) for batch in range(nb_batches): # Compute batch start and end indices start, end = batch_indices(batch, len(X_sub), param['test_batch_size']) x = X_sub[index_shuf[start:end]] y = y_sub[index_shuf[start:end]] scores = net(to_var(torch.from_numpy(x))) loss = criterion(scores, to_var(torch.from_numpy(y).long())) optimizer.zero_grad() loss.backward() optimizer.step() print('loss = %.8f' % (loss.data[0])) test(net, loader_test, blackbox=True, hold_out_size=param['hold_out_size']) # If we are not at last substitute training iteration, augment dataset if rho < param['data_aug'] - 1: print("Augmenting substitute training data.") # Perform the Jacobian augmentation X_sub = jacobian_augmentation(net, X_sub, y_sub) print("Labeling substitute training data.") # Label the newly generated synthetic points using the black-box scores = oracle(to_var(torch.from_numpy(X_sub))) # Note here that we take the argmax because the adversary # only has access to the label (not the probabilities) output # by the black-box model y_sub = np.argmax(scores.data.cpu().numpy(), axis=1) torch.save(net.state_dict(), param['oracle_name']+'_sub.pkl') if __name__ == "__main__": # Hyper-parameters param = { 'hold_out_size': 150, 'test_batch_size': 128, 'nb_epochs': 10, 'learning_rate': 0.001, 'data_aug': 6, 'oracle_name': 'models/adv_trained_lenet5', 'epsilon': 0.3, } # Data loaders # We need to hold out 150 data points from the test data # This is a bit tricky in PyTorch # We adopt the way from: # https://github.com/pytorch/pytorch/issues/1106 hold_out_data = datasets.MNIST(root='../data/', train=True, download=True, transform=transforms.ToTensor()) test_dataset = datasets.MNIST(root='../data/', train=False, download=True, transform=transforms.ToTensor()) indices = list(range(test_dataset.test_data.size(0))) split = param['hold_out_size'] rng = np.random.RandomState() rng.shuffle(indices) hold_out_idx, test_idx = indices[:split], indices[split:] hold_out_sampler = SubsetRandomSampler(hold_out_idx) test_sampler = SubsetRandomSampler(test_idx) loader_hold_out = torch.utils.data.DataLoader(hold_out_data, batch_size=param['hold_out_size'], sampler=hold_out_sampler, shuffle=False) loader_test = torch.utils.data.DataLoader(test_dataset, batch_size=param['test_batch_size'], sampler=test_sampler, shuffle=False) # Train the substitute MNIST_bbox_sub(param, loader_hold_out, loader_test) # Setup models net = SubstituteModel() oracle = LeNet5() net.load_state_dict(torch.load(param['oracle_name']+'_sub.pkl')) oracle.load_state_dict(torch.load(param['oracle_name']+'.pkl')) if torch.cuda.is_available(): net.cuda() oracle.cuda() print('CUDA ensabled.') for p in net.parameters(): p.requires_grad = False net.eval() oracle.eval() # Setup adversarial attacks adversary = FGSMAttack(net, param['epsilon']) print('For the substitute model:') test(net, loader_test, blackbox=True, hold_out_size=param['hold_out_size']) # Setup oracle print('For the oracle'+param['oracle_name']) print('agaist blackbox FGSM attacks using gradients from the substitute:') attack_over_test_data(net, adversary, param, loader_test, oracle) ================================================ FILE: models.py ================================================ import torch import torch.nn as nn class LeNet5(nn.Module): def __init__(self): super(LeNet5, self).__init__() self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1, stride=1) self.relu1 = nn.ReLU(inplace=True) self.maxpool1 = nn.MaxPool2d(2) self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1, stride=1) self.relu2 = nn.ReLU(inplace=True) self.maxpool2 = nn.MaxPool2d(2) self.linear1 = nn.Linear(7*7*64, 200) self.relu3 = nn.ReLU(inplace=True) self.linear2 = nn.Linear(200, 10) def forward(self, x): out = self.maxpool1(self.relu1(self.conv1(x))) out = self.maxpool2(self.relu2(self.conv2(out))) out = out.view(out.size(0), -1) out = self.relu3(self.linear1(out)) out = self.linear2(out) return out class SubstituteModel(nn.Module): def __init__(self): super(SubstituteModel, self).__init__() self.linear1 = nn.Linear(28*28, 200) self.relu1 = nn.ReLU(inplace=True) self.linear2 = nn.Linear(200, 200) self.relu2 = nn.ReLU(inplace=True) self.linear3 = nn.Linear(200, 10) def forward(self, x): out = x.view(x.size(0), -1) out = self.relu1(self.linear1(out)) out = self.relu2(self.linear2(out)) out = self.linear3(out) return out