Repository: wanglouis49/pytorch-adversarial_box
Branch: master
Commit: bddb5a899a76
Files: 10
Total size: 21.1 KB

Directory structure:
gitextract_lz9oan5g/

├── README.md
├── adversarialbox/
│   ├── __init__.py
│   ├── attacks.py
│   ├── train.py
│   └── utils.py
├── mnist_adv_train.py
├── mnist_attack.py
├── mnist_blackbox.py
├── models/
│   └── adv_trained_lenet5.pkl
└── models.py

================================================
FILE CONTENTS
================================================

================================================
FILE: README.md
================================================
# Adversarial Box - Pytorch Adversarial Attack and Training

Luyu Wang and Gavin Ding, Borealis AI

## Motivation?
[CleverHans](https://github.com/tensorflow/cleverhans) comes in handy for Tensorflow. However, PyTorch does not have the luck at this moment. [Foolbox](https://github.com/bethgelab/foolbox) supports multiple deep learning frameworks, but it lacks many major implementations (e.g., black-box attack, Carlini-Wagner attack, adversarial training). We feel there is a need to write an easy-to-use and versatile library to help our fellow researchers and engineers.

**We have a much more updated version called [AdverTorch](https://github.com/BorealisAI/advertorch). You can find most of the popular attacks there. This repo will not be maintained anymore.**

## Usage
    from adversarialbox.attacks import FGSMAttack
    adversary = FGSMAttack(model, epsilon=0.1)
    X_adv = adversary.perturb(X_i, y_i)

## Examples
1. MNIST with FGSM ([code](https://github.com/wanglouis49/pytorch-adversarial_box/blob/master/mnist_attack.py))
2. Adversarial Training on MNIST ([code](https://github.com/wanglouis49/pytorch-adversarial_box/blob/master/mnist_adv_train.py))
3. MNIST using a black-box attack ([code](https://github.com/wanglouis49/pytorch-adversarial_box/blob/master/mnist_blackbox.py))

## List of supported attacks
1. FGSM
2. PGD
3. Black-box


================================================
FILE: adversarialbox/__init__.py
================================================


================================================
FILE: adversarialbox/attacks.py
================================================
import copy
import numpy as np
from collections import Iterable
from scipy.stats import truncnorm

import torch
import torch.nn as nn

from adversarialbox.utils import to_var

# --- White-box attacks ---

class FGSMAttack(object):
    def __init__(self, model=None, epsilon=None):
        """
        One step fast gradient sign method
        """
        self.model = model
        self.epsilon = epsilon
        self.loss_fn = nn.CrossEntropyLoss()

    def perturb(self, X_nat, y, epsilons=None):
        """
        Given examples (X_nat, y), returns their adversarial
        counterparts with an attack length of epsilon.
        """
        # Providing epsilons in batch
        if epsilons is not None:
            self.epsilon = epsilons

        X = np.copy(X_nat)

        X_var = to_var(torch.from_numpy(X), requires_grad=True)
        y_var = to_var(torch.LongTensor(y))

        scores = self.model(X_var)
        loss = self.loss_fn(scores, y_var)
        loss.backward()
        grad_sign = X_var.grad.data.cpu().sign().numpy()

        X += self.epsilon * grad_sign
        X = np.clip(X, 0, 1)

        return X


class LinfPGDAttack(object):
    def __init__(self, model=None, epsilon=0.3, k=40, a=0.01, 
        random_start=True):
        """
        Attack parameter initialization. The attack performs k steps of
        size a, while always staying within epsilon from the initial
        point.
        https://github.com/MadryLab/mnist_challenge/blob/master/pgd_attack.py
        """
        self.model = model
        self.epsilon = epsilon
        self.k = k
        self.a = a
        self.rand = random_start
        self.loss_fn = nn.CrossEntropyLoss()

    def perturb(self, X_nat, y):
        """
        Given examples (X_nat, y), returns adversarial
        examples within epsilon of X_nat in l_infinity norm.
        """
        if self.rand:
            X = X_nat + np.random.uniform(-self.epsilon, self.epsilon,
                X_nat.shape).astype('float32')
        else:
            X = np.copy(X_nat)

        for i in range(self.k):
            X_var = to_var(torch.from_numpy(X), requires_grad=True)
            y_var = to_var(torch.LongTensor(y))

            scores = self.model(X_var)
            loss = self.loss_fn(scores, y_var)
            loss.backward()
            grad = X_var.grad.data.cpu().numpy()

            X += self.a * np.sign(grad)

            X = np.clip(X, X_nat - self.epsilon, X_nat + self.epsilon)
            X = np.clip(X, 0, 1) # ensure valid pixel range

        return X


# --- Black-box attacks ---

def jacobian(model, x, nb_classes=10):
    """
    This function will return a list of PyTorch gradients
    """
    list_derivatives = []
    x_var = to_var(torch.from_numpy(x), requires_grad=True)

    # derivatives for each class
    for class_ind in range(nb_classes):
        score = model(x_var)[:, class_ind]
        score.backward()
        list_derivatives.append(x_var.grad.data.cpu().numpy())
        x_var.grad.data.zero_()

    return list_derivatives


def jacobian_augmentation(model, X_sub_prev, Y_sub, lmbda=0.1):
    """
    Create new numpy array for adversary training data
    with twice as many components on the first dimension.
    """
    X_sub = np.vstack([X_sub_prev, X_sub_prev])

    # For each input in the previous' substitute training iteration
    for ind, x in enumerate(X_sub_prev):
        grads = jacobian(model, x)
        # Select gradient corresponding to the label predicted by the oracle
        grad = grads[Y_sub[ind]]

        # Compute sign matrix
        grad_val = np.sign(grad)

        # Create new synthetic point in adversary substitute training set
        X_sub[len(X_sub_prev)+ind] = X_sub[ind] + lmbda * grad_val #???

    # Return augmented training data (needs to be labeled afterwards)
    return X_sub


================================================
FILE: adversarialbox/train.py
================================================
"""
Adversarial training
"""

import copy
import numpy as np
from collections import Iterable
from scipy.stats import truncnorm

import torch
import torch.nn as nn

from adversarialbox.attacks import FGSMAttack, LinfPGDAttack
from adversarialbox.utils import truncated_normal


def adv_train(X, y, model, criterion, adversary):
    """
    Adversarial training. Returns pertubed mini batch.
    """

    # If adversarial training, need a snapshot of 
    # the model at each batch to compute grad, so 
    # as not to mess up with the optimization step
    model_cp = copy.deepcopy(model)
    for p in model_cp.parameters():
        p.requires_grad = False
    model_cp.eval()
    
    adversary.model = model_cp

    X_adv = adversary.perturb(X.numpy(), y)

    return torch.from_numpy(X_adv)


def FGSM_train_rnd(X, y, model, criterion, fgsm_adversary, epsilon_max=0.3):
    """
    FGSM with epsilon sampled from a truncated normal distribution.
    Returns pertubed mini batch.
    Kurakin et al, ADVERSARIAL MACHINE LEARNING AT SCALE, 2016
    """

    # If adversarial training, need a snapshot of 
    # the model at each batch to compute grad, so 
    # as not to mess up with the optimization step
    model_cp = copy.deepcopy(model)
    for p in model_cp.parameters():
        p.requires_grad = False
    model_cp.eval()
    
    fgsm_adversary.model = model_cp

    # truncated Gaussian
    m = X.size()[0] # mini-batch size
    mean, std = 0., epsilon_max/2
    epsilons = np.abs(truncated_normal(mean, std, m))[:, np.newaxis, \
        np.newaxis, np.newaxis]

    X_adv = fgsm_adversary.perturb(X.numpy(), y, epsilons)

    return torch.from_numpy(X_adv)


================================================
FILE: adversarialbox/utils.py
================================================
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn
from torch.utils.data import sampler


def truncated_normal(mean=0.0, stddev=1.0, m=1):
    '''
    The generated values follow a normal distribution with specified 
    mean and standard deviation, except that values whose magnitude is 
    more than 2 standard deviations from the mean are dropped and 
    re-picked. Returns a vector of length m
    '''
    samples = []
    for i in range(m):
        while True:
            sample = np.random.normal(mean, stddev)
            if np.abs(sample) <= 2 * stddev:
                break
        samples.append(sample)
    assert len(samples) == m, "something wrong"
    if m == 1:
        return samples[0]
    else:
        return np.array(samples)


# --- PyTorch helpers ---

def to_var(x, requires_grad=False, volatile=False):
    """
    Varialbe type that automatically choose cpu or cuda
    """
    if torch.cuda.is_available():
        x = x.cuda()
    return Variable(x, requires_grad=requires_grad, volatile=volatile)


def pred_batch(x, model):
    """
    batch prediction helper
    """
    y_pred = np.argmax(model(to_var(x)).data.cpu().numpy(), axis=1)
    return torch.from_numpy(y_pred)


def test(model, loader, blackbox=False, hold_out_size=None):
    """
    Check model accuracy on model based on loader (train or test)
    """
    model.eval()

    num_correct, num_samples = 0, len(loader.dataset)

    if blackbox:
        num_samples -= hold_out_size

    for x, y in loader:
        x_var = to_var(x, volatile=True)
        scores = model(x_var)
        _, preds = scores.data.cpu().max(1)
        num_correct += (preds == y).sum()

    acc = float(num_correct)/float(num_samples)
    print('Got %d/%d correct (%.2f%%) on the clean data' 
        % (num_correct, num_samples, 100 * acc))

    return acc


def attack_over_test_data(model, adversary, param, loader_test, oracle=None):
    """
    Given target model computes accuracy on perturbed data
    """
    total_correct = 0
    total_samples = len(loader_test.dataset)

    # For black-box
    if oracle is not None:
        total_samples -= param['hold_out_size']

    for t, (X, y) in enumerate(loader_test):
        y_pred = pred_batch(X, model)
        X_adv = adversary.perturb(X.numpy(), y_pred)
        X_adv = torch.from_numpy(X_adv)

        if oracle is not None:
            y_pred_adv = pred_batch(X_adv, oracle)
        else:
            y_pred_adv = pred_batch(X_adv, model)
        
        total_correct += (y_pred_adv.numpy() == y.numpy()).sum()

    acc = total_correct/total_samples

    print('Got %d/%d correct (%.2f%%) on the perturbed data' 
        % (total_correct, total_samples, 100 * acc))

    return acc


def batch_indices(batch_nb, data_length, batch_size):
    """
    This helper function computes a batch start and end index
    :param batch_nb: the batch number
    :param data_length: the total length of the data being parsed by batches
    :param batch_size: the number of inputs in each batch
    :return: pair of (start, end) indices
    """
    # Batch start and end index
    start = int(batch_nb * batch_size)
    end = int((batch_nb + 1) * batch_size)

    # When there are not enough inputs left, we reuse some to complete the
    # batch
    if end > data_length:
        shift = end - data_length
        start -= shift
        end -= shift

    return start, end


================================================
FILE: mnist_adv_train.py
================================================
"""
Adversarially train LeNet-5
"""

import torch
import torch.nn as nn
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.nn.functional as F

from adversarialbox.attacks import FGSMAttack, LinfPGDAttack
from adversarialbox.train import adv_train, FGSM_train_rnd
from adversarialbox.utils import to_var, pred_batch, test

from models import LeNet5


# Hyper-parameters
param = {
    'batch_size': 128,
    'test_batch_size': 100,
    'num_epochs': 15,
    'delay': 10,
    'learning_rate': 1e-3,
    'weight_decay': 5e-4,
}


# Data loaders
train_dataset = datasets.MNIST(root='../data/',train=True, download=True, 
    transform=transforms.ToTensor())
loader_train = torch.utils.data.DataLoader(train_dataset, 
    batch_size=param['batch_size'], shuffle=True)

test_dataset = datasets.MNIST(root='../data/', train=False, download=True, 
    transform=transforms.ToTensor())
loader_test = torch.utils.data.DataLoader(test_dataset, 
    batch_size=param['test_batch_size'], shuffle=True)


# Setup the model
net = LeNet5()

if torch.cuda.is_available():
    print('CUDA ensabled.')
    net.cuda()
net.train()

# Adversarial training setup
#adversary = FGSMAttack(epsilon=0.3)
adversary = LinfPGDAttack()

# Train the model
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(net.parameters(), lr=param['learning_rate'],
    weight_decay=param['weight_decay'])

for epoch in range(param['num_epochs']):

    print('Starting epoch %d / %d' % (epoch + 1, param['num_epochs']))

    for t, (x, y) in enumerate(loader_train):

        x_var, y_var = to_var(x), to_var(y.long())
        loss = criterion(net(x_var), y_var)

        # adversarial training
        if epoch+1 > param['delay']:
            # use predicted label to prevent label leaking
            y_pred = pred_batch(x, net)
            x_adv = adv_train(x, y_pred, net, criterion, adversary)
            x_adv_var = to_var(x_adv)
            loss_adv = criterion(net(x_adv_var), y_var)
            loss = (loss + loss_adv) / 2

        if (t + 1) % 100 == 0:
            print('t = %d, loss = %.8f' % (t + 1, loss.data[0]))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


test(net, loader_test)

torch.save(net.state_dict(), 'models/adv_trained_lenet5.pkl')


================================================
FILE: mnist_attack.py
================================================
"""
Adversarial attacks on LeNet5
"""
from time import time
import torch
import torch.nn as nn
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.nn.functional as F

from adversarialbox.attacks import FGSMAttack, LinfPGDAttack
from adversarialbox.utils import to_var, pred_batch, test, \
    attack_over_test_data

from models import LeNet5


# Hyper-parameters
param = {
    'test_batch_size': 100,
    'epsilon': 0.3,
}


# Data loaders
test_dataset = datasets.MNIST(root='../data/', train=False, download=True,
    transform=transforms.ToTensor())
loader_test = torch.utils.data.DataLoader(test_dataset, 
    batch_size=param['test_batch_size'], shuffle=False)


# Setup model to be attacked
net = LeNet5()
net.load_state_dict(torch.load('models/adv_trained_lenet5.pkl'))

if torch.cuda.is_available():
    print('CUDA ensabled.')
    net.cuda()

for p in net.parameters():
    p.requires_grad = False
net.eval()

test(net, loader_test)


# Adversarial attack
adversary = FGSMAttack(net, param['epsilon'])
# adversary = LinfPGDAttack(net, random_start=False)


t0 = time()
attack_over_test_data(net, adversary, param, loader_test)
print('{}s eclipsed.'.format(time()-t0))


================================================
FILE: mnist_blackbox.py
================================================
"""
PyTorch Implementation of Papernot's Black-Box Attack
arXiv:1602.02697
"""

import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils.data.sampler import SubsetRandomSampler

from adversarialbox.attacks import FGSMAttack, LinfPGDAttack, \
    jacobian_augmentation
from adversarialbox.utils import to_var, pred_batch, test, \
    attack_over_test_data, batch_indices

from models import LeNet5, SubstituteModel


def MNIST_bbox_sub(param, loader_hold_out, loader_test):
    """
    Train a substitute model using Jacobian data augmentation
    arXiv:1602.02697
    """

    # Setup the substitute
    net = SubstituteModel()

    if torch.cuda.is_available():
        print('CUDA ensabled for the substitute.')
        net.cuda()
    net.train()

    # Setup the oracle
    oracle = LeNet5()

    if torch.cuda.is_available():
        print('CUDA ensabled for the oracle.')
        oracle.cuda()
    oracle.load_state_dict(torch.load(param['oracle_name']+'.pkl'))
    oracle.eval()


    # Setup training
    criterion = nn.CrossEntropyLoss()
    # Careful optimization is crucial to train a well-representative 
    # substitute. In Tensorflow Adam has some problem:
    # (https://github.com/tensorflow/cleverhans/issues/183)
    # But it works fine here in PyTorch (you may try other optimization
    # methods
    optimizer = torch.optim.Adam(net.parameters(), lr=param['learning_rate'])

    # Data held out for initial training
    data_iter = iter(loader_hold_out)
    X_sub, y_sub = data_iter.next()
    X_sub, y_sub = X_sub.numpy(), y_sub.numpy()

    # Train the substitute and augment dataset alternatively
    for rho in range(param['data_aug']):
        print("Substitute training epoch #"+str(rho))
        print("Training data: "+str(len(X_sub)))

        rng = np.random.RandomState()

        # model training
        for epoch in range(param['nb_epochs']):

            print('Starting epoch %d / %d' % (epoch + 1, param['nb_epochs']))

            # Compute number of batches
            nb_batches = int(np.ceil(float(len(X_sub)) / 
                param['test_batch_size']))
            assert nb_batches * param['test_batch_size'] >= len(X_sub)

            # Indices to shuffle training set
            index_shuf = list(range(len(X_sub)))
            rng.shuffle(index_shuf)

            for batch in range(nb_batches):

                # Compute batch start and end indices
                start, end = batch_indices(batch, len(X_sub), 
                    param['test_batch_size'])

                x = X_sub[index_shuf[start:end]]
                y = y_sub[index_shuf[start:end]]

                scores = net(to_var(torch.from_numpy(x)))
                loss = criterion(scores, to_var(torch.from_numpy(y).long()))

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            print('loss = %.8f' % (loss.data[0]))
        test(net, loader_test, blackbox=True, hold_out_size=param['hold_out_size'])

        # If we are not at last substitute training iteration, augment dataset
        if rho < param['data_aug'] - 1:
            print("Augmenting substitute training data.")
            # Perform the Jacobian augmentation
            X_sub = jacobian_augmentation(net, X_sub, y_sub)

            print("Labeling substitute training data.")
            # Label the newly generated synthetic points using the black-box
            scores = oracle(to_var(torch.from_numpy(X_sub)))
            # Note here that we take the argmax because the adversary
            # only has access to the label (not the probabilities) output
            # by the black-box model
            y_sub = np.argmax(scores.data.cpu().numpy(), axis=1)


    torch.save(net.state_dict(), param['oracle_name']+'_sub.pkl')


if __name__ == "__main__":

    # Hyper-parameters
    param = {
        'hold_out_size': 150,
        'test_batch_size': 128,
        'nb_epochs': 10,
        'learning_rate': 0.001,
        'data_aug': 6,
        'oracle_name': 'models/adv_trained_lenet5',
        'epsilon': 0.3,
    }

    # Data loaders
    # We need to hold out 150 data points from the test data
    # This is a bit tricky in PyTorch
    # We adopt the way from:
    # https://github.com/pytorch/pytorch/issues/1106
    hold_out_data = datasets.MNIST(root='../data/', train=True,
        download=True, transform=transforms.ToTensor())
    test_dataset = datasets.MNIST(root='../data/', train=False, 
        download=True, transform=transforms.ToTensor())

    indices = list(range(test_dataset.test_data.size(0)))
    split = param['hold_out_size']
    rng = np.random.RandomState()
    rng.shuffle(indices)

    hold_out_idx, test_idx = indices[:split], indices[split:]

    hold_out_sampler = SubsetRandomSampler(hold_out_idx)
    test_sampler = SubsetRandomSampler(test_idx)

    loader_hold_out = torch.utils.data.DataLoader(hold_out_data, 
        batch_size=param['hold_out_size'], sampler=hold_out_sampler,
        shuffle=False)
    loader_test = torch.utils.data.DataLoader(test_dataset, 
        batch_size=param['test_batch_size'], sampler=test_sampler,
        shuffle=False)


    # Train the substitute
    MNIST_bbox_sub(param, loader_hold_out, loader_test)


    # Setup models
    net = SubstituteModel()
    oracle = LeNet5()

    net.load_state_dict(torch.load(param['oracle_name']+'_sub.pkl'))
    oracle.load_state_dict(torch.load(param['oracle_name']+'.pkl'))

    if torch.cuda.is_available():
        net.cuda()
        oracle.cuda()
        print('CUDA ensabled.')

    for p in net.parameters():
        p.requires_grad = False

    net.eval()
    oracle.eval()
    

    # Setup adversarial attacks
    adversary = FGSMAttack(net, param['epsilon'])

    print('For the substitute model:')
    test(net, loader_test, blackbox=True, hold_out_size=param['hold_out_size'])

    # Setup oracle
    print('For the oracle'+param['oracle_name'])
    print('agaist blackbox FGSM attacks using gradients from the substitute:')
    attack_over_test_data(net, adversary, param, loader_test, oracle)


================================================
FILE: models.py
================================================
import torch
import torch.nn as nn


class LeNet5(nn.Module):
    def __init__(self):
        super(LeNet5, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1, stride=1)
        self.relu1 = nn.ReLU(inplace=True)
        self.maxpool1 = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1, stride=1)
        self.relu2 = nn.ReLU(inplace=True)
        self.maxpool2 = nn.MaxPool2d(2)
        self.linear1 = nn.Linear(7*7*64, 200)
        self.relu3 = nn.ReLU(inplace=True)
        self.linear2 = nn.Linear(200, 10)

    def forward(self, x):
        out = self.maxpool1(self.relu1(self.conv1(x)))
        out = self.maxpool2(self.relu2(self.conv2(out)))
        out = out.view(out.size(0), -1)
        out = self.relu3(self.linear1(out))
        out = self.linear2(out)
        return out


class SubstituteModel(nn.Module):

    def __init__(self):
        super(SubstituteModel, self).__init__()
        self.linear1 = nn.Linear(28*28, 200)
        self.relu1 = nn.ReLU(inplace=True)
        self.linear2 = nn.Linear(200, 200)
        self.relu2 = nn.ReLU(inplace=True)
        self.linear3 = nn.Linear(200, 10)

    def forward(self, x):
        out = x.view(x.size(0), -1)
        out = self.relu1(self.linear1(out))
        out = self.relu2(self.linear2(out))
        out = self.linear3(out)
        return out