Full Code of Yonghongwei/Gradient-Centralization for AI

master ed2a608ccdbb cached

57 files

394.5 KB

115.9k tokens

523 symbols

1 requests

Download .txt

Showing preview only (414K chars total). Download the full file or copy to clipboard to get everything.

Repository: Yonghongwei/Gradient-Centralization
Branch: master
Commit: ed2a608ccdbb
Files: 57
Total size: 394.5 KB

Directory structure:
gitextract_l162dn_3/

├── GC_code/
│   ├── CIFAR100/
│   │   ├── algorithm/
│   │   │   ├── Adagrad.py
│   │   │   ├── Adam.py
│   │   │   └── SGD.py
│   │   ├── main.py
│   │   ├── models/
│   │   │   ├── __init__.py
│   │   │   ├── densenet.py
│   │   │   ├── dpn.py
│   │   │   ├── googlenet.py
│   │   │   ├── lenet.py
│   │   │   ├── mobilenet.py
│   │   │   ├── mobilenetv2.py
│   │   │   ├── pnasnet.py
│   │   │   ├── preact_resnet.py
│   │   │   ├── resnet.py
│   │   │   ├── resnext.py
│   │   │   ├── senet.py
│   │   │   ├── shufflenet.py
│   │   │   └── vgg.py
│   │   └── os_run.py
│   ├── Fine-grained_classification/
│   │   ├── SGD.py
│   │   ├── main.py
│   │   └── os_run.py
│   ├── ImageNet/
│   │   ├── SGD.py
│   │   ├── main.py
│   │   ├── myresnet.py
│   │   ├── myresnetgn.py
│   │   └── os_run.py
│   └── Mini_ImageNet/
│       ├── SGD.py
│       ├── main.py
│       ├── os_run.py
│       └── resnet_ws.py
├── README.md
└── algorithm-GC/
    ├── README.md
    ├── algorithm/
    │   ├── Adam.py
    │   ├── Centralization.py
    │   ├── Lookahead.py
    │   ├── RAdam.py
    │   ├── Ranger.py
    │   └── SGD.py
    └── cifar/
        ├── main.py
        ├── models/
        │   ├── __init__.py
        │   ├── densenet.py
        │   ├── dpn.py
        │   ├── googlenet.py
        │   ├── lenet.py
        │   ├── mobilenet.py
        │   ├── mobilenetv2.py
        │   ├── pnasnet.py
        │   ├── preact_resnet.py
        │   ├── resnet.py
        │   ├── resnext.py
        │   ├── senet.py
        │   ├── shufflenet.py
        │   └── vgg.py
        ├── nohup.out
        ├── os_run.py
        └── os_run2.py

================================================
FILE CONTENTS
================================================

================================================
FILE: GC_code/CIFAR100/algorithm/Adagrad.py
================================================
import torch
from torch.optim.optimizer import Optimizer


class Adagrad_GCC(Optimizer):
    """Implements Adagrad algorithm.

    It has been proposed in `Adaptive Subgradient Methods for Online Learning
    and Stochastic Optimization`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-2)
        lr_decay (float, optional): learning rate decay (default: 0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-10)

    .. _Adaptive Subgradient Methods for Online Learning and Stochastic
        Optimization: http://jmlr.org/papers/v12/duchi11a.html
    """

    def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0, eps=1e-10):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= lr_decay:
            raise ValueError("Invalid lr_decay value: {}".format(lr_decay))
        if not 0.0 <= weight_decay:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
        if not 0.0 <= initial_accumulator_value:
            raise ValueError("Invalid initial_accumulator_value value: {}".format(initial_accumulator_value))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))

        defaults = dict(lr=lr, lr_decay=lr_decay, eps=eps, weight_decay=weight_decay,
                        initial_accumulator_value=initial_accumulator_value)
        super(Adagrad_GCC, self).__init__(params, defaults)

        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'] = 0
                state['sum'] = torch.full_like(p.data, initial_accumulator_value)

    def share_memory(self):
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['sum'].share_memory_()

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue

                grad = p.grad.data
                state = self.state[p]

                state['step'] += 1

                if group['weight_decay'] != 0:
                    if p.grad.data.is_sparse:
                        raise RuntimeError("weight_decay option is not compatible with sparse gradients")
                    grad = grad.add(group['weight_decay'], p.data)
                    
                 #GC operation for Conv layers                  
                if len(list(grad.size()))>3:
                    grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))

                clr = group['lr'] / (1 + (state['step'] - 1) * group['lr_decay'])

                if grad.is_sparse:
                    grad = grad.coalesce()  # the update is non-linear so indices must be unique
                    grad_indices = grad._indices()
                    grad_values = grad._values()
                    size = grad.size()

                    def make_sparse(values):
                        constructor = grad.new
                        if grad_indices.dim() == 0 or values.dim() == 0:
                            return constructor().resize_as_(grad)
                        return constructor(grad_indices, values, size)
                    state['sum'].add_(make_sparse(grad_values.pow(2)))
                    std = state['sum'].sparse_mask(grad)
                    std_values = std._values().sqrt_().add_(group['eps'])
                    p.data.add_(-clr, make_sparse(grad_values / std_values))
                else:
                    state['sum'].addcmul_(1, grad, grad)
                    std = state['sum'].sqrt().add_(group['eps'])
                    p.data.addcdiv_(-clr, grad, std)

        return loss
    
class Adagrad_GC(Optimizer):
    """Implements Adagrad algorithm.

    It has been proposed in `Adaptive Subgradient Methods for Online Learning
    and Stochastic Optimization`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-2)
        lr_decay (float, optional): learning rate decay (default: 0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-10)

    .. _Adaptive Subgradient Methods for Online Learning and Stochastic
        Optimization: http://jmlr.org/papers/v12/duchi11a.html
    """

    def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0, eps=1e-10):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= lr_decay:
            raise ValueError("Invalid lr_decay value: {}".format(lr_decay))
        if not 0.0 <= weight_decay:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
        if not 0.0 <= initial_accumulator_value:
            raise ValueError("Invalid initial_accumulator_value value: {}".format(initial_accumulator_value))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))

        defaults = dict(lr=lr, lr_decay=lr_decay, eps=eps, weight_decay=weight_decay,
                        initial_accumulator_value=initial_accumulator_value)
        super(Adagrad_GC, self).__init__(params, defaults)

        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'] = 0
                state['sum'] = torch.full_like(p.data, initial_accumulator_value)

    def share_memory(self):
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['sum'].share_memory_()

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue

                grad = p.grad.data
                state = self.state[p]

                state['step'] += 1

                if group['weight_decay'] != 0:
                    if p.grad.data.is_sparse:
                        raise RuntimeError("weight_decay option is not compatible with sparse gradients")
                    grad = grad.add(group['weight_decay'], p.data)
                    
                 #GC operation for Conv layers                  
                if len(list(grad.size()))>1:
                    grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))

                clr = group['lr'] / (1 + (state['step'] - 1) * group['lr_decay'])

                if grad.is_sparse:
                    grad = grad.coalesce()  # the update is non-linear so indices must be unique
                    grad_indices = grad._indices()
                    grad_values = grad._values()
                    size = grad.size()

                    def make_sparse(values):
                        constructor = grad.new
                        if grad_indices.dim() == 0 or values.dim() == 0:
                            return constructor().resize_as_(grad)
                        return constructor(grad_indices, values, size)
                    state['sum'].add_(make_sparse(grad_values.pow(2)))
                    std = state['sum'].sparse_mask(grad)
                    std_values = std._values().sqrt_().add_(group['eps'])
                    p.data.add_(-clr, make_sparse(grad_values / std_values))
                else:
                    state['sum'].addcmul_(1, grad, grad)
                    std = state['sum'].sqrt().add_(group['eps'])
                    p.data.addcdiv_(-clr, grad, std)

        return loss
    


================================================
FILE: GC_code/CIFAR100/algorithm/Adam.py
================================================
import math
import torch
from torch.optim.optimizer import Optimizer

class Adam_GCC(Optimizer):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(Adam_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(Adam_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1
                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']

                if group['weight_decay'] != 0:
                    grad.add_(group['weight_decay'], p.data)

                #GC operation for Conv layers
                if len(list(grad.size()))>3:                    
                    grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
                    
                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
                else:
                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])

                step_size = group['lr'] / bias_correction1

                p.data.addcdiv_(-step_size, exp_avg, denom)

        return loss

class Adam_GCC2(Optimizer):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(Adam_GCC2, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(Adam_GCC2, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1
                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']

                if group['weight_decay'] != 0:
                    grad.add_(group['weight_decay'], p.data)
                    
                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
                else:
                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])

                step_size = group['lr'] / bias_correction1
                #GC operation for Conv layers                
                if len(list(grad.size()))>3:
                  delta=(step_size*exp_avg/denom).clone()
                  delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
                  p.data.add_(-delta)
                else:
                  p.data.addcdiv_(-step_size, exp_avg, denom)
        return loss    

class Adam_GC(Optimizer):
    r"""Implements Adam algorithm.

    It has been proposed in `Adam: A Method for Stochastic Optimization`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False)

    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(Adam_GC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(Adam_GC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1
                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']

                if group['weight_decay'] != 0:
                    grad.add_(group['weight_decay'], p.data)
                   
                #GC operation for Conv layers and FC layers   
                if len(list(grad.size()))>1:
                   grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
                else:
                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])

                step_size = group['lr'] / bias_correction1

                p.data.addcdiv_(-step_size, exp_avg, denom)

        return loss


class Adam_GC2(Optimizer):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(Adam_GC2, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(Adam_GC2, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1
                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']

                if group['weight_decay'] != 0:
                    grad.add_(group['weight_decay'], p.data)
                    
                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
                else:
                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])

                step_size = group['lr'] / bias_correction1
                #GC operation for Conv layers and FC layers               
                if len(list(grad.size()))>1:
                  delta=(step_size*exp_avg/denom).clone()
                  delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
                  p.data.add_(-delta)
                else:
                  p.data.addcdiv_(-step_size, exp_avg, denom)
        return loss

class AdamW(Optimizer):
    """Implements Adam algorithm.
    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(AdamW, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(AdamW, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                # if group['weight_decay'] != 0:
                #     grad = grad.add(group['weight_decay'], p.data)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                else:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                # p.data.addcdiv_(-step_size, exp_avg, denom)
                p.data.add_(-step_size,  torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )

        return loss



class AdamW_GCC(Optimizer):
    """Implements Adam algorithm.
    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(AdamW_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(AdamW_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                #GC operation for Conv layers
                if len(list(grad.size()))>3:                    
                   grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))

                state['step'] += 1

                # if group['weight_decay'] != 0:
                #     grad = grad.add(group['weight_decay'], p.data)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                else:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                # p.data.addcdiv_(-step_size, exp_avg, denom)
                p.data.add_(-step_size,  torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )

        return loss
 
class AdamW_GC(Optimizer):
    """Implements Adam algorithm.
    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(AdamW_GC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(AdamW_GC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                #GC operation for Conv and FC layers
                if len(list(grad.size()))>1:                    
                   grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))

                state['step'] += 1

                # if group['weight_decay'] != 0:
                #     grad = grad.add(group['weight_decay'], p.data)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                else:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                # p.data.addcdiv_(-step_size, exp_avg, denom)
                p.data.add_(-step_size,  torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )

        return loss

class AdamW_GCC2(Optimizer):
    """Implements Adam algorithm.
    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(AdamW_GCC2, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(AdamW_GCC2, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                # if group['weight_decay'] != 0:
                #     grad = grad.add(group['weight_decay'], p.data)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                else:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                # GC operation for Conv layers
                if len(list(grad.size()))>3:
                  delta=(step_size*torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom)).clone()
                  delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
                  p.data.add_(-delta)
                else:
                  p.data.add_(-step_size,  torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )
                             
        return loss

class AdamW_GC2(Optimizer):
    """Implements Adam algorithm.
    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(AdamW_GC2, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(AdamW_GC2, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                # if group['weight_decay'] != 0:
                #     grad = grad.add(group['weight_decay'], p.data)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                else:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                # GC operation for Conv and FC layers
                if len(list(grad.size()))>1:
                  delta=(step_size*torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom)).clone()
                  delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
                  p.data.add_(-delta)
                else:
                  p.data.add_(-step_size,  torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )
                
               
        return loss


================================================
FILE: GC_code/CIFAR100/algorithm/SGD.py
================================================
import torch
from torch.optim.optimizer import Optimizer, required



class SGD_GCC(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGD_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGD_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)

                #GC operation for Conv layers
                if len(list(d_p.size()))>3:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
                   
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

        return loss

class SGD_GC(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGD_GC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGD_GC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)

                #GC operation for Conv layers and FC layers
                if len(list(d_p.size()))>1:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))

                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

        return loss


class SGDW(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGDW, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGDW, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                old = torch.clone(p.data).detach()
                #if weight_decay != 0:
                #    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

                if weight_decay != 0:
                    p.data.add_(-weight_decay*group['lr'], old)

        return loss



class SGDW_GCC(Optimizer):
    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGDW_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGDW_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                old = torch.clone(p.data).detach()
                #if weight_decay != 0:
                #    d_p.add_(weight_decay, p.data)
                
                #GC operation for Conv layers
                if len(list(d_p.size()))>3:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))


                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

                if weight_decay != 0:
                    p.data.add_(-weight_decay*group['lr'], old)

        return loss

    
class SGDW_GC(Optimizer):
    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGDW_GC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGDW_GC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                old = torch.clone(p.data).detach()
                #if weight_decay != 0:
                #    d_p.add_(weight_decay, p.data)
                
                #GC operation for Conv and FC layers
                if len(list(d_p.size()))>1:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))


                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

                if weight_decay != 0:
                    p.data.add_(-weight_decay*group['lr'], old)

        return loss


================================================
FILE: GC_code/CIFAR100/main.py
================================================
'''Train CIFAR100 with PyTorch.'''
from __future__ import print_function

import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn


import torch.optim as optim
import torch.nn.functional as F

import torchvision
import torchvision.transforms as transforms


from torch.optim import lr_scheduler
import os
import argparse
from torchvision import datasets, models
from models import *
#from utils import progress_bar
import numpy as np

#import optimizers with GC
from algorithm.SGD import *
from algorithm.Adam import *
from algorithm.Adagrad import *


parser = argparse.ArgumentParser(description='PyTorch CIFAR100 Training')
parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
parser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint')
parser.add_argument('--bs', default=128, type=int, help='batchsize')
parser.add_argument('--wd', default=0.0005, type=float, help='weight decay')
parser.add_argument('--alg', default='sgd', type=str, help='algorithm')
parser.add_argument('--epochs', default=200, type=int, help='epochs')
parser.add_argument('--path', default='logout/result', type=str, help='path')
parser.add_argument('--model', default='r50', type=str, help='model')


args = parser.parse_args()
os.environ["CUDA_VISIBLE_DEVICES"]="0"


epochs=args.epochs
device = 'cuda' if torch.cuda.is_available() else 'cpu'
best_acc = 0  # best test accuracy
start_epoch = 0  # start from epoch 0 or last checkpoint epoch



# Data
print('==> Preparing data..')
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.507, 0.487, 0.441), (0.267, 0.256, 0.276)),
  ])
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.507, 0.487, 0.441), (0.267, 0.256, 0.276)),
  ])
trainset = torchvision.datasets.CIFAR100(root='/home/yonghw/data/cifar100/', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=4,drop_last=True)
testset = torchvision.datasets.CIFAR100(root='/home/yonghw/data/cifar100/', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=4)




# Model
print('==> Building model..')

Num_classes = 100

if args.model=='r18':
    net = ResNet18(Num_classes=Num_classes)
if args.model=='r34':
    net = ResNet34(Num_classes=Num_classes)
if args.model=='r50':
    net = ResNet50(Num_classes=Num_classes)
if args.model=='r101':
    net = ResNet101(Num_classes=Num_classes)
if args.model=='v11':
    net = VGG('VGG11',Num_classes=Num_classes)
if args.model=='rx29':
    net = ResNeXt29_4x64d(Num_classes=Num_classes)
if args.model=='d121':
    net = DenseNet121(Num_classes=Num_classes)

if device == 'cuda':
    net = net.cuda()
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True


if args.resume:
    # Load checkpoint.
    print('==> Resuming from checkpoint..')
    assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
    checkpoint = torch.load('./checkpoint/ckpt.t7')
    net.load_state_dict(checkpoint['net'])
    best_acc = checkpoint['acc']
    start_epoch = checkpoint['epoch']
    
criterion = nn.CrossEntropyLoss()

#optimizer
WD=args.wd
print('==> choose optimizer..')
if args.alg=='sgd':
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)
if args.alg=='sgdGC':
    optimizer = SGD_GC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)
if args.alg=='sgdGCC':
    optimizer = SGD_GCC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)
    
if args.alg=='adam':
    optimizer = optim.Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamGC':
    optimizer = Adam_GC(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamGCC':
    optimizer = Adam_GCC(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamGC2':
    optimizer = Adam_GC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamGCC2':
    optimizer = Adam_GCC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD) 


if args.alg=='adagrad':
    optimizer = optim.Adagrad(net.parameters(), lr=args.lr*0.1,weight_decay = WD)
if args.alg=='adagradGC':
    optimizer = Adagrad_GC(net.parameters(), lr=args.lr*0.1,weight_decay = WD)
if args.alg=='adagradGCC':
    optimizer = Adagrad_GCC(net.parameters(), lr=args.lr*0.1,weight_decay = WD)
if args.alg=='adagradGC2':
    optimizer = Adagrad_GC2(net.parameters(), lr=args.lr*0.1,weight_decay = WD)
if args.alg=='adagradGCC2':
    optimizer = Adagrad_GCC2(net.parameters(), lr=args.lr*0.1,weight_decay = WD)
    
    
if args.alg=='sgdW':
    optimizer = SGDW(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)
if args.alg=='sgdWGC':
    optimizer = SGDW_GC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)
if args.alg=='sgdWGCC':
    optimizer = SGDW_GCC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)
    
if args.alg=='adamW':
    optimizer = AdamW(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamWGC':
    optimizer = Adam_GC(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamWGCC':
    optimizer = Adam_GCC(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamWGC2':
    optimizer = Adam_GC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamWGCC2':
    optimizer = Adam_GCC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
    
    
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=60, gamma=0.1)

# Training
def train(epoch,net,optimizer):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
    print('Training: Loss: {:.4f} | Acc: {:.4f}'.format(train_loss/(batch_idx+1),correct/total))
    #        progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
    #            % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
    acc=100.*correct/total
    return acc
    
# Testing
def test(epoch,net):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
      for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            #progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
                #% (test_loss/(batch_idx+1), 100.*correct/total, correct, total))
    print('Testing:Loss: {:.4f} | Acc: {:.4f}'.format(test_loss/(batch_idx+1),correct/total) )

    # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc:
        print('Saving..')
        state = {
            'net': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir('checkpoint'):
            os.mkdir('checkpoint')
        torch.save(state, './checkpoint/ckpt.t7')
        best_acc = acc
    return acc


for epoch in range(start_epoch, start_epoch+epochs):
    train_acc=train(epoch,net,optimizer)
    exp_lr_scheduler.step()
    val_acc=test(epoch,net)



================================================
FILE: GC_code/CIFAR100/models/__init__.py
================================================
from .vgg import *
from .dpn import *
from .lenet import *
from .senet import *
from .pnasnet import *
from .densenet import *
from .googlenet import *
from .shufflenet import *
from .resnet import *
from .resnext import *
from .preact_resnet import *
from .mobilenet import *
from .mobilenetv2 import *


================================================
FILE: GC_code/CIFAR100/models/densenet.py
================================================
'''DenseNet in PyTorch.'''
import math

import torch
import torch.nn as nn
import torch.nn.functional as F


class Bottleneck(nn.Module):
    def __init__(self, in_planes, growth_rate):
        super(Bottleneck, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False)
        self.bn2 = nn.BatchNorm2d(4*growth_rate)
        self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)

    def forward(self, x):
        out = self.conv1(F.relu(self.bn1(x)))
        out = self.conv2(F.relu(self.bn2(out)))
        out = torch.cat([out,x], 1)
        return out


class Transition(nn.Module):
    def __init__(self, in_planes, out_planes):
        super(Transition, self).__init__()
        self.bn = nn.BatchNorm2d(in_planes)
        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False)

    def forward(self, x):
        out = self.conv(F.relu(self.bn(x)))
        out = F.avg_pool2d(out, 2)
        return out


class DenseNet(nn.Module):
    def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
        super(DenseNet, self).__init__()
        self.growth_rate = growth_rate

        num_planes = 2*growth_rate
        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False)

        self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
        num_planes += nblocks[0]*growth_rate
        out_planes = int(math.floor(num_planes*reduction))
        self.trans1 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
        num_planes += nblocks[1]*growth_rate
        out_planes = int(math.floor(num_planes*reduction))
        self.trans2 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
        num_planes += nblocks[2]*growth_rate
        out_planes = int(math.floor(num_planes*reduction))
        self.trans3 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3])
        num_planes += nblocks[3]*growth_rate

        self.bn = nn.BatchNorm2d(num_planes)
        self.linear = nn.Linear(num_planes, num_classes)

    def _make_dense_layers(self, block, in_planes, nblock):
        layers = []
        for i in range(nblock):
            layers.append(block(in_planes, self.growth_rate))
            in_planes += self.growth_rate
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.trans1(self.dense1(out))
        out = self.trans2(self.dense2(out))
        out = self.trans3(self.dense3(out))
        out = self.dense4(out)
        out = F.avg_pool2d(F.relu(self.bn(out)), 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

def DenseNet121(Num_classes=10):
    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32, num_classes=Num_classes)

def DenseNet169(Num_classes=10):
    return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32, num_classes=Num_classes)

def DenseNet201(Num_classes=10):
    return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32, num_classes=Num_classes)

def DenseNet161(Num_classes=10):
    return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48, num_classes=Num_classes)

def densenet_cifar(Num_classes=10):
    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12, num_classes=Num_classes)

def test():
    net = densenet_cifar()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y)

# test()


================================================
FILE: GC_code/CIFAR100/models/dpn.py
================================================
'''Dual Path Networks in PyTorch.'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class Bottleneck(nn.Module):
    def __init__(self, last_planes, in_planes, out_planes, dense_depth, stride, first_layer):
        super(Bottleneck, self).__init__()
        self.out_planes = out_planes
        self.dense_depth = dense_depth

        self.conv1 = nn.Conv2d(last_planes, in_planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv2 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=32, bias=False)
        self.bn2 = nn.BatchNorm2d(in_planes)
        self.conv3 = nn.Conv2d(in_planes, out_planes+dense_depth, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(out_planes+dense_depth)

        self.shortcut = nn.Sequential()
        if first_layer:
            self.shortcut = nn.Sequential(
                nn.Conv2d(last_planes, out_planes+dense_depth, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_planes+dense_depth)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        x = self.shortcut(x)
        d = self.out_planes
        out = torch.cat([x[:,:d,:,:]+out[:,:d,:,:], x[:,d:,:,:], out[:,d:,:,:]], 1)
        out = F.relu(out)
        return out


class DPN(nn.Module):
    def __init__(self, cfg):
        super(DPN, self).__init__()
        in_planes, out_planes = cfg['in_planes'], cfg['out_planes']
        num_blocks, dense_depth = cfg['num_blocks'], cfg['dense_depth']

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.last_planes = 64
        self.layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1)
        self.layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2)
        self.layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=2)
        self.layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2)
        self.linear = nn.Linear(out_planes[3]+(num_blocks[3]+1)*dense_depth[3], 10)

    def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for i,stride in enumerate(strides):
            layers.append(Bottleneck(self.last_planes, in_planes, out_planes, dense_depth, stride, i==0))
            self.last_planes = out_planes + (i+2) * dense_depth
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def DPN26():
    cfg = {
        'in_planes': (96,192,384,768),
        'out_planes': (256,512,1024,2048),
        'num_blocks': (2,2,2,2),
        'dense_depth': (16,32,24,128)
    }
    return DPN(cfg)

def DPN92():
    cfg = {
        'in_planes': (96,192,384,768),
        'out_planes': (256,512,1024,2048),
        'num_blocks': (3,4,20,3),
        'dense_depth': (16,32,24,128)
    }
    return DPN(cfg)


def test():
    net = DPN92()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y)

# test()


================================================
FILE: GC_code/CIFAR100/models/googlenet.py
================================================
'''GoogLeNet with PyTorch.'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class Inception(nn.Module):
    def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes):
        super(Inception, self).__init__()
        # 1x1 conv branch
        self.b1 = nn.Sequential(
            nn.Conv2d(in_planes, n1x1, kernel_size=1),
            nn.BatchNorm2d(n1x1),
            nn.ReLU(True),
        )

        # 1x1 conv -> 3x3 conv branch
        self.b2 = nn.Sequential(
            nn.Conv2d(in_planes, n3x3red, kernel_size=1),
            nn.BatchNorm2d(n3x3red),
            nn.ReLU(True),
            nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1),
            nn.BatchNorm2d(n3x3),
            nn.ReLU(True),
        )

        # 1x1 conv -> 5x5 conv branch
        self.b3 = nn.Sequential(
            nn.Conv2d(in_planes, n5x5red, kernel_size=1),
            nn.BatchNorm2d(n5x5red),
            nn.ReLU(True),
            nn.Conv2d(n5x5red, n5x5, kernel_size=3, padding=1),
            nn.BatchNorm2d(n5x5),
            nn.ReLU(True),
            nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1),
            nn.BatchNorm2d(n5x5),
            nn.ReLU(True),
        )

        # 3x3 pool -> 1x1 conv branch
        self.b4 = nn.Sequential(
            nn.MaxPool2d(3, stride=1, padding=1),
            nn.Conv2d(in_planes, pool_planes, kernel_size=1),
            nn.BatchNorm2d(pool_planes),
            nn.ReLU(True),
        )

    def forward(self, x):
        y1 = self.b1(x)
        y2 = self.b2(x)
        y3 = self.b3(x)
        y4 = self.b4(x)
        return torch.cat([y1,y2,y3,y4], 1)


class GoogLeNet(nn.Module):
    def __init__(self):
        super(GoogLeNet, self).__init__()
        self.pre_layers = nn.Sequential(
            nn.Conv2d(3, 192, kernel_size=3, padding=1),
            nn.BatchNorm2d(192),
            nn.ReLU(True),
        )

        self.a3 = Inception(192,  64,  96, 128, 16, 32, 32)
        self.b3 = Inception(256, 128, 128, 192, 32, 96, 64)

        self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)

        self.a4 = Inception(480, 192,  96, 208, 16,  48,  64)
        self.b4 = Inception(512, 160, 112, 224, 24,  64,  64)
        self.c4 = Inception(512, 128, 128, 256, 24,  64,  64)
        self.d4 = Inception(512, 112, 144, 288, 32,  64,  64)
        self.e4 = Inception(528, 256, 160, 320, 32, 128, 128)

        self.a5 = Inception(832, 256, 160, 320, 32, 128, 128)
        self.b5 = Inception(832, 384, 192, 384, 48, 128, 128)

        self.avgpool = nn.AvgPool2d(8, stride=1)
        self.linear = nn.Linear(1024, 10)

    def forward(self, x):
        out = self.pre_layers(x)
        out = self.a3(out)
        out = self.b3(out)
        out = self.maxpool(out)
        out = self.a4(out)
        out = self.b4(out)
        out = self.c4(out)
        out = self.d4(out)
        out = self.e4(out)
        out = self.maxpool(out)
        out = self.a5(out)
        out = self.b5(out)
        out = self.avgpool(out)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def test():
    net = GoogLeNet()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y.size())

# test()


================================================
FILE: GC_code/CIFAR100/models/lenet.py
================================================
'''LeNet in PyTorch.'''
import torch.nn as nn
import torch.nn.functional as F

class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1   = nn.Linear(16*5*5, 120)
        self.fc2   = nn.Linear(120, 84)
        self.fc3   = nn.Linear(84, 10)

    def forward(self, x):
        out = F.relu(self.conv1(x))
        out = F.max_pool2d(out, 2)
        out = F.relu(self.conv2(out))
        out = F.max_pool2d(out, 2)
        out = out.view(out.size(0), -1)
        out = F.relu(self.fc1(out))
        out = F.relu(self.fc2(out))
        out = self.fc3(out)
        return out


================================================
FILE: GC_code/CIFAR100/models/mobilenet.py
================================================
'''MobileNet in PyTorch.

See the paper "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class Block(nn.Module):
    '''Depthwise conv + Pointwise conv'''
    def __init__(self, in_planes, out_planes, stride=1):
        super(Block, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False)
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn2 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        return out


class MobileNet(nn.Module):
    # (128,2) means conv planes=128, conv stride=2, by default conv stride=1
    cfg = [64, (128,2), 128, (256,2), 256, (512,2), 512, 512, 512, 512, 512, (1024,2), 1024]

    def __init__(self, num_classes=10):
        super(MobileNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.layers = self._make_layers(in_planes=32)
        self.linear = nn.Linear(1024, num_classes)

    def _make_layers(self, in_planes):
        layers = []
        for x in self.cfg:
            out_planes = x if isinstance(x, int) else x[0]
            stride = 1 if isinstance(x, int) else x[1]
            layers.append(Block(in_planes, out_planes, stride))
            in_planes = out_planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layers(out)
        out = F.avg_pool2d(out, 2)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def test():
    net = MobileNet()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y.size())

# test()


================================================
FILE: GC_code/CIFAR100/models/mobilenetv2.py
================================================
'''MobileNetV2 in PyTorch.

See the paper "Inverted Residuals and Linear Bottlenecks:
Mobile Networks for Classification, Detection and Segmentation" for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class Block(nn.Module):
    '''expand + depthwise + pointwise'''
    def __init__(self, in_planes, out_planes, expansion, stride):
        super(Block, self).__init__()
        self.stride = stride

        planes = expansion * in_planes
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn3 = nn.BatchNorm2d(out_planes)

        self.shortcut = nn.Sequential()
        if stride == 1 and in_planes != out_planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False),
                nn.BatchNorm2d(out_planes),
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out = out + self.shortcut(x) if self.stride==1 else out
        return out


class MobileNetV2(nn.Module):
    # (expansion, out_planes, num_blocks, stride)
    cfg = [(1,  16, 1, 1),
           (6,  24, 2, 1),  # NOTE: change stride 2 -> 1 for CIFAR10
           (6,  32, 3, 2),
           (6,  64, 4, 2),
           (6,  96, 3, 1),
           (6, 160, 3, 2),
           (6, 320, 1, 1)]

    def __init__(self, num_classes=10):
        super(MobileNetV2, self).__init__()
        # NOTE: change conv1 stride 2 -> 1 for CIFAR10
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.layers = self._make_layers(in_planes=32)
        self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn2 = nn.BatchNorm2d(1280)
        self.linear = nn.Linear(1280, num_classes)

    def _make_layers(self, in_planes):
        layers = []
        for expansion, out_planes, num_blocks, stride in self.cfg:
            strides = [stride] + [1]*(num_blocks-1)
            for stride in strides:
                layers.append(Block(in_planes, out_planes, expansion, stride))
                in_planes = out_planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layers(out)
        out = F.relu(self.bn2(self.conv2(out)))
        # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def test():
    net = MobileNetV2()
    x = torch.randn(2,3,32,32)
    y = net(x)
    print(y.size())

# test()


================================================
FILE: GC_code/CIFAR100/models/pnasnet.py
================================================
'''PNASNet in PyTorch.

Paper: Progressive Neural Architecture Search
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class SepConv(nn.Module):
    '''Separable Convolution.'''
    def __init__(self, in_planes, out_planes, kernel_size, stride):
        super(SepConv, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, out_planes,
                               kernel_size, stride,
                               padding=(kernel_size-1)//2,
                               bias=False, groups=in_planes)
        self.bn1 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        return self.bn1(self.conv1(x))


class CellA(nn.Module):
    def __init__(self, in_planes, out_planes, stride=1):
        super(CellA, self).__init__()
        self.stride = stride
        self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)
        if stride==2:
            self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
            self.bn1 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        y1 = self.sep_conv1(x)
        y2 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)
        if self.stride==2:
            y2 = self.bn1(self.conv1(y2))
        return F.relu(y1+y2)

class CellB(nn.Module):
    def __init__(self, in_planes, out_planes, stride=1):
        super(CellB, self).__init__()
        self.stride = stride
        # Left branch
        self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)
        self.sep_conv2 = SepConv(in_planes, out_planes, kernel_size=3, stride=stride)
        # Right branch
        self.sep_conv3 = SepConv(in_planes, out_planes, kernel_size=5, stride=stride)
        if stride==2:
            self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
            self.bn1 = nn.BatchNorm2d(out_planes)
        # Reduce channels
        self.conv2 = nn.Conv2d(2*out_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn2 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        # Left branch
        y1 = self.sep_conv1(x)
        y2 = self.sep_conv2(x)
        # Right branch
        y3 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)
        if self.stride==2:
            y3 = self.bn1(self.conv1(y3))
        y4 = self.sep_conv3(x)
        # Concat & reduce channels
        b1 = F.relu(y1+y2)
        b2 = F.relu(y3+y4)
        y = torch.cat([b1,b2], 1)
        return F.relu(self.bn2(self.conv2(y)))

class PNASNet(nn.Module):
    def __init__(self, cell_type, num_cells, num_planes):
        super(PNASNet, self).__init__()
        self.in_planes = num_planes
        self.cell_type = cell_type

        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(num_planes)

        self.layer1 = self._make_layer(num_planes, num_cells=6)
        self.layer2 = self._downsample(num_planes*2)
        self.layer3 = self._make_layer(num_planes*2, num_cells=6)
        self.layer4 = self._downsample(num_planes*4)
        self.layer5 = self._make_layer(num_planes*4, num_cells=6)

        self.linear = nn.Linear(num_planes*4, 10)

    def _make_layer(self, planes, num_cells):
        layers = []
        for _ in range(num_cells):
            layers.append(self.cell_type(self.in_planes, planes, stride=1))
            self.in_planes = planes
        return nn.Sequential(*layers)

    def _downsample(self, planes):
        layer = self.cell_type(self.in_planes, planes, stride=2)
        self.in_planes = planes
        return layer

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = F.avg_pool2d(out, 8)
        out = self.linear(out.view(out.size(0), -1))
        return out


def PNASNetA():
    return PNASNet(CellA, num_cells=6, num_planes=44)

def PNASNetB():
    return PNASNet(CellB, num_cells=6, num_planes=32)


def test():
    net = PNASNetB()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y)

# test()


================================================
FILE: GC_code/CIFAR100/models/preact_resnet.py
================================================
'''Pre-activation ResNet in PyTorch.

Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Identity Mappings in Deep Residual Networks. arXiv:1603.05027
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class PreActBlock(nn.Module):
    '''Pre-activation version of the BasicBlock.'''
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(PreActBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)

        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
            )

    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))
        out += shortcut
        return out


class PreActBottleneck(nn.Module):
    '''Pre-activation version of the original Bottleneck module.'''
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(PreActBottleneck, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)

        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
            )

    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))
        out = self.conv3(F.relu(self.bn3(out)))
        out += shortcut
        return out


class PreActResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(PreActResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def PreActResNet18():
    return PreActResNet(PreActBlock, [2,2,2,2])

def PreActResNet34():
    return PreActResNet(PreActBlock, [3,4,6,3])

def PreActResNet50():
    return PreActResNet(PreActBottleneck, [3,4,6,3])

def PreActResNet101():
    return PreActResNet(PreActBottleneck, [3,4,23,3])

def PreActResNet152():
    return PreActResNet(PreActBottleneck, [3,8,36,3])


def test():
    net = PreActResNet18()
    y = net((torch.randn(1,3,32,32)))
    print(y.size())

# test()


================================================
FILE: GC_code/CIFAR100/models/resnet.py
================================================
'''ResNet in PyTorch.

For Pre-activation ResNet, see 'preact_resnet.py'.

Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNet18(Num_classes=10):
    return ResNet(BasicBlock, [2,2,2,2],num_classes=Num_classes)

def ResNet34(Num_classes=10):
    return ResNet(BasicBlock, [3,4,6,3],num_classes=Num_classes)

def ResNet50(Num_classes=10):
    return ResNet(Bottleneck, [3,4,6,3],num_classes=Num_classes)

def ResNet101(Num_classes=10):
    return ResNet(Bottleneck, [3,4,23,3],num_classes=Num_classes)

def ResNet152(Num_classes=10):
    return ResNet(Bottleneck, [3,8,36,3],num_classes=Num_classes)


def test():
    net = ResNet18()
    y = net(torch.randn(1,3,32,32))
    print(y.size())

# test()


================================================
FILE: GC_code/CIFAR100/models/resnext.py
================================================
'''ResNeXt in PyTorch.

See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class Block(nn.Module):
    '''Grouped convolution block.'''
    expansion = 2

    def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1):
        super(Block, self).__init__()
        group_width = cardinality * bottleneck_width
        self.conv1 = nn.Conv2d(in_planes, group_width, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(group_width)
        self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False)
        self.bn2 = nn.BatchNorm2d(group_width)
        self.conv3 = nn.Conv2d(group_width, self.expansion*group_width, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*group_width)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*group_width:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*group_width, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*group_width)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNeXt(nn.Module):
    def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10):
        super(ResNeXt, self).__init__()
        self.cardinality = cardinality
        self.bottleneck_width = bottleneck_width
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(num_blocks[0], 1)
        self.layer2 = self._make_layer(num_blocks[1], 2)
        self.layer3 = self._make_layer(num_blocks[2], 2)
        # self.layer4 = self._make_layer(num_blocks[3], 2)
        self.linear = nn.Linear(cardinality*bottleneck_width*8, num_classes)

    def _make_layer(self, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(Block(self.in_planes, self.cardinality, self.bottleneck_width, stride))
            self.in_planes = Block.expansion * self.cardinality * self.bottleneck_width
        # Increase bottleneck_width by 2 after each stage.
        self.bottleneck_width *= 2
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        # out = self.layer4(out)
        out = F.avg_pool2d(out, 8)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNeXt29_2x64d(Num_classes=10):
    return ResNeXt(num_blocks=[3,3,3], cardinality=2, bottleneck_width=64,num_classes=Num_classes)

def ResNeXt29_4x64d(Num_classes=10):
    return ResNeXt(num_blocks=[3,3,3], cardinality=4, bottleneck_width=64,num_classes=Num_classes)

def ResNeXt29_8x64d(Num_classes=10):
    return ResNeXt(num_blocks=[3,3,3], cardinality=8, bottleneck_width=64,num_classes=Num_classes)

def ResNeXt29_32x4d(Num_classes=10):
    return ResNeXt(num_blocks=[3,3,3], cardinality=32, bottleneck_width=4,num_classes=Num_classes)

def test_resnext():
    net = ResNeXt29_2x64d()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y.size())

# test_resnext()


================================================
FILE: GC_code/CIFAR100/models/senet.py
================================================
'''SENet in PyTorch.

SENet is the winner of ImageNet-2017. The paper is not released yet.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes)
            )

        # SE layers
        self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)  # Use nn.Conv2d instead of nn.Linear
        self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))

        # Squeeze
        w = F.avg_pool2d(out, out.size(2))
        w = F.relu(self.fc1(w))
        w = F.sigmoid(self.fc2(w))
        # Excitation
        out = out * w  # New broadcasting feature from v0.2!

        out += self.shortcut(x)
        out = F.relu(out)
        return out


class PreActBlock(nn.Module):
    def __init__(self, in_planes, planes, stride=1):
        super(PreActBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)

        if stride != 1 or in_planes != planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False)
            )

        # SE layers
        self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)
        self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)

    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))

        # Squeeze
        w = F.avg_pool2d(out, out.size(2))
        w = F.relu(self.fc1(w))
        w = F.sigmoid(self.fc2(w))
        # Excitation
        out = out * w

        out += shortcut
        return out


class SENet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(SENet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block,  64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def SENet18():
    return SENet(PreActBlock, [2,2,2,2])


def test():
    net = SENet18()
    y = net(torch.randn(1,3,32,32))
    print(y.size())

# test()


================================================
FILE: GC_code/CIFAR100/models/shufflenet.py
================================================
'''ShuffleNet in PyTorch.

See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class ShuffleBlock(nn.Module):
    def __init__(self, groups):
        super(ShuffleBlock, self).__init__()
        self.groups = groups

    def forward(self, x):
        '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''
        N,C,H,W = x.size()
        g = self.groups
        return x.view(N,g,C/g,H,W).permute(0,2,1,3,4).contiguous().view(N,C,H,W)


class Bottleneck(nn.Module):
    def __init__(self, in_planes, out_planes, stride, groups):
        super(Bottleneck, self).__init__()
        self.stride = stride

        mid_planes = out_planes/4
        g = 1 if in_planes==24 else groups
        self.conv1 = nn.Conv2d(in_planes, mid_planes, kernel_size=1, groups=g, bias=False)
        self.bn1 = nn.BatchNorm2d(mid_planes)
        self.shuffle1 = ShuffleBlock(groups=g)
        self.conv2 = nn.Conv2d(mid_planes, mid_planes, kernel_size=3, stride=stride, padding=1, groups=mid_planes, bias=False)
        self.bn2 = nn.BatchNorm2d(mid_planes)
        self.conv3 = nn.Conv2d(mid_planes, out_planes, kernel_size=1, groups=groups, bias=False)
        self.bn3 = nn.BatchNorm2d(out_planes)

        self.shortcut = nn.Sequential()
        if stride == 2:
            self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1))

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.shuffle1(out)
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        res = self.shortcut(x)
        out = F.relu(torch.cat([out,res], 1)) if self.stride==2 else F.relu(out+res)
        return out


class ShuffleNet(nn.Module):
    def __init__(self, cfg):
        super(ShuffleNet, self).__init__()
        out_planes = cfg['out_planes']
        num_blocks = cfg['num_blocks']
        groups = cfg['groups']

        self.conv1 = nn.Conv2d(3, 24, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(24)
        self.in_planes = 24
        self.layer1 = self._make_layer(out_planes[0], num_blocks[0], groups)
        self.layer2 = self._make_layer(out_planes[1], num_blocks[1], groups)
        self.layer3 = self._make_layer(out_planes[2], num_blocks[2], groups)
        self.linear = nn.Linear(out_planes[2], 10)

    def _make_layer(self, out_planes, num_blocks, groups):
        layers = []
        for i in range(num_blocks):
            stride = 2 if i == 0 else 1
            cat_planes = self.in_planes if i == 0 else 0
            layers.append(Bottleneck(self.in_planes, out_planes-cat_planes, stride=stride, groups=groups))
            self.in_planes = out_planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ShuffleNetG2():
    cfg = {
        'out_planes': [200,400,800],
        'num_blocks': [4,8,4],
        'groups': 2
    }
    return ShuffleNet(cfg)

def ShuffleNetG3():
    cfg = {
        'out_planes': [240,480,960],
        'num_blocks': [4,8,4],
        'groups': 3
    }
    return ShuffleNet(cfg)


def test():
    net = ShuffleNetG2()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y)

# test()


================================================
FILE: GC_code/CIFAR100/models/vgg.py
================================================
'''VGG11/13/16/19 in Pytorch.'''
import torch
import torch.nn as nn


cfg = {
    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}


class VGG(nn.Module):
    def __init__(self, vgg_name,Num_classes=100):
        super(VGG, self).__init__()
        self.features = self._make_layers(cfg[vgg_name])
        self.classifier = nn.Linear(512, Num_classes)

    def forward(self, x):
        out = self.features(x)
        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return out

    def _make_layers(self, cfg):
        layers = []
        in_channels = 3
        for x in cfg:
            if x == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
                           nn.BatchNorm2d(x),
                           nn.ReLU(inplace=True)]
                in_channels = x
        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
        return nn.Sequential(*layers)


def test():
    net = VGG('VGG11')
    x = torch.randn(2,3,32,32)
    y = net(x)
    print(y.size())

# test()


================================================
FILE: GC_code/CIFAR100/os_run.py
================================================

import os,time

#cifar100 sgd & sgdGCC

os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200  --model r50 > logout/r50_lr11_wd45_sgd.log ")

os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200  --model r50 > logout/r50_lr11_wd45_sgdGC.log ")


================================================
FILE: GC_code/Fine-grained_classification/SGD.py
================================================
import torch
from torch.optim.optimizer import Optimizer, required



class SGD_GCC(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGD_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGD_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)

                #GC operation for Conv layers
                if len(list(d_p.size()))>3:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
                   
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

        return loss

class SGD_GC(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGD_GC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGD_GC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)

                #GC operation for Conv layers and FC layers
                if len(list(d_p.size()))>1:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))

                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

        return loss


class SGDW(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGDW, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGDW, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                old = torch.clone(p.data).detach()
                #if weight_decay != 0:
                #    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

                if weight_decay != 0:
                    p.data.add_(-weight_decay*group['lr'], old)

        return loss



class SGDW_GCC(Optimizer):
    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGDW_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGDW_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                old = torch.clone(p.data).detach()
                #if weight_decay != 0:
                #    d_p.add_(weight_decay, p.data)
                
                #GC operation for Conv layers
                if len(list(d_p.size()))>3:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))


                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

                if weight_decay != 0:
                    p.data.add_(-weight_decay*group['lr'], old)

        return loss


================================================
FILE: GC_code/Fine-grained_classification/main.py
================================================
import argparse
import os
import random
import shutil
import time
import warnings
import sys

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models

from torch.optim import lr_scheduler

from SGD import SGD_GC #import SGD with GC


model_names = sorted(name for name in models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(models.__dict__[name]))

parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')

parser.add_argument('-b', '--batch-size', default=256, type=int,
                    metavar='N',
                    help='mini-batch size (default: 256), this is the total '
                         'batch size of all GPUs on the current node when '
                         'using Data Parallel or Distributed Data Parallel')

parser.add_argument('--lr', '--learning-rate', default=0.1*128/128, type=float,
                    metavar='LR', help='initial learning rate', dest='lr')

parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',
                    choices=model_names,
                    help='model architecture: ' +
                        ' | '.join(model_names) +
                        ' (default: resnet18)')

parser.add_argument('data', metavar='DIR',
                    help='path to dataset')

parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
                    help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=100, type=int, metavar='N',
                    help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                    help='manual epoch number (useful on restarts)')

parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                    help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
                    metavar='W', help='weight decay (default: 1e-4)',
                    dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=100, type=int,
                    metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
                    help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                    help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                    help='use pre-trained model')
parser.add_argument('--world-size', default=-1, type=int,
                    help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
                    help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
                    help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
                    help='distributed backend')
parser.add_argument('--seed', default=None, type=int,
                    help='seed for initializing training. ')
parser.add_argument('--gpu', default=None, type=int,
                    help='GPU id to use.')
parser.add_argument('--multiprocessing-distributed', action='store_true',
                    help='Use multi-processing distributed training to launch '
                         'N processes per node, which has N GPUs. This is the '
                         'fastest way to use PyTorch for either single node or '
                         'multi node data parallel training')
parser.add_argument('--model', default='r50p', type=str, help='model')

parser.add_argument('--path', default='test', type=str, help='model')
parser.add_argument('--alg', default='sgd', type=str, help='algorithm')

parser.add_argument('--dataset', default='cub', type=str, help='model')

best_acc1 = 0

def main():
    args = parser.parse_args()
    os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')

    if args.dist_url == "env://" and args.world_size == -1:
        args.world_size = int(os.environ["WORLD_SIZE"])

    args.distributed = args.world_size > 1 or args.multiprocessing_distributed

    ngpus_per_node = torch.cuda.device_count()
    if args.multiprocessing_distributed:
        # Since we have ngpus_per_node processes per node, the total world_size
        # needs to be adjusted accordingly
        args.world_size = ngpus_per_node * args.world_size
        # Use torch.multiprocessing.spawn to launch distributed processes: the
        # main_worker process function
        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
    else:
        # Simply call main_worker function
        main_worker(args.gpu, ngpus_per_node, args)


def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    class_num={'cub':200,'cars':196,'dogs':120,'fgvc':100}
    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)
    # create model
    if args.model=='r18p':
      model =models.resnet18(pretrained=True)
      model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True)
    if args.model=='r18':
      model =models.resnet18()
      model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True)
    if args.model=='r50p':
      model =models.resnet50(pretrained=True)
      model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True)
    if args.model=='r50':
      model =models.resnet50()
      model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True)


    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(args.workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(model)
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    # choose optimizer
    if   args.model=='r50p' or args.model=='r50':
        new_param_ids = set(map(id, model.module.fc.parameters()))
    base_params = [p for p in model.parameters() if
            id(p) not in new_param_ids]
    param_groups_base =[{'params': base_params, 'lr_mult': 0.1}]

    if   args.model=='r50p' or args.model=='r50':
       param_groups_new=[{'params': model.module.fc.parameters(), 'lr_mult': 1.0}]

    if args.alg=='sgd':
       optimizer_base = torch.optim.SGD(param_groups_base, args.lr, momentum=args.momentum,weight_decay=args.weight_decay)
       optimizer_new= torch.optim.SGD(param_groups_new, args.lr, momentum=args.momentum,weight_decay=args.weight_decay)
    if args.alg=='sgdGC':
       optimizer_base = SGD_GC(param_groups_base, args.lr, momentum=args.momentum,weight_decay=args.weight_decay)
       optimizer_new= SGD_GC(param_groups_new, args.lr, momentum=args.momentum,weight_decay=args.weight_decay)

    exp_lr_scheduler_new = lr_scheduler.MultiStepLR(optimizer_new, milestones=[50,80], gamma=0.1)
    exp_lr_scheduler_base = lr_scheduler.MultiStepLR(optimizer_base, milestones=[50,80], gamma=0.1)
    
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
                    transforms.Resize(512),
                    transforms.RandomHorizontalFlip(),
                    transforms.CenterCrop(448),
                    transforms.ToTensor(),
                    normalize,
                ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler,drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
                    transforms.Resize(512),
                    transforms.CenterCrop(448),
                    transforms.ToTensor(),
                    normalize,
                ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True,drop_last=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        #adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer_base, optimizer_new,epoch, args)
        #exp_lr_scheduler.step()
        exp_lr_scheduler_new.step()
        exp_lr_scheduler_base.step()
        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                #'optimizer' : optimizer.state_dict(),
            }, is_best)
        #torch.save(model.module, './result_model/'+args.path+'.pth')


# train
def train(train_loader, model, criterion, optimizer_base, optimizer_new, epoch, args):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()
    total = 0
    train_loss = 0
    correct = 0
    # switch to train mode
    model.train()
    print('\nEpoch: %d' % epoch)
    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        #if args.gpu is not None:
         #input = input.cuda(args.gpu, non_blocking=True)
        #target = target.cuda(args.gpu, non_blocking=True)
        input, target = input.to('cuda'), target.to('cuda')



        # compute output
        output = model(input)
        loss = criterion(output, target)

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), input.size(0))
        top1.update(acc1[0], input.size(0))
        top5.update(acc5[0], input.size(0))

        _, predicted = output.max(1)
        correct += predicted.eq(target).sum().item()

        train_loss += loss.item()
        #correct +=acc1[0]
        total += target.size(0)
        # compute gradient and do SGD step
        optimizer_new.zero_grad()
        optimizer_base.zero_grad()
        loss.backward()
        optimizer_new.step()
        optimizer_base.step()
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
    print('Training: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5))
    #print('Training: Loss: {:.3f} | Acc: {:.3f}'.format(train_loss/(i+1),correct/total))

# test
def validate(val_loader, model, criterion, args):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    val_loss = 0
    total = 0
    correct = 0
    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (input, target) in enumerate(val_loader):
            if args.gpu is not None:
                input = input.cuda(args.gpu, non_blocking=True)
            target = target.cuda(args.gpu, non_blocking=True)

            # compute output
            output = model(input)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), input.size(0))
            top1.update(acc1[0], input.size(0))
            top5.update(acc5[0], input.size(0))

            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()

            val_loss +=loss.item()
            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            val_loss += loss.item()

        print('Testing: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5))
    return top1.avg


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def adjust_learning_rate(optimizer, epoch, args):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = args.lr * (0.1 ** (epoch // 30))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(1.0 / batch_size))
        return res


if __name__ == '__main__':
    main()


================================================
FILE: GC_code/Fine-grained_classification/os_run.py
================================================

import os,time




os.system("nohup python -W ignore main.py /home/yonghw/data/data/CUB_200_2011/ --model r50p -b 128 --alg sgd --dataset cub  > logout/Cub_r50p_sgd_b128_g4.log ")
os.system("nohup python -W ignore main.py /home/yonghw/data/data/CUB_200_2011/ --model r50p -b 128  --alg sgdGC --dataset cub > logout/Cub_r50p_sgdGC_b128_g4.log ")

os.system("nohup python -W ignore main.py /home/yonghw/data/data/Car196/ --model r50p -b 128 --alg sgd --dataset cars > logout/Car_r50p_sgd_b128_g4.log ")
os.system("nohup python -W ignore main.py /home/yonghw/data/data/Car196/ --model r50p -b 128 --alg sgdGC --dataset cars> logout/Car_r50p_sgdGC_b128_g4.log ")

os.system("nohup python -W ignore main.py /home/yonghw/data/data/fgvc_aricraft/ --model r50p  -b 128 --alg sgd --dataset fgvc > logout/Ari_r50p_sgd_b128_g4.log ")
os.system("nohup python -W ignore main.py /home/yonghw/data/data/fgvc_aricraft/ --model r50p  -b 128 --alg sgdGC --dataset fgvc > logout/Ari_r50p_sgdGC_b128_g4.log ")

os.system("nohup python -W ignore main.py /home/yonghw/data/data/StanfordDogs/ --model r50p  -b 128  --alg sgd --dataset dogs > logout/Dog_r50p_sgd_b128_g4.log ")
os.system("nohup python -W ignore main.py /home/yonghw/data/data/StanfordDogs/ --model r50p  -b 128  --alg sgdGC --dataset dogs > logout/Dog_r50p_sgdGC_b128_g4.log ")


================================================
FILE: GC_code/ImageNet/SGD.py
================================================
import torch
from torch.optim.optimizer import Optimizer, required



class SGD_GCC(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGD_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGD_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)

                #GC operation for Conv layers
                if len(list(d_p.size()))>3:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
                   
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

        return loss

class SGD_GC(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGD_GC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGD_GC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)

                #GC operation for Conv layers and FC layers
                if len(list(d_p.size()))>1:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))

                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

        return loss


class SGDW(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGDW, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGDW, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                old = torch.clone(p.data).detach()
                #if weight_decay != 0:
                #    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

                if weight_decay != 0:
                    p.data.add_(-weight_decay*group['lr'], old)

        return loss



class SGDW_GCC(Optimizer):
    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGDW_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGDW_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                old = torch.clone(p.data).detach()
                #if weight_decay != 0:
                #    d_p.add_(weight_decay, p.data)
                
                #GC operation for Conv layers
                if len(list(d_p.size()))>3:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))


                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

                if weight_decay != 0:
                    p.data.add_(-weight_decay*group['lr'], old)

        return loss


================================================
FILE: GC_code/ImageNet/main.py
================================================
import argparse
import os
import random
import shutil
import time
import warnings
import sys
#nohup python -W ignore main.py /mnt/v0/ --model r50bn --alg sgd1 -b 256 --gpug 1 --path r50bn_sgd1_b256_g4 > logout/r50bn_sgd1_b256_g4.log
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
#from myresnet_nbn import resnet18_nbn, resnet101_nbn,resnet50_nbn
from myresnet import resnet50, resnet101
from myresnetgn import resnet50gn, resnet101gn


from torch.optim import lr_scheduler


from SGD import SGD_GCC #import SGD with GC for Conv layer


model_names = sorted(name for name in models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(models.__dict__[name]))

parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')

parser.add_argument('-b', '--batch-size', default=256, type=int,
                    metavar='N',
                    help='mini-batch size (default: 256), this is the total '
                         'batch size of all GPUs on the current node when '
                         'using Data Parallel or Distributed Data Parallel')

parser.add_argument('--lr', '--learning-rate', default=0.1*128/128, type=float,
                    metavar='LR', help='initial learning rate', dest='lr')

parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',
                    choices=model_names,
                    help='model architecture: ' +
                        ' | '.join(model_names) +
                        ' (default: resnet18)')

parser.add_argument('data', metavar='DIR',
                    help='path to dataset')

parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
                    help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=100, type=int, metavar='N',
                    help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                    help='manual epoch number (useful on restarts)')

parser.add_argument('--bgn', default=1, type=int, help='bn group number')

parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                    help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
                    metavar='W', help='weight decay (default: 1e-4)',
                    dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=100, type=int,
                    metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
                    help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                    help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                    help='use pre-trained model')
parser.add_argument('--world-size', default=-1, type=int,
                    help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
                    help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
                    help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
                    help='distributed backend')
parser.add_argument('--seed', default=None, type=int,
                    help='seed for initializing training. ')
parser.add_argument('--gpu', default=None, type=int,
                    help='GPU id to use.')
parser.add_argument('--multiprocessing-distributed', action='store_true',
                    help='Use multi-processing distributed training to launch '
                         'N processes per node, which has N GPUs. This is the '
                         'fastest way to use PyTorch for either single node or '
                         'multi node data parallel training')
parser.add_argument('--model', default='r50bn', type=str, help='model')
parser.add_argument('--path', default='test', type=str, help='model')
parser.add_argument('--alg', default='sgd', type=str, help='algorithm')


best_acc1 = 0
device_ids=[0,1,2,3,4,5,6,7]

def main():
    args = parser.parse_args()
    os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"


    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')

    if args.dist_url == "env://" and args.world_size == -1:
        args.world_size = int(os.environ["WORLD_SIZE"])

    args.distributed = args.world_size > 1 or args.multiprocessing_distributed

    ngpus_per_node = torch.cuda.device_count()
    if args.multiprocessing_distributed:
        # Since we have ngpus_per_node processes per node, the total world_size
        # needs to be adjusted accordingly
        args.world_size = ngpus_per_node * args.world_size
        # Use torch.multiprocessing.spawn to launch distributed processes: the
        # main_worker process function
        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
    else:
        # Simply call main_worker function
        main_worker(args.gpu, ngpus_per_node, args)


def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)
    # create model
    if args.model=='r50bn':
      model = resnet50()
    if args.model=='r50gn':
      model = resnet50gn()

    if args.model=='r101bn':
      model = resnet101()
    if args.model=='r101gn':
      model = resnet101gn()



    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(args.workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(model)
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)
    
    # choose optimizer
    if args.alg=='sgd':
      optimizer =torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum,weight_decay=args.weight_decay)
    if args.alg=='sgdGC':
      optimizer = SGD_GCC(model.parameters(), args.lr, momentum=args.momentum,weight_decay=args.weight_decay)

    exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler,drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True,drop_last=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        #adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)
        exp_lr_scheduler.step()
        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer' : optimizer.state_dict(),
            }, is_best)
        torch.save(model.module, './result_model/'+args.path+'.pth')

# train
def train(train_loader, model, criterion, optimizer, epoch, args):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()
    total = 0
    train_loss = 0
    correct = 0
    # switch to train mode
    model.train()
    print('\nEpoch: %d' % epoch)
    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        #if args.gpu is not None:
         #input = input.cuda(args.gpu, non_blocking=True)
        #target = target.cuda(args.gpu, non_blocking=True)
        input, target = input.to('cuda'), target.to('cuda')

        # compute output
        output = model(input)
        loss = criterion(output, target)

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), input.size(0))
        top1.update(acc1[0], input.size(0))
        top5.update(acc5[0], input.size(0))

        _, predicted = output.max(1)
        correct += predicted.eq(target).sum().item()

        train_loss += loss.item()
        #correct +=acc1[0]
        total += target.size(0)
        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
    print('Training: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5))
    #print('Training: Loss: {:.3f} | Acc: {:.3f}'.format(train_loss/(i+1),correct/total))

# validate
def validate(val_loader, model, criterion, args):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    val_loss = 0
    total = 0
    correct = 0
    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (input, target) in enumerate(val_loader):
            if args.gpu is not None:
                input = input.cuda(args.gpu, non_blocking=True)
            target = target.cuda(args.gpu, non_blocking=True)

            # compute output
            output = model(input)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), input.size(0))
            top1.update(acc1[0], input.size(0))
            top5.update(acc5[0], input.size(0))

            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()

            val_loss +=loss.item()
            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            val_loss += loss.item()
        print('Testing: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5))
    return top1.avg


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def adjust_learning_rate(optimizer, epoch, args):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = args.lr * (0.1 ** (epoch // 30))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(1.0 / batch_size))
        return res


if __name__ == '__main__':
    main()


================================================
FILE: GC_code/ImageNet/myresnet.py
================================================
from __future__ import print_function, division, absolute_import
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo


__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
           'resnet152']


model_urls = {
    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}


def conv3x3(in_planes, out_planes, stride=1):
    "3x3 convolution with padding"
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=True)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=True)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=True)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=True)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

#from torch.legacy import nn as nnl

class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                                bias=True)
        #self.conv1 = nnl.SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AvgPool2d(7)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=True),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        self.conv1_input = x.clone()
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x


def resnet18(pretrained=False, **kwargs):
    """Constructs a ResNet-18 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
    return model


def resnet34(pretrained=False, **kwargs):
    """Constructs a ResNet-34 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
    return model


def resnet50(pretrained=False, **kwargs):
    """Constructs a ResNet-50 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
    return model


def resnet101(pretrained=False, **kwargs):
    """Constructs a ResNet-101 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
    return model


def resnet152(pretrained=False, **kwargs):
    """Constructs a ResNet-152 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
    return model


def test():
    net = resnet18()
    net.eval()
    x=Variable(torch.randn(2,3,224,224))
    y = net(x)
    print(y.size())
    print(net)
#test()


================================================
FILE: GC_code/ImageNet/myresnetgn.py
================================================
from __future__ import print_function, division, absolute_import
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo


__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
           'resnet152']


model_urls = {
    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}


def conv3x3(in_planes, out_planes, stride=1):
    "3x3 convolution with padding"
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=True)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.GroupNorm(32,planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.GroupNorm(32,planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=True)
        self.bn1 = nn.GroupNorm(32,planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=True)
        self.bn2 = nn.GroupNorm(32,planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=True)
        self.bn3 = nn.GroupNorm(32,planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

#from torch.legacy import nn as nnl

class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                                bias=True)
        #self.conv1 = nnl.SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3)
        self.bn1 = nn.GroupNorm(32,64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AvgPool2d(7)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.GroupNorm):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=True),
                nn.GroupNorm(32,planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        self.conv1_input = x.clone()
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x


def resnet18gn(pretrained=False, **kwargs):
    """Constructs a ResNet-18 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
    return model


def resnet34gn(pretrained=False, **kwargs):
    """Constructs a ResNet-34 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
    return model


def resnet50gn(pretrained=False, **kwargs):
    """Constructs a ResNet-50 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
    return model


def resnet101gn(pretrained=False, **kwargs):
    """Constructs a ResNet-101 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
    return model


def resnet152gn(pretrained=False, **kwargs):
    """Constructs a ResNet-152 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
    return model


def test():
    net = resnet18gn()
    net.eval()
    x=torch.randn(2,3,224,224)
    y = net(x)
    print(y.size())
    print(net)
#test()


================================================
FILE: GC_code/ImageNet/os_run.py
================================================

import os,time


os.system("#nohup python -W ignore main.py /ssd/data/yonghw/Imagenet/v0/ --model r50bn --alg sgd -b 256 --path r50bn_sgd_b256_g4 > logout/r50bn_sgd_b256_g4.log &")

os.system("#nohup python -W ignore main.py /ssd/data/yonghw/Imagenet/v0/ --model r50bn --alg sgdGC -b 256 --path r50bn_sgdGC_b256_g4 > logout/r50bn_sgdGC_b256_g4.log &")


================================================
FILE: GC_code/Mini_ImageNet/SGD.py
================================================
import torch
from torch.optim.optimizer import Optimizer, required



class SGD_GCC(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGD_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGD_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)

                #GC operation for Conv layers
                if len(list(d_p.size()))>3:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
                   
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

        return loss

class SGD_GC(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGD_GC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGD_GC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)

                #GC operation for Conv layers and FC layers
                if len(list(d_p.size()))>1:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))

                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

        return loss


class SGDW(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGDW, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGDW, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                old = torch.clone(p.data).detach()
                #if weight_decay != 0:
                #    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

                if weight_decay != 0:
                    p.data.add_(-weight_decay*group['lr'], old)

        return loss



class SGDW_GCC(Optimizer):
    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGDW_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGDW_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                old = torch.clone(p.data).detach()
                #if weight_decay != 0:
                #    d_p.add_(weight_decay, p.data)
                
                #GC operation for Conv layers
                if len(list(d_p.size()))>3:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))


                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

                if weight_decay != 0:
                    p.data.add_(-weight_decay*group['lr'], old)

        return loss


================================================
FILE: GC_code/Mini_ImageNet/main.py
================================================
import argparse
import os
import random
import shutil
import time
import warnings
import sys
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from resnet_ws import l_resnet50

import torchvision.models as models
import math
import numpy as np
from torch.optim import lr_scheduler


from SGD import SGD_GC #import SGD with GC

model_names = sorted(name for name in models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(models.__dict__[name]))

parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')

parser.add_argument('-b', '--batch_size', default=256, type=int,
                    metavar='N',
                    help='mini-batch size (default: 256), this is the total '
                         'batch s

Download .txt

gitextract_l162dn_3/

├── GC_code/
│   ├── CIFAR100/
│   │   ├── algorithm/
│   │   │   ├── Adagrad.py
│   │   │   ├── Adam.py
│   │   │   └── SGD.py
│   │   ├── main.py
│   │   ├── models/
│   │   │   ├── __init__.py
│   │   │   ├── densenet.py
│   │   │   ├── dpn.py
│   │   │   ├── googlenet.py
│   │   │   ├── lenet.py
│   │   │   ├── mobilenet.py
│   │   │   ├── mobilenetv2.py
│   │   │   ├── pnasnet.py
│   │   │   ├── preact_resnet.py
│   │   │   ├── resnet.py
│   │   │   ├── resnext.py
│   │   │   ├── senet.py
│   │   │   ├── shufflenet.py
│   │   │   └── vgg.py
│   │   └── os_run.py
│   ├── Fine-grained_classification/
│   │   ├── SGD.py
│   │   ├── main.py
│   │   └── os_run.py
│   ├── ImageNet/
│   │   ├── SGD.py
│   │   ├── main.py
│   │   ├── myresnet.py
│   │   ├── myresnetgn.py
│   │   └── os_run.py
│   └── Mini_ImageNet/
│       ├── SGD.py
│       ├── main.py
│       ├── os_run.py
│       └── resnet_ws.py
├── README.md
└── algorithm-GC/
    ├── README.md
    ├── algorithm/
    │   ├── Adam.py
    │   ├── Centralization.py
    │   ├── Lookahead.py
    │   ├── RAdam.py
    │   ├── Ranger.py
    │   └── SGD.py
    └── cifar/
        ├── main.py
        ├── models/
        │   ├── __init__.py
        │   ├── densenet.py
        │   ├── dpn.py
        │   ├── googlenet.py
        │   ├── lenet.py
        │   ├── mobilenet.py
        │   ├── mobilenetv2.py
        │   ├── pnasnet.py
        │   ├── preact_resnet.py
        │   ├── resnet.py
        │   ├── resnext.py
        │   ├── senet.py
        │   ├── shufflenet.py
        │   └── vgg.py
        ├── nohup.out
        ├── os_run.py
        └── os_run2.py

Download .txt

SYMBOL INDEX (523 symbols across 46 files)

FILE: GC_code/CIFAR100/algorithm/Adagrad.py
  class Adagrad_GCC (line 5) | class Adagrad_GCC(Optimizer):
    method __init__ (line 24) | def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initia...
    method share_memory (line 46) | def share_memory(self):
    method step (line 52) | def step(self, closure=None):
  class Adagrad_GC (line 106) | class Adagrad_GC(Optimizer):
    method __init__ (line 125) | def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initia...
    method share_memory (line 147) | def share_memory(self):
    method step (line 153) | def step(self, closure=None):

FILE: GC_code/CIFAR100/algorithm/Adam.py
  class Adam_GCC (line 5) | class Adam_GCC(Optimizer):
    method __init__ (line 6) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
    method __setstate__ (line 20) | def __setstate__(self, state):
    method step (line 25) | def step(self, closure=None):
  class Adam_GCC2 (line 91) | class Adam_GCC2(Optimizer):
    method __init__ (line 92) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
    method __setstate__ (line 106) | def __setstate__(self, state):
    method step (line 111) | def step(self, closure=None):
  class Adam_GC (line 176) | class Adam_GC(Optimizer):
    method __init__ (line 200) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
    method __setstate__ (line 214) | def __setstate__(self, state):
    method step (line 219) | def step(self, closure=None):
  class Adam_GC2 (line 286) | class Adam_GC2(Optimizer):
    method __init__ (line 287) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
    method __setstate__ (line 301) | def __setstate__(self, state):
    method step (line 306) | def step(self, closure=None):
  class AdamW (line 371) | class AdamW(Optimizer):
    method __init__ (line 391) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
    method __setstate__ (line 405) | def __setstate__(self, state):
    method step (line 410) | def step(self, closure=None):
  class AdamW_GCC (line 474) | class AdamW_GCC(Optimizer):
    method __init__ (line 494) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
    method __setstate__ (line 508) | def __setstate__(self, state):
    method step (line 513) | def step(self, closure=None):
  class AdamW_GC (line 579) | class AdamW_GC(Optimizer):
    method __init__ (line 599) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
    method __setstate__ (line 613) | def __setstate__(self, state):
    method step (line 618) | def step(self, closure=None):
  class AdamW_GCC2 (line 684) | class AdamW_GCC2(Optimizer):
    method __init__ (line 704) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
    method __setstate__ (line 718) | def __setstate__(self, state):
    method step (line 723) | def step(self, closure=None):
  class AdamW_GC2 (line 790) | class AdamW_GC2(Optimizer):
    method __init__ (line 810) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
    method __setstate__ (line 824) | def __setstate__(self, state):
    method step (line 829) | def step(self, closure=None):

FILE: GC_code/CIFAR100/algorithm/SGD.py
  class SGD_GCC (line 6) | class SGD_GCC(Optimizer):
    method __init__ (line 8) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 23) | def __setstate__(self, state):
    method step (line 28) | def step(self, closure=None):
  class SGD_GC (line 73) | class SGD_GC(Optimizer):
    method __init__ (line 75) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 90) | def __setstate__(self, state):
    method step (line 95) | def step(self, closure=None):
  class SGDW (line 141) | class SGDW(Optimizer):
    method __init__ (line 143) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 158) | def __setstate__(self, state):
    method step (line 163) | def step(self, closure=None):
  class SGDW_GCC (line 209) | class SGDW_GCC(Optimizer):
    method __init__ (line 210) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 225) | def __setstate__(self, state):
    method step (line 230) | def step(self, closure=None):
  class SGDW_GC (line 281) | class SGDW_GC(Optimizer):
    method __init__ (line 282) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 297) | def __setstate__(self, state):
    method step (line 302) | def step(self, closure=None):

FILE: GC_code/CIFAR100/main.py
  function train (line 165) | def train(epoch,net,optimizer):
  function test (line 190) | def test(epoch,net):

FILE: GC_code/CIFAR100/models/densenet.py
  class Bottleneck (line 9) | class Bottleneck(nn.Module):
    method __init__ (line 10) | def __init__(self, in_planes, growth_rate):
    method forward (line 17) | def forward(self, x):
  class Transition (line 24) | class Transition(nn.Module):
    method __init__ (line 25) | def __init__(self, in_planes, out_planes):
    method forward (line 30) | def forward(self, x):
  class DenseNet (line 36) | class DenseNet(nn.Module):
    method __init__ (line 37) | def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_...
    method _make_dense_layers (line 68) | def _make_dense_layers(self, block, in_planes, nblock):
    method forward (line 75) | def forward(self, x):
  function DenseNet121 (line 86) | def DenseNet121(Num_classes=10):
  function DenseNet169 (line 89) | def DenseNet169(Num_classes=10):
  function DenseNet201 (line 92) | def DenseNet201(Num_classes=10):
  function DenseNet161 (line 95) | def DenseNet161(Num_classes=10):
  function densenet_cifar (line 98) | def densenet_cifar(Num_classes=10):
  function test (line 101) | def test():

FILE: GC_code/CIFAR100/models/dpn.py
  class Bottleneck (line 7) | class Bottleneck(nn.Module):
    method __init__ (line 8) | def __init__(self, last_planes, in_planes, out_planes, dense_depth, st...
    method forward (line 27) | def forward(self, x):
  class DPN (line 38) | class DPN(nn.Module):
    method __init__ (line 39) | def __init__(self, cfg):
    method _make_layer (line 53) | def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, ...
    method forward (line 61) | def forward(self, x):
  function DPN26 (line 73) | def DPN26():
  function DPN92 (line 82) | def DPN92():
  function test (line 92) | def test():

FILE: GC_code/CIFAR100/models/googlenet.py
  class Inception (line 7) | class Inception(nn.Module):
    method __init__ (line 8) | def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool...
    method forward (line 48) | def forward(self, x):
  class GoogLeNet (line 56) | class GoogLeNet(nn.Module):
    method __init__ (line 57) | def __init__(self):
    method forward (line 82) | def forward(self, x):
  function test (line 101) | def test():

FILE: GC_code/CIFAR100/models/lenet.py
  class LeNet (line 5) | class LeNet(nn.Module):
    method __init__ (line 6) | def __init__(self):
    method forward (line 14) | def forward(self, x):

FILE: GC_code/CIFAR100/models/mobilenet.py
  class Block (line 11) | class Block(nn.Module):
    method __init__ (line 13) | def __init__(self, in_planes, out_planes, stride=1):
    method forward (line 20) | def forward(self, x):
  class MobileNet (line 26) | class MobileNet(nn.Module):
    method __init__ (line 30) | def __init__(self, num_classes=10):
    method _make_layers (line 37) | def _make_layers(self, in_planes):
    method forward (line 46) | def forward(self, x):
  function test (line 55) | def test():

FILE: GC_code/CIFAR100/models/mobilenetv2.py
  class Block (line 11) | class Block(nn.Module):
    method __init__ (line 13) | def __init__(self, in_planes, out_planes, expansion, stride):
    method forward (line 32) | def forward(self, x):
  class MobileNetV2 (line 40) | class MobileNetV2(nn.Module):
    method __init__ (line 50) | def __init__(self, num_classes=10):
    method _make_layers (line 60) | def _make_layers(self, in_planes):
    method forward (line 69) | def forward(self, x):
  function test (line 80) | def test():

FILE: GC_code/CIFAR100/models/pnasnet.py
  class SepConv (line 10) | class SepConv(nn.Module):
    method __init__ (line 12) | def __init__(self, in_planes, out_planes, kernel_size, stride):
    method forward (line 20) | def forward(self, x):
  class CellA (line 24) | class CellA(nn.Module):
    method __init__ (line 25) | def __init__(self, in_planes, out_planes, stride=1):
    method forward (line 33) | def forward(self, x):
  class CellB (line 40) | class CellB(nn.Module):
    method __init__ (line 41) | def __init__(self, in_planes, out_planes, stride=1):
    method forward (line 56) | def forward(self, x):
  class PNASNet (line 71) | class PNASNet(nn.Module):
    method __init__ (line 72) | def __init__(self, cell_type, num_cells, num_planes):
    method _make_layer (line 88) | def _make_layer(self, planes, num_cells):
    method _downsample (line 95) | def _downsample(self, planes):
    method forward (line 100) | def forward(self, x):
  function PNASNetA (line 112) | def PNASNetA():
  function PNASNetB (line 115) | def PNASNetB():
  function test (line 119) | def test():

FILE: GC_code/CIFAR100/models/preact_resnet.py
  class PreActBlock (line 12) | class PreActBlock(nn.Module):
    method __init__ (line 16) | def __init__(self, in_planes, planes, stride=1):
    method forward (line 28) | def forward(self, x):
  class PreActBottleneck (line 37) | class PreActBottleneck(nn.Module):
    method __init__ (line 41) | def __init__(self, in_planes, planes, stride=1):
    method forward (line 55) | def forward(self, x):
  class PreActResNet (line 65) | class PreActResNet(nn.Module):
    method __init__ (line 66) | def __init__(self, block, num_blocks, num_classes=10):
    method _make_layer (line 77) | def _make_layer(self, block, planes, num_blocks, stride):
    method forward (line 85) | def forward(self, x):
  function PreActResNet18 (line 97) | def PreActResNet18():
  function PreActResNet34 (line 100) | def PreActResNet34():
  function PreActResNet50 (line 103) | def PreActResNet50():
  function PreActResNet101 (line 106) | def PreActResNet101():
  function PreActResNet152 (line 109) | def PreActResNet152():
  function test (line 113) | def test():

FILE: GC_code/CIFAR100/models/resnet.py
  class BasicBlock (line 14) | class BasicBlock(nn.Module):
    method __init__ (line 17) | def __init__(self, in_planes, planes, stride=1):
    method forward (line 31) | def forward(self, x):
  class Bottleneck (line 39) | class Bottleneck(nn.Module):
    method __init__ (line 42) | def __init__(self, in_planes, planes, stride=1):
    method forward (line 58) | def forward(self, x):
  class ResNet (line 67) | class ResNet(nn.Module):
    method __init__ (line 68) | def __init__(self, block, num_blocks, num_classes=10):
    method _make_layer (line 80) | def _make_layer(self, block, planes, num_blocks, stride):
    method forward (line 88) | def forward(self, x):
  function ResNet18 (line 100) | def ResNet18(Num_classes=10):
  function ResNet34 (line 103) | def ResNet34(Num_classes=10):
  function ResNet50 (line 106) | def ResNet50(Num_classes=10):
  function ResNet101 (line 109) | def ResNet101(Num_classes=10):
  function ResNet152 (line 112) | def ResNet152(Num_classes=10):
  function test (line 116) | def test():

FILE: GC_code/CIFAR100/models/resnext.py
  class Block (line 10) | class Block(nn.Module):
    method __init__ (line 14) | def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stri...
    method forward (line 31) | def forward(self, x):
  class ResNeXt (line 40) | class ResNeXt(nn.Module):
    method __init__ (line 41) | def __init__(self, num_blocks, cardinality, bottleneck_width, num_clas...
    method _make_layer (line 55) | def _make_layer(self, num_blocks, stride):
    method forward (line 65) | def forward(self, x):
  function ResNeXt29_2x64d (line 77) | def ResNeXt29_2x64d(Num_classes=10):
  function ResNeXt29_4x64d (line 80) | def ResNeXt29_4x64d(Num_classes=10):
  function ResNeXt29_8x64d (line 83) | def ResNeXt29_8x64d(Num_classes=10):
  function ResNeXt29_32x4d (line 86) | def ResNeXt29_32x4d(Num_classes=10):
  function test_resnext (line 89) | def test_resnext():

FILE: GC_code/CIFAR100/models/senet.py
  class BasicBlock (line 10) | class BasicBlock(nn.Module):
    method __init__ (line 11) | def __init__(self, in_planes, planes, stride=1):
    method forward (line 29) | def forward(self, x):
  class PreActBlock (line 45) | class PreActBlock(nn.Module):
    method __init__ (line 46) | def __init__(self, in_planes, planes, stride=1):
    method forward (line 62) | def forward(self, x):
  class SENet (line 79) | class SENet(nn.Module):
    method __init__ (line 80) | def __init__(self, block, num_blocks, num_classes=10):
    method _make_layer (line 92) | def _make_layer(self, block, planes, num_blocks, stride):
    method forward (line 100) | def forward(self, x):
  function SENet18 (line 112) | def SENet18():
  function test (line 116) | def test():

FILE: GC_code/CIFAR100/models/shufflenet.py
  class ShuffleBlock (line 10) | class ShuffleBlock(nn.Module):
    method __init__ (line 11) | def __init__(self, groups):
    method forward (line 15) | def forward(self, x):
  class Bottleneck (line 22) | class Bottleneck(nn.Module):
    method __init__ (line 23) | def __init__(self, in_planes, out_planes, stride, groups):
    method forward (line 41) | def forward(self, x):
  class ShuffleNet (line 51) | class ShuffleNet(nn.Module):
    method __init__ (line 52) | def __init__(self, cfg):
    method _make_layer (line 66) | def _make_layer(self, out_planes, num_blocks, groups):
    method forward (line 75) | def forward(self, x):
  function ShuffleNetG2 (line 86) | def ShuffleNetG2():
  function ShuffleNetG3 (line 94) | def ShuffleNetG3():
  function test (line 103) | def test():

FILE: GC_code/CIFAR100/models/vgg.py
  class VGG (line 14) | class VGG(nn.Module):
    method __init__ (line 15) | def __init__(self, vgg_name,Num_classes=100):
    method forward (line 20) | def forward(self, x):
    method _make_layers (line 26) | def _make_layers(self, cfg):
  function test (line 41) | def test():

FILE: GC_code/Fine-grained_classification/SGD.py
  class SGD_GCC (line 6) | class SGD_GCC(Optimizer):
    method __init__ (line 8) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 23) | def __setstate__(self, state):
    method step (line 28) | def step(self, closure=None):
  class SGD_GC (line 73) | class SGD_GC(Optimizer):
    method __init__ (line 75) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 90) | def __setstate__(self, state):
    method step (line 95) | def step(self, closure=None):
  class SGDW (line 141) | class SGDW(Optimizer):
    method __init__ (line 143) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 158) | def __setstate__(self, state):
    method step (line 163) | def step(self, closure=None):
  class SGDW_GCC (line 209) | class SGDW_GCC(Optimizer):
    method __init__ (line 210) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 225) | def __setstate__(self, state):
    method step (line 230) | def step(self, closure=None):

FILE: GC_code/Fine-grained_classification/main.py
  function main (line 97) | def main():
  function main_worker (line 133) | def main_worker(gpu, ngpus_per_node, args):
  function train (line 302) | def train(train_loader, model, criterion, optimizer_base, optimizer_new,...
  function validate (line 355) | def validate(val_loader, model, criterion, args):
  function save_checkpoint (line 399) | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
  class AverageMeter (line 405) | class AverageMeter(object):
    method __init__ (line 407) | def __init__(self):
    method reset (line 410) | def reset(self):
    method update (line 416) | def update(self, val, n=1):
  function adjust_learning_rate (line 423) | def adjust_learning_rate(optimizer, epoch, args):
  function accuracy (line 430) | def accuracy(output, target, topk=(1,)):

FILE: GC_code/ImageNet/SGD.py
  class SGD_GCC (line 6) | class SGD_GCC(Optimizer):
    method __init__ (line 8) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 23) | def __setstate__(self, state):
    method step (line 28) | def step(self, closure=None):
  class SGD_GC (line 73) | class SGD_GC(Optimizer):
    method __init__ (line 75) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 90) | def __setstate__(self, state):
    method step (line 95) | def step(self, closure=None):
  class SGDW (line 141) | class SGDW(Optimizer):
    method __init__ (line 143) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 158) | def __setstate__(self, state):
    method step (line 163) | def step(self, closure=None):
  class SGDW_GCC (line 209) | class SGDW_GCC(Optimizer):
    method __init__ (line 210) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 225) | def __setstate__(self, state):
    method step (line 230) | def step(self, closure=None):

FILE: GC_code/ImageNet/main.py
  function main (line 103) | def main():
  function main_worker (line 140) | def main_worker(gpu, ngpus_per_node, args):
  function train (line 289) | def train(train_loader, model, criterion, optimizer, epoch, args):
  function validate (line 338) | def validate(val_loader, model, criterion, args):
  function save_checkpoint (line 380) | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
  class AverageMeter (line 386) | class AverageMeter(object):
    method __init__ (line 388) | def __init__(self):
    method reset (line 391) | def reset(self):
    method update (line 397) | def update(self, val, n=1):
  function adjust_learning_rate (line 404) | def adjust_learning_rate(optimizer, epoch, args):
  function accuracy (line 411) | def accuracy(output, target, topk=(1,)):

FILE: GC_code/ImageNet/myresnet.py
  function conv3x3 (line 20) | def conv3x3(in_planes, out_planes, stride=1):
  class BasicBlock (line 26) | class BasicBlock(nn.Module):
    method __init__ (line 29) | def __init__(self, inplanes, planes, stride=1, downsample=None):
    method forward (line 39) | def forward(self, x):
  class Bottleneck (line 58) | class Bottleneck(nn.Module):
    method __init__ (line 61) | def __init__(self, inplanes, planes, stride=1, downsample=None):
    method forward (line 74) | def forward(self, x):
  class ResNet (line 98) | class ResNet(nn.Module):
    method __init__ (line 100) | def __init__(self, block, layers, num_classes=1000):
    method _make_layer (line 124) | def _make_layer(self, block, planes, blocks, stride=1):
    method forward (line 141) | def forward(self, x):
  function resnet18 (line 160) | def resnet18(pretrained=False, **kwargs):
  function resnet34 (line 171) | def resnet34(pretrained=False, **kwargs):
  function resnet50 (line 182) | def resnet50(pretrained=False, **kwargs):
  function resnet101 (line 193) | def resnet101(pretrained=False, **kwargs):
  function resnet152 (line 204) | def resnet152(pretrained=False, **kwargs):
  function test (line 215) | def test():

FILE: GC_code/ImageNet/myresnetgn.py
  function conv3x3 (line 20) | def conv3x3(in_planes, out_planes, stride=1):
  class BasicBlock (line 26) | class BasicBlock(nn.Module):
    method __init__ (line 29) | def __init__(self, inplanes, planes, stride=1, downsample=None):
    method forward (line 39) | def forward(self, x):
  class Bottleneck (line 58) | class Bottleneck(nn.Module):
    method __init__ (line 61) | def __init__(self, inplanes, planes, stride=1, downsample=None):
    method forward (line 74) | def forward(self, x):
  class ResNet (line 98) | class ResNet(nn.Module):
    method __init__ (line 100) | def __init__(self, block, layers, num_classes=1000):
    method _make_layer (line 124) | def _make_layer(self, block, planes, blocks, stride=1):
    method forward (line 141) | def forward(self, x):
  function resnet18gn (line 160) | def resnet18gn(pretrained=False, **kwargs):
  function resnet34gn (line 171) | def resnet34gn(pretrained=False, **kwargs):
  function resnet50gn (line 182) | def resnet50gn(pretrained=False, **kwargs):
  function resnet101gn (line 193) | def resnet101gn(pretrained=False, **kwargs):
  function resnet152gn (line 204) | def resnet152gn(pretrained=False, **kwargs):
  function test (line 215) | def test():

FILE: GC_code/Mini_ImageNet/SGD.py
  class SGD_GCC (line 6) | class SGD_GCC(Optimizer):
    method __init__ (line 8) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 23) | def __setstate__(self, state):
    method step (line 28) | def step(self, closure=None):
  class SGD_GC (line 73) | class SGD_GC(Optimizer):
    method __init__ (line 75) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 90) | def __setstate__(self, state):
    method step (line 95) | def step(self, closure=None):
  class SGDW (line 141) | class SGDW(Optimizer):
    method __init__ (line 143) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 158) | def __setstate__(self, state):
    method step (line 163) | def step(self, closure=None):
  class SGDW_GCC (line 209) | class SGDW_GCC(Optimizer):
    method __init__ (line 210) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 225) | def __setstate__(self, state):
    method step (line 230) | def step(self, closure=None):

FILE: GC_code/Mini_ImageNet/main.py
  function main (line 99) | def main():
  function main_worker (line 135) | def main_worker(gpu, ngpus_per_node, args):
  function train (line 279) | def train(train_loader, model, criterion, optimizer, epoch, args):
  function validate (line 333) | def validate(val_loader, model, criterion, args):
  function save_checkpoint (line 376) | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
  class AverageMeter (line 382) | class AverageMeter(object):
    method __init__ (line 384) | def __init__(self):
    method reset (line 387) | def reset(self):
    method update (line 393) | def update(self, val, n=1):
  function adjust_learning_rate (line 400) | def adjust_learning_rate(optimizer, epoch, args):
  function accuracy (line 407) | def accuracy(output, target, topk=(1,)):

FILE: GC_code/Mini_ImageNet/resnet_ws.py
  class Conv2d (line 16) | class Conv2d(nn.Conv2d):
    method __init__ (line 18) | def __init__(self, in_channels, out_channels, kernel_size, stride=1,
    method forward (line 23) | def forward(self, x):
  function BatchNorm2d (line 35) | def BatchNorm2d(num_features):
  function conv3x3 (line 41) | def conv3x3(in_planes, out_planes, stride=1):
  function conv1x1 (line 47) | def conv1x1(in_planes, out_planes, stride=1):
  class BasicBlock (line 52) | class BasicBlock(nn.Module):
    method __init__ (line 55) | def __init__(self, inplanes, planes, stride=1, downsample=None):
    method forward (line 65) | def forward(self, x):
  class Bottleneck (line 84) | class Bottleneck(nn.Module):
    method __init__ (line 87) | def __init__(self, inplanes, planes, stride=1, downsample=None):
    method forward (line 99) | def forward(self, x):
  class ResNet (line 122) | class ResNet(nn.Module):
    method __init__ (line 124) | def __init__(self, block, layers, num_classes=1000, zero_init_residual...
    method _make_layer (line 161) | def _make_layer(self, block, planes, blocks, stride=1):
    method forward (line 177) | def forward(self, x):
  function l_resnet18 (line 195) | def l_resnet18(pretrained=False, **kwargs):
  function l_resnet34 (line 204) | def l_resnet34(pretrained=False, **kwargs):
  function l_resnet50 (line 213) | def l_resnet50(pretrained=False, **kwargs):
  function l_resnet101 (line 222) | def l_resnet101(pretrained=False, **kwargs):
  function l_resnet152 (line 231) | def l_resnet152(pretrained=False, **kwargs):

FILE: algorithm-GC/algorithm/Adam.py
  class Adam (line 6) | class Adam(Optimizer):
    method __init__ (line 30) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
    method __setstate__ (line 49) | def __setstate__(self, state):
    method step (line 55) | def step(self, closure=None):
  class AdamW (line 127) | class AdamW(Optimizer):
    method __init__ (line 154) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
    method __setstate__ (line 173) | def __setstate__(self, state):
    method step (line 179) | def step(self, closure=None):

FILE: algorithm-GC/algorithm/Centralization.py
  function centralized_gradient (line 6) | def centralized_gradient(x,use_gc=True,gc_conv_only=False):

FILE: algorithm-GC/algorithm/Lookahead.py
  class Lookahead (line 7) | class Lookahead(Optimizer):
    method __init__ (line 8) | def __init__(self, optimizer, k=5, alpha=0.5):
    method update (line 18) | def update(self, group):
    method update_lookahead (line 28) | def update_lookahead(self):
    method step (line 32) | def step(self, closure=None):
    method state_dict (line 42) | def state_dict(self):
    method load_state_dict (line 56) | def load_state_dict(self, state_dict):
    method add_param_group (line 69) | def add_param_group(self, param_group):

FILE: algorithm-GC/algorithm/RAdam.py
  class RAdam (line 7) | class RAdam(Optimizer):
    method __init__ (line 9) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weig...
    method __setstate__ (line 32) | def __setstate__(self, state):
    method step (line 35) | def step(self, closure=None):
  class PlainRAdam (line 109) | class PlainRAdam(Optimizer):
    method __init__ (line 111) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weig...
    method __setstate__ (line 129) | def __setstate__(self, state):
    method step (line 132) | def step(self, closure=None):

FILE: algorithm-GC/algorithm/Ranger.py
  class Ranger (line 7) | class Ranger(Optimizer):
    method __init__ (line 9) | def __init__(self, params, lr=1e-3,                       # lr
    method __setstate__ (line 61) | def __setstate__(self, state):
    method step (line 65) | def step(self, closure=None):

FILE: algorithm-GC/algorithm/SGD.py
  class SGD (line 6) | class SGD(Optimizer):
    method __init__ (line 56) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 71) | def __setstate__(self, state):
    method step (line 77) | def step(self, closure=None):

FILE: algorithm-GC/cifar/main.py
  function train (line 218) | def train(epoch,net,optimizer):
  function test (line 243) | def test(epoch,net):

FILE: algorithm-GC/cifar/models/densenet.py
  class Bottleneck (line 9) | class Bottleneck(nn.Module):
    method __init__ (line 10) | def __init__(self, in_planes, growth_rate):
    method forward (line 17) | def forward(self, x):
  class Transition (line 24) | class Transition(nn.Module):
    method __init__ (line 25) | def __init__(self, in_planes, out_planes):
    method forward (line 30) | def forward(self, x):
  class DenseNet (line 36) | class DenseNet(nn.Module):
    method __init__ (line 37) | def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_...
    method _make_dense_layers (line 68) | def _make_dense_layers(self, block, in_planes, nblock):
    method forward (line 75) | def forward(self, x):
  function DenseNet121 (line 86) | def DenseNet121():
  function DenseNet169 (line 89) | def DenseNet169():
  function DenseNet201 (line 92) | def DenseNet201():
  function DenseNet161 (line 95) | def DenseNet161():
  function densenet_cifar (line 98) | def densenet_cifar():
  function test (line 101) | def test():

FILE: algorithm-GC/cifar/models/dpn.py
  class Bottleneck (line 7) | class Bottleneck(nn.Module):
    method __init__ (line 8) | def __init__(self, last_planes, in_planes, out_planes, dense_depth, st...
    method forward (line 27) | def forward(self, x):
  class DPN (line 38) | class DPN(nn.Module):
    method __init__ (line 39) | def __init__(self, cfg):
    method _make_layer (line 53) | def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, ...
    method forward (line 61) | def forward(self, x):
  function DPN26 (line 73) | def DPN26():
  function DPN92 (line 82) | def DPN92():
  function test (line 92) | def test():

FILE: algorithm-GC/cifar/models/googlenet.py
  class Inception (line 7) | class Inception(nn.Module):
    method __init__ (line 8) | def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool...
    method forward (line 48) | def forward(self, x):
  class GoogLeNet (line 56) | class GoogLeNet(nn.Module):
    method __init__ (line 57) | def __init__(self):
    method forward (line 82) | def forward(self, x):
  function test (line 101) | def test():

FILE: algorithm-GC/cifar/models/lenet.py
  class LeNet (line 5) | class LeNet(nn.Module):
    method __init__ (line 6) | def __init__(self):
    method forward (line 14) | def forward(self, x):

FILE: algorithm-GC/cifar/models/mobilenet.py
  class Block (line 11) | class Block(nn.Module):
    method __init__ (line 13) | def __init__(self, in_planes, out_planes, stride=1):
    method forward (line 20) | def forward(self, x):
  class MobileNet (line 26) | class MobileNet(nn.Module):
    method __init__ (line 30) | def __init__(self, num_classes=10):
    method _make_layers (line 37) | def _make_layers(self, in_planes):
    method forward (line 46) | def forward(self, x):
  function test (line 55) | def test():

FILE: algorithm-GC/cifar/models/mobilenetv2.py
  class Block (line 11) | class Block(nn.Module):
    method __init__ (line 13) | def __init__(self, in_planes, out_planes, expansion, stride):
    method forward (line 32) | def forward(self, x):
  class MobileNetV2 (line 40) | class MobileNetV2(nn.Module):
    method __init__ (line 50) | def __init__(self, num_classes=10):
    method _make_layers (line 60) | def _make_layers(self, in_planes):
    method forward (line 69) | def forward(self, x):
  function test (line 80) | def test():

FILE: algorithm-GC/cifar/models/pnasnet.py
  class SepConv (line 10) | class SepConv(nn.Module):
    method __init__ (line 12) | def __init__(self, in_planes, out_planes, kernel_size, stride):
    method forward (line 20) | def forward(self, x):
  class CellA (line 24) | class CellA(nn.Module):
    method __init__ (line 25) | def __init__(self, in_planes, out_planes, stride=1):
    method forward (line 33) | def forward(self, x):
  class CellB (line 40) | class CellB(nn.Module):
    method __init__ (line 41) | def __init__(self, in_planes, out_planes, stride=1):
    method forward (line 56) | def forward(self, x):
  class PNASNet (line 71) | class PNASNet(nn.Module):
    method __init__ (line 72) | def __init__(self, cell_type, num_cells, num_planes):
    method _make_layer (line 88) | def _make_layer(self, planes, num_cells):
    method _downsample (line 95) | def _downsample(self, planes):
    method forward (line 100) | def forward(self, x):
  function PNASNetA (line 112) | def PNASNetA():
  function PNASNetB (line 115) | def PNASNetB():
  function test (line 119) | def test():

FILE: algorithm-GC/cifar/models/preact_resnet.py
  class PreActBlock (line 12) | class PreActBlock(nn.Module):
    method __init__ (line 16) | def __init__(self, in_planes, planes, stride=1):
    method forward (line 28) | def forward(self, x):
  class PreActBottleneck (line 37) | class PreActBottleneck(nn.Module):
    method __init__ (line 41) | def __init__(self, in_planes, planes, stride=1):
    method forward (line 55) | def forward(self, x):
  class PreActResNet (line 65) | class PreActResNet(nn.Module):
    method __init__ (line 66) | def __init__(self, block, num_blocks, num_classes=10):
    method _make_layer (line 77) | def _make_layer(self, block, planes, num_blocks, stride):
    method forward (line 85) | def forward(self, x):
  function PreActResNet18 (line 97) | def PreActResNet18():
  function PreActResNet34 (line 100) | def PreActResNet34():
  function PreActResNet50 (line 103) | def PreActResNet50():
  function PreActResNet101 (line 106) | def PreActResNet101():
  function PreActResNet152 (line 109) | def PreActResNet152():
  function test (line 113) | def test():

FILE: algorithm-GC/cifar/models/resnet.py
  class BasicBlock (line 14) | class BasicBlock(nn.Module):
    method __init__ (line 17) | def __init__(self, in_planes, planes, stride=1):
    method forward (line 31) | def forward(self, x):
  class Bottleneck (line 39) | class Bottleneck(nn.Module):
    method __init__ (line 42) | def __init__(self, in_planes, planes, stride=1):
    method forward (line 58) | def forward(self, x):
  class ResNet (line 67) | class ResNet(nn.Module):
    method __init__ (line 68) | def __init__(self, block, num_blocks, num_classes=10):
    method _make_layer (line 80) | def _make_layer(self, block, planes, num_blocks, stride):
    method forward (line 88) | def forward(self, x):
  function ResNet18 (line 100) | def ResNet18(Num_classes=10):
  function ResNet34 (line 103) | def ResNet34(Num_classes=10):
  function ResNet50 (line 106) | def ResNet50(Num_classes=10):
  function ResNet101 (line 109) | def ResNet101(Num_classes=10):
  function ResNet152 (line 112) | def ResNet152(Num_classes=10):
  function test (line 116) | def test():

FILE: algorithm-GC/cifar/models/resnext.py
  class Block (line 10) | class Block(nn.Module):
    method __init__ (line 14) | def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stri...
    method forward (line 31) | def forward(self, x):
  class ResNeXt (line 40) | class ResNeXt(nn.Module):
    method __init__ (line 41) | def __init__(self, num_blocks, cardinality, bottleneck_width, num_clas...
    method _make_layer (line 55) | def _make_layer(self, num_blocks, stride):
    method forward (line 65) | def forward(self, x):
  function ResNeXt29_2x64d (line 77) | def ResNeXt29_2x64d():
  function ResNeXt29_4x64d (line 80) | def ResNeXt29_4x64d():
  function ResNeXt29_8x64d (line 83) | def ResNeXt29_8x64d():
  function ResNeXt29_32x4d (line 86) | def ResNeXt29_32x4d():
  function test_resnext (line 89) | def test_resnext():

FILE: algorithm-GC/cifar/models/senet.py
  class BasicBlock (line 10) | class BasicBlock(nn.Module):
    method __init__ (line 11) | def __init__(self, in_planes, planes, stride=1):
    method forward (line 29) | def forward(self, x):
  class PreActBlock (line 45) | class PreActBlock(nn.Module):
    method __init__ (line 46) | def __init__(self, in_planes, planes, stride=1):
    method forward (line 62) | def forward(self, x):
  class SENet (line 79) | class SENet(nn.Module):
    method __init__ (line 80) | def __init__(self, block, num_blocks, num_classes=10):
    method _make_layer (line 92) | def _make_layer(self, block, planes, num_blocks, stride):
    method forward (line 100) | def forward(self, x):
  function SENet18 (line 112) | def SENet18():
  function test (line 116) | def test():

FILE: algorithm-GC/cifar/models/shufflenet.py
  class ShuffleBlock (line 10) | class ShuffleBlock(nn.Module):
    method __init__ (line 11) | def __init__(self, groups):
    method forward (line 15) | def forward(self, x):
  class Bottleneck (line 22) | class Bottleneck(nn.Module):
    method __init__ (line 23) | def __init__(self, in_planes, out_planes, stride, groups):
    method forward (line 41) | def forward(self, x):
  class ShuffleNet (line 51) | class ShuffleNet(nn.Module):
    method __init__ (line 52) | def __init__(self, cfg):
    method _make_layer (line 66) | def _make_layer(self, out_planes, num_blocks, groups):
    method forward (line 75) | def forward(self, x):
  function ShuffleNetG2 (line 86) | def ShuffleNetG2():
  function ShuffleNetG3 (line 94) | def ShuffleNetG3():
  function test (line 103) | def test():

FILE: algorithm-GC/cifar/models/vgg.py
  class VGG (line 14) | class VGG(nn.Module):
    method __init__ (line 15) | def __init__(self, vgg_name,Num_classes=100):
    method forward (line 20) | def forward(self, x):
    method _make_layers (line 26) | def _make_layers(self, cfg):
  function test (line 41) | def test():

Download .json

Condensed preview — 57 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (420K chars).

[
  {
    "path": "GC_code/CIFAR100/algorithm/Adagrad.py",
    "chars": 8949,
    "preview": "import torch\r\nfrom torch.optim.optimizer import Optimizer\r\n\r\n\r\nclass Adagrad_GCC(Optimizer):\r\n    \"\"\"Implements Adagrad "
  },
  {
    "path": "GC_code/CIFAR100/algorithm/Adam.py",
    "chars": 40981,
    "preview": "import math\nimport torch\nfrom torch.optim.optimizer import Optimizer\n\nclass Adam_GCC(Optimizer):\n    def __init__(self, "
  },
  {
    "path": "GC_code/CIFAR100/algorithm/SGD.py",
    "chars": 13299,
    "preview": "import torch\nfrom torch.optim.optimizer import Optimizer, required\n\n\n\nclass SGD_GCC(Optimizer):\n\n    def __init__(self, "
  },
  {
    "path": "GC_code/CIFAR100/main.py",
    "chars": 7991,
    "preview": "'''Train CIFAR100 with PyTorch.'''\nfrom __future__ import print_function\n\nimport torch\nimport torch.nn as nn\nimport torc"
  },
  {
    "path": "GC_code/CIFAR100/models/__init__.py",
    "chars": 304,
    "preview": "from .vgg import *\nfrom .dpn import *\nfrom .lenet import *\nfrom .senet import *\nfrom .pnasnet import *\nfrom .densenet im"
  },
  {
    "path": "GC_code/CIFAR100/models/densenet.py",
    "chars": 3737,
    "preview": "'''DenseNet in PyTorch.'''\nimport math\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Bottl"
  },
  {
    "path": "GC_code/CIFAR100/models/dpn.py",
    "chars": 3562,
    "preview": "'''Dual Path Networks in PyTorch.'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Bottlene"
  },
  {
    "path": "GC_code/CIFAR100/models/googlenet.py",
    "chars": 3221,
    "preview": "'''GoogLeNet with PyTorch.'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Inception(nn.Mo"
  },
  {
    "path": "GC_code/CIFAR100/models/lenet.py",
    "chars": 699,
    "preview": "'''LeNet in PyTorch.'''\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass LeNet(nn.Module):\n    def __init__("
  },
  {
    "path": "GC_code/CIFAR100/models/mobilenet.py",
    "chars": 2025,
    "preview": "'''MobileNet in PyTorch.\n\nSee the paper \"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applicati"
  },
  {
    "path": "GC_code/CIFAR100/models/mobilenetv2.py",
    "chars": 3092,
    "preview": "'''MobileNetV2 in PyTorch.\n\nSee the paper \"Inverted Residuals and Linear Bottlenecks:\nMobile Networks for Classification"
  },
  {
    "path": "GC_code/CIFAR100/models/pnasnet.py",
    "chars": 4258,
    "preview": "'''PNASNet in PyTorch.\n\nPaper: Progressive Neural Architecture Search\n'''\nimport torch\nimport torch.nn as nn\nimport torc"
  },
  {
    "path": "GC_code/CIFAR100/models/preact_resnet.py",
    "chars": 4078,
    "preview": "'''Pre-activation ResNet in PyTorch.\n\nReference:\n[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun\n    Identity Mapp"
  },
  {
    "path": "GC_code/CIFAR100/models/resnet.py",
    "chars": 4195,
    "preview": "'''ResNet in PyTorch.\n\nFor Pre-activation ResNet, see 'preact_resnet.py'.\n\nReference:\n[1] Kaiming He, Xiangyu Zhang, Sha"
  },
  {
    "path": "GC_code/CIFAR100/models/resnext.py",
    "chars": 3630,
    "preview": "'''ResNeXt in PyTorch.\n\nSee the paper \"Aggregated Residual Transformations for Deep Neural Networks\" for more details.\n'"
  },
  {
    "path": "GC_code/CIFAR100/models/senet.py",
    "chars": 4027,
    "preview": "'''SENet in PyTorch.\n\nSENet is the winner of ImageNet-2017. The paper is not released yet.\n'''\nimport torch\nimport torch"
  },
  {
    "path": "GC_code/CIFAR100/models/shufflenet.py",
    "chars": 3551,
    "preview": "'''ShuffleNet in PyTorch.\n\nSee the paper \"ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Dev"
  },
  {
    "path": "GC_code/CIFAR100/models/vgg.py",
    "chars": 1467,
    "preview": "'''VGG11/13/16/19 in Pytorch.'''\nimport torch\nimport torch.nn as nn\n\n\ncfg = {\n    'VGG11': [64, 'M', 128, 'M', 256, 256,"
  },
  {
    "path": "GC_code/CIFAR100/os_run.py",
    "chars": 296,
    "preview": "\nimport os,time\n\n#cifar100 sgd & sgdGCC\n\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200  "
  },
  {
    "path": "GC_code/Fine-grained_classification/SGD.py",
    "chars": 10518,
    "preview": "import torch\nfrom torch.optim.optimizer import Optimizer, required\n\n\n\nclass SGD_GCC(Optimizer):\n\n    def __init__(self, "
  },
  {
    "path": "GC_code/Fine-grained_classification/main.py",
    "chars": 17597,
    "preview": "import argparse\nimport os\nimport random\nimport shutil\nimport time\nimport warnings\nimport sys\n\nimport torch\nimport torch."
  },
  {
    "path": "GC_code/Fine-grained_classification/os_run.py",
    "chars": 1322,
    "preview": "\nimport os,time\n\n\n\n\nos.system(\"nohup python -W ignore main.py /home/yonghw/data/data/CUB_200_2011/ --model r50p -b 128 -"
  },
  {
    "path": "GC_code/ImageNet/SGD.py",
    "chars": 10518,
    "preview": "import torch\nfrom torch.optim.optimizer import Optimizer, required\n\n\n\nclass SGD_GCC(Optimizer):\n\n    def __init__(self, "
  },
  {
    "path": "GC_code/ImageNet/main.py",
    "chars": 16397,
    "preview": "import argparse\nimport os\nimport random\nimport shutil\nimport time\nimport warnings\nimport sys\n#nohup python -W ignore mai"
  },
  {
    "path": "GC_code/ImageNet/myresnet.py",
    "chars": 7140,
    "preview": "from __future__ import print_function, division, absolute_import\r\nimport torch.nn as nn\r\nimport math\r\nimport torch.utils"
  },
  {
    "path": "GC_code/ImageNet/myresnetgn.py",
    "chars": 7147,
    "preview": "from __future__ import print_function, division, absolute_import\r\nimport torch.nn as nn\r\nimport math\r\nimport torch.utils"
  },
  {
    "path": "GC_code/ImageNet/os_run.py",
    "chars": 353,
    "preview": "\nimport os,time\n\n\nos.system(\"#nohup python -W ignore main.py /ssd/data/yonghw/Imagenet/v0/ --model r50bn --alg sgd -b 25"
  },
  {
    "path": "GC_code/Mini_ImageNet/SGD.py",
    "chars": 10518,
    "preview": "import torch\nfrom torch.optim.optimizer import Optimizer, required\n\n\n\nclass SGD_GCC(Optimizer):\n\n    def __init__(self, "
  },
  {
    "path": "GC_code/Mini_ImageNet/main.py",
    "chars": 15800,
    "preview": "import argparse\nimport os\nimport random\nimport shutil\nimport time\nimport warnings\nimport sys\nimport torch\nimport torch.n"
  },
  {
    "path": "GC_code/Mini_ImageNet/os_run.py",
    "chars": 702,
    "preview": "#cifar100 e200 bs128  gs  2,4,8,16\nimport os,time\n\n#print('runing mini_imagenet.py')\n\n\nos.system(\"nohup  python -W ignor"
  },
  {
    "path": "GC_code/Mini_ImageNet/resnet_ws.py",
    "chars": 7797,
    "preview": "import torch.nn as nn\r\nimport torch.utils.model_zoo as model_zoo\r\n\r\nimport torch\r\nimport torch.nn as nn\r\nfrom torch.nn.p"
  },
  {
    "path": "README.md",
    "chars": 11780,
    "preview": "# Gradient Centralization\n\n## [Gradient Centralization: A New Optimization Technique for Deep Neural Networks](https://a"
  },
  {
    "path": "algorithm-GC/README.md",
    "chars": 3444,
    "preview": "# Advanced-optimizer-with-Gradient-Centralization\nAdvanced optimizer with Gradient-Centralization\nPlease Refer to\n## [Gr"
  },
  {
    "path": "algorithm-GC/algorithm/Adam.py",
    "chars": 11078,
    "preview": "import math\nimport torch\nfrom torch.optim.optimizer import Optimizer\nfrom .Centralization import centralized_gradient\n\nc"
  },
  {
    "path": "algorithm-GC/algorithm/Centralization.py",
    "chars": 456,
    "preview": "import torch\n#from torch.optim.optimizer import Optimizer, required\n\n\n\ndef centralized_gradient(x,use_gc=True,gc_conv_on"
  },
  {
    "path": "algorithm-GC/algorithm/Lookahead.py",
    "chars": 2449,
    "preview": "from collections import defaultdict\nfrom itertools import chain\nfrom torch.optim import Optimizer\nimport torch\nimport wa"
  },
  {
    "path": "algorithm-GC/algorithm/RAdam.py",
    "chars": 8846,
    "preview": "import math\nimport torch\nfrom torch.optim.optimizer import Optimizer\nfrom .Centralization import centralized_gradient\n\n\n"
  },
  {
    "path": "algorithm-GC/algorithm/Ranger.py",
    "chars": 7457,
    "preview": "import math\nimport torch\nfrom torch.optim.optimizer import Optimizer\nfrom .Centralization import centralized_gradient\n\n\n"
  },
  {
    "path": "algorithm-GC/algorithm/SGD.py",
    "chars": 4718,
    "preview": "import torch\nfrom torch.optim.optimizer import Optimizer, required\n\nfrom .Centralization import centralized_gradient\n\ncl"
  },
  {
    "path": "algorithm-GC/cifar/main.py",
    "chars": 10414,
    "preview": "'''Train CIFAR100 with PyTorch.'''\nfrom __future__ import print_function\n\nimport torch\nimport torch.nn as nn\nimport torc"
  },
  {
    "path": "algorithm-GC/cifar/models/__init__.py",
    "chars": 304,
    "preview": "from .vgg import *\nfrom .dpn import *\nfrom .lenet import *\nfrom .senet import *\nfrom .pnasnet import *\nfrom .densenet im"
  },
  {
    "path": "algorithm-GC/cifar/models/densenet.py",
    "chars": 3542,
    "preview": "'''DenseNet in PyTorch.'''\nimport math\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Bottl"
  },
  {
    "path": "algorithm-GC/cifar/models/dpn.py",
    "chars": 3562,
    "preview": "'''Dual Path Networks in PyTorch.'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Bottlene"
  },
  {
    "path": "algorithm-GC/cifar/models/googlenet.py",
    "chars": 3221,
    "preview": "'''GoogLeNet with PyTorch.'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Inception(nn.Mo"
  },
  {
    "path": "algorithm-GC/cifar/models/lenet.py",
    "chars": 699,
    "preview": "'''LeNet in PyTorch.'''\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass LeNet(nn.Module):\n    def __init__("
  },
  {
    "path": "algorithm-GC/cifar/models/mobilenet.py",
    "chars": 2025,
    "preview": "'''MobileNet in PyTorch.\n\nSee the paper \"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applicati"
  },
  {
    "path": "algorithm-GC/cifar/models/mobilenetv2.py",
    "chars": 3092,
    "preview": "'''MobileNetV2 in PyTorch.\n\nSee the paper \"Inverted Residuals and Linear Bottlenecks:\nMobile Networks for Classification"
  },
  {
    "path": "algorithm-GC/cifar/models/pnasnet.py",
    "chars": 4258,
    "preview": "'''PNASNet in PyTorch.\n\nPaper: Progressive Neural Architecture Search\n'''\nimport torch\nimport torch.nn as nn\nimport torc"
  },
  {
    "path": "algorithm-GC/cifar/models/preact_resnet.py",
    "chars": 4078,
    "preview": "'''Pre-activation ResNet in PyTorch.\n\nReference:\n[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun\n    Identity Mapp"
  },
  {
    "path": "algorithm-GC/cifar/models/resnet.py",
    "chars": 4195,
    "preview": "'''ResNet in PyTorch.\n\nFor Pre-activation ResNet, see 'preact_resnet.py'.\n\nReference:\n[1] Kaiming He, Xiangyu Zhang, Sha"
  },
  {
    "path": "algorithm-GC/cifar/models/resnext.py",
    "chars": 3478,
    "preview": "'''ResNeXt in PyTorch.\n\nSee the paper \"Aggregated Residual Transformations for Deep Neural Networks\" for more details.\n'"
  },
  {
    "path": "algorithm-GC/cifar/models/senet.py",
    "chars": 4027,
    "preview": "'''SENet in PyTorch.\n\nSENet is the winner of ImageNet-2017. The paper is not released yet.\n'''\nimport torch\nimport torch"
  },
  {
    "path": "algorithm-GC/cifar/models/shufflenet.py",
    "chars": 3551,
    "preview": "'''ShuffleNet in PyTorch.\n\nSee the paper \"ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Dev"
  },
  {
    "path": "algorithm-GC/cifar/models/vgg.py",
    "chars": 1467,
    "preview": "'''VGG11/13/16/19 in Pytorch.'''\nimport torch\nimport torch.nn as nn\n\n\ncfg = {\n    'VGG11': [64, 'M', 128, 'M', 256, 256,"
  },
  {
    "path": "algorithm-GC/cifar/nohup.out",
    "chars": 1982,
    "preview": "Traceback (most recent call last):\n  File \"main.py\", line 281, in <module>\n    train_acc=train(epoch,net,optimizer)\n  Fi"
  },
  {
    "path": "algorithm-GC/cifar/os_run.py",
    "chars": 51263,
    "preview": "#cifar100 e200 bs128  gs  2,4,8,16\nimport os,time\n#############################\n#r18\n##############\n\n#### sgd \n#os.syste"
  },
  {
    "path": "algorithm-GC/cifar/os_run2.py",
    "chars": 29425,
    "preview": "#cifar100 e200 bs128  gs  2,4,8,16\nimport os,time\n\n\n#r50\n##############\n\n\n### adam \nos.system(\"nohup  python  main.py --"
  }
]

About this extraction

This page contains the full source code of the Yonghongwei/Gradient-Centralization GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 57 files (394.5 KB), approximately 115.9k tokens, and a symbol index with 523 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo