Repository: Yonghongwei/Gradient-Centralization
Branch: master
Commit: ed2a608ccdbb
Files: 57
Total size: 394.5 KB

Directory structure:
gitextract_l162dn_3/

├── GC_code/
│   ├── CIFAR100/
│   │   ├── algorithm/
│   │   │   ├── Adagrad.py
│   │   │   ├── Adam.py
│   │   │   └── SGD.py
│   │   ├── main.py
│   │   ├── models/
│   │   │   ├── __init__.py
│   │   │   ├── densenet.py
│   │   │   ├── dpn.py
│   │   │   ├── googlenet.py
│   │   │   ├── lenet.py
│   │   │   ├── mobilenet.py
│   │   │   ├── mobilenetv2.py
│   │   │   ├── pnasnet.py
│   │   │   ├── preact_resnet.py
│   │   │   ├── resnet.py
│   │   │   ├── resnext.py
│   │   │   ├── senet.py
│   │   │   ├── shufflenet.py
│   │   │   └── vgg.py
│   │   └── os_run.py
│   ├── Fine-grained_classification/
│   │   ├── SGD.py
│   │   ├── main.py
│   │   └── os_run.py
│   ├── ImageNet/
│   │   ├── SGD.py
│   │   ├── main.py
│   │   ├── myresnet.py
│   │   ├── myresnetgn.py
│   │   └── os_run.py
│   └── Mini_ImageNet/
│       ├── SGD.py
│       ├── main.py
│       ├── os_run.py
│       └── resnet_ws.py
├── README.md
└── algorithm-GC/
    ├── README.md
    ├── algorithm/
    │   ├── Adam.py
    │   ├── Centralization.py
    │   ├── Lookahead.py
    │   ├── RAdam.py
    │   ├── Ranger.py
    │   └── SGD.py
    └── cifar/
        ├── main.py
        ├── models/
        │   ├── __init__.py
        │   ├── densenet.py
        │   ├── dpn.py
        │   ├── googlenet.py
        │   ├── lenet.py
        │   ├── mobilenet.py
        │   ├── mobilenetv2.py
        │   ├── pnasnet.py
        │   ├── preact_resnet.py
        │   ├── resnet.py
        │   ├── resnext.py
        │   ├── senet.py
        │   ├── shufflenet.py
        │   └── vgg.py
        ├── nohup.out
        ├── os_run.py
        └── os_run2.py

================================================
FILE CONTENTS
================================================

================================================
FILE: GC_code/CIFAR100/algorithm/Adagrad.py
================================================
import torch
from torch.optim.optimizer import Optimizer


class Adagrad_GCC(Optimizer):
    """Implements Adagrad algorithm.

    It has been proposed in `Adaptive Subgradient Methods for Online Learning
    and Stochastic Optimization`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-2)
        lr_decay (float, optional): learning rate decay (default: 0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-10)

    .. _Adaptive Subgradient Methods for Online Learning and Stochastic
        Optimization: http://jmlr.org/papers/v12/duchi11a.html
    """

    def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0, eps=1e-10):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= lr_decay:
            raise ValueError("Invalid lr_decay value: {}".format(lr_decay))
        if not 0.0 <= weight_decay:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
        if not 0.0 <= initial_accumulator_value:
            raise ValueError("Invalid initial_accumulator_value value: {}".format(initial_accumulator_value))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))

        defaults = dict(lr=lr, lr_decay=lr_decay, eps=eps, weight_decay=weight_decay,
                        initial_accumulator_value=initial_accumulator_value)
        super(Adagrad_GCC, self).__init__(params, defaults)

        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'] = 0
                state['sum'] = torch.full_like(p.data, initial_accumulator_value)

    def share_memory(self):
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['sum'].share_memory_()

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue

                grad = p.grad.data
                state = self.state[p]

                state['step'] += 1

                if group['weight_decay'] != 0:
                    if p.grad.data.is_sparse:
                        raise RuntimeError("weight_decay option is not compatible with sparse gradients")
                    grad = grad.add(group['weight_decay'], p.data)
                    
                 #GC operation for Conv layers                  
                if len(list(grad.size()))>3:
                    grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))

                clr = group['lr'] / (1 + (state['step'] - 1) * group['lr_decay'])

                if grad.is_sparse:
                    grad = grad.coalesce()  # the update is non-linear so indices must be unique
                    grad_indices = grad._indices()
                    grad_values = grad._values()
                    size = grad.size()

                    def make_sparse(values):
                        constructor = grad.new
                        if grad_indices.dim() == 0 or values.dim() == 0:
                            return constructor().resize_as_(grad)
                        return constructor(grad_indices, values, size)
                    state['sum'].add_(make_sparse(grad_values.pow(2)))
                    std = state['sum'].sparse_mask(grad)
                    std_values = std._values().sqrt_().add_(group['eps'])
                    p.data.add_(-clr, make_sparse(grad_values / std_values))
                else:
                    state['sum'].addcmul_(1, grad, grad)
                    std = state['sum'].sqrt().add_(group['eps'])
                    p.data.addcdiv_(-clr, grad, std)

        return loss
    
class Adagrad_GC(Optimizer):
    """Implements Adagrad algorithm.

    It has been proposed in `Adaptive Subgradient Methods for Online Learning
    and Stochastic Optimization`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-2)
        lr_decay (float, optional): learning rate decay (default: 0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-10)

    .. _Adaptive Subgradient Methods for Online Learning and Stochastic
        Optimization: http://jmlr.org/papers/v12/duchi11a.html
    """

    def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0, eps=1e-10):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= lr_decay:
            raise ValueError("Invalid lr_decay value: {}".format(lr_decay))
        if not 0.0 <= weight_decay:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
        if not 0.0 <= initial_accumulator_value:
            raise ValueError("Invalid initial_accumulator_value value: {}".format(initial_accumulator_value))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))

        defaults = dict(lr=lr, lr_decay=lr_decay, eps=eps, weight_decay=weight_decay,
                        initial_accumulator_value=initial_accumulator_value)
        super(Adagrad_GC, self).__init__(params, defaults)

        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'] = 0
                state['sum'] = torch.full_like(p.data, initial_accumulator_value)

    def share_memory(self):
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['sum'].share_memory_()

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue

                grad = p.grad.data
                state = self.state[p]

                state['step'] += 1

                if group['weight_decay'] != 0:
                    if p.grad.data.is_sparse:
                        raise RuntimeError("weight_decay option is not compatible with sparse gradients")
                    grad = grad.add(group['weight_decay'], p.data)
                    
                 #GC operation for Conv layers                  
                if len(list(grad.size()))>1:
                    grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))

                clr = group['lr'] / (1 + (state['step'] - 1) * group['lr_decay'])

                if grad.is_sparse:
                    grad = grad.coalesce()  # the update is non-linear so indices must be unique
                    grad_indices = grad._indices()
                    grad_values = grad._values()
                    size = grad.size()

                    def make_sparse(values):
                        constructor = grad.new
                        if grad_indices.dim() == 0 or values.dim() == 0:
                            return constructor().resize_as_(grad)
                        return constructor(grad_indices, values, size)
                    state['sum'].add_(make_sparse(grad_values.pow(2)))
                    std = state['sum'].sparse_mask(grad)
                    std_values = std._values().sqrt_().add_(group['eps'])
                    p.data.add_(-clr, make_sparse(grad_values / std_values))
                else:
                    state['sum'].addcmul_(1, grad, grad)
                    std = state['sum'].sqrt().add_(group['eps'])
                    p.data.addcdiv_(-clr, grad, std)

        return loss
    

================================================
FILE: GC_code/CIFAR100/algorithm/Adam.py
================================================
import math
import torch
from torch.optim.optimizer import Optimizer

class Adam_GCC(Optimizer):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(Adam_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(Adam_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1
                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']

                if group['weight_decay'] != 0:
                    grad.add_(group['weight_decay'], p.data)

                #GC operation for Conv layers
                if len(list(grad.size()))>3:                    
                    grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
                    
                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
                else:
                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])

                step_size = group['lr'] / bias_correction1

                p.data.addcdiv_(-step_size, exp_avg, denom)

        return loss

class Adam_GCC2(Optimizer):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(Adam_GCC2, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(Adam_GCC2, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1
                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']

                if group['weight_decay'] != 0:
                    grad.add_(group['weight_decay'], p.data)
                    
                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
                else:
                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])

                step_size = group['lr'] / bias_correction1
                #GC operation for Conv layers                
                if len(list(grad.size()))>3:
                  delta=(step_size*exp_avg/denom).clone()
                  delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
                  p.data.add_(-delta)
                else:
                  p.data.addcdiv_(-step_size, exp_avg, denom)
        return loss    

class Adam_GC(Optimizer):
    r"""Implements Adam algorithm.

    It has been proposed in `Adam: A Method for Stochastic Optimization`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False)

    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(Adam_GC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(Adam_GC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1
                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']

                if group['weight_decay'] != 0:
                    grad.add_(group['weight_decay'], p.data)
                   
                #GC operation for Conv layers and FC layers   
                if len(list(grad.size()))>1:
                   grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
                else:
                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])

                step_size = group['lr'] / bias_correction1

                p.data.addcdiv_(-step_size, exp_avg, denom)

        return loss


class Adam_GC2(Optimizer):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(Adam_GC2, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(Adam_GC2, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1
                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']

                if group['weight_decay'] != 0:
                    grad.add_(group['weight_decay'], p.data)
                    
                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
                else:
                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])

                step_size = group['lr'] / bias_correction1
                #GC operation for Conv layers and FC layers               
                if len(list(grad.size()))>1:
                  delta=(step_size*exp_avg/denom).clone()
                  delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
                  p.data.add_(-delta)
                else:
                  p.data.addcdiv_(-step_size, exp_avg, denom)
        return loss

class AdamW(Optimizer):
    """Implements Adam algorithm.
    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(AdamW, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(AdamW, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                # if group['weight_decay'] != 0:
                #     grad = grad.add(group['weight_decay'], p.data)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                else:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                # p.data.addcdiv_(-step_size, exp_avg, denom)
                p.data.add_(-step_size,  torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )

        return loss


class AdamW_GCC(Optimizer):
    """Implements Adam algorithm.
    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(AdamW_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(AdamW_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                #GC operation for Conv layers
                if len(list(grad.size()))>3:                    
                   grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))

                state['step'] += 1

                # if group['weight_decay'] != 0:
                #     grad = grad.add(group['weight_decay'], p.data)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                else:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                # p.data.addcdiv_(-step_size, exp_avg, denom)
                p.data.add_(-step_size,  torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )

        return loss
 
class AdamW_GC(Optimizer):
    """Implements Adam algorithm.
    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(AdamW_GC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(AdamW_GC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                #GC operation for Conv and FC layers
                if len(list(grad.size()))>1:                    
                   grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))

                state['step'] += 1

                # if group['weight_decay'] != 0:
                #     grad = grad.add(group['weight_decay'], p.data)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                else:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                # p.data.addcdiv_(-step_size, exp_avg, denom)
                p.data.add_(-step_size,  torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )

        return loss

class AdamW_GCC2(Optimizer):
    """Implements Adam algorithm.
    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(AdamW_GCC2, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(AdamW_GCC2, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                # if group['weight_decay'] != 0:
                #     grad = grad.add(group['weight_decay'], p.data)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                else:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                # GC operation for Conv layers
                if len(list(grad.size()))>3:
                  delta=(step_size*torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom)).clone()
                  delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
                  p.data.add_(-delta)
                else:
                  p.data.add_(-step_size,  torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )
                             
        return loss

class AdamW_GC2(Optimizer):
    """Implements Adam algorithm.
    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(AdamW_GC2, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(AdamW_GC2, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                # if group['weight_decay'] != 0:
                #     grad = grad.add(group['weight_decay'], p.data)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                else:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                # GC operation for Conv and FC layers
                if len(list(grad.size()))>1:
                  delta=(step_size*torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom)).clone()
                  delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
                  p.data.add_(-delta)
                else:
                  p.data.add_(-step_size,  torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )
                
               
        return loss


================================================
FILE: GC_code/CIFAR100/algorithm/SGD.py
================================================
import torch
from torch.optim.optimizer import Optimizer, required


class SGD_GCC(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGD_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGD_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)

                #GC operation for Conv layers
                if len(list(d_p.size()))>3:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
                   
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

        return loss

class SGD_GC(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGD_GC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGD_GC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)

                #GC operation for Conv layers and FC layers
                if len(list(d_p.size()))>1:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))

                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

        return loss


class SGDW(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGDW, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGDW, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                old = torch.clone(p.data).detach()
                #if weight_decay != 0:
                #    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

                if weight_decay != 0:
                    p.data.add_(-weight_decay*group['lr'], old)

        return loss


class SGDW_GCC(Optimizer):
    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGDW_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGDW_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                old = torch.clone(p.data).detach()
                #if weight_decay != 0:
                #    d_p.add_(weight_decay, p.data)
                
                #GC operation for Conv layers
                if len(list(d_p.size()))>3:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))


                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

                if weight_decay != 0:
                    p.data.add_(-weight_decay*group['lr'], old)

        return loss

    
class SGDW_GC(Optimizer):
    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGDW_GC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGDW_GC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                old = torch.clone(p.data).detach()
                #if weight_decay != 0:
                #    d_p.add_(weight_decay, p.data)
                
                #GC operation for Conv and FC layers
                if len(list(d_p.size()))>1:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))


                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

                if weight_decay != 0:
                    p.data.add_(-weight_decay*group['lr'], old)

        return loss


================================================
FILE: GC_code/CIFAR100/main.py
================================================
'''Train CIFAR100 with PyTorch.'''
from __future__ import print_function

import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn


import torch.optim as optim
import torch.nn.functional as F

import torchvision
import torchvision.transforms as transforms


from torch.optim import lr_scheduler
import os
import argparse
from torchvision import datasets, models
from models import *
#from utils import progress_bar
import numpy as np

#import optimizers with GC
from algorithm.SGD import *
from algorithm.Adam import *
from algorithm.Adagrad import *


parser = argparse.ArgumentParser(description='PyTorch CIFAR100 Training')
parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
parser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint')
parser.add_argument('--bs', default=128, type=int, help='batchsize')
parser.add_argument('--wd', default=0.0005, type=float, help='weight decay')
parser.add_argument('--alg', default='sgd', type=str, help='algorithm')
parser.add_argument('--epochs', default=200, type=int, help='epochs')
parser.add_argument('--path', default='logout/result', type=str, help='path')
parser.add_argument('--model', default='r50', type=str, help='model')


args = parser.parse_args()
os.environ["CUDA_VISIBLE_DEVICES"]="0"


epochs=args.epochs
device = 'cuda' if torch.cuda.is_available() else 'cpu'
best_acc = 0  # best test accuracy
start_epoch = 0  # start from epoch 0 or last checkpoint epoch


# Data
print('==> Preparing data..')
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.507, 0.487, 0.441), (0.267, 0.256, 0.276)),
  ])
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.507, 0.487, 0.441), (0.267, 0.256, 0.276)),
  ])
trainset = torchvision.datasets.CIFAR100(root='/home/yonghw/data/cifar100/', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=4,drop_last=True)
testset = torchvision.datasets.CIFAR100(root='/home/yonghw/data/cifar100/', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=4)


# Model
print('==> Building model..')

Num_classes = 100

if args.model=='r18':
    net = ResNet18(Num_classes=Num_classes)
if args.model=='r34':
    net = ResNet34(Num_classes=Num_classes)
if args.model=='r50':
    net = ResNet50(Num_classes=Num_classes)
if args.model=='r101':
    net = ResNet101(Num_classes=Num_classes)
if args.model=='v11':
    net = VGG('VGG11',Num_classes=Num_classes)
if args.model=='rx29':
    net = ResNeXt29_4x64d(Num_classes=Num_classes)
if args.model=='d121':
    net = DenseNet121(Num_classes=Num_classes)

if device == 'cuda':
    net = net.cuda()
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True


if args.resume:
    # Load checkpoint.
    print('==> Resuming from checkpoint..')
    assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
    checkpoint = torch.load('./checkpoint/ckpt.t7')
    net.load_state_dict(checkpoint['net'])
    best_acc = checkpoint['acc']
    start_epoch = checkpoint['epoch']
    
criterion = nn.CrossEntropyLoss()

#optimizer
WD=args.wd
print('==> choose optimizer..')
if args.alg=='sgd':
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)
if args.alg=='sgdGC':
    optimizer = SGD_GC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)
if args.alg=='sgdGCC':
    optimizer = SGD_GCC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)
    
if args.alg=='adam':
    optimizer = optim.Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamGC':
    optimizer = Adam_GC(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamGCC':
    optimizer = Adam_GCC(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamGC2':
    optimizer = Adam_GC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamGCC2':
    optimizer = Adam_GCC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD) 


if args.alg=='adagrad':
    optimizer = optim.Adagrad(net.parameters(), lr=args.lr*0.1,weight_decay = WD)
if args.alg=='adagradGC':
    optimizer = Adagrad_GC(net.parameters(), lr=args.lr*0.1,weight_decay = WD)
if args.alg=='adagradGCC':
    optimizer = Adagrad_GCC(net.parameters(), lr=args.lr*0.1,weight_decay = WD)
if args.alg=='adagradGC2':
    optimizer = Adagrad_GC2(net.parameters(), lr=args.lr*0.1,weight_decay = WD)
if args.alg=='adagradGCC2':
    optimizer = Adagrad_GCC2(net.parameters(), lr=args.lr*0.1,weight_decay = WD)
    
    
if args.alg=='sgdW':
    optimizer = SGDW(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)
if args.alg=='sgdWGC':
    optimizer = SGDW_GC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)
if args.alg=='sgdWGCC':
    optimizer = SGDW_GCC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)
    
if args.alg=='adamW':
    optimizer = AdamW(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamWGC':
    optimizer = Adam_GC(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamWGCC':
    optimizer = Adam_GCC(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamWGC2':
    optimizer = Adam_GC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamWGCC2':
    optimizer = Adam_GCC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
    
    
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=60, gamma=0.1)

# Training
def train(epoch,net,optimizer):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
    print('Training: Loss: {:.4f} | Acc: {:.4f}'.format(train_loss/(batch_idx+1),correct/total))
    #        progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
    #            % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
    acc=100.*correct/total
    return acc
    
# Testing
def test(epoch,net):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
      for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            #progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
                #% (test_loss/(batch_idx+1), 100.*correct/total, correct, total))
    print('Testing:Loss: {:.4f} | Acc: {:.4f}'.format(test_loss/(batch_idx+1),correct/total) )

    # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc:
        print('Saving..')
        state = {
            'net': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir('checkpoint'):
            os.mkdir('checkpoint')
        torch.save(state, './checkpoint/ckpt.t7')
        best_acc = acc
    return acc


for epoch in range(start_epoch, start_epoch+epochs):
    train_acc=train(epoch,net,optimizer)
    exp_lr_scheduler.step()
    val_acc=test(epoch,net)


================================================
FILE: GC_code/CIFAR100/models/__init__.py
================================================
from .vgg import *
from .dpn import *
from .lenet import *
from .senet import *
from .pnasnet import *
from .densenet import *
from .googlenet import *
from .shufflenet import *
from .resnet import *
from .resnext import *
from .preact_resnet import *
from .mobilenet import *
from .mobilenetv2 import *


================================================
FILE: GC_code/CIFAR100/models/densenet.py
================================================
'''DenseNet in PyTorch.'''
import math

import torch
import torch.nn as nn
import torch.nn.functional as F


class Bottleneck(nn.Module):
    def __init__(self, in_planes, growth_rate):
        super(Bottleneck, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False)
        self.bn2 = nn.BatchNorm2d(4*growth_rate)
        self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)

    def forward(self, x):
        out = self.conv1(F.relu(self.bn1(x)))
        out = self.conv2(F.relu(self.bn2(out)))
        out = torch.cat([out,x], 1)
        return out


class Transition(nn.Module):
    def __init__(self, in_planes, out_planes):
        super(Transition, self).__init__()
        self.bn = nn.BatchNorm2d(in_planes)
        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False)

    def forward(self, x):
        out = self.conv(F.relu(self.bn(x)))
        out = F.avg_pool2d(out, 2)
        return out


class DenseNet(nn.Module):
    def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
        super(DenseNet, self).__init__()
        self.growth_rate = growth_rate

        num_planes = 2*growth_rate
        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False)

        self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
        num_planes += nblocks[0]*growth_rate
        out_planes = int(math.floor(num_planes*reduction))
        self.trans1 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
        num_planes += nblocks[1]*growth_rate
        out_planes = int(math.floor(num_planes*reduction))
        self.trans2 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
        num_planes += nblocks[2]*growth_rate
        out_planes = int(math.floor(num_planes*reduction))
        self.trans3 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3])
        num_planes += nblocks[3]*growth_rate

        self.bn = nn.BatchNorm2d(num_planes)
        self.linear = nn.Linear(num_planes, num_classes)

    def _make_dense_layers(self, block, in_planes, nblock):
        layers = []
        for i in range(nblock):
            layers.append(block(in_planes, self.growth_rate))
            in_planes += self.growth_rate
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.trans1(self.dense1(out))
        out = self.trans2(self.dense2(out))
        out = self.trans3(self.dense3(out))
        out = self.dense4(out)
        out = F.avg_pool2d(F.relu(self.bn(out)), 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

def DenseNet121(Num_classes=10):
    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32, num_classes=Num_classes)

def DenseNet169(Num_classes=10):
    return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32, num_classes=Num_classes)

def DenseNet201(Num_classes=10):
    return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32, num_classes=Num_classes)

def DenseNet161(Num_classes=10):
    return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48, num_classes=Num_classes)

def densenet_cifar(Num_classes=10):
    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12, num_classes=Num_classes)

def test():
    net = densenet_cifar()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y)

# test()


================================================
FILE: GC_code/CIFAR100/models/dpn.py
================================================
'''Dual Path Networks in PyTorch.'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class Bottleneck(nn.Module):
    def __init__(self, last_planes, in_planes, out_planes, dense_depth, stride, first_layer):
        super(Bottleneck, self).__init__()
        self.out_planes = out_planes
        self.dense_depth = dense_depth

        self.conv1 = nn.Conv2d(last_planes, in_planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv2 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=32, bias=False)
        self.bn2 = nn.BatchNorm2d(in_planes)
        self.conv3 = nn.Conv2d(in_planes, out_planes+dense_depth, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(out_planes+dense_depth)

        self.shortcut = nn.Sequential()
        if first_layer:
            self.shortcut = nn.Sequential(
                nn.Conv2d(last_planes, out_planes+dense_depth, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_planes+dense_depth)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        x = self.shortcut(x)
        d = self.out_planes
        out = torch.cat([x[:,:d,:,:]+out[:,:d,:,:], x[:,d:,:,:], out[:,d:,:,:]], 1)
        out = F.relu(out)
        return out


class DPN(nn.Module):
    def __init__(self, cfg):
        super(DPN, self).__init__()
        in_planes, out_planes = cfg['in_planes'], cfg['out_planes']
        num_blocks, dense_depth = cfg['num_blocks'], cfg['dense_depth']

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.last_planes = 64
        self.layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1)
        self.layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2)
        self.layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=2)
        self.layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2)
        self.linear = nn.Linear(out_planes[3]+(num_blocks[3]+1)*dense_depth[3], 10)

    def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for i,stride in enumerate(strides):
            layers.append(Bottleneck(self.last_planes, in_planes, out_planes, dense_depth, stride, i==0))
            self.last_planes = out_planes + (i+2) * dense_depth
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def DPN26():
    cfg = {
        'in_planes': (96,192,384,768),
        'out_planes': (256,512,1024,2048),
        'num_blocks': (2,2,2,2),
        'dense_depth': (16,32,24,128)
    }
    return DPN(cfg)

def DPN92():
    cfg = {
        'in_planes': (96,192,384,768),
        'out_planes': (256,512,1024,2048),
        'num_blocks': (3,4,20,3),
        'dense_depth': (16,32,24,128)
    }
    return DPN(cfg)


def test():
    net = DPN92()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y)

# test()


================================================
FILE: GC_code/CIFAR100/models/googlenet.py
================================================
'''GoogLeNet with PyTorch.'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class Inception(nn.Module):
    def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes):
        super(Inception, self).__init__()
        # 1x1 conv branch
        self.b1 = nn.Sequential(
            nn.Conv2d(in_planes, n1x1, kernel_size=1),
            nn.BatchNorm2d(n1x1),
            nn.ReLU(True),
        )

        # 1x1 conv -> 3x3 conv branch
        self.b2 = nn.Sequential(
            nn.Conv2d(in_planes, n3x3red, kernel_size=1),
            nn.BatchNorm2d(n3x3red),
            nn.ReLU(True),
            nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1),
            nn.BatchNorm2d(n3x3),
            nn.ReLU(True),
        )

        # 1x1 conv -> 5x5 conv branch
        self.b3 = nn.Sequential(
            nn.Conv2d(in_planes, n5x5red, kernel_size=1),
            nn.BatchNorm2d(n5x5red),
            nn.ReLU(True),
            nn.Conv2d(n5x5red, n5x5, kernel_size=3, padding=1),
            nn.BatchNorm2d(n5x5),
            nn.ReLU(True),
            nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1),
            nn.BatchNorm2d(n5x5),
            nn.ReLU(True),
        )

        # 3x3 pool -> 1x1 conv branch
        self.b4 = nn.Sequential(
            nn.MaxPool2d(3, stride=1, padding=1),
            nn.Conv2d(in_planes, pool_planes, kernel_size=1),
            nn.BatchNorm2d(pool_planes),
            nn.ReLU(True),
        )

    def forward(self, x):
        y1 = self.b1(x)
        y2 = self.b2(x)
        y3 = self.b3(x)
        y4 = self.b4(x)
        return torch.cat([y1,y2,y3,y4], 1)


class GoogLeNet(nn.Module):
    def __init__(self):
        super(GoogLeNet, self).__init__()
        self.pre_layers = nn.Sequential(
            nn.Conv2d(3, 192, kernel_size=3, padding=1),
            nn.BatchNorm2d(192),
            nn.ReLU(True),
        )

        self.a3 = Inception(192,  64,  96, 128, 16, 32, 32)
        self.b3 = Inception(256, 128, 128, 192, 32, 96, 64)

        self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)

        self.a4 = Inception(480, 192,  96, 208, 16,  48,  64)
        self.b4 = Inception(512, 160, 112, 224, 24,  64,  64)
        self.c4 = Inception(512, 128, 128, 256, 24,  64,  64)
        self.d4 = Inception(512, 112, 144, 288, 32,  64,  64)
        self.e4 = Inception(528, 256, 160, 320, 32, 128, 128)

        self.a5 = Inception(832, 256, 160, 320, 32, 128, 128)
        self.b5 = Inception(832, 384, 192, 384, 48, 128, 128)

        self.avgpool = nn.AvgPool2d(8, stride=1)
        self.linear = nn.Linear(1024, 10)

    def forward(self, x):
        out = self.pre_layers(x)
        out = self.a3(out)
        out = self.b3(out)
        out = self.maxpool(out)
        out = self.a4(out)
        out = self.b4(out)
        out = self.c4(out)
        out = self.d4(out)
        out = self.e4(out)
        out = self.maxpool(out)
        out = self.a5(out)
        out = self.b5(out)
        out = self.avgpool(out)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def test():
    net = GoogLeNet()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y.size())

# test()


================================================
FILE: GC_code/CIFAR100/models/lenet.py
================================================
'''LeNet in PyTorch.'''
import torch.nn as nn
import torch.nn.functional as F

class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1   = nn.Linear(16*5*5, 120)
        self.fc2   = nn.Linear(120, 84)
        self.fc3   = nn.Linear(84, 10)

    def forward(self, x):
        out = F.relu(self.conv1(x))
        out = F.max_pool2d(out, 2)
        out = F.relu(self.conv2(out))
        out = F.max_pool2d(out, 2)
        out = out.view(out.size(0), -1)
        out = F.relu(self.fc1(out))
        out = F.relu(self.fc2(out))
        out = self.fc3(out)
        return out


================================================
FILE: GC_code/CIFAR100/models/mobilenet.py
================================================
'''MobileNet in PyTorch.

See the paper "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class Block(nn.Module):
    '''Depthwise conv + Pointwise conv'''
    def __init__(self, in_planes, out_planes, stride=1):
        super(Block, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False)
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn2 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        return out


class MobileNet(nn.Module):
    # (128,2) means conv planes=128, conv stride=2, by default conv stride=1
    cfg = [64, (128,2), 128, (256,2), 256, (512,2), 512, 512, 512, 512, 512, (1024,2), 1024]

    def __init__(self, num_classes=10):
        super(MobileNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.layers = self._make_layers(in_planes=32)
        self.linear = nn.Linear(1024, num_classes)

    def _make_layers(self, in_planes):
        layers = []
        for x in self.cfg:
            out_planes = x if isinstance(x, int) else x[0]
            stride = 1 if isinstance(x, int) else x[1]
            layers.append(Block(in_planes, out_planes, stride))
            in_planes = out_planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layers(out)
        out = F.avg_pool2d(out, 2)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def test():
    net = MobileNet()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y.size())

# test()


================================================
FILE: GC_code/CIFAR100/models/mobilenetv2.py
================================================
'''MobileNetV2 in PyTorch.

See the paper "Inverted Residuals and Linear Bottlenecks:
Mobile Networks for Classification, Detection and Segmentation" for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class Block(nn.Module):
    '''expand + depthwise + pointwise'''
    def __init__(self, in_planes, out_planes, expansion, stride):
        super(Block, self).__init__()
        self.stride = stride

        planes = expansion * in_planes
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn3 = nn.BatchNorm2d(out_planes)

        self.shortcut = nn.Sequential()
        if stride == 1 and in_planes != out_planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False),
                nn.BatchNorm2d(out_planes),
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out = out + self.shortcut(x) if self.stride==1 else out
        return out


class MobileNetV2(nn.Module):
    # (expansion, out_planes, num_blocks, stride)
    cfg = [(1,  16, 1, 1),
           (6,  24, 2, 1),  # NOTE: change stride 2 -> 1 for CIFAR10
           (6,  32, 3, 2),
           (6,  64, 4, 2),
           (6,  96, 3, 1),
           (6, 160, 3, 2),
           (6, 320, 1, 1)]

    def __init__(self, num_classes=10):
        super(MobileNetV2, self).__init__()
        # NOTE: change conv1 stride 2 -> 1 for CIFAR10
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.layers = self._make_layers(in_planes=32)
        self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn2 = nn.BatchNorm2d(1280)
        self.linear = nn.Linear(1280, num_classes)

    def _make_layers(self, in_planes):
        layers = []
        for expansion, out_planes, num_blocks, stride in self.cfg:
            strides = [stride] + [1]*(num_blocks-1)
            for stride in strides:
                layers.append(Block(in_planes, out_planes, expansion, stride))
                in_planes = out_planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layers(out)
        out = F.relu(self.bn2(self.conv2(out)))
        # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def test():
    net = MobileNetV2()
    x = torch.randn(2,3,32,32)
    y = net(x)
    print(y.size())

# test()


================================================
FILE: GC_code/CIFAR100/models/pnasnet.py
================================================
'''PNASNet in PyTorch.

Paper: Progressive Neural Architecture Search
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class SepConv(nn.Module):
    '''Separable Convolution.'''
    def __init__(self, in_planes, out_planes, kernel_size, stride):
        super(SepConv, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, out_planes,
                               kernel_size, stride,
                               padding=(kernel_size-1)//2,
                               bias=False, groups=in_planes)
        self.bn1 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        return self.bn1(self.conv1(x))


class CellA(nn.Module):
    def __init__(self, in_planes, out_planes, stride=1):
        super(CellA, self).__init__()
        self.stride = stride
        self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)
        if stride==2:
            self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
            self.bn1 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        y1 = self.sep_conv1(x)
        y2 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)
        if self.stride==2:
            y2 = self.bn1(self.conv1(y2))
        return F.relu(y1+y2)

class CellB(nn.Module):
    def __init__(self, in_planes, out_planes, stride=1):
        super(CellB, self).__init__()
        self.stride = stride
        # Left branch
        self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)
        self.sep_conv2 = SepConv(in_planes, out_planes, kernel_size=3, stride=stride)
        # Right branch
        self.sep_conv3 = SepConv(in_planes, out_planes, kernel_size=5, stride=stride)
        if stride==2:
            self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
            self.bn1 = nn.BatchNorm2d(out_planes)
        # Reduce channels
        self.conv2 = nn.Conv2d(2*out_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn2 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        # Left branch
        y1 = self.sep_conv1(x)
        y2 = self.sep_conv2(x)
        # Right branch
        y3 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)
        if self.stride==2:
            y3 = self.bn1(self.conv1(y3))
        y4 = self.sep_conv3(x)
        # Concat & reduce channels
        b1 = F.relu(y1+y2)
        b2 = F.relu(y3+y4)
        y = torch.cat([b1,b2], 1)
        return F.relu(self.bn2(self.conv2(y)))

class PNASNet(nn.Module):
    def __init__(self, cell_type, num_cells, num_planes):
        super(PNASNet, self).__init__()
        self.in_planes = num_planes
        self.cell_type = cell_type

        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(num_planes)

        self.layer1 = self._make_layer(num_planes, num_cells=6)
        self.layer2 = self._downsample(num_planes*2)
        self.layer3 = self._make_layer(num_planes*2, num_cells=6)
        self.layer4 = self._downsample(num_planes*4)
        self.layer5 = self._make_layer(num_planes*4, num_cells=6)

        self.linear = nn.Linear(num_planes*4, 10)

    def _make_layer(self, planes, num_cells):
        layers = []
        for _ in range(num_cells):
            layers.append(self.cell_type(self.in_planes, planes, stride=1))
            self.in_planes = planes
        return nn.Sequential(*layers)

    def _downsample(self, planes):
        layer = self.cell_type(self.in_planes, planes, stride=2)
        self.in_planes = planes
        return layer

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = F.avg_pool2d(out, 8)
        out = self.linear(out.view(out.size(0), -1))
        return out


def PNASNetA():
    return PNASNet(CellA, num_cells=6, num_planes=44)

def PNASNetB():
    return PNASNet(CellB, num_cells=6, num_planes=32)


def test():
    net = PNASNetB()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y)

# test()


================================================
FILE: GC_code/CIFAR100/models/preact_resnet.py
================================================
'''Pre-activation ResNet in PyTorch.

Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Identity Mappings in Deep Residual Networks. arXiv:1603.05027
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class PreActBlock(nn.Module):
    '''Pre-activation version of the BasicBlock.'''
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(PreActBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)

        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
            )

    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))
        out += shortcut
        return out


class PreActBottleneck(nn.Module):
    '''Pre-activation version of the original Bottleneck module.'''
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(PreActBottleneck, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)

        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
            )

    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))
        out = self.conv3(F.relu(self.bn3(out)))
        out += shortcut
        return out


class PreActResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(PreActResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def PreActResNet18():
    return PreActResNet(PreActBlock, [2,2,2,2])

def PreActResNet34():
    return PreActResNet(PreActBlock, [3,4,6,3])

def PreActResNet50():
    return PreActResNet(PreActBottleneck, [3,4,6,3])

def PreActResNet101():
    return PreActResNet(PreActBottleneck, [3,4,23,3])

def PreActResNet152():
    return PreActResNet(PreActBottleneck, [3,8,36,3])


def test():
    net = PreActResNet18()
    y = net((torch.randn(1,3,32,32)))
    print(y.size())

# test()


================================================
FILE: GC_code/CIFAR100/models/resnet.py
================================================
'''ResNet in PyTorch.

For Pre-activation ResNet, see 'preact_resnet.py'.

Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNet18(Num_classes=10):
    return ResNet(BasicBlock, [2,2,2,2],num_classes=Num_classes)

def ResNet34(Num_classes=10):
    return ResNet(BasicBlock, [3,4,6,3],num_classes=Num_classes)

def ResNet50(Num_classes=10):
    return ResNet(Bottleneck, [3,4,6,3],num_classes=Num_classes)

def ResNet101(Num_classes=10):
    return ResNet(Bottleneck, [3,4,23,3],num_classes=Num_classes)

def ResNet152(Num_classes=10):
    return ResNet(Bottleneck, [3,8,36,3],num_classes=Num_classes)


def test():
    net = ResNet18()
    y = net(torch.randn(1,3,32,32))
    print(y.size())

# test()


================================================
FILE: GC_code/CIFAR100/models/resnext.py
================================================
'''ResNeXt in PyTorch.

See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class Block(nn.Module):
    '''Grouped convolution block.'''
    expansion = 2

    def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1):
        super(Block, self).__init__()
        group_width = cardinality * bottleneck_width
        self.conv1 = nn.Conv2d(in_planes, group_width, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(group_width)
        self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False)
        self.bn2 = nn.BatchNorm2d(group_width)
        self.conv3 = nn.Conv2d(group_width, self.expansion*group_width, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*group_width)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*group_width:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*group_width, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*group_width)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNeXt(nn.Module):
    def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10):
        super(ResNeXt, self).__init__()
        self.cardinality = cardinality
        self.bottleneck_width = bottleneck_width
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(num_blocks[0], 1)
        self.layer2 = self._make_layer(num_blocks[1], 2)
        self.layer3 = self._make_layer(num_blocks[2], 2)
        # self.layer4 = self._make_layer(num_blocks[3], 2)
        self.linear = nn.Linear(cardinality*bottleneck_width*8, num_classes)

    def _make_layer(self, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(Block(self.in_planes, self.cardinality, self.bottleneck_width, stride))
            self.in_planes = Block.expansion * self.cardinality * self.bottleneck_width
        # Increase bottleneck_width by 2 after each stage.
        self.bottleneck_width *= 2
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        # out = self.layer4(out)
        out = F.avg_pool2d(out, 8)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNeXt29_2x64d(Num_classes=10):
    return ResNeXt(num_blocks=[3,3,3], cardinality=2, bottleneck_width=64,num_classes=Num_classes)

def ResNeXt29_4x64d(Num_classes=10):
    return ResNeXt(num_blocks=[3,3,3], cardinality=4, bottleneck_width=64,num_classes=Num_classes)

def ResNeXt29_8x64d(Num_classes=10):
    return ResNeXt(num_blocks=[3,3,3], cardinality=8, bottleneck_width=64,num_classes=Num_classes)

def ResNeXt29_32x4d(Num_classes=10):
    return ResNeXt(num_blocks=[3,3,3], cardinality=32, bottleneck_width=4,num_classes=Num_classes)

def test_resnext():
    net = ResNeXt29_2x64d()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y.size())

# test_resnext()


================================================
FILE: GC_code/CIFAR100/models/senet.py
================================================
'''SENet in PyTorch.

SENet is the winner of ImageNet-2017. The paper is not released yet.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes)
            )

        # SE layers
        self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)  # Use nn.Conv2d instead of nn.Linear
        self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))

        # Squeeze
        w = F.avg_pool2d(out, out.size(2))
        w = F.relu(self.fc1(w))
        w = F.sigmoid(self.fc2(w))
        # Excitation
        out = out * w  # New broadcasting feature from v0.2!

        out += self.shortcut(x)
        out = F.relu(out)
        return out


class PreActBlock(nn.Module):
    def __init__(self, in_planes, planes, stride=1):
        super(PreActBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)

        if stride != 1 or in_planes != planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False)
            )

        # SE layers
        self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)
        self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)

    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))

        # Squeeze
        w = F.avg_pool2d(out, out.size(2))
        w = F.relu(self.fc1(w))
        w = F.sigmoid(self.fc2(w))
        # Excitation
        out = out * w

        out += shortcut
        return out


class SENet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(SENet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block,  64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def SENet18():
    return SENet(PreActBlock, [2,2,2,2])


def test():
    net = SENet18()
    y = net(torch.randn(1,3,32,32))
    print(y.size())

# test()


================================================
FILE: GC_code/CIFAR100/models/shufflenet.py
================================================
'''ShuffleNet in PyTorch.

See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class ShuffleBlock(nn.Module):
    def __init__(self, groups):
        super(ShuffleBlock, self).__init__()
        self.groups = groups

    def forward(self, x):
        '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''
        N,C,H,W = x.size()
        g = self.groups
        return x.view(N,g,C/g,H,W).permute(0,2,1,3,4).contiguous().view(N,C,H,W)


class Bottleneck(nn.Module):
    def __init__(self, in_planes, out_planes, stride, groups):
        super(Bottleneck, self).__init__()
        self.stride = stride

        mid_planes = out_planes/4
        g = 1 if in_planes==24 else groups
        self.conv1 = nn.Conv2d(in_planes, mid_planes, kernel_size=1, groups=g, bias=False)
        self.bn1 = nn.BatchNorm2d(mid_planes)
        self.shuffle1 = ShuffleBlock(groups=g)
        self.conv2 = nn.Conv2d(mid_planes, mid_planes, kernel_size=3, stride=stride, padding=1, groups=mid_planes, bias=False)
        self.bn2 = nn.BatchNorm2d(mid_planes)
        self.conv3 = nn.Conv2d(mid_planes, out_planes, kernel_size=1, groups=groups, bias=False)
        self.bn3 = nn.BatchNorm2d(out_planes)

        self.shortcut = nn.Sequential()
        if stride == 2:
            self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1))

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.shuffle1(out)
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        res = self.shortcut(x)
        out = F.relu(torch.cat([out,res], 1)) if self.stride==2 else F.relu(out+res)
        return out


class ShuffleNet(nn.Module):
    def __init__(self, cfg):
        super(ShuffleNet, self).__init__()
        out_planes = cfg['out_planes']
        num_blocks = cfg['num_blocks']
        groups = cfg['groups']

        self.conv1 = nn.Conv2d(3, 24, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(24)
        self.in_planes = 24
        self.layer1 = self._make_layer(out_planes[0], num_blocks[0], groups)
        self.layer2 = self._make_layer(out_planes[1], num_blocks[1], groups)
        self.layer3 = self._make_layer(out_planes[2], num_blocks[2], groups)
        self.linear = nn.Linear(out_planes[2], 10)

    def _make_layer(self, out_planes, num_blocks, groups):
        layers = []
        for i in range(num_blocks):
            stride = 2 if i == 0 else 1
            cat_planes = self.in_planes if i == 0 else 0
            layers.append(Bottleneck(self.in_planes, out_planes-cat_planes, stride=stride, groups=groups))
            self.in_planes = out_planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ShuffleNetG2():
    cfg = {
        'out_planes': [200,400,800],
        'num_blocks': [4,8,4],
        'groups': 2
    }
    return ShuffleNet(cfg)

def ShuffleNetG3():
    cfg = {
        'out_planes': [240,480,960],
        'num_blocks': [4,8,4],
        'groups': 3
    }
    return ShuffleNet(cfg)


def test():
    net = ShuffleNetG2()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y)

# test()


================================================
FILE: GC_code/CIFAR100/models/vgg.py
================================================
'''VGG11/13/16/19 in Pytorch.'''
import torch
import torch.nn as nn


cfg = {
    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}


class VGG(nn.Module):
    def __init__(self, vgg_name,Num_classes=100):
        super(VGG, self).__init__()
        self.features = self._make_layers(cfg[vgg_name])
        self.classifier = nn.Linear(512, Num_classes)

    def forward(self, x):
        out = self.features(x)
        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return out

    def _make_layers(self, cfg):
        layers = []
        in_channels = 3
        for x in cfg:
            if x == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
                           nn.BatchNorm2d(x),
                           nn.ReLU(inplace=True)]
                in_channels = x
        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
        return nn.Sequential(*layers)


def test():
    net = VGG('VGG11')
    x = torch.randn(2,3,32,32)
    y = net(x)
    print(y.size())

# test()


================================================
FILE: GC_code/CIFAR100/os_run.py
================================================

import os,time

#cifar100 sgd & sgdGCC

os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200  --model r50 > logout/r50_lr11_wd45_sgd.log ")

os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200  --model r50 > logout/r50_lr11_wd45_sgdGC.log ")


================================================
FILE: GC_code/Fine-grained_classification/SGD.py
================================================
import torch
from torch.optim.optimizer import Optimizer, required


class SGD_GCC(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGD_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGD_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)

                #GC operation for Conv layers
                if len(list(d_p.size()))>3:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
                   
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

        return loss

class SGD_GC(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGD_GC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGD_GC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)

                #GC operation for Conv layers and FC layers
                if len(list(d_p.size()))>1:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))

                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

        return loss


class SGDW(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGDW, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGDW, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                old = torch.clone(p.data).detach()
                #if weight_decay != 0:
                #    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

                if weight_decay != 0:
                    p.data.add_(-weight_decay*group['lr'], old)

        return loss


class SGDW_GCC(Optimizer):
    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGDW_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGDW_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                old = torch.clone(p.data).detach()
                #if weight_decay != 0:
                #    d_p.add_(weight_decay, p.data)
                
                #GC operation for Conv layers
                if len(list(d_p.size()))>3:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))


                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

                if weight_decay != 0:
                    p.data.add_(-weight_decay*group['lr'], old)

        return loss


================================================
FILE: GC_code/Fine-grained_classification/main.py
================================================
import argparse
import os
import random
import shutil
import time
import warnings
import sys

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models

from torch.optim import lr_scheduler

from SGD import SGD_GC #import SGD with GC


model_names = sorted(name for name in models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(models.__dict__[name]))

parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')

parser.add_argument('-b', '--batch-size', default=256, type=int,
                    metavar='N',
                    help='mini-batch size (default: 256), this is the total '
                         'batch size of all GPUs on the current node when '
                         'using Data Parallel or Distributed Data Parallel')

parser.add_argument('--lr', '--learning-rate', default=0.1*128/128, type=float,
                    metavar='LR', help='initial learning rate', dest='lr')

parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',
                    choices=model_names,
                    help='model architecture: ' +
                        ' | '.join(model_names) +
                        ' (default: resnet18)')

parser.add_argument('data', metavar='DIR',
                    help='path to dataset')

parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
                    help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=100, type=int, metavar='N',
                    help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                    help='manual epoch number (useful on restarts)')

parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                    help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
                    metavar='W', help='weight decay (default: 1e-4)',
                    dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=100, type=int,
                    metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
                    help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                    help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                    help='use pre-trained model')
parser.add_argument('--world-size', default=-1, type=int,
                    help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
                    help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
                    help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
                    help='distributed backend')
parser.add_argument('--seed', default=None, type=int,
                    help='seed for initializing training. ')
parser.add_argument('--gpu', default=None, type=int,
                    help='GPU id to use.')
parser.add_argument('--multiprocessing-distributed', action='store_true',
                    help='Use multi-processing distributed training to launch '
                         'N processes per node, which has N GPUs. This is the '
                         'fastest way to use PyTorch for either single node or '
                         'multi node data parallel training')
parser.add_argument('--model', default='r50p', type=str, help='model')

parser.add_argument('--path', default='test', type=str, help='model')
parser.add_argument('--alg', default='sgd', type=str, help='algorithm')

parser.add_argument('--dataset', default='cub', type=str, help='model')

best_acc1 = 0

def main():
    args = parser.parse_args()
    os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')

    if args.dist_url == "env://" and args.world_size == -1:
        args.world_size = int(os.environ["WORLD_SIZE"])

    args.distributed = args.world_size > 1 or args.multiprocessing_distributed

    ngpus_per_node = torch.cuda.device_count()
    if args.multiprocessing_distributed:
        # Since we have ngpus_per_node processes per node, the total world_size
        # needs to be adjusted accordingly
        args.world_size = ngpus_per_node * args.world_size
        # Use torch.multiprocessing.spawn to launch distributed processes: the
        # main_worker process function
        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
    else:
        # Simply call main_worker function
        main_worker(args.gpu, ngpus_per_node, args)


def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    class_num={'cub':200,'cars':196,'dogs':120,'fgvc':100}
    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)
    # create model
    if args.model=='r18p':
      model =models.resnet18(pretrained=True)
      model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True)
    if args.model=='r18':
      model =models.resnet18()
      model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True)
    if args.model=='r50p':
      model =models.resnet50(pretrained=True)
      model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True)
    if args.model=='r50':
      model =models.resnet50()
      model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True)


    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(args.workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(model)
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    # choose optimizer
    if   args.model=='r50p' or args.model=='r50':
        new_param_ids = set(map(id, model.module.fc.parameters()))
    base_params = [p for p in model.parameters() if
            id(p) not in new_param_ids]
    param_groups_base =[{'params': base_params, 'lr_mult': 0.1}]

    if   args.model=='r50p' or args.model=='r50':
       param_groups_new=[{'params': model.module.fc.parameters(), 'lr_mult': 1.0}]

    if args.alg=='sgd':
       optimizer_base = torch.optim.SGD(param_groups_base, args.lr, momentum=args.momentum,weight_decay=args.weight_decay)
       optimizer_new= torch.optim.SGD(param_groups_new, args.lr, momentum=args.momentum,weight_decay=args.weight_decay)
    if args.alg=='sgdGC':
       optimizer_base = SGD_GC(param_groups_base, args.lr, momentum=args.momentum,weight_decay=args.weight_decay)
       optimizer_new= SGD_GC(param_groups_new, args.lr, momentum=args.momentum,weight_decay=args.weight_decay)

    exp_lr_scheduler_new = lr_scheduler.MultiStepLR(optimizer_new, milestones=[50,80], gamma=0.1)
    exp_lr_scheduler_base = lr_scheduler.MultiStepLR(optimizer_base, milestones=[50,80], gamma=0.1)
    
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
                    transforms.Resize(512),
                    transforms.RandomHorizontalFlip(),
                    transforms.CenterCrop(448),
                    transforms.ToTensor(),
                    normalize,
                ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler,drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
                    transforms.Resize(512),
                    transforms.CenterCrop(448),
                    transforms.ToTensor(),
                    normalize,
                ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True,drop_last=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        #adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer_base, optimizer_new,epoch, args)
        #exp_lr_scheduler.step()
        exp_lr_scheduler_new.step()
        exp_lr_scheduler_base.step()
        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                #'optimizer' : optimizer.state_dict(),
            }, is_best)
        #torch.save(model.module, './result_model/'+args.path+'.pth')


# train
def train(train_loader, model, criterion, optimizer_base, optimizer_new, epoch, args):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()
    total = 0
    train_loss = 0
    correct = 0
    # switch to train mode
    model.train()
    print('\nEpoch: %d' % epoch)
    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        #if args.gpu is not None:
         #input = input.cuda(args.gpu, non_blocking=True)
        #target = target.cuda(args.gpu, non_blocking=True)
        input, target = input.to('cuda'), target.to('cuda')


        # compute output
        output = model(input)
        loss = criterion(output, target)

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), input.size(0))
        top1.update(acc1[0], input.size(0))
        top5.update(acc5[0], input.size(0))

        _, predicted = output.max(1)
        correct += predicted.eq(target).sum().item()

        train_loss += loss.item()
        #correct +=acc1[0]
        total += target.size(0)
        # compute gradient and do SGD step
        optimizer_new.zero_grad()
        optimizer_base.zero_grad()
        loss.backward()
        optimizer_new.step()
        optimizer_base.step()
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
    print('Training: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5))
    #print('Training: Loss: {:.3f} | Acc: {:.3f}'.format(train_loss/(i+1),correct/total))

# test
def validate(val_loader, model, criterion, args):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    val_loss = 0
    total = 0
    correct = 0
    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (input, target) in enumerate(val_loader):
            if args.gpu is not None:
                input = input.cuda(args.gpu, non_blocking=True)
            target = target.cuda(args.gpu, non_blocking=True)

            # compute output
            output = model(input)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), input.size(0))
            top1.update(acc1[0], input.size(0))
            top5.update(acc5[0], input.size(0))

            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()

            val_loss +=loss.item()
            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            val_loss += loss.item()

        print('Testing: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5))
    return top1.avg


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def adjust_learning_rate(optimizer, epoch, args):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = args.lr * (0.1 ** (epoch // 30))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(1.0 / batch_size))
        return res


if __name__ == '__main__':
    main()


================================================
FILE: GC_code/Fine-grained_classification/os_run.py
================================================

import os,time


os.system("nohup python -W ignore main.py /home/yonghw/data/data/CUB_200_2011/ --model r50p -b 128 --alg sgd --dataset cub  > logout/Cub_r50p_sgd_b128_g4.log ")
os.system("nohup python -W ignore main.py /home/yonghw/data/data/CUB_200_2011/ --model r50p -b 128  --alg sgdGC --dataset cub > logout/Cub_r50p_sgdGC_b128_g4.log ")

os.system("nohup python -W ignore main.py /home/yonghw/data/data/Car196/ --model r50p -b 128 --alg sgd --dataset cars > logout/Car_r50p_sgd_b128_g4.log ")
os.system("nohup python -W ignore main.py /home/yonghw/data/data/Car196/ --model r50p -b 128 --alg sgdGC --dataset cars> logout/Car_r50p_sgdGC_b128_g4.log ")

os.system("nohup python -W ignore main.py /home/yonghw/data/data/fgvc_aricraft/ --model r50p  -b 128 --alg sgd --dataset fgvc > logout/Ari_r50p_sgd_b128_g4.log ")
os.system("nohup python -W ignore main.py /home/yonghw/data/data/fgvc_aricraft/ --model r50p  -b 128 --alg sgdGC --dataset fgvc > logout/Ari_r50p_sgdGC_b128_g4.log ")

os.system("nohup python -W ignore main.py /home/yonghw/data/data/StanfordDogs/ --model r50p  -b 128  --alg sgd --dataset dogs > logout/Dog_r50p_sgd_b128_g4.log ")
os.system("nohup python -W ignore main.py /home/yonghw/data/data/StanfordDogs/ --model r50p  -b 128  --alg sgdGC --dataset dogs > logout/Dog_r50p_sgdGC_b128_g4.log ")


================================================
FILE: GC_code/ImageNet/SGD.py
================================================
import torch
from torch.optim.optimizer import Optimizer, required


class SGD_GCC(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGD_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGD_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)

                #GC operation for Conv layers
                if len(list(d_p.size()))>3:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
                   
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

        return loss

class SGD_GC(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGD_GC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGD_GC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)

                #GC operation for Conv layers and FC layers
                if len(list(d_p.size()))>1:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))

                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

        return loss


class SGDW(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGDW, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGDW, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                old = torch.clone(p.data).detach()
                #if weight_decay != 0:
                #    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

                if weight_decay != 0:
                    p.data.add_(-weight_decay*group['lr'], old)

        return loss


class SGDW_GCC(Optimizer):
    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGDW_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGDW_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                old = torch.clone(p.data).detach()
                #if weight_decay != 0:
                #    d_p.add_(weight_decay, p.data)
                
                #GC operation for Conv layers
                if len(list(d_p.size()))>3:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))


                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

                if weight_decay != 0:
                    p.data.add_(-weight_decay*group['lr'], old)

        return loss


================================================
FILE: GC_code/ImageNet/main.py
================================================
import argparse
import os
import random
import shutil
import time
import warnings
import sys
#nohup python -W ignore main.py /mnt/v0/ --model r50bn --alg sgd1 -b 256 --gpug 1 --path r50bn_sgd1_b256_g4 > logout/r50bn_sgd1_b256_g4.log
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
#from myresnet_nbn import resnet18_nbn, resnet101_nbn,resnet50_nbn
from myresnet import resnet50, resnet101
from myresnetgn import resnet50gn, resnet101gn


from torch.optim import lr_scheduler


from SGD import SGD_GCC #import SGD with GC for Conv layer


model_names = sorted(name for name in models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(models.__dict__[name]))

parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')

parser.add_argument('-b', '--batch-size', default=256, type=int,
                    metavar='N',
                    help='mini-batch size (default: 256), this is the total '
                         'batch size of all GPUs on the current node when '
                         'using Data Parallel or Distributed Data Parallel')

parser.add_argument('--lr', '--learning-rate', default=0.1*128/128, type=float,
                    metavar='LR', help='initial learning rate', dest='lr')

parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',
                    choices=model_names,
                    help='model architecture: ' +
                        ' | '.join(model_names) +
                        ' (default: resnet18)')

parser.add_argument('data', metavar='DIR',
                    help='path to dataset')

parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
                    help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=100, type=int, metavar='N',
                    help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                    help='manual epoch number (useful on restarts)')

parser.add_argument('--bgn', default=1, type=int, help='bn group number')

parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                    help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
                    metavar='W', help='weight decay (default: 1e-4)',
                    dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=100, type=int,
                    metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
                    help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                    help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                    help='use pre-trained model')
parser.add_argument('--world-size', default=-1, type=int,
                    help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
                    help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
                    help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
                    help='distributed backend')
parser.add_argument('--seed', default=None, type=int,
                    help='seed for initializing training. ')
parser.add_argument('--gpu', default=None, type=int,
                    help='GPU id to use.')
parser.add_argument('--multiprocessing-distributed', action='store_true',
                    help='Use multi-processing distributed training to launch '
                         'N processes per node, which has N GPUs. This is the '
                         'fastest way to use PyTorch for either single node or '
                         'multi node data parallel training')
parser.add_argument('--model', default='r50bn', type=str, help='model')
parser.add_argument('--path', default='test', type=str, help='model')
parser.add_argument('--alg', default='sgd', type=str, help='algorithm')


best_acc1 = 0
device_ids=[0,1,2,3,4,5,6,7]

def main():
    args = parser.parse_args()
    os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"


    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')

    if args.dist_url == "env://" and args.world_size == -1:
        args.world_size = int(os.environ["WORLD_SIZE"])

    args.distributed = args.world_size > 1 or args.multiprocessing_distributed

    ngpus_per_node = torch.cuda.device_count()
    if args.multiprocessing_distributed:
        # Since we have ngpus_per_node processes per node, the total world_size
        # needs to be adjusted accordingly
        args.world_size = ngpus_per_node * args.world_size
        # Use torch.multiprocessing.spawn to launch distributed processes: the
        # main_worker process function
        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
    else:
        # Simply call main_worker function
        main_worker(args.gpu, ngpus_per_node, args)


def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)
    # create model
    if args.model=='r50bn':
      model = resnet50()
    if args.model=='r50gn':
      model = resnet50gn()

    if args.model=='r101bn':
      model = resnet101()
    if args.model=='r101gn':
      model = resnet101gn()


    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(args.workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(model)
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)
    
    # choose optimizer
    if args.alg=='sgd':
      optimizer =torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum,weight_decay=args.weight_decay)
    if args.alg=='sgdGC':
      optimizer = SGD_GCC(model.parameters(), args.lr, momentum=args.momentum,weight_decay=args.weight_decay)

    exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler,drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True,drop_last=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        #adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)
        exp_lr_scheduler.step()
        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer' : optimizer.state_dict(),
            }, is_best)
        torch.save(model.module, './result_model/'+args.path+'.pth')

# train
def train(train_loader, model, criterion, optimizer, epoch, args):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()
    total = 0
    train_loss = 0
    correct = 0
    # switch to train mode
    model.train()
    print('\nEpoch: %d' % epoch)
    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        #if args.gpu is not None:
         #input = input.cuda(args.gpu, non_blocking=True)
        #target = target.cuda(args.gpu, non_blocking=True)
        input, target = input.to('cuda'), target.to('cuda')

        # compute output
        output = model(input)
        loss = criterion(output, target)

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), input.size(0))
        top1.update(acc1[0], input.size(0))
        top5.update(acc5[0], input.size(0))

        _, predicted = output.max(1)
        correct += predicted.eq(target).sum().item()

        train_loss += loss.item()
        #correct +=acc1[0]
        total += target.size(0)
        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
    print('Training: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5))
    #print('Training: Loss: {:.3f} | Acc: {:.3f}'.format(train_loss/(i+1),correct/total))

# validate
def validate(val_loader, model, criterion, args):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    val_loss = 0
    total = 0
    correct = 0
    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (input, target) in enumerate(val_loader):
            if args.gpu is not None:
                input = input.cuda(args.gpu, non_blocking=True)
            target = target.cuda(args.gpu, non_blocking=True)

            # compute output
            output = model(input)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), input.size(0))
            top1.update(acc1[0], input.size(0))
            top5.update(acc5[0], input.size(0))

            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()

            val_loss +=loss.item()
            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            val_loss += loss.item()
        print('Testing: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5))
    return top1.avg


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def adjust_learning_rate(optimizer, epoch, args):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = args.lr * (0.1 ** (epoch // 30))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(1.0 / batch_size))
        return res


if __name__ == '__main__':
    main()


================================================
FILE: GC_code/ImageNet/myresnet.py
================================================
from __future__ import print_function, division, absolute_import
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo


__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
           'resnet152']


model_urls = {
    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}


def conv3x3(in_planes, out_planes, stride=1):
    "3x3 convolution with padding"
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=True)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=True)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=True)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=True)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

#from torch.legacy import nn as nnl

class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                                bias=True)
        #self.conv1 = nnl.SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AvgPool2d(7)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=True),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        self.conv1_input = x.clone()
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x


def resnet18(pretrained=False, **kwargs):
    """Constructs a ResNet-18 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
    return model


def resnet34(pretrained=False, **kwargs):
    """Constructs a ResNet-34 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
    return model


def resnet50(pretrained=False, **kwargs):
    """Constructs a ResNet-50 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
    return model


def resnet101(pretrained=False, **kwargs):
    """Constructs a ResNet-101 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
    return model


def resnet152(pretrained=False, **kwargs):
    """Constructs a ResNet-152 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
    return model


def test():
    net = resnet18()
    net.eval()
    x=Variable(torch.randn(2,3,224,224))
    y = net(x)
    print(y.size())
    print(net)
#test()


================================================
FILE: GC_code/ImageNet/myresnetgn.py
================================================
from __future__ import print_function, division, absolute_import
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo


__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
           'resnet152']


model_urls = {
    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}


def conv3x3(in_planes, out_planes, stride=1):
    "3x3 convolution with padding"
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=True)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.GroupNorm(32,planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.GroupNorm(32,planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=True)
        self.bn1 = nn.GroupNorm(32,planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=True)
        self.bn2 = nn.GroupNorm(32,planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=True)
        self.bn3 = nn.GroupNorm(32,planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

#from torch.legacy import nn as nnl

class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                                bias=True)
        #self.conv1 = nnl.SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3)
        self.bn1 = nn.GroupNorm(32,64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AvgPool2d(7)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.GroupNorm):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=True),
                nn.GroupNorm(32,planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        self.conv1_input = x.clone()
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x


def resnet18gn(pretrained=False, **kwargs):
    """Constructs a ResNet-18 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
    return model


def resnet34gn(pretrained=False, **kwargs):
    """Constructs a ResNet-34 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
    return model


def resnet50gn(pretrained=False, **kwargs):
    """Constructs a ResNet-50 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
    return model


def resnet101gn(pretrained=False, **kwargs):
    """Constructs a ResNet-101 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
    return model


def resnet152gn(pretrained=False, **kwargs):
    """Constructs a ResNet-152 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
    return model


def test():
    net = resnet18gn()
    net.eval()
    x=torch.randn(2,3,224,224)
    y = net(x)
    print(y.size())
    print(net)
#test()


================================================
FILE: GC_code/ImageNet/os_run.py
================================================

import os,time


os.system("#nohup python -W ignore main.py /ssd/data/yonghw/Imagenet/v0/ --model r50bn --alg sgd -b 256 --path r50bn_sgd_b256_g4 > logout/r50bn_sgd_b256_g4.log &")

os.system("#nohup python -W ignore main.py /ssd/data/yonghw/Imagenet/v0/ --model r50bn --alg sgdGC -b 256 --path r50bn_sgdGC_b256_g4 > logout/r50bn_sgdGC_b256_g4.log &")


================================================
FILE: GC_code/Mini_ImageNet/SGD.py
================================================
import torch
from torch.optim.optimizer import Optimizer, required


class SGD_GCC(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGD_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGD_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)

                #GC operation for Conv layers
                if len(list(d_p.size()))>3:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
                   
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

        return loss

class SGD_GC(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGD_GC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGD_GC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)

                #GC operation for Conv layers and FC layers
                if len(list(d_p.size()))>1:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))

                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

        return loss


class SGDW(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGDW, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGDW, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                old = torch.clone(p.data).detach()
                #if weight_decay != 0:
                #    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

                if weight_decay != 0:
                    p.data.add_(-weight_decay*group['lr'], old)

        return loss


class SGDW_GCC(Optimizer):
    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGDW_GCC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGDW_GCC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                old = torch.clone(p.data).detach()
                #if weight_decay != 0:
                #    d_p.add_(weight_decay, p.data)
                
                #GC operation for Conv layers
                if len(list(d_p.size()))>3:
                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))


                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

                if weight_decay != 0:
                    p.data.add_(-weight_decay*group['lr'], old)

        return loss


================================================
FILE: GC_code/Mini_ImageNet/main.py
================================================
import argparse
import os
import random
import shutil
import time
import warnings
import sys
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from resnet_ws import l_resnet50

import torchvision.models as models
import math
import numpy as np
from torch.optim import lr_scheduler


from SGD import SGD_GC #import SGD with GC

model_names = sorted(name for name in models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(models.__dict__[name]))

parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')

parser.add_argument('-b', '--batch_size', default=256, type=int,
                    metavar='N',
                    help='mini-batch size (default: 256), this is the total '
                         'batch size of all GPUs on the current node when '
                         'using Data Parallel or Distributed Data Parallel')

parser.add_argument('--lr', '--learning-rate', default=0.1*32/32, type=float,
                    metavar='LR', help='initial learning rate', dest='lr')

parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',
                    choices=model_names,
                    help='model architecture: ' +
                        ' | '.join(model_names) +
                        ' (default: resnet18)')

parser.add_argument('data', metavar='DIR',
                    help='path to dataset')

parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
                    help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=100, type=int, metavar='N',
                    help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                    help='manual epoch number (useful on restarts)')

parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                    help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
                    metavar='W', help='weight decay (default: 1e-4)',
                    dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=100, type=int,
                    metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
                    help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                    help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                    help='use pre-trained model')
parser.add_argument('--world-size', default=-1, type=int,
                    help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
                    help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
                    help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
                    help='distributed backend')
parser.add_argument('--seed', default=None, type=int,
                    help='seed for initializing training. ')
parser.add_argument('--gpu', default=None, type=int,
                    help='GPU id to use.')
parser.add_argument('--multiprocessing-distributed', action='store_true',
                    help='Use multi-processing distributed training to launch '
                         'N processes per node, which has N GPUs. This is the '
                         'fastest way to use PyTorch for either single node or '
                         'multi node data parallel training')
parser.add_argument('--model', default='r18', type=str, help='model')
parser.add_argument('--path', default='test', type=str, help='model')
parser.add_argument('--alg', default='sgd', type=str, help='model')


best_acc1 = 0
device_ids=[0,1,2,3,4,5,6,7]

def main():
    args = parser.parse_args()
    os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')

    if args.dist_url == "env://" and args.world_size == -1:
        args.world_size = int(os.environ["WORLD_SIZE"])

    args.distributed = args.world_size > 1 or args.multiprocessing_distributed

    ngpus_per_node = torch.cuda.device_count()
    if args.multiprocessing_distributed:
        # Since we have ngpus_per_node processes per node, the total world_size
        # needs to be adjusted accordingly
        args.world_size = ngpus_per_node * args.world_size
        # Use torch.multiprocessing.spawn to launch distributed processes: the
        # main_worker process function
        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
    else:
        # Simply call main_worker function
        main_worker(args.gpu, ngpus_per_node, args)


def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu
    #momentum=pow(math.e,math.log(0.9)/64*args.batch_size/ngpus_per_node/args.bgn)
    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)


    # create model
    num_classes=100
    if args.model=='r50':
        model = models.resnet50()
        model.fc= nn.Linear(in_features=2048, out_features=num_classes, bias=True)
    if args.model=='r50ws':
      model =l_resnet50(num_classes=num_classes)

    for m in model.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.GroupNorm):
                m.weight.data.uniform_()
                m.bias.data.zero_()

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(args.workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(model)
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) 
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)


    # choose optimizer
    if args.alg=='sgd':
       optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,momentum=args.momentum, weight_decay = args.weight_decay)
    if args.alg=='sgdGC':
      optimizer = SGD_GC(model.parameters(), lr=args.lr,momentum=args.momentum, weight_decay = args.weight_decay)


    exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)


    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
         ]))


    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler,drop_last=True)


    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        #adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)
        exp_lr_scheduler.step()
        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

#        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
#                and args.rank % ngpus_per_node == 0):
#            save_checkpoint({
#                'epoch': epoch + 1,
#                'arch': args.arch,
#                'state_dict': model.state_dict(),
#                'best_acc1': best_acc1,
#                'optimizer' : optimizer.state_dict(),
#            }, is_best)
            #torch.save(model.module, './result_model/'+args.path+'.pth')

#train
def train(train_loader, model, criterion, optimizer, epoch, args):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()
    total = 0
    train_loss = 0
    correct = 0
    # switch to train mode
    model.train()
    print('\nEpoch: %d' % epoch)
    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        #if args.gpu is not None:
         #input = input.cuda(args.gpu, non_blocking=True)
        #target = target.cuda(args.gpu, non_blocking=True)
        input, target = input.to('cuda'), target.to('cuda')


        # compute output
        output = model(input)
        loss = criterion(output, target)

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), input.size(0))
        top1.update(acc1[0], input.size(0))
        top5.update(acc5[0], input.size(0))

        _, predicted = output.max(1)
        correct += predicted.eq(target).sum().item()

        train_loss += loss.item()
        #correct +=acc1[0]
        total += target.size(0)
        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

    print('Training: Top1: {top1.avg:.4f}|loss:{losses.avg:.4f}'.format(top1=top1, losses=losses))
    #print('Training: top1: {:.4f} '.format(correct/total))

# test
def validate(val_loader, model, criterion, args):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    val_loss = 0
    total = 0
    correct = 0
    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (input, target) in enumerate(val_loader):
            if args.gpu is not None:
                input = input.cuda(args.gpu, non_blocking=True)
            target = target.cuda(args.gpu, non_blocking=True)

            # compute output
            output = model(input)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), input.size(0))
            top1.update(acc1[0], input.size(0))
            top5.update(acc5[0], input.size(0))

            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()

            val_loss +=loss.item()
            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            val_loss += loss.item()
        print('Testing: Top1: {top1.avg:.4f}|loss:{losses.avg:.4f}'.format(top1=top1, losses=losses))
        #print('Testing: top1: {:.4f} '.format(correct/total))
    return top1.avg


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def adjust_learning_rate(optimizer, epoch, args):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = args.lr * (0.1 ** (epoch // 30))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(1.0 / batch_size))
        return res


if __name__ == '__main__':
    main()


================================================
FILE: GC_code/Mini_ImageNet/os_run.py
================================================
#cifar100 e200 bs128  gs  2,4,8,16
import os,time

#print('runing mini_imagenet.py')


os.system("nohup  python -W ignore main.py /home/yonghw/data/mini_imagenet/split_mini/ --model r50  -b 128 --alg sgd   > logout/r50_b128_sgd.log  ")

os.system("nohup  python -W ignore main.py /home/yonghw/data/mini_imagenet/split_mini/ --model r50  -b 128 --alg sgdGC   > logout/r50_b128_sgdGC.log  ")

os.system("nohup  python -W ignore main.py /home/yonghw/data/mini_imagenet/split_mini/ --model r50ws  -b 128 --alg sgd   > logout/r50ws_b128_sgd.log  ")

os.system("nohup  python -W ignore main.py /home/yonghw/data/mini_imagenet/split_mini/ --model r50ws  -b 128 --alg sgdGC   > logout/r50ws_b128_sgdGC.log  ")


================================================
FILE: GC_code/Mini_ImageNet/resnet_ws.py
================================================
import torch.nn as nn
import torch.utils.model_zoo as model_zoo

import torch
import torch.nn as nn
from torch.nn.parameter import Parameter
from torch.nn import functional as F

#from .. import layers as L
import math

__all__ = ['ResNet', 'l_resnet18', 'l_resnet34', 'l_resnet50', 'l_resnet101',
           'l_resnet152']


class Conv2d(nn.Conv2d):

    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                 padding=0, dilation=1, groups=1, bias=True):
        super(Conv2d, self).__init__(in_channels, out_channels, kernel_size, stride,
                 padding, dilation, groups, bias)

    def forward(self, x):
        # return super(Conv2d, self).forward(x)
        weight = self.weight
        weight_mean = weight.mean(dim=1, keepdim=True).mean(dim=2,
                                  keepdim=True).mean(dim=3, keepdim=True)
        weight = weight - weight_mean
        std = weight.view(weight.size(0), -1).std(dim=1).view(-1, 1, 1, 1) + 1e-5
        weight = weight / std.expand_as(weight)
        return F.conv2d(x, weight, self.bias, self.stride,
                        self.padding, self.dilation, self.groups)


def BatchNorm2d(num_features):

    #return nn.GroupNorm(num_channels=num_features, num_groups=32)
    return nn.BatchNorm2d(num_features=num_features)


def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = conv1x1(inplanes, planes)
        self.bn1 = BatchNorm2d(planes)
        self.conv2 = conv3x3(planes, planes, stride)
        self.bn2 = BatchNorm2d(planes)
        self.conv3 = conv1x1(planes, planes * self.expansion)
        self.bn3 = BatchNorm2d(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False):
        super(ResNet, self).__init__()
        self.inplanes = 64
        self.conv1 = Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, Conv2d):
                #nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m,nn.BatchNorm2d):
                #nn.init.constant_(m.weight, 1)
                #nn.init.constant_(m.bias, 0)
                m.weight.data.uniform_()
                m.bias.data.zero_()


        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x


def l_resnet18(pretrained=False, **kwargs):
    """Constructs a ResNet-18 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
    return model


def l_resnet34(pretrained=False, **kwargs):
    """Constructs a ResNet-34 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
    return model


def l_resnet50(pretrained=False, **kwargs):
    """Constructs a ResNet-50 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    return model


def l_resnet101(pretrained=False, **kwargs):
    """Constructs a ResNet-101 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
    return model


def l_resnet152(pretrained=False, **kwargs):
    """Constructs a ResNet-152 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
    return model


================================================
FILE: README.md
================================================
# Gradient Centralization

## [Gradient Centralization: A New Optimization Technique for Deep Neural Networks](https://arxiv.org/abs/2004.01461)

***

## Introduction

* Gradient Centralization (GC) is a simple and effective optimization technique for Deep Neural Networks (DNNs), which operates directly on gradients by centralizing the gradient vectors to have zero mean. It can both speedup training process and improve the final generalization performance of DNNs. GC is very simple to implement and can be easily embedded into existing gradient based DNN optimizers with only few lines of code. It can also be directly used to finetune the pre-trained DNNs. Please refer to the [algorithm-GC](https://github.com/Yonghongwei/Gradient-Centralization/blob/master/algorithm-GC/) to obtain the codes of more advanced optimizers.

<div  align="center"><img src="https://github.com/Yonghongwei/Gradient-Centralization/blob/master/fig/gradient.png" height="45%" width="45%" alt="Illustration of the GC operation on gradient matrix/tensor of weights in the fully-connected layer (left) and convolutional layer (right)."/></div>

* GC can be viewed as a projected gradient descent method with a constrained loss function.  The Lipschitzness of the constrained loss function and its gradient is better so that the training process becomes more efficient and stable.   Our experiments on various applications, including `general image classification`, `fine-grained image classification`, `detection and segmentation` and `Person ReID` demonstrate that GC can consistently improve the performance of DNN learning. 

<div  align="center"><img src="https://github.com/Yonghongwei/Gradient-Centralization/blob/master/fig/projected_Grad.png" height="50%" width="50%" alt=""/></div>

* The optimizers are provided in the files: [`SGD.py`](https://github.com/Yonghongwei/Gradient-Centralization/blob/master/GC_code/CIFAR100/algorithm/SGD.py), [`Adam.py`](https://github.com/Yonghongwei/Gradient-Centralization/blob/master/GC_code/CIFAR100/algorithm/Adam.py) and [`Adagrad.py`](https://github.com/Yonghongwei/Gradient-Centralization/blob/master/GC_code/CIFAR100/algorithm/Adagrad.py), including SGD_GC, SGD_GCC, SGDW_GCC, Adam_GC, Adam_GCC, Adam_GCC2, AdamW_GCC, AdamW_GCC2  and Adagrad_GCC. The optimizers with "_GC" use GC for both Conv layers and FC layers, and the optimizers with "_GCC" use GC only for Conv layers. For adaptive learning rate methods, keeping mean of weight vector unchanged usually works better. Please refer to Adam_GCC2 and AdamW_GCC2. We can use the following codes to import SGD_GC:
```python
from SGD import SGD_GC 
```

***

## Update
* 2020/04/07:Release a pytorch implementation of optimizers with GC, and provide some examples on classification task, including
general image classification (Mini-ImageNet,  CIFAR100 and ImageNet) and Fine-grained image classification (FGVC Aircraft， Stanford Cars， Stanford  Dogs and CUB-200-2011).

* 2020/04/14:Release the code of GC on MMdetection and update some tables of experimental results.

* 2020/05/07:Release the code of GC on Person ReID and show some results on Market1501.

* 2020/08/08:Release the code of some advanced optimizers with GC.
***

## Citation
    @article{GradientCentra,
      title={Gradient-Centralization: A New Optimization Technique for Deep Neural Networks},
      author={Hongwei Yong and Jianqiang Huang and Xiansheng Hua and Lei Zhang},
      booktitle={the European Conference on Conputer Vision},
      year={2020}
    }

***
## Link to the other implementation of GC
* Gradient Centralization in TensorFlow [`https://github.com/Rishit-dagli/Gradient-Centralization-TensorFlow`](https://github.com/Rishit-dagli/Gradient-Centralization-TensorFlow)
* Gradient Centralization in Ranger optimizer [`https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer`](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer)


## Experiments
***

### General Image Classification
* Mini-ImageNet

The codes are in [`GC_code/Mini_ImageNet`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/GC_code/Mini_ImageNet). The split dataset can be downloaded from [here](https://drive.google.com/open?id=1XWRjPzwRWChNgvemqsylYM1ocpxhGtfy) (Google drive) or [here](https://pan.baidu.com/s/1Ah6Lu8OSfAVc3PZM-mPpvw) (Baidu drive, safe code: 1681). The following figure  is training loss (left) and testing accuracy (right) curves vs. training epoch on the Mini-ImageNet. The ResNet50 is used as the DNN model. The compared optimization techniques include BN, BN+GC, BN+WS and BN+WS+GC.

<div  align="center"><img src="https://github.com/Yonghongwei/Gradient-Centralization/blob/master/fig/miniIN_largeBN.png" height="60%" width="60%" alt=""/></div>

*  CIFAR100

The codes are in [`GC_code/CIFAR100`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/GC_code/CIFAR100).

*  ImageNet

The codes are in [`GC_code/ImageNet`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/GC_code/ImageNet). The following table is the Top-1 error rates on ImageNet w/o GC and w/ GC:
    
|Backbone       |  R50BN        |R50GN         | R101BN      | R101GN      |
| :-----------: | :-----------: | :----:       |:------:     |:-------:    |
| w/o GC        | 23.71         |24.50         |22.37        |23.34        |
| w/ GC         | 23.21         |23.53         |21.82        |22.14        |

The following figure  is the training error (left) and validation error (right) curves vs. training epoch on
ImageNet. The DNN model is ResNet50 with GN.
<div  align="center"><img src="https://github.com/Yonghongwei/Gradient-Centralization/blob/master/fig/Imagnet_r50GN2.png" height="60%" width="60%" alt=""/></div>


***

### Fine-grained Image Classification
The codes are in [`GC_code/Fine-grained_classification`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/GC_code/Fine-grained_classification).  The preprocessed dataset can be downloaded from [here](https://drive.google.com/open?id=1c3OnKq3EsMKK1OerWdouCG7hvN8Rv8yh). The following table is the testing accuracies on the four fine-grained image classification datasets with ResNet50:

|Datesets       | FGVC Aircraft |Stanford Cars |Stanford Dogs| CUB-200-2011|
| :-----------: | :-----------: | :----:       |:------:     |:-------:    |
| w/o GC        | 86.62         |88.66         |76.16        |82.07        |
| w/ GC         | 87.77         |90.03         |78.23        |83.40        |

The following figure is the training accuracy (solid line) and testing accuracy (dotted line) curves vs. training epoch on four fine-grained image classification datasets:

<div  align="center"><img src="https://github.com/Yonghongwei/Gradient-Centralization/blob/master/fig/fine_grid2_c.png" height="100%" width="100%" alt=""/></div>

***

### Objection Detection and Segmentation
The codes are in [`MMdetection`](https://github.com/Yonghongwei/mmdetection). Please let [`SGD.py`](https://github.com/Yonghongwei/mmdetection/blob/master/tools/SGD.py) in [`MMdetection\tools\`](https://github.com/Yonghongwei/mmdetection/tree/master/tools), and update [`MMdetection\tools\train.py`](https://github.com/Yonghongwei/mmdetection/blob/master/tools/train.py). Then if you want use SGD_GC optimizer, just update optimizer in the [`configs`](https://github.com/Yonghongwei/mmdetection/blob/master/configs/) file. For example, if we want use SGD_GC to optimize Faster_RCNN with ResNet50 backbone and FPN, we update the 151th line in [`MMdetection/configs/faster_rcnn_r50_fpn_1x.py`](https://github.com/Yonghongwei/mmdetection/blob/master/configs/faster_rcnn_r50_fpn_1x.py). The following table is the detection results on COCO by using Faster-RCNN and FPN with various backbone models:

| Method        | Backbone      |  AP   | AP<sub>.5</sub> | AP<sub>.75</sub> | Backbone |  AP  | AP<sub>.5</sub> | AP<sub>.75</sub> |
| :-----------: | :-----------: | :----:|:------:|:-------: | :-----------: | :----:|:------:|:-------: |
| w/o GC        | R50           |  36.4 |  58.4  |  39.1    | X101-32x4d    |  40.1 |  62.0  |   43.8   |
| w/ GC         | R50           |  37.0 |  59.0  |  40.2    | X101-32x4d    |  40.7 |  62.7  |   43.9   |
| w/o GC        | R101          |  38.5 |  60.3  |  41.6    | X101-64x4d    |  41.3 |  63.3  |   45.2   |
| w/ GC         | R101          |  38.9 |  60.8  |  42.2    | X101-64x4d    |  41.6 |  63.8  |   45.4   |

The following table is the detection and segmentation results on COCO by using Mask-RCNN and FPN with various backbone models:

| Method        | Backbone      |  AP<sup>b</sup>  | AP<sup>b</sup><sub>.5</sub>| AP<sup>b</sup><sub>.75</sub>|  AP<sup>m</sup>   | AP<sup>m</sup><sub>.5</sub>| AP<sup>m</sup><sub>.75</sub> |
| :-----------: | :-----------: | :----:|:------:|:-------:| :----:|:------:|:-------: |
| w/o GC        | R50           | 37.4  | 59.0   | 40.6    | 34.1  | 55.5   | 36.1     |
| w/ GC         | R50           | 37.9  | 59.6   | 41.2    | 34.7  | 56.1   | 37.0     |
| w/o GC        | R101          | 39.4  | 60.9   | 43.3    | 35.9  | 57.7   | 38.4     |
| w/ GC         | R101          | 40.0  | 61.5   | 43.7    | 36.2  | 58.1   | 38.7     |
| w/o GC        | X101-32x4d    | 41.1  | 62.8   | 45.0    | 37.1  | 59.4   | 39.8     |
| w/ GC         | X101-32x4d    | 41.6  | 63.1   | 45.5    | 37.4  | 59.8   | 39.9     |
| w/o GC        | X101-64x4d    | 42.1  | 63.8   | 46.3    | 38.0  | 60.6   | 40.9     |
| w/ GC         | X101-64x4d    | 42.8  | 64.5   | 46.8    | 38.4  | 61.0   | 41.1     |
| w/o GC        | R50 (4c1f)    | 37.5  | 58.2   | 41.0    | 33.9  | 55.0   | 36.1     |
| w/ GC         | R50 (4c1f)    | 38.4  | 59.5   | 41.8    | 34.6  | 55.9   | 36.7     |
| w/o GC        | R101GN        | 41.1  | 61.7   | 44.9    | 36.9  | 58.7   | 39.3     |
| w/ GC         | R101GN        | 41.7  | 62.3   | 45.3    | 37.4  | 59.3   | 40.3     |
| w/o GC        | R50GN+WS      | 40.0  | 60.7   | 43.6    | 36.1  | 57.8   | 38.6     |
| w/ GC         | R50GN+WS      | 40.6  | 61.3   | 43.9    | 36.6  | 58.2   | 39.1     |

***

### Person ReId
The codes are in [`PersonReId`](https://github.com/Yonghongwei/reid-strong-baseline). Please let [`SGD.py`](https://github.com/Yonghongwei/reid-strong-baseline/tree/master/tools/SGD.py) in [`reid-strong-baseline\tools\`](https://github.com/Yonghongwei/reid-strong-baseline/tree/master/tools), and update [`reid-strong-baseline\solver\build.py`](https://github.com/Yonghongwei/reid-strong-baseline/blob/master/solver/build.py). For Market1501, please use SGD_GCC algorithm with
learning rate 0.03 or 0.02 and weight decay 0.002. For example, you can change the '.sh' file with the following codes: 
```python
python3 tools/train.py --config_file='configs/softmax_triplet_with_center.yml' MODEL.DEVICE_ID "('0')" DATASETS.NAMES "('market1501')" DATASETS.ROOT_DIR "('/home/yonghw/data/reid/')" OUTPUT_DIR "('out_dir/market1501/test')" SOLVER.OPTIMIZER_NAME "('SGD_GCC')" SOLVER.BASE_LR "(0.03)" SOLVER.WEIGHT_DECAY "(0.002)" SOLVER.WEIGHT_DECAY_BIAS "(0.002)"
```
The results of Market1501 without reranking are shown in the following table:
| Method        | Backbone      |  MAP    | Top 1    |
| :-----------: | :-----------: |:------:|:-------: |
|  Adam*        | R18           | 77.8   |  91.7   |
| SGD_GCC       | R18           | 81.3   | 92.7    |
|  Adam*        | R50           | 85.9   | 94.5    |
| SGD_GCC       | R50           |  86.6  |  94.8   |
|  Adam*        | R101          |  87.1  | 94.5    |
| SGD_GCC       | R101          |  87.9  |  95.0   |

The results with * are reported by the authors in [reid-strong-baseline](https://github.com/michuanhaohao/reid-strong-baseline). Our reproduced results are slightly lower than the results provided by the authors.


================================================
FILE: algorithm-GC/README.md
================================================
# Advanced-optimizer-with-Gradient-Centralization
Advanced optimizer with Gradient-Centralization
Please Refer to
## [Gradient Centralization: A New Optimization Technique for Deep Neural Networks](https://arxiv.org/abs/2004.01461)

## Introduction

We embed GC into some advanced DNN optimizers, including [`SGD.py`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/SGD.py),
[`Adam.py`](https://github.com/Yonghongwei/Advanced-optimizer-with-Gradient-Centralization/blob/master/algorithm/Adam.py), [`AdamW`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/Adam.py), [`RAdam`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/RAdam.py),[`Lookahead`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/Lookahead.py)+[`SGD.py`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/SGD.py), [`Lookahead`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/Lookahead.py)+[`Adam.py`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/Adam.py), [`Ranger`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/Ranger.py).

There are three hyper-parameters `use_gc`, `gc_conv_only` and `gc_loc`. `use_gc=True` means that the algorithm adds GC operation, otherwise, not. `gc_conv_only=True` means the algorithm only adds GC operation for Conv layer, otherwise, for both Conv and FC layer. `gc_loc` controls the location of GC operation for adaptive learning rate algorithms, including Adam, Radam, Ranger and so on. There are two locations in the algorithm to add GC operation for original gradient and generalized gradient, respectively. Generalized gradient is the variable which is directly used to update the weight.  For adaptive learning rate algorithms, we suggest `gc_loc=False`.  For SGD, these two locations for GC are equivalent, so we do not introduce the hyper-parameter `gc_loc`.

We also give an example of how to use these algorithms in [`Cifar`](https://github.com/Yonghongwei/Gradient-Centralization/blob/master/algorithm-GC/cifar/main.py). 
For example: 

```python
# SGD
optimizer = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = args.weight_decay,use_gc=True, gc_conv_only=False) 
```

```python
# Adam
optimizer = Adam(net.parameters(), lr=args.lr, weight_decay = args.weight_decay,use_gc=True, gc_conv_only=False,gc_loc=False) 
```

```python
# RAdam
optimizer = RAdam(net.parameters(), lr=args.lr, weight_decay = args.weight_decay,use_gc=True, gc_conv_only=False,gc_loc=False)
```
```python
# lookahead+SGD
base_opt = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = args.weight_decay,use_gc=False, gc_conv_only=False)
optimizer = Lookahead(base_opt, k=5, alpha=0.5)
```
```python
# Ranger
optimizer = Ranger(net.parameters(), lr=args.lr, weight_decay = args.weight_decay,use_gc=True, gc_conv_only=False,gc_loc=False)
```
## References:
* Adam: https://arxiv.org/abs/1412.6980

* AdamW: https://arxiv.org/abs/1711.05101

* Lookahead: https://arxiv.org/abs/1907.08610

* RAdam: https://arxiv.org/abs/1908.03265, https://github.com/LiyuanLucasLiu/RAdam

* Ranger: https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer

* Gradient Centralization: https://arxiv.org/abs/2004.01461v2


================================================
FILE: algorithm-GC/algorithm/Adam.py
================================================
import math
import torch
from torch.optim.optimizer import Optimizer
from .Centralization import centralized_gradient

class Adam(Optimizer):
    r"""Implements Adam algorithm.

    It has been proposed in `Adam: A Method for Stochastic Optimization`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False)

    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False,use_gc=False, gc_conv_only=False,gc_loc=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        if not 0.0 <= weight_decay:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(Adam, self).__init__(params, defaults)
        self.gc_loc=gc_loc
        self.use_gc=use_gc
        self.gc_conv_only=gc_conv_only

    def __setstate__(self, state):
        super(Adam, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    @torch.no_grad()
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1
                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']

                if group['weight_decay'] != 0:
                    grad = grad.add(p, alpha=group['weight_decay'])
                if self.gc_loc:
                   grad=centralized_gradient(grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)
                    
                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
                else:
                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])

                step_size = group['lr'] / bias_correction1
                #GC operation 
                G_grad=exp_avg/denom 
                if self.gc_loc==False:       
                    G_grad=centralized_gradient(G_grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)
                
                p.add_( G_grad, alpha=-step_size)

        return loss


class AdamW(Optimizer):
    r"""Implements AdamW algorithm.

    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay coefficient (default: 1e-2)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False)

    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _Decoupled Weight Decay Regularization:
        https://arxiv.org/abs/1711.05101
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=1e-2, amsgrad=False,use_gc=False, gc_conv_only=False,gc_loc=True):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        if not 0.0 <= weight_decay:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(AdamW, self).__init__(params, defaults)
        self.gc_loc=gc_loc
        self.use_gc=use_gc
        self.gc_conv_only=gc_conv_only
        
    def __setstate__(self, state):
        super(AdamW, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    @torch.no_grad()
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue

                # Perform optimization step
                grad = p.grad
                if grad.is_sparse:
                    raise RuntimeError('AdamW does not support sparse gradients')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1
                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                if self.gc_loc:
                   grad=centralized_gradient(grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
                else:
                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])

                step_size = group['lr'] / bias_correction1

                #GC operation and stepweight decay
                G_grad=(exp_avg/denom).add(p.data,alpha=group['weight_decay'])        
                if self.gc_loc==False:       
                    G_grad=centralized_gradient(G_grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)
                
                p.add_( G_grad, alpha=-step_size)

        return loss

================================================
FILE: algorithm-GC/algorithm/Centralization.py
================================================
import torch
#from torch.optim.optimizer import Optimizer, required


def centralized_gradient(x,use_gc=True,gc_conv_only=False):
    if use_gc:
      if gc_conv_only:
        if len(list(x.size()))>3:
            x.add_(-x.mean(dim = tuple(range(1,len(list(x.size())))), keepdim = True))
      else:
        if len(list(x.size()))>1:
            x.add_(-x.mean(dim = tuple(range(1,len(list(x.size())))), keepdim = True))
    return x                   


================================================
FILE: algorithm-GC/algorithm/Lookahead.py
================================================
from collections import defaultdict
from itertools import chain
from torch.optim import Optimizer
import torch
import warnings

class Lookahead(Optimizer):
    def __init__(self, optimizer, k=5, alpha=0.5):
        self.optimizer = optimizer
        self.k = k
        self.alpha = alpha
        self.param_groups = self.optimizer.param_groups
        self.state = defaultdict(dict)
        self.fast_state = self.optimizer.state
        for group in self.param_groups:
            group["counter"] = 0
    
    def update(self, group):
        for fast in group["params"]:
            param_state = self.state[fast]
            if "slow_param" not in param_state:
                param_state["slow_param"] = torch.zeros_like(fast.data)
                param_state["slow_param"].copy_(fast.data)
            slow = param_state["slow_param"]
            slow += (fast.data - slow) * self.alpha
            fast.data.copy_(slow)
    
    def update_lookahead(self):
        for group in self.param_groups:
            self.update(group)

    def step(self, closure=None):
        loss = self.optimizer.step(closure)
        for group in self.param_groups:
            if group["counter"] == 0:
                self.update(group)
            group["counter"] += 1
            if group["counter"] >= self.k:
                group["counter"] = 0
        return loss

    def state_dict(self):
        fast_state_dict = self.optimizer.state_dict()
        slow_state = {
            (id(k) if isinstance(k, torch.Tensor) else k): v
            for k, v in self.state.items()
        }
        fast_state = fast_state_dict["state"]
        param_groups = fast_state_dict["param_groups"]
        return {
            "fast_state": fast_state,
            "slow_state": slow_state,
            "param_groups": param_groups,
        }

    def load_state_dict(self, state_dict):
        slow_state_dict = {
            "state": state_dict["slow_state"],
            "param_groups": state_dict["param_groups"],
        }
        fast_state_dict = {
            "state": state_dict["fast_state"],
            "param_groups": state_dict["param_groups"],
        }
        super(Lookahead, self).load_state_dict(slow_state_dict)
        self.optimizer.load_state_dict(fast_state_dict)
        self.fast_state = self.optimizer.state

    def add_param_group(self, param_group):
        param_group["counter"] = 0
        self.optimizer.add_param_group(param_group)

================================================
FILE: algorithm-GC/algorithm/RAdam.py
================================================
import math
import torch
from torch.optim.optimizer import Optimizer
from .Centralization import centralized_gradient


class RAdam(Optimizer):

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True,use_gc=False, gc_conv_only=False,gc_loc=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))

        self.degenerated_to_sgd = degenerated_to_sgd
        self.gc_loc=gc_loc
        self.use_gc=use_gc
        self.gc_conv_only=gc_conv_only
                
        if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
            for param in params:
                if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):
                    param['buffer'] = [[None, None, None] for _ in range(10)]
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)])
        super(RAdam, self).__init__(params, defaults)

        
    def __setstate__(self, state):
        super(RAdam, self).__setstate__(state)

    def step(self, closure=None):

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()
                if grad.is_sparse:
                    raise RuntimeError('RAdam does not support sparse gradients')

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']
                if self.gc_loc:
                   grad=centralized_gradient(grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)
                   
                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
                

                state['step'] += 1
                buffered = group['buffer'][int(state['step'] % 10)]
                if state['step'] == buffered[0]:
                    N_sma, step_size = buffered[1], buffered[2]
                else:
                    buffered[0] = state['step']
                    beta2_t = beta2 ** state['step']
                    N_sma_max = 2 / (1 - beta2) - 1
                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
                    buffered[1] = N_sma

                    # more conservative since it's an approximated value
                    if N_sma >= 5:
                        step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
                    elif self.degenerated_to_sgd:
                        step_size = 1.0 / (1 - beta1 ** state['step'])
                    else:
                        step_size = -1
                    buffered[2] = step_size

                # more conservative since it's an approximated value
                if N_sma >= 5:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])                                        
                    G_grad=exp_avg/denom  
                elif step_size > 0:
                    G_grad=exp_avg  
       
                if group['weight_decay'] != 0:
                       G_grad.add_(p_data_fp32,alpha=group['weight_decay'])   
                #GC operation                                                  
                if self.gc_loc==False:       
                    G_grad=centralized_gradient(G_grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)                                       
                p_data_fp32.add_( G_grad, alpha=-step_size * group['lr'])
                    #p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
                p.data.copy_(p_data_fp32)
        return loss

class PlainRAdam(Optimizer):

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True,use_gc=False, gc_conv_only=False,gc_loc=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
                    
        self.degenerated_to_sgd = degenerated_to_sgd
        self.gc_loc=gc_loc
        self.use_gc = use_gc
        self.gc_conv_only=gc_conv_only
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)

        super(PlainRAdam, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(PlainRAdam, self).__setstate__(state)

    def step(self, closure=None):

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()
                if grad.is_sparse:
                    raise RuntimeError('RAdam does not support sparse gradients')

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']
                if self.gc_loc:
                   grad=centralized_gradient(grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)
                   
                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)

                state['step'] += 1
                beta2_t = beta2 ** state['step']
                N_sma_max = 2 / (1 - beta2) - 1
                N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)


                # more conservative since it's an approximated value
                if N_sma >= 5:
                    #if group['weight_decay'] != 0:
                    #    p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
                    step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
                    denom = exp_avg_sq.sqrt().add_(group['eps'])                                       
                    G_grad=exp_avg/denom  

                elif self.degenerated_to_sgd:
                    #if group['weight_decay'] != 0:
                    #    p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
                    step_size = group['lr'] / (1 - beta1 ** state['step'])
                    G_grad=exp_avg  

                if group['weight_decay'] != 0:
                       G_grad.add_(p.data,alpha=group['weight_decay']) 
                                                                   
                if self.gc_loc==False:       
                    G_grad=centralized_gradient(G_grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)
                    
                p_data_fp32.add_( G_grad, alpha=-step_size * group['lr'])                   
                    #p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
                p.data.copy_(p_data_fp32)
        return loss

================================================
FILE: algorithm-GC/algorithm/Ranger.py
================================================
import math
import torch
from torch.optim.optimizer import Optimizer
from .Centralization import centralized_gradient


class Ranger(Optimizer):

    def __init__(self, params, lr=1e-3,                       # lr
                 alpha=0.5, k=6, N_sma_threshhold=5,           # Ranger options
                 betas=(.95, 0.999), eps=1e-5, weight_decay=0,  # Adam options
                 # Gradient centralization on or off, applied to conv layers only or conv + fc layers
                 use_gc=False, gc_conv_only=False,gc_loc=False
                 ):

        # parameter checks
        if not 0.0 <= alpha <= 1.0:
            raise ValueError(f'Invalid slow update rate: {alpha}')
        if not 1 <= k:
            raise ValueError(f'Invalid lookahead steps: {k}')
        if not lr > 0:
            raise ValueError(f'Invalid Learning Rate: {lr}')
        if not eps > 0:
            raise ValueError(f'Invalid eps: {eps}')

        # parameter comments:
        # beta1 (momentum) of .95 seems to work better than .90...
        # N_sma_threshold of 5 seems better in testing than 4.
        # In both cases, worth testing on your dataset (.90 vs .95, 4 vs 5) to make sure which works best for you.

        # prep defaults and init torch.optim base
        defaults = dict(lr=lr, alpha=alpha, k=k, step_counter=0, betas=betas,
                        N_sma_threshhold=N_sma_threshhold, eps=eps, weight_decay=weight_decay)
        super().__init__(params, defaults)

        # adjustable threshold
        self.N_sma_threshhold = N_sma_threshhold

        # look ahead params

        self.alpha = alpha
        self.k = k

        # radam buffer for state
        self.radam_buffer = [[None, None, None] for ind in range(10)]

        # gc on or off
        self.gc_loc=gc_loc
        self.use_gc = use_gc
        self.gc_conv_only=gc_conv_only
        # level of gradient centralization
        #self.gc_gradient_threshold = 3 if gc_conv_only else 1

        print(
            f"Ranger optimizer loaded. \nGradient Centralization usage = {self.use_gc}")
        if (self.use_gc and self.gc_conv_only == False):
            print(f"GC applied to both conv and fc layers")
        elif (self.use_gc and self.gc_conv_only == True):
            print(f"GC applied to conv layers only")

    def __setstate__(self, state):
        print("set state called")
        super(Ranger, self).__setstate__(state)

    def step(self, closure=None):
        loss = None
        # note - below is commented out b/c I have other work that passes back the loss as a float, and thus not a callable closure.
        # Uncomment if you need to use the actual closure...

        # if closure is not None:
        #loss = closure()

        # Evaluate averages and grad, update param tensors
        for group in self.param_groups:

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()

                if grad.is_sparse:
                    raise RuntimeError(
                        'Ranger optimizer does not support sparse gradients')

                p_data_fp32 = p.data.float()

                state = self.state[p]  # get state dict for this param

                if len(state) == 0:  # if first time to run...init dictionary with our desired entries
                    # if self.first_run_check==0:
                    # self.first_run_check=1
                    #print("Initializing slow buffer...should not see this at load from saved model!")
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)

                    # look ahead weight storage now in state dict
                    state['slow_buffer'] = torch.empty_like(p.data)
                    state['slow_buffer'].copy_(p.data)

                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(
                        p_data_fp32)

                # begin computations
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                # GC operation for Conv layers and FC layers
                #if grad.dim() > self.gc_gradient_threshold:
                #    grad.add_(-grad.mean(dim=tuple(range(1, grad.dim())), keepdim=True))
                if self.gc_loc:
                   grad=centralized_gradient(grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)
                   
                state['step'] += 1

                # compute variance mov avg
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
                
                # compute mean moving avg
                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                

                buffered = self.radam_buffer[int(state['step'] % 10)]

                if state['step'] == buffered[0]:
                    N_sma, step_size = buffered[1], buffered[2]
                else:
                    buffered[0] = state['step']
                    beta2_t = beta2 ** state['step']
                    N_sma_max = 2 / (1 - beta2) - 1
                    N_sma = N_sma_max - 2 * \
                        state['step'] * beta2_t / (1 - beta2_t)
                    buffered[1] = N_sma
                    if N_sma > self.N_sma_threshhold:
                        step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (
                            N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
                    else:
                        step_size = 1.0 / (1 - beta1 ** state['step'])
                    buffered[2] = step_size

                #if group['weight_decay'] != 0:
                #    p_data_fp32.add_(-group['weight_decay']
                #                     * group['lr'], p_data_fp32)

                # apply lr
                if N_sma > self.N_sma_threshhold:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])                                         
                    G_grad=exp_avg/denom                                                                                    
                else:
                    G_grad=exp_avg  

                if group['weight_decay'] != 0:
                       G_grad.add_(p_data_fp32,alpha=group['weight_decay']) 
                #GC operation                                                   
                if self.gc_loc==False:       
                    G_grad=centralized_gradient(G_grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)
                                                           
                p_data_fp32.add_( G_grad, alpha=-step_size * group['lr'])
                p.data.copy_(p_data_fp32)

                # integrated look ahead...
                # we do it at the param level instead of group level
                if state['step'] % group['k'] == 0:
                    # get access to slow param tensor
                    slow_p = state['slow_buffer']
                    # (fast weights - slow weights) * alpha
                    slow_p.add_( p.data - slow_p,alpha=self.alpha)
                    # copy interpolated weights to RAdam param tensor
                    p.data.copy_(slow_p)

        return loss

================================================
FILE: algorithm-GC/algorithm/SGD.py
================================================
import torch
from torch.optim.optimizer import Optimizer, required

from .Centralization import centralized_gradient

class SGD(Optimizer):
    r"""Implements stochastic gradient descent (optionally with momentum).

    Nesterov momentum is based on the formula from
    `On the importance of initialization and momentum in deep learning`__.

    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float): learning rate
        momentum (float, optional): momentum factor (default: 0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        dampening (float, optional): dampening for momentum (default: 0)
        nesterov (bool, optional): enables Nesterov momentum (default: False)

    Example:
        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
        >>> optimizer.zero_grad()
        >>> loss_fn(model(input), target).backward()
        >>> optimizer.step()

    __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf

    .. note::
        The implementation of SGD with Momentum/Nesterov subtly differs from
        Sutskever et. al. and implementations in some other frameworks.

        Considering the specific case of Momentum, the update can be written as

        .. math::
            \begin{aligned}
                v_{t+1} & = \mu * v_{t} + g_{t+1}, \\
                p_{t+1} & = p_{t} - \text{lr} * v_{t+1},
            \end{aligned}

        where :math:`p`, :math:`g`, :math:`v` and :math:`\mu` denote the 
        parameters, gradient, velocity, and momentum respectively.

        This is in contrast to Sutskever et. al. and
        other frameworks which employ an update of the form

        .. math::
            \begin{aligned}
                v_{t+1} & = \mu * v_{t} + \text{lr} * g_{t+1}, \\
                p_{t+1} & = p_{t} - v_{t+1}.
            \end{aligned}

        The Nesterov version is analogously modified.
    """

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False,use_gc=False, gc_conv_only=False):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov, use_gc=use_gc,gc_conv_only=gc_conv_only)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGD, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGD, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    @torch.no_grad()
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad
                if weight_decay != 0:
                    d_p = d_p.add(p, alpha=weight_decay)
                    
                #GC operation     
                d_p =centralized_gradient(d_p ,use_gc=group['use_gc'],gc_conv_only=group['gc_conv_only'])                 
                                        
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
                    if nesterov:
                        d_p = d_p.add(buf, alpha=momentum)
                    else:
                        d_p = buf
 
                             
                p.add_(d_p, alpha=-group['lr'])

        return loss


================================================
FILE: algorithm-GC/cifar/main.py
================================================
'''Train CIFAR100 with PyTorch.'''
from __future__ import print_function

import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn


import torch.optim as optim
import torch.nn.functional as F

import torchvision
import torchvision.transforms as transforms


from torch.optim import lr_scheduler
import os
import argparse
from torchvision import datasets, models
from models import *
#from utils import progress_bar
import numpy as np


import sys 
sys.path.append('../')
 
#import optimizers with GC
from algorithm.SGD import SGD
from algorithm.Adam import Adam,AdamW
from algorithm.RAdam import RAdam
from algorithm.Lookahead import Lookahead
from algorithm.Ranger import Ranger
#from algorithm.Adam import Adam_GCC,AdamW,AdamW_GCC
#from algorithm.Adagrad import Adagrad_GCC


parser = argparse.ArgumentParser(description='PyTorch CIFAR100 Training')
parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
parser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint')
parser.add_argument('--bs', default=128, type=int, help='batchsize')
parser.add_argument('--wd', default=0.0005, type=float, help='weight decay')
parser.add_argument('--alg', default='sgd', type=str, help='algorithm')
parser.add_argument('--epochs', default=200, type=int, help='epochs')
parser.add_argument('--path', default='logout/result', type=str, help='path')
parser.add_argument('--model', default='r50', type=str, help='model')
parser.add_argument('--gpug', default=1, type=int, help='gpugroup')

args = parser.parse_args()
#os.environ["CUDA_VISIBLE_DEVICES"]="0"

if args.gpug==11:
      os.environ["CUDA_VISIBLE_DEVICES"]="1"   
if args.gpug==12:
      os.environ["CUDA_VISIBLE_DEVICES"]="2"   
if args.gpug==13:
      os.environ["CUDA_VISIBLE_DEVICES"]="3"   
if args.gpug==14:
      os.environ["CUDA_VISIBLE_DEVICES"]="4"   
if args.gpug==15:
      os.environ["CUDA_VISIBLE_DEVICES"]="5"   
if args.gpug==16:
      os.environ["CUDA_VISIBLE_DEVICES"]="6"   
if args.gpug==17:
      os.environ["CUDA_VISIBLE_DEVICES"]="7"   
if args.gpug==10:
     os.environ["CUDA_VISIBLE_DEVICES"]="0"

epochs=args.epochs
device = 'cuda' if torch.cuda.is_available() else 'cpu'
best_acc = 0  # best test accuracy
start_epoch = 0  # start from epoch 0 or last checkpoint epoch


# Data
print('==> Preparing data..')
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.507, 0.487, 0.441), (0.267, 0.256, 0.276)),
  ])
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.507, 0.487, 0.441), (0.267, 0.256, 0.276)),
  ])
trainset = torchvision.datasets.CIFAR100(root='/home/yonghw/data/cifar100/', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=4,drop_last=True)
testset = torchvision.datasets.CIFAR100(root='/home/yonghw/data/cifar100/', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=4)


# Model
print('==> Building model..')

Num_classes = 100

if args.model=='r18':
    net = ResNet18(Num_classes=Num_classes)
if args.model=='r34':
    net = ResNet34(Num_classes=Num_classes)
if args.model=='r50':
    net = ResNet50(Num_classes=Num_classes)
if args.model=='r101':
    net = ResNet101(Num_classes=Num_classes)
if args.model=='v11':
    net = VGG('VGG11',Num_classes=Num_classes)
if args.model=='rx29':
    net = ResNeXt29_4x64d(Num_classes=Num_classes)
if args.model=='d121':
    net = DenseNet121(Num_classes=Num_classes)

if device == 'cuda':
    net = net.cuda()
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True


if args.resume:
    # Load checkpoint.
    print('==> Resuming from checkpoint..')
    assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
    checkpoint = torch.load('./checkpoint/ckpt.t7')
    net.load_state_dict(checkpoint['net'])
    best_acc = checkpoint['acc']
    start_epoch = checkpoint['epoch']
    
criterion = nn.CrossEntropyLoss()

#optimizer
WD=args.wd
print('==> choose optimizer..')
if args.alg=='sgd':
    optimizer = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD,use_gc=False, gc_conv_only=False)
if args.alg=='sgdGC':
    optimizer = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD,use_gc=True, gc_conv_only=False)
if args.alg=='sgdGCC':
    optimizer = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD,use_gc=True, gc_conv_only=True)    
    

if args.alg=='adam':
    optimizer = Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=False, gc_conv_only=False)
if args.alg=='adamGC':
    optimizer = Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=False)
if args.alg=='adamGCC':
    optimizer = Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=True)


if args.alg=='adamW':
    optimizer = AdamW(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=False, gc_conv_only=False)
if args.alg=='adamWGC':
    optimizer = AdamW(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=False)
if args.alg=='adamWGCC':
    optimizer = AdamW(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=True)


if args.alg=='radam':
    optimizer = RAdam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=False, gc_conv_only=False)
if args.alg=='radamGC':
    optimizer = RAdam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=False)
if args.alg=='radamGCC':
    optimizer = RAdam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=True)


if args.alg=='Lsgd':
    base_opt = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD,use_gc=False, gc_conv_only=False)
    optimizer = Lookahead(base_opt, k=5, alpha=0.5)
if args.alg=='LsgdGC':
    base_opt = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD,use_gc=True, gc_conv_only=False)
    optimizer = Lookahead(base_opt, k=5, alpha=0.5)
if args.alg=='LsgdGCC':
    base_opt = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD,use_gc=True, gc_conv_only=True)
    optimizer = Lookahead(base_opt, k=5, alpha=0.5)


if args.alg=='Ladam':
     base_opt  = Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=False, gc_conv_only=False)
     optimizer = Lookahead(base_opt, k=5, alpha=0.5)     
if args.alg=='LadamGC':
     base_opt  = Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=False)
     optimizer = Lookahead(base_opt, k=5, alpha=0.5)     
if args.alg=='LadamGCC':
     base_opt  = Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=True)
     optimizer = Lookahead(base_opt, k=5, alpha=0.5)     

if args.alg=='Lradam':
     base_opt  = RAdam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=False, gc_conv_only=False)
     optimizer = Lookahead(base_opt, k=5, alpha=0.5)     
if args.alg=='LradamGC':
     base_opt  = RAdam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=False)
     optimizer = Lookahead(base_opt, k=5, alpha=0.5)     
if args.alg=='LradamGCC':
     base_opt  = RAdam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=True)
     optimizer = Lookahead(base_opt, k=5, alpha=0.5) 


if args.alg=='ranger':
    optimizer = Ranger(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=False, gc_conv_only=False)
if args.alg=='rangerGC':
    optimizer = Ranger(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=False)
if args.alg=='rangerGCC':
    optimizer = Ranger(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=True)
    
    
if args.epochs==200:
   exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=60, gamma=0.1)
if args.epochs==400:
   exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=120, gamma=0.1)
# Training
def train(epoch,net,optimizer):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
    print('Training: Loss: {:.4f} | Acc: {:.4f}'.format(train_loss/(batch_idx+1),correct/total))
    #        progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
    #            % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
    acc=100.*correct/total
    return acc
    
# Testing
def test(epoch,net):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
      for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            #progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
                #% (test_loss/(batch_idx+1), 100.*correct/total, correct, total))
    print('Testing:Loss: {:.4f} | Acc: {:.4f}'.format(test_loss/(batch_idx+1),correct/total) )

    # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc:
        print('Saving..')
        state = {
            'net': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir('checkpoint'):
            os.mkdir('checkpoint')
        torch.save(state, './checkpoint/ckpt.t7')
        best_acc = acc
    return acc


for epoch in range(start_epoch, start_epoch+epochs):
    train_acc=train(epoch,net,optimizer)
    exp_lr_scheduler.step()
    val_acc=test(epoch,net)


================================================
FILE: algorithm-GC/cifar/models/__init__.py
================================================
from .vgg import *
from .dpn import *
from .lenet import *
from .senet import *
from .pnasnet import *
from .densenet import *
from .googlenet import *
from .shufflenet import *
from .resnet import *
from .resnext import *
from .preact_resnet import *
from .mobilenet import *
from .mobilenetv2 import *


================================================
FILE: algorithm-GC/cifar/models/densenet.py
================================================
'''DenseNet in PyTorch.'''
import math

import torch
import torch.nn as nn
import torch.nn.functional as F


class Bottleneck(nn.Module):
    def __init__(self, in_planes, growth_rate):
        super(Bottleneck, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False)
        self.bn2 = nn.BatchNorm2d(4*growth_rate)
        self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)

    def forward(self, x):
        out = self.conv1(F.relu(self.bn1(x)))
        out = self.conv2(F.relu(self.bn2(out)))
        out = torch.cat([out,x], 1)
        return out


class Transition(nn.Module):
    def __init__(self, in_planes, out_planes):
        super(Transition, self).__init__()
        self.bn = nn.BatchNorm2d(in_planes)
        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False)

    def forward(self, x):
        out = self.conv(F.relu(self.bn(x)))
        out = F.avg_pool2d(out, 2)
        return out


class DenseNet(nn.Module):
    def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
        super(DenseNet, self).__init__()
        self.growth_rate = growth_rate

        num_planes = 2*growth_rate
        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False)

        self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
        num_planes += nblocks[0]*growth_rate
        out_planes = int(math.floor(num_planes*reduction))
        self.trans1 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
        num_planes += nblocks[1]*growth_rate
        out_planes = int(math.floor(num_planes*reduction))
        self.trans2 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
        num_planes += nblocks[2]*growth_rate
        out_planes = int(math.floor(num_planes*reduction))
        self.trans3 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3])
        num_planes += nblocks[3]*growth_rate

        self.bn = nn.BatchNorm2d(num_planes)
        self.linear = nn.Linear(num_planes, num_classes)

    def _make_dense_layers(self, block, in_planes, nblock):
        layers = []
        for i in range(nblock):
            layers.append(block(in_planes, self.growth_rate))
            in_planes += self.growth_rate
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.trans1(self.dense1(out))
        out = self.trans2(self.dense2(out))
        out = self.trans3(self.dense3(out))
        out = self.dense4(out)
        out = F.avg_pool2d(F.relu(self.bn(out)), 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

def DenseNet121():
    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32)

def DenseNet169():
    return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32)

def DenseNet201():
    return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32)

def DenseNet161():
    return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48)

def densenet_cifar():
    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12)

def test():
    net = densenet_cifar()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y)

# test()


================================================
FILE: algorithm-GC/cifar/models/dpn.py
================================================
'''Dual Path Networks in PyTorch.'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class Bottleneck(nn.Module):
    def __init__(self, last_planes, in_planes, out_planes, dense_depth, stride, first_layer):
        super(Bottleneck, self).__init__()
        self.out_planes = out_planes
        self.dense_depth = dense_depth

        self.conv1 = nn.Conv2d(last_planes, in_planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv2 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=32, bias=False)
        self.bn2 = nn.BatchNorm2d(in_planes)
        self.conv3 = nn.Conv2d(in_planes, out_planes+dense_depth, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(out_planes+dense_depth)

        self.shortcut = nn.Sequential()
        if first_layer:
            self.shortcut = nn.Sequential(
                nn.Conv2d(last_planes, out_planes+dense_depth, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_planes+dense_depth)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        x = self.shortcut(x)
        d = self.out_planes
        out = torch.cat([x[:,:d,:,:]+out[:,:d,:,:], x[:,d:,:,:], out[:,d:,:,:]], 1)
        out = F.relu(out)
        return out


class DPN(nn.Module):
    def __init__(self, cfg):
        super(DPN, self).__init__()
        in_planes, out_planes = cfg['in_planes'], cfg['out_planes']
        num_blocks, dense_depth = cfg['num_blocks'], cfg['dense_depth']

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.last_planes = 64
        self.layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1)
        self.layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2)
        self.layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=2)
        self.layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2)
        self.linear = nn.Linear(out_planes[3]+(num_blocks[3]+1)*dense_depth[3], 10)

    def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for i,stride in enumerate(strides):
            layers.append(Bottleneck(self.last_planes, in_planes, out_planes, dense_depth, stride, i==0))
            self.last_planes = out_planes + (i+2) * dense_depth
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def DPN26():
    cfg = {
        'in_planes': (96,192,384,768),
        'out_planes': (256,512,1024,2048),
        'num_blocks': (2,2,2,2),
        'dense_depth': (16,32,24,128)
    }
    return DPN(cfg)

def DPN92():
    cfg = {
        'in_planes': (96,192,384,768),
        'out_planes': (256,512,1024,2048),
        'num_blocks': (3,4,20,3),
        'dense_depth': (16,32,24,128)
    }
    return DPN(cfg)


def test():
    net = DPN92()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y)

# test()


================================================
FILE: algorithm-GC/cifar/models/googlenet.py
================================================
'''GoogLeNet with PyTorch.'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class Inception(nn.Module):
    def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes):
        super(Inception, self).__init__()
        # 1x1 conv branch
        self.b1 = nn.Sequential(
            nn.Conv2d(in_planes, n1x1, kernel_size=1),
            nn.BatchNorm2d(n1x1),
            nn.ReLU(True),
        )

        # 1x1 conv -> 3x3 conv branch
        self.b2 = nn.Sequential(
            nn.Conv2d(in_planes, n3x3red, kernel_size=1),
            nn.BatchNorm2d(n3x3red),
            nn.ReLU(True),
            nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1),
            nn.BatchNorm2d(n3x3),
            nn.ReLU(True),
        )

        # 1x1 conv -> 5x5 conv branch
        self.b3 = nn.Sequential(
            nn.Conv2d(in_planes, n5x5red, kernel_size=1),
            nn.BatchNorm2d(n5x5red),
            nn.ReLU(True),
            nn.Conv2d(n5x5red, n5x5, kernel_size=3, padding=1),
            nn.BatchNorm2d(n5x5),
            nn.ReLU(True),
            nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1),
            nn.BatchNorm2d(n5x5),
            nn.ReLU(True),
        )

        # 3x3 pool -> 1x1 conv branch
        self.b4 = nn.Sequential(
            nn.MaxPool2d(3, stride=1, padding=1),
            nn.Conv2d(in_planes, pool_planes, kernel_size=1),
            nn.BatchNorm2d(pool_planes),
            nn.ReLU(True),
        )

    def forward(self, x):
        y1 = self.b1(x)
        y2 = self.b2(x)
        y3 = self.b3(x)
        y4 = self.b4(x)
        return torch.cat([y1,y2,y3,y4], 1)


class GoogLeNet(nn.Module):
    def __init__(self):
        super(GoogLeNet, self).__init__()
        self.pre_layers = nn.Sequential(
            nn.Conv2d(3, 192, kernel_size=3, padding=1),
            nn.BatchNorm2d(192),
            nn.ReLU(True),
        )

        self.a3 = Inception(192,  64,  96, 128, 16, 32, 32)
        self.b3 = Inception(256, 128, 128, 192, 32, 96, 64)

        self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)

        self.a4 = Inception(480, 192,  96, 208, 16,  48,  64)
        self.b4 = Inception(512, 160, 112, 224, 24,  64,  64)
        self.c4 = Inception(512, 128, 128, 256, 24,  64,  64)
        self.d4 = Inception(512, 112, 144, 288, 32,  64,  64)
        self.e4 = Inception(528, 256, 160, 320, 32, 128, 128)

        self.a5 = Inception(832, 256, 160, 320, 32, 128, 128)
        self.b5 = Inception(832, 384, 192, 384, 48, 128, 128)

        self.avgpool = nn.AvgPool2d(8, stride=1)
        self.linear = nn.Linear(1024, 10)

    def forward(self, x):
        out = self.pre_layers(x)
        out = self.a3(out)
        out = self.b3(out)
        out = self.maxpool(out)
        out = self.a4(out)
        out = self.b4(out)
        out = self.c4(out)
        out = self.d4(out)
        out = self.e4(out)
        out = self.maxpool(out)
        out = self.a5(out)
        out = self.b5(out)
        out = self.avgpool(out)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def test():
    net = GoogLeNet()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y.size())

# test()


================================================
FILE: algorithm-GC/cifar/models/lenet.py
================================================
'''LeNet in PyTorch.'''
import torch.nn as nn
import torch.nn.functional as F

class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1   = nn.Linear(16*5*5, 120)
        self.fc2   = nn.Linear(120, 84)
        self.fc3   = nn.Linear(84, 10)

    def forward(self, x):
        out = F.relu(self.conv1(x))
        out = F.max_pool2d(out, 2)
        out = F.relu(self.conv2(out))
        out = F.max_pool2d(out, 2)
        out = out.view(out.size(0), -1)
        out = F.relu(self.fc1(out))
        out = F.relu(self.fc2(out))
        out = self.fc3(out)
        return out


================================================
FILE: algorithm-GC/cifar/models/mobilenet.py
================================================
'''MobileNet in PyTorch.

See the paper "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class Block(nn.Module):
    '''Depthwise conv + Pointwise conv'''
    def __init__(self, in_planes, out_planes, stride=1):
        super(Block, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False)
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn2 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        return out


class MobileNet(nn.Module):
    # (128,2) means conv planes=128, conv stride=2, by default conv stride=1
    cfg = [64, (128,2), 128, (256,2), 256, (512,2), 512, 512, 512, 512, 512, (1024,2), 1024]

    def __init__(self, num_classes=10):
        super(MobileNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.layers = self._make_layers(in_planes=32)
        self.linear = nn.Linear(1024, num_classes)

    def _make_layers(self, in_planes):
        layers = []
        for x in self.cfg:
            out_planes = x if isinstance(x, int) else x[0]
            stride = 1 if isinstance(x, int) else x[1]
            layers.append(Block(in_planes, out_planes, stride))
            in_planes = out_planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layers(out)
        out = F.avg_pool2d(out, 2)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def test():
    net = MobileNet()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y.size())

# test()


================================================
FILE: algorithm-GC/cifar/models/mobilenetv2.py
================================================
'''MobileNetV2 in PyTorch.

See the paper "Inverted Residuals and Linear Bottlenecks:
Mobile Networks for Classification, Detection and Segmentation" for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class Block(nn.Module):
    '''expand + depthwise + pointwise'''
    def __init__(self, in_planes, out_planes, expansion, stride):
        super(Block, self).__init__()
        self.stride = stride

        planes = expansion * in_planes
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn3 = nn.BatchNorm2d(out_planes)

        self.shortcut = nn.Sequential()
        if stride == 1 and in_planes != out_planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False),
                nn.BatchNorm2d(out_planes),
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out = out + self.shortcut(x) if self.stride==1 else out
        return out


class MobileNetV2(nn.Module):
    # (expansion, out_planes, num_blocks, stride)
    cfg = [(1,  16, 1, 1),
           (6,  24, 2, 1),  # NOTE: change stride 2 -> 1 for CIFAR10
           (6,  32, 3, 2),
           (6,  64, 4, 2),
           (6,  96, 3, 1),
           (6, 160, 3, 2),
           (6, 320, 1, 1)]

    def __init__(self, num_classes=10):
        super(MobileNetV2, self).__init__()
        # NOTE: change conv1 stride 2 -> 1 for CIFAR10
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.layers = self._make_layers(in_planes=32)
        self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn2 = nn.BatchNorm2d(1280)
        self.linear = nn.Linear(1280, num_classes)

    def _make_layers(self, in_planes):
        layers = []
        for expansion, out_planes, num_blocks, stride in self.cfg:
            strides = [stride] + [1]*(num_blocks-1)
            for stride in strides:
                layers.append(Block(in_planes, out_planes, expansion, stride))
                in_planes = out_planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layers(out)
        out = F.relu(self.bn2(self.conv2(out)))
        # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def test():
    net = MobileNetV2()
    x = torch.randn(2,3,32,32)
    y = net(x)
    print(y.size())

# test()


================================================
FILE: algorithm-GC/cifar/models/pnasnet.py
================================================
'''PNASNet in PyTorch.

Paper: Progressive Neural Architecture Search
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class SepConv(nn.Module):
    '''Separable Convolution.'''
    def __init__(self, in_planes, out_planes, kernel_size, stride):
        super(SepConv, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, out_planes,
                               kernel_size, stride,
                               padding=(kernel_size-1)//2,
                               bias=False, groups=in_planes)
        self.bn1 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        return self.bn1(self.conv1(x))


class CellA(nn.Module):
    def __init__(self, in_planes, out_planes, stride=1):
        super(CellA, self).__init__()
        self.stride = stride
        self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)
        if stride==2:
            self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
            self.bn1 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        y1 = self.sep_conv1(x)
        y2 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)
        if self.stride==2:
            y2 = self.bn1(self.conv1(y2))
        return F.relu(y1+y2)

class CellB(nn.Module):
    def __init__(self, in_planes, out_planes, stride=1):
        super(CellB, self).__init__()
        self.stride = stride
        # Left branch
        self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)
        self.sep_conv2 = SepConv(in_planes, out_planes, kernel_size=3, stride=stride)
        # Right branch
        self.sep_conv3 = SepConv(in_planes, out_planes, kernel_size=5, stride=stride)
        if stride==2:
            self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
            self.bn1 = nn.BatchNorm2d(out_planes)
        # Reduce channels
        self.conv2 = nn.Conv2d(2*out_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn2 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        # Left branch
        y1 = self.sep_conv1(x)
        y2 = self.sep_conv2(x)
        # Right branch
        y3 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)
        if self.stride==2:
            y3 = self.bn1(self.conv1(y3))
        y4 = self.sep_conv3(x)
        # Concat & reduce channels
        b1 = F.relu(y1+y2)
        b2 = F.relu(y3+y4)
        y = torch.cat([b1,b2], 1)
        return F.relu(self.bn2(self.conv2(y)))

class PNASNet(nn.Module):
    def __init__(self, cell_type, num_cells, num_planes):
        super(PNASNet, self).__init__()
        self.in_planes = num_planes
        self.cell_type = cell_type

        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(num_planes)

        self.layer1 = self._make_layer(num_planes, num_cells=6)
        self.layer2 = self._downsample(num_planes*2)
        self.layer3 = self._make_layer(num_planes*2, num_cells=6)
        self.layer4 = self._downsample(num_planes*4)
        self.layer5 = self._make_layer(num_planes*4, num_cells=6)

        self.linear = nn.Linear(num_planes*4, 10)

    def _make_layer(self, planes, num_cells):
        layers = []
        for _ in range(num_cells):
            layers.append(self.cell_type(self.in_planes, planes, stride=1))
            self.in_planes = planes
        return nn.Sequential(*layers)

    def _downsample(self, planes):
        layer = self.cell_type(self.in_planes, planes, stride=2)
        self.in_planes = planes
        return layer

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = F.avg_pool2d(out, 8)
        out = self.linear(out.view(out.size(0), -1))
        return out


def PNASNetA():
    return PNASNet(CellA, num_cells=6, num_planes=44)

def PNASNetB():
    return PNASNet(CellB, num_cells=6, num_planes=32)


def test():
    net = PNASNetB()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y)

# test()


================================================
FILE: algorithm-GC/cifar/models/preact_resnet.py
================================================
'''Pre-activation ResNet in PyTorch.

Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Identity Mappings in Deep Residual Networks. arXiv:1603.05027
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class PreActBlock(nn.Module):
    '''Pre-activation version of the BasicBlock.'''
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(PreActBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)

        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
            )

    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))
        out += shortcut
        return out


class PreActBottleneck(nn.Module):
    '''Pre-activation version of the original Bottleneck module.'''
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(PreActBottleneck, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)

        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
            )

    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))
        out = self.conv3(F.relu(self.bn3(out)))
        out += shortcut
        return out


class PreActResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(PreActResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def PreActResNet18():
    return PreActResNet(PreActBlock, [2,2,2,2])

def PreActResNet34():
    return PreActResNet(PreActBlock, [3,4,6,3])

def PreActResNet50():
    return PreActResNet(PreActBottleneck, [3,4,6,3])

def PreActResNet101():
    return PreActResNet(PreActBottleneck, [3,4,23,3])

def PreActResNet152():
    return PreActResNet(PreActBottleneck, [3,8,36,3])


def test():
    net = PreActResNet18()
    y = net((torch.randn(1,3,32,32)))
    print(y.size())

# test()


================================================
FILE: algorithm-GC/cifar/models/resnet.py
================================================
'''ResNet in PyTorch.

For Pre-activation ResNet, see 'preact_resnet.py'.

Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNet18(Num_classes=10):
    return ResNet(BasicBlock, [2,2,2,2],num_classes=Num_classes)

def ResNet34(Num_classes=10):
    return ResNet(BasicBlock, [3,4,6,3],num_classes=Num_classes)

def ResNet50(Num_classes=10):
    return ResNet(Bottleneck, [3,4,6,3],num_classes=Num_classes)

def ResNet101(Num_classes=10):
    return ResNet(Bottleneck, [3,4,23,3],num_classes=Num_classes)

def ResNet152(Num_classes=10):
    return ResNet(Bottleneck, [3,8,36,3],num_classes=Num_classes)


def test():
    net = ResNet18()
    y = net(torch.randn(1,3,32,32))
    print(y.size())

# test()


================================================
FILE: algorithm-GC/cifar/models/resnext.py
================================================
'''ResNeXt in PyTorch.

See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class Block(nn.Module):
    '''Grouped convolution block.'''
    expansion = 2

    def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1):
        super(Block, self).__init__()
        group_width = cardinality * bottleneck_width
        self.conv1 = nn.Conv2d(in_planes, group_width, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(group_width)
        self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False)
        self.bn2 = nn.BatchNorm2d(group_width)
        self.conv3 = nn.Conv2d(group_width, self.expansion*group_width, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*group_width)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*group_width:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*group_width, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*group_width)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNeXt(nn.Module):
    def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10):
        super(ResNeXt, self).__init__()
        self.cardinality = cardinality
        self.bottleneck_width = bottleneck_width
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(num_blocks[0], 1)
        self.layer2 = self._make_layer(num_blocks[1], 2)
        self.layer3 = self._make_layer(num_blocks[2], 2)
        # self.layer4 = self._make_layer(num_blocks[3], 2)
        self.linear = nn.Linear(cardinality*bottleneck_width*8, num_classes)

    def _make_layer(self, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(Block(self.in_planes, self.cardinality, self.bottleneck_width, stride))
            self.in_planes = Block.expansion * self.cardinality * self.bottleneck_width
        # Increase bottleneck_width by 2 after each stage.
        self.bottleneck_width *= 2
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        # out = self.layer4(out)
        out = F.avg_pool2d(out, 8)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNeXt29_2x64d():
    return ResNeXt(num_blocks=[3,3,3], cardinality=2, bottleneck_width=64)

def ResNeXt29_4x64d():
    return ResNeXt(num_blocks=[3,3,3], cardinality=4, bottleneck_width=64)

def ResNeXt29_8x64d():
    return ResNeXt(num_blocks=[3,3,3], cardinality=8, bottleneck_width=64)

def ResNeXt29_32x4d():
    return ResNeXt(num_blocks=[3,3,3], cardinality=32, bottleneck_width=4)

def test_resnext():
    net = ResNeXt29_2x64d()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y.size())

# test_resnext()


================================================
FILE: algorithm-GC/cifar/models/senet.py
================================================
'''SENet in PyTorch.

SENet is the winner of ImageNet-2017. The paper is not released yet.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes)
            )

        # SE layers
        self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)  # Use nn.Conv2d instead of nn.Linear
        self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))

        # Squeeze
        w = F.avg_pool2d(out, out.size(2))
        w = F.relu(self.fc1(w))
        w = F.sigmoid(self.fc2(w))
        # Excitation
        out = out * w  # New broadcasting feature from v0.2!

        out += self.shortcut(x)
        out = F.relu(out)
        return out


class PreActBlock(nn.Module):
    def __init__(self, in_planes, planes, stride=1):
        super(PreActBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)

        if stride != 1 or in_planes != planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False)
            )

        # SE layers
        self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)
        self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)

    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))

        # Squeeze
        w = F.avg_pool2d(out, out.size(2))
        w = F.relu(self.fc1(w))
        w = F.sigmoid(self.fc2(w))
        # Excitation
        out = out * w

        out += shortcut
        return out


class SENet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(SENet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block,  64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def SENet18():
    return SENet(PreActBlock, [2,2,2,2])


def test():
    net = SENet18()
    y = net(torch.randn(1,3,32,32))
    print(y.size())

# test()


================================================
FILE: algorithm-GC/cifar/models/shufflenet.py
================================================
'''ShuffleNet in PyTorch.

See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class ShuffleBlock(nn.Module):
    def __init__(self, groups):
        super(ShuffleBlock, self).__init__()
        self.groups = groups

    def forward(self, x):
        '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''
        N,C,H,W = x.size()
        g = self.groups
        return x.view(N,g,C/g,H,W).permute(0,2,1,3,4).contiguous().view(N,C,H,W)


class Bottleneck(nn.Module):
    def __init__(self, in_planes, out_planes, stride, groups):
        super(Bottleneck, self).__init__()
        self.stride = stride

        mid_planes = out_planes/4
        g = 1 if in_planes==24 else groups
        self.conv1 = nn.Conv2d(in_planes, mid_planes, kernel_size=1, groups=g, bias=False)
        self.bn1 = nn.BatchNorm2d(mid_planes)
        self.shuffle1 = ShuffleBlock(groups=g)
        self.conv2 = nn.Conv2d(mid_planes, mid_planes, kernel_size=3, stride=stride, padding=1, groups=mid_planes, bias=False)
        self.bn2 = nn.BatchNorm2d(mid_planes)
        self.conv3 = nn.Conv2d(mid_planes, out_planes, kernel_size=1, groups=groups, bias=False)
        self.bn3 = nn.BatchNorm2d(out_planes)

        self.shortcut = nn.Sequential()
        if stride == 2:
            self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1))

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.shuffle1(out)
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        res = self.shortcut(x)
        out = F.relu(torch.cat([out,res], 1)) if self.stride==2 else F.relu(out+res)
        return out


class ShuffleNet(nn.Module):
    def __init__(self, cfg):
        super(ShuffleNet, self).__init__()
        out_planes = cfg['out_planes']
        num_blocks = cfg['num_blocks']
        groups = cfg['groups']

        self.conv1 = nn.Conv2d(3, 24, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(24)
        self.in_planes = 24
        self.layer1 = self._make_layer(out_planes[0], num_blocks[0], groups)
        self.layer2 = self._make_layer(out_planes[1], num_blocks[1], groups)
        self.layer3 = self._make_layer(out_planes[2], num_blocks[2], groups)
        self.linear = nn.Linear(out_planes[2], 10)

    def _make_layer(self, out_planes, num_blocks, groups):
        layers = []
        for i in range(num_blocks):
            stride = 2 if i == 0 else 1
            cat_planes = self.in_planes if i == 0 else 0
            layers.append(Bottleneck(self.in_planes, out_planes-cat_planes, stride=stride, groups=groups))
            self.in_planes = out_planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ShuffleNetG2():
    cfg = {
        'out_planes': [200,400,800],
        'num_blocks': [4,8,4],
        'groups': 2
    }
    return ShuffleNet(cfg)

def ShuffleNetG3():
    cfg = {
        'out_planes': [240,480,960],
        'num_blocks': [4,8,4],
        'groups': 3
    }
    return ShuffleNet(cfg)


def test():
    net = ShuffleNetG2()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y)

# test()


================================================
FILE: algorithm-GC/cifar/models/vgg.py
================================================
'''VGG11/13/16/19 in Pytorch.'''
import torch
import torch.nn as nn


cfg = {
    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}


class VGG(nn.Module):
    def __init__(self, vgg_name,Num_classes=100):
        super(VGG, self).__init__()
        self.features = self._make_layers(cfg[vgg_name])
        self.classifier = nn.Linear(512, Num_classes)

    def forward(self, x):
        out = self.features(x)
        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return out

    def _make_layers(self, cfg):
        layers = []
        in_channels = 3
        for x in cfg:
            if x == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
                           nn.BatchNorm2d(x),
                           nn.ReLU(inplace=True)]
                in_channels = x
        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
        return nn.Sequential(*layers)


def test():
    net = VGG('VGG11')
    x = torch.randn(2,3,32,32)
    y = net(x)
    print(y.size())

# test()


================================================
FILE: algorithm-GC/cifar/nohup.out
================================================
Traceback (most recent call last):
  File "main.py", line 281, in <module>
    train_acc=train(epoch,net,optimizer)
  File "main.py", line 227, in train
    outputs = net(inputs)
  File "/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 153, in forward
    return self.module(*inputs[0], **kwargs[0])
  File "/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/yonghw/mycode/Opt_GC/cifar/models/resnet.py", line 90, in forward
    out = self.layer1(out)
  File "/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/container.py", line 100, in forward
    input = module(input)
  File "/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/yonghw/mycode/Opt_GC/cifar/models/resnet.py", line 61, in forward
    out = self.bn3(self.conv3(out))
  File "/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 349, in forward
    return self._conv_forward(input, self.weight)
  File "/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 346, in _conv_forward
    self.padding, self.dilation, self.groups)
RuntimeError: CUDA out of memory. Tried to allocate 128.00 MiB (GPU 0; 15.90 GiB total capacity; 1.06 GiB already allocated; 31.38 MiB free; 1.23 GiB reserved in total by PyTorch)
Terminated


================================================
FILE: algorithm-GC/cifar/os_run.py
================================================
#cifar100 e200 bs128  gs  2,4,8,16
import os,time
#############################
#r18
##############

#### sgd 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_sgd_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_sgd_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_sgd_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_sgd_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_sgd_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_sgd_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_sgd_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_sgd_8.log ")
#time.sleep(500)
#
#### sgdGC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_sgdGC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_sgdGC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_sgdGC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_sgdGC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_sgdGC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_sgdGC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_sgdGC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_sgdGC_8.log ")
#time.sleep(500)
#
#### sgdGCC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_sgdGCC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_sgdGCC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_sgdGCC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_sgdGCC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_sgdGCC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_sgdGCC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_sgdGCC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_sgdGCC_8.log ")
#time.sleep(500)
#
###############
###############
#
#### adam 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_adam_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_adam_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_adam_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_adam_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_adam_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_adam_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_adam_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_adam_8.log ")
#
#time.sleep(500)
#### adamGC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_adamGC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_adamGC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_adamGC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_adamGC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_adamGC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_adamGC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_adamGC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_adamGC_8.log ")
#time.sleep(500)
#
#### adamGCC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_adamGCC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_adamGCC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_adamGCC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_adamGCC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_adamGCC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_adamGCC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_adamGCC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_adamGCC_8.log ")
#time.sleep(500)
#
###############
###############
#
#### adamW 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_adamW_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_adamW_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_adamW_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_adamW_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_adamW_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_adamW_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_adamW_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_adamW_8.log ")
#
#time.sleep(500)
#### adamWGC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_adamWGC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_adamWGC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_adamWGC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_adamWGC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_adamWGC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_adamWGC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_adamWGC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_adamWGC_8.log ")
#time.sleep(500)
#
#### adamWGCC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_adamWGCC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_adamWGCC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_adamWGCC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_adamWGCC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_adamWGCC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_adamWGCC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_adamWGCC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_adamWGCC_8.log ")
#time.sleep(500)
#
###############
###############
#
#### radam 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_radam_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_radam_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_radam_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_radam_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_radam_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_radam_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_radam_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_radam_8.log ")
#
#time.sleep(500)
#### radamGC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_radamGC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_radamGC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_radamGC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_radamGC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_radamGC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_radamGC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_radamGC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_radamGC_8.log ")
#time.sleep(500)
#
#### radamGCC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_radamGCC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_radamGCC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_radamGCC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_radamGCC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_radamGCC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_radamGCC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_radamGCC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_radamGCC_8.log ")
#time.sleep(500)
#
###############
###############
#
#### Lsgd 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_Lsgd_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_Lsgd_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_Lsgd_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_Lsgd_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_Lsgd_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_Lsgd_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_Lsgd_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_Lsgd_8.log ")
#time.sleep(500)
#
#### LsgdGC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_LsgdGC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_LsgdGC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_LsgdGC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_LsgdGC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_LsgdGC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_LsgdGC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_LsgdGC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_LsgdGC_8.log ")
#time.sleep(500)
#
#### LsgdGCC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_LsgdGCC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_LsgdGCC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_LsgdGCC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_LsgdGCC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_LsgdGCC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_LsgdGCC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_LsgdGCC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_LsgdGCC_8.log ")
#time.sleep(500)
#
###############
###############
#
#### Ladam 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_Ladam_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_Ladam_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_Ladam_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_Ladam_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_Ladam_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_Ladam_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_Ladam_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_Ladam_8.log ")
#
#time.sleep(500)
#### LadamGC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_LadamGC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_LadamGC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_LadamGC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_LadamGC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_LadamGC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_LadamGC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_LadamGC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_LadamGC_8.log ")
#time.sleep(500)
#
#### LadamGCC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_LadamGCC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_LadamGCC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_LadamGCC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_LadamGCC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_LadamGCC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_LadamGCC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_LadamGCC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_LadamGCC_8.log ")
#time.sleep(500)
#
###############
###############
#
#### ranger
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger  --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_ranger_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_ranger_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_ranger_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_ranger_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_ranger_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_ranger_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_ranger_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_ranger_8.log ")
#
#time.sleep(500)
#### ranger 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_rangerGC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_rangerGC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_rangerGC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_rangerGC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_rangerGC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_rangerGC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_rangerGC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_rangerGC_8.log ")
#time.sleep(500)
#
#### ranger 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_rangerGCC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_rangerGCC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_rangerGCC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_rangerGCC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_rangerGCC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_rangerGCC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_rangerGCC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_rangerGCC_8.log ")
#time.sleep(500)
#
###############
#
##r50
###############
#
#### sgd 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_sgd_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_sgd_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_sgd_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_sgd_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_sgd_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_sgd_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_sgd_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_sgd_8.log ")
#time.sleep(500)
#
#### sgdGC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_sgdGC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_sgdGC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_sgdGC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_sgdGC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_sgdGC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_sgdGC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_sgdGC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_sgdGC_8.log ")
#time.sleep(500)
#
#### sgdGCC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_sgdGCC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_sgdGCC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_sgdGCC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_sgdGCC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_sgdGCC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_sgdGCC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_sgdGCC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_sgdGCC_8.log ")
#time.sleep(500)
#
###############
###############
#
#### adam 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_adam_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_adam_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_adam_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_adam_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_adam_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_adam_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_adam_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_adam_8.log ")
#
#time.sleep(500)
#### adamGC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_adamGC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_adamGC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_adamGC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_adamGC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_adamGC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_adamGC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_adamGC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_adamGC_8.log ")
#time.sleep(500)
#
#### adamGCC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_adamGCC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_adamGCC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_adamGCC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_adamGCC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_adamGCC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_adamGCC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_adamGCC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_adamGCC_8.log ")
#time.sleep(500)
#
###############
###############
#
#### adamW 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_adamW_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_adamW_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_adamW_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_adamW_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_adamW_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_adamW_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_adamW_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_adamW_8.log ")
#
#time.sleep(500)
#### adamWGC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_adamWGC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_adamWGC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_adamWGC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_adamWGC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_adamWGC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_adamWGC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_adamWGC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_adamWGC_8.log ")
#time.sleep(500)

### adamWGCC 
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_adamWGCC_1.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_adamWGCC_2.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_adamWGCC_3.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_adamWGCC_4.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_adamWGCC_5.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_adamWGCC_6.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_adamWGCC_7.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_adamWGCC_8.log ")
time.sleep(500)

##############
##############

### radam 
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_radam_1.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_radam_2.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_radam_3.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_radam_4.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_radam_5.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_radam_6.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_radam_7.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_radam_8.log ")

time.sleep(500)
### radamGC 
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_radamGC_1.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_radamGC_2.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_radamGC_3.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_radamGC_4.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_radamGC_5.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_radamGC_6.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_radamGC_7.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_radamGC_8.log ")
time.sleep(500)

### radamGCC 
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_radamGCC_1.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_radamGCC_2.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_radamGCC_3.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_radamGCC_4.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_radamGCC_5.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_radamGCC_6.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_radamGCC_7.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_radamGCC_8.log ")
time.sleep(500)

##############
##############

### Lsgd 
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_Lsgd_1.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_Lsgd_2.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_Lsgd_3.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_Lsgd_4.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_Lsgd_5.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_Lsgd_6.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_Lsgd_7.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_Lsgd_8.log ")
time.sleep(500)

### LsgdGC 
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_LsgdGC_1.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_LsgdGC_2.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_LsgdGC_3.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_LsgdGC_4.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_LsgdGC_5.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_LsgdGC_6.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_LsgdGC_7.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_LsgdGC_8.log ")
time.sleep(500)

### LsgdGCC 
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_LsgdGCC_1.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_LsgdGCC_2.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_LsgdGCC_3.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_LsgdGCC_4.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_LsgdGCC_5.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_LsgdGCC_6.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_LsgdGCC_7.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_LsgdGCC_8.log ")
time.sleep(500)

##############
##############

### Ladam 
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_Ladam_1.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_Ladam_2.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_Ladam_3.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_Ladam_4.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_Ladam_5.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_Ladam_6.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_Ladam_7.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_Ladam_8.log ")

time.sleep(500)
### LadamGC 
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_LadamGC_1.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_LadamGC_2.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_LadamGC_3.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_LadamGC_4.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_LadamGC_5.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_LadamGC_6.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_LadamGC_7.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_LadamGC_8.log ")
time.sleep(500)

### LadamGCC 
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_LadamGCC_1.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_LadamGCC_2.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_LadamGCC_3.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_LadamGCC_4.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_LadamGCC_5.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_LadamGCC_6.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_LadamGCC_7.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_LadamGCC_8.log ")
time.sleep(500)

##############
##############

### ranger
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_ranger_1.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_ranger_2.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_ranger_3.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_ranger_4.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_ranger_5.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_ranger_6.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_ranger_7.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_ranger_8.log ")

time.sleep(500)
### ranger 
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_rangerGC_1.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_rangerGC_2.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_rangerGC_3.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_rangerGC_4.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_rangerGC_5.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_rangerGC_6.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_rangerGC_7.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_rangerGC_8.log ")
time.sleep(500)

### ranger 
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_rangerGCC_1.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_rangerGCC_2.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_rangerGCC_3.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_rangerGCC_4.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_rangerGCC_5.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_rangerGCC_6.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_rangerGCC_7.log &")
os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_rangerGCC_8.log ")
time.sleep(500)

##############


================================================
FILE: algorithm-GC/cifar/os_run2.py
================================================
#cifar100 e200 bs128  gs  2,4,8,16
import os,time


#r50
##############


### adam 
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 10 > logout2/r50_lr21_wd45_adam_1.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 11 > logout2/r50_lr21_wd45_adam_2.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 12 > logout2/r50_lr21_wd45_adam_3.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 13 > logout2/r50_lr21_wd45_adam_4.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 14 > logout2/r50_lr21_wd45_adam_5.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 15 > logout2/r50_lr21_wd45_adam_6.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 16 > logout2/r50_lr21_wd45_adam_7.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 17 > logout2/r50_lr21_wd45_adam_8.log ")

time.sleep(500)
### adamGC 
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 10 > logout2/r50_lr21_wd45_adamGC_1.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 11 > logout2/r50_lr21_wd45_adamGC_2.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 12 > logout2/r50_lr21_wd45_adamGC_3.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 13 > logout2/r50_lr21_wd45_adamGC_4.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 14 > logout2/r50_lr21_wd45_adamGC_5.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 15 > logout2/r50_lr21_wd45_adamGC_6.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 16 > logout2/r50_lr21_wd45_adamGC_7.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 17 > logout2/r50_lr21_wd45_adamGC_8.log ")
time.sleep(500)

### adamGCC 
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 10 > logout2/r50_lr21_wd45_adamGCC_1.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 11 > logout2/r50_lr21_wd45_adamGCC_2.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 12 > logout2/r50_lr21_wd45_adamGCC_3.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 13 > logout2/r50_lr21_wd45_adamGCC_4.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 14 > logout2/r50_lr21_wd45_adamGCC_5.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 15 > logout2/r50_lr21_wd45_adamGCC_6.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 16 > logout2/r50_lr21_wd45_adamGCC_7.log &")
os.system("nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 17 > logout2/r50_lr21_wd45_adamGCC_8.log ")
time.sleep(500)

##############
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 10 > logout2/r50_lr25_wd45_adam_1.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 11 > logout2/r50_lr25_wd45_adam_2.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 12 > logout2/r50_lr25_wd45_adam_3.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 13 > logout2/r50_lr25_wd45_adam_4.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 14 > logout2/r50_lr25_wd45_adam_5.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 15 > logout2/r50_lr25_wd45_adam_6.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 16 > logout2/r50_lr25_wd45_adam_7.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 17 > logout2/r50_lr25_wd45_adam_8.log ")

time.sleep(500)
### adamGC 
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 10 > logout2/r50_lr25_wd45_adamGC_1.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 11 > logout2/r50_lr25_wd45_adamGC_2.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 12 > logout2/r50_lr25_wd45_adamGC_3.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 13 > logout2/r50_lr25_wd45_adamGC_4.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 14 > logout2/r50_lr25_wd45_adamGC_5.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 15 > logout2/r50_lr25_wd45_adamGC_6.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 16 > logout2/r50_lr25_wd45_adamGC_7.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 17 > logout2/r50_lr25_wd45_adamGC_8.log ")
time.sleep(500)

### adamGCC 
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 10 > logout2/r50_lr25_wd45_adamGCC_1.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 11 > logout2/r50_lr25_wd45_adamGCC_2.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 12 > logout2/r50_lr25_wd45_adamGCC_3.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 13 > logout2/r50_lr25_wd45_adamGCC_4.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 14 > logout2/r50_lr25_wd45_adamGCC_5.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 15 > logout2/r50_lr25_wd45_adamGCC_6.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 16 > logout2/r50_lr25_wd45_adamGCC_7.log &")
os.system("nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 17 > logout2/r50_lr25_wd45_adamGCC_8.log ")
time.sleep(500)


##############
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 10 > logout2/r50_lr115_wd45_adam_1.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 11 > logout2/r50_lr115_wd45_adam_2.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 12 > logout2/r50_lr115_wd45_adam_3.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 13 > logout2/r50_lr115_wd45_adam_4.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 14 > logout2/r50_lr115_wd45_adam_5.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 15 > logout2/r50_lr115_wd45_adam_6.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 16 > logout2/r50_lr115_wd45_adam_7.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 17 > logout2/r50_lr115_wd45_adam_8.log ")

time.sleep(500)
### adamGC 
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 10 > logout2/r50_lr115_wd45_adamGC_1.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 11 > logout2/r50_lr115_wd45_adamGC_2.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 12 > logout2/r50_lr115_wd45_adamGC_3.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 13 > logout2/r50_lr115_wd45_adamGC_4.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 14 > logout2/r50_lr115_wd45_adamGC_5.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 15 > logout2/r50_lr115_wd45_adamGC_6.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 16 > logout2/r50_lr115_wd45_adamGC_7.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 17 > logout2/r50_lr115_wd45_adamGC_8.log ")
time.sleep(500)

### adamGCC 
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 10 > logout2/r50_lr115_wd45_adamGCC_1.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 11 > logout2/r50_lr115_wd45_adamGCC_2.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 12 > logout2/r50_lr115_wd45_adamGCC_3.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 13 > logout2/r50_lr115_wd45_adamGCC_4.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 14 > logout2/r50_lr115_wd45_adamGCC_5.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 15 > logout2/r50_lr115_wd45_adamGCC_6.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 16 > logout2/r50_lr115_wd45_adamGCC_7.log &")
os.system("nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 17 > logout2/r50_lr115_wd45_adamGCC_8.log ")
time.sleep(500)


#
###############
###############
#
#### adamW 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_adamW_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_adamW_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_adamW_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_adamW_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_adamW_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_adamW_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_adamW_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_adamW_8.log ")
#
#time.sleep(500)
#### adamWGC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_adamWGC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_adamWGC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_adamWGC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_adamWGC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_adamWGC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_adamWGC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_adamWGC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_adamWGC_8.log ")
#time.sleep(500)

### adamWGCC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_adamWGCC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_adamWGCC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_adamWGCC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_adamWGCC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_adamWGCC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_adamWGCC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_adamWGCC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_adamWGCC_8.log ")
#time.sleep(500)
#
###############
###############
#
#### radam 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_radam_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_radam_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_radam_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_radam_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_radam_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_radam_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_radam_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_radam_8.log ")
#
#time.sleep(500)
#### radamGC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_radamGC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_radamGC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_radamGC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_radamGC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_radamGC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_radamGC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_radamGC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_radamGC_8.log ")
#time.sleep(500)
#
#### radamGCC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_radamGCC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_radamGCC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_radamGCC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_radamGCC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_radamGCC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_radamGCC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_radamGCC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_radamGCC_8.log ")
#time.sleep(500)
#
###############
###############
#
#### Lsgd 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_Lsgd_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_Lsgd_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_Lsgd_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_Lsgd_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_Lsgd_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_Lsgd_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_Lsgd_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_Lsgd_8.log ")
#time.sleep(500)
#
#### LsgdGC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_LsgdGC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_LsgdGC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_LsgdGC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_LsgdGC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_LsgdGC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_LsgdGC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_LsgdGC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_LsgdGC_8.log ")
#time.sleep(500)
#
#### LsgdGCC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_LsgdGCC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_LsgdGCC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_LsgdGCC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_LsgdGCC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_LsgdGCC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_LsgdGCC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_LsgdGCC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_LsgdGCC_8.log ")
#time.sleep(500)
#
###############
###############
#
#### Ladam 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_Ladam_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_Ladam_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_Ladam_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_Ladam_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_Ladam_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_Ladam_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_Ladam_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_Ladam_8.log ")
#
#time.sleep(500)
#### LadamGC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_LadamGC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_LadamGC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_LadamGC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_LadamGC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_LadamGC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_LadamGC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_LadamGC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_LadamGC_8.log ")
#time.sleep(500)
#
#### LadamGCC 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_LadamGCC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_LadamGCC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_LadamGCC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_LadamGCC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_LadamGCC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_LadamGCC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_LadamGCC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_LadamGCC_8.log ")
#time.sleep(500)
#
###############
###############
#
#### ranger
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_ranger_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_ranger_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_ranger_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_ranger_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_ranger_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_ranger_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_ranger_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_ranger_8.log ")
#
#time.sleep(500)
#### ranger 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_rangerGC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_rangerGC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_rangerGC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_rangerGC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_rangerGC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_rangerGC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_rangerGC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_rangerGC_8.log ")
#time.sleep(500)
#
#### ranger 
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_rangerGCC_1.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_rangerGCC_2.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_rangerGCC_3.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_rangerGCC_4.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_rangerGCC_5.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_rangerGCC_6.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_rangerGCC_7.log &")
#os.system("nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_rangerGCC_8.log ")
#time.sleep(500)

##############