[
  {
    "path": "GC_code/CIFAR100/algorithm/Adagrad.py",
    "content": "import torch\r\nfrom torch.optim.optimizer import Optimizer\r\n\r\n\r\nclass Adagrad_GCC(Optimizer):\r\n    \"\"\"Implements Adagrad algorithm.\r\n\r\n    It has been proposed in `Adaptive Subgradient Methods for Online Learning\r\n    and Stochastic Optimization`_.\r\n\r\n    Arguments:\r\n        params (iterable): iterable of parameters to optimize or dicts defining\r\n            parameter groups\r\n        lr (float, optional): learning rate (default: 1e-2)\r\n        lr_decay (float, optional): learning rate decay (default: 0)\r\n        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)\r\n        eps (float, optional): term added to the denominator to improve\r\n            numerical stability (default: 1e-10)\r\n\r\n    .. _Adaptive Subgradient Methods for Online Learning and Stochastic\r\n        Optimization: http://jmlr.org/papers/v12/duchi11a.html\r\n    \"\"\"\r\n\r\n    def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0, eps=1e-10):\r\n        if not 0.0 <= lr:\r\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\r\n        if not 0.0 <= lr_decay:\r\n            raise ValueError(\"Invalid lr_decay value: {}\".format(lr_decay))\r\n        if not 0.0 <= weight_decay:\r\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\r\n        if not 0.0 <= initial_accumulator_value:\r\n            raise ValueError(\"Invalid initial_accumulator_value value: {}\".format(initial_accumulator_value))\r\n        if not 0.0 <= eps:\r\n            raise ValueError(\"Invalid epsilon value: {}\".format(eps))\r\n\r\n        defaults = dict(lr=lr, lr_decay=lr_decay, eps=eps, weight_decay=weight_decay,\r\n                        initial_accumulator_value=initial_accumulator_value)\r\n        super(Adagrad_GCC, self).__init__(params, defaults)\r\n\r\n        for group in self.param_groups:\r\n            for p in group['params']:\r\n                state = self.state[p]\r\n                state['step'] = 0\r\n                state['sum'] = torch.full_like(p.data, initial_accumulator_value)\r\n\r\n    def share_memory(self):\r\n        for group in self.param_groups:\r\n            for p in group['params']:\r\n                state = self.state[p]\r\n                state['sum'].share_memory_()\r\n\r\n    def step(self, closure=None):\r\n        \"\"\"Performs a single optimization step.\r\n\r\n        Arguments:\r\n            closure (callable, optional): A closure that reevaluates the model\r\n                and returns the loss.\r\n        \"\"\"\r\n        loss = None\r\n        if closure is not None:\r\n            loss = closure()\r\n\r\n        for group in self.param_groups:\r\n            for p in group['params']:\r\n                if p.grad is None:\r\n                    continue\r\n\r\n                grad = p.grad.data\r\n                state = self.state[p]\r\n\r\n                state['step'] += 1\r\n\r\n                if group['weight_decay'] != 0:\r\n                    if p.grad.data.is_sparse:\r\n                        raise RuntimeError(\"weight_decay option is not compatible with sparse gradients\")\r\n                    grad = grad.add(group['weight_decay'], p.data)\r\n                    \r\n                 #GC operation for Conv layers                  \r\n                if len(list(grad.size()))>3:\r\n                    grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))\r\n\r\n                clr = group['lr'] / (1 + (state['step'] - 1) * group['lr_decay'])\r\n\r\n                if grad.is_sparse:\r\n                    grad = grad.coalesce()  # the update is non-linear so indices must be unique\r\n                    grad_indices = grad._indices()\r\n                    grad_values = grad._values()\r\n                    size = grad.size()\r\n\r\n                    def make_sparse(values):\r\n                        constructor = grad.new\r\n                        if grad_indices.dim() == 0 or values.dim() == 0:\r\n                            return constructor().resize_as_(grad)\r\n                        return constructor(grad_indices, values, size)\r\n                    state['sum'].add_(make_sparse(grad_values.pow(2)))\r\n                    std = state['sum'].sparse_mask(grad)\r\n                    std_values = std._values().sqrt_().add_(group['eps'])\r\n                    p.data.add_(-clr, make_sparse(grad_values / std_values))\r\n                else:\r\n                    state['sum'].addcmul_(1, grad, grad)\r\n                    std = state['sum'].sqrt().add_(group['eps'])\r\n                    p.data.addcdiv_(-clr, grad, std)\r\n\r\n        return loss\r\n    \r\nclass Adagrad_GC(Optimizer):\r\n    \"\"\"Implements Adagrad algorithm.\r\n\r\n    It has been proposed in `Adaptive Subgradient Methods for Online Learning\r\n    and Stochastic Optimization`_.\r\n\r\n    Arguments:\r\n        params (iterable): iterable of parameters to optimize or dicts defining\r\n            parameter groups\r\n        lr (float, optional): learning rate (default: 1e-2)\r\n        lr_decay (float, optional): learning rate decay (default: 0)\r\n        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)\r\n        eps (float, optional): term added to the denominator to improve\r\n            numerical stability (default: 1e-10)\r\n\r\n    .. _Adaptive Subgradient Methods for Online Learning and Stochastic\r\n        Optimization: http://jmlr.org/papers/v12/duchi11a.html\r\n    \"\"\"\r\n\r\n    def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0, eps=1e-10):\r\n        if not 0.0 <= lr:\r\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\r\n        if not 0.0 <= lr_decay:\r\n            raise ValueError(\"Invalid lr_decay value: {}\".format(lr_decay))\r\n        if not 0.0 <= weight_decay:\r\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\r\n        if not 0.0 <= initial_accumulator_value:\r\n            raise ValueError(\"Invalid initial_accumulator_value value: {}\".format(initial_accumulator_value))\r\n        if not 0.0 <= eps:\r\n            raise ValueError(\"Invalid epsilon value: {}\".format(eps))\r\n\r\n        defaults = dict(lr=lr, lr_decay=lr_decay, eps=eps, weight_decay=weight_decay,\r\n                        initial_accumulator_value=initial_accumulator_value)\r\n        super(Adagrad_GC, self).__init__(params, defaults)\r\n\r\n        for group in self.param_groups:\r\n            for p in group['params']:\r\n                state = self.state[p]\r\n                state['step'] = 0\r\n                state['sum'] = torch.full_like(p.data, initial_accumulator_value)\r\n\r\n    def share_memory(self):\r\n        for group in self.param_groups:\r\n            for p in group['params']:\r\n                state = self.state[p]\r\n                state['sum'].share_memory_()\r\n\r\n    def step(self, closure=None):\r\n        \"\"\"Performs a single optimization step.\r\n\r\n        Arguments:\r\n            closure (callable, optional): A closure that reevaluates the model\r\n                and returns the loss.\r\n        \"\"\"\r\n        loss = None\r\n        if closure is not None:\r\n            loss = closure()\r\n\r\n        for group in self.param_groups:\r\n            for p in group['params']:\r\n                if p.grad is None:\r\n                    continue\r\n\r\n                grad = p.grad.data\r\n                state = self.state[p]\r\n\r\n                state['step'] += 1\r\n\r\n                if group['weight_decay'] != 0:\r\n                    if p.grad.data.is_sparse:\r\n                        raise RuntimeError(\"weight_decay option is not compatible with sparse gradients\")\r\n                    grad = grad.add(group['weight_decay'], p.data)\r\n                    \r\n                 #GC operation for Conv layers                  \r\n                if len(list(grad.size()))>1:\r\n                    grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))\r\n\r\n                clr = group['lr'] / (1 + (state['step'] - 1) * group['lr_decay'])\r\n\r\n                if grad.is_sparse:\r\n                    grad = grad.coalesce()  # the update is non-linear so indices must be unique\r\n                    grad_indices = grad._indices()\r\n                    grad_values = grad._values()\r\n                    size = grad.size()\r\n\r\n                    def make_sparse(values):\r\n                        constructor = grad.new\r\n                        if grad_indices.dim() == 0 or values.dim() == 0:\r\n                            return constructor().resize_as_(grad)\r\n                        return constructor(grad_indices, values, size)\r\n                    state['sum'].add_(make_sparse(grad_values.pow(2)))\r\n                    std = state['sum'].sparse_mask(grad)\r\n                    std_values = std._values().sqrt_().add_(group['eps'])\r\n                    p.data.add_(-clr, make_sparse(grad_values / std_values))\r\n                else:\r\n                    state['sum'].addcmul_(1, grad, grad)\r\n                    std = state['sum'].sqrt().add_(group['eps'])\r\n                    p.data.addcdiv_(-clr, grad, std)\r\n\r\n        return loss\r\n    \r\n"
  },
  {
    "path": "GC_code/CIFAR100/algorithm/Adam.py",
    "content": "import math\nimport torch\nfrom torch.optim.optimizer import Optimizer\n\nclass Adam_GCC(Optimizer):\n    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,\n                 weight_decay=0, amsgrad=False):\n        if not 0.0 <= lr:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if not 0.0 <= eps:\n            raise ValueError(\"Invalid epsilon value: {}\".format(eps))\n        if not 0.0 <= betas[0] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 0: {}\".format(betas[0]))\n        if not 0.0 <= betas[1] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 1: {}\".format(betas[1]))\n        defaults = dict(lr=lr, betas=betas, eps=eps,\n                        weight_decay=weight_decay, amsgrad=amsgrad)\n        super(Adam_GCC, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(Adam_GCC, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('amsgrad', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                grad = p.grad.data\n                if grad.is_sparse:\n                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')\n                amsgrad = group['amsgrad']\n\n                state = self.state[p]\n\n                # State initialization\n                if len(state) == 0:\n                    state['step'] = 0\n                    # Exponential moving average of gradient values\n                    state['exp_avg'] = torch.zeros_like(p.data)\n                    # Exponential moving average of squared gradient values\n                    state['exp_avg_sq'] = torch.zeros_like(p.data)\n                    if amsgrad:\n                        # Maintains max of all exp. moving avg. of sq. grad. values\n                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)\n\n                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']\n                if amsgrad:\n                    max_exp_avg_sq = state['max_exp_avg_sq']\n                beta1, beta2 = group['betas']\n\n                state['step'] += 1\n                bias_correction1 = 1 - beta1 ** state['step']\n                bias_correction2 = 1 - beta2 ** state['step']\n\n                if group['weight_decay'] != 0:\n                    grad.add_(group['weight_decay'], p.data)\n\n                #GC operation for Conv layers\n                if len(list(grad.size()))>3:                    \n                    grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))\n                    \n                # Decay the first and second moment running average coefficient\n                exp_avg.mul_(beta1).add_(1 - beta1, grad)\n                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)\n                if amsgrad:\n                    # Maintains the maximum of all 2nd moment running avg. till now\n                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)\n                    # Use the max. for normalizing running avg. of gradient\n                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])\n                else:\n                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])\n\n                step_size = group['lr'] / bias_correction1\n\n                p.data.addcdiv_(-step_size, exp_avg, denom)\n\n        return loss\n\nclass Adam_GCC2(Optimizer):\n    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,\n                 weight_decay=0, amsgrad=False):\n        if not 0.0 <= lr:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if not 0.0 <= eps:\n            raise ValueError(\"Invalid epsilon value: {}\".format(eps))\n        if not 0.0 <= betas[0] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 0: {}\".format(betas[0]))\n        if not 0.0 <= betas[1] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 1: {}\".format(betas[1]))\n        defaults = dict(lr=lr, betas=betas, eps=eps,\n                        weight_decay=weight_decay, amsgrad=amsgrad)\n        super(Adam_GCC2, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(Adam_GCC2, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('amsgrad', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                grad = p.grad.data\n                if grad.is_sparse:\n                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')\n                amsgrad = group['amsgrad']\n\n                state = self.state[p]\n\n                # State initialization\n                if len(state) == 0:\n                    state['step'] = 0\n                    # Exponential moving average of gradient values\n                    state['exp_avg'] = torch.zeros_like(p.data)\n                    # Exponential moving average of squared gradient values\n                    state['exp_avg_sq'] = torch.zeros_like(p.data)\n                    if amsgrad:\n                        # Maintains max of all exp. moving avg. of sq. grad. values\n                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)\n\n                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']\n                if amsgrad:\n                    max_exp_avg_sq = state['max_exp_avg_sq']\n                beta1, beta2 = group['betas']\n\n                state['step'] += 1\n                bias_correction1 = 1 - beta1 ** state['step']\n                bias_correction2 = 1 - beta2 ** state['step']\n\n                if group['weight_decay'] != 0:\n                    grad.add_(group['weight_decay'], p.data)\n                    \n                # Decay the first and second moment running average coefficient\n                exp_avg.mul_(beta1).add_(1 - beta1, grad)\n                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)\n                if amsgrad:\n                    # Maintains the maximum of all 2nd moment running avg. till now\n                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)\n                    # Use the max. for normalizing running avg. of gradient\n                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])\n                else:\n                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])\n\n                step_size = group['lr'] / bias_correction1\n                #GC operation for Conv layers                \n                if len(list(grad.size()))>3:\n                  delta=(step_size*exp_avg/denom).clone()\n                  delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))\n                  p.data.add_(-delta)\n                else:\n                  p.data.addcdiv_(-step_size, exp_avg, denom)\n        return loss    \n\nclass Adam_GC(Optimizer):\n    r\"\"\"Implements Adam algorithm.\n\n    It has been proposed in `Adam: A Method for Stochastic Optimization`_.\n\n    Arguments:\n        params (iterable): iterable of parameters to optimize or dicts defining\n            parameter groups\n        lr (float, optional): learning rate (default: 1e-3)\n        betas (Tuple[float, float], optional): coefficients used for computing\n            running averages of gradient and its square (default: (0.9, 0.999))\n        eps (float, optional): term added to the denominator to improve\n            numerical stability (default: 1e-8)\n        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)\n        amsgrad (boolean, optional): whether to use the AMSGrad variant of this\n            algorithm from the paper `On the Convergence of Adam and Beyond`_\n            (default: False)\n\n    .. _Adam\\: A Method for Stochastic Optimization:\n        https://arxiv.org/abs/1412.6980\n    .. _On the Convergence of Adam and Beyond:\n        https://openreview.net/forum?id=ryQu7f-RZ\n    \"\"\"\n\n    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,\n                 weight_decay=0, amsgrad=False):\n        if not 0.0 <= lr:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if not 0.0 <= eps:\n            raise ValueError(\"Invalid epsilon value: {}\".format(eps))\n        if not 0.0 <= betas[0] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 0: {}\".format(betas[0]))\n        if not 0.0 <= betas[1] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 1: {}\".format(betas[1]))\n        defaults = dict(lr=lr, betas=betas, eps=eps,\n                        weight_decay=weight_decay, amsgrad=amsgrad)\n        super(Adam_GC, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(Adam_GC, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('amsgrad', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                grad = p.grad.data\n                if grad.is_sparse:\n                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')\n                amsgrad = group['amsgrad']\n\n                state = self.state[p]\n\n                # State initialization\n                if len(state) == 0:\n                    state['step'] = 0\n                    # Exponential moving average of gradient values\n                    state['exp_avg'] = torch.zeros_like(p.data)\n                    # Exponential moving average of squared gradient values\n                    state['exp_avg_sq'] = torch.zeros_like(p.data)\n                    if amsgrad:\n                        # Maintains max of all exp. moving avg. of sq. grad. values\n                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)\n\n                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']\n                if amsgrad:\n                    max_exp_avg_sq = state['max_exp_avg_sq']\n                beta1, beta2 = group['betas']\n\n                state['step'] += 1\n                bias_correction1 = 1 - beta1 ** state['step']\n                bias_correction2 = 1 - beta2 ** state['step']\n\n                if group['weight_decay'] != 0:\n                    grad.add_(group['weight_decay'], p.data)\n                   \n                #GC operation for Conv layers and FC layers   \n                if len(list(grad.size()))>1:\n                   grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))\n\n                # Decay the first and second moment running average coefficient\n                exp_avg.mul_(beta1).add_(1 - beta1, grad)\n                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)\n                if amsgrad:\n                    # Maintains the maximum of all 2nd moment running avg. till now\n                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)\n                    # Use the max. for normalizing running avg. of gradient\n                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])\n                else:\n                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])\n\n                step_size = group['lr'] / bias_correction1\n\n                p.data.addcdiv_(-step_size, exp_avg, denom)\n\n        return loss\n\n\nclass Adam_GC2(Optimizer):\n    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,\n                 weight_decay=0, amsgrad=False):\n        if not 0.0 <= lr:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if not 0.0 <= eps:\n            raise ValueError(\"Invalid epsilon value: {}\".format(eps))\n        if not 0.0 <= betas[0] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 0: {}\".format(betas[0]))\n        if not 0.0 <= betas[1] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 1: {}\".format(betas[1]))\n        defaults = dict(lr=lr, betas=betas, eps=eps,\n                        weight_decay=weight_decay, amsgrad=amsgrad)\n        super(Adam_GC2, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(Adam_GC2, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('amsgrad', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                grad = p.grad.data\n                if grad.is_sparse:\n                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')\n                amsgrad = group['amsgrad']\n\n                state = self.state[p]\n\n                # State initialization\n                if len(state) == 0:\n                    state['step'] = 0\n                    # Exponential moving average of gradient values\n                    state['exp_avg'] = torch.zeros_like(p.data)\n                    # Exponential moving average of squared gradient values\n                    state['exp_avg_sq'] = torch.zeros_like(p.data)\n                    if amsgrad:\n                        # Maintains max of all exp. moving avg. of sq. grad. values\n                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)\n\n                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']\n                if amsgrad:\n                    max_exp_avg_sq = state['max_exp_avg_sq']\n                beta1, beta2 = group['betas']\n\n                state['step'] += 1\n                bias_correction1 = 1 - beta1 ** state['step']\n                bias_correction2 = 1 - beta2 ** state['step']\n\n                if group['weight_decay'] != 0:\n                    grad.add_(group['weight_decay'], p.data)\n                    \n                # Decay the first and second moment running average coefficient\n                exp_avg.mul_(beta1).add_(1 - beta1, grad)\n                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)\n                if amsgrad:\n                    # Maintains the maximum of all 2nd moment running avg. till now\n                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)\n                    # Use the max. for normalizing running avg. of gradient\n                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])\n                else:\n                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])\n\n                step_size = group['lr'] / bias_correction1\n                #GC operation for Conv layers and FC layers               \n                if len(list(grad.size()))>1:\n                  delta=(step_size*exp_avg/denom).clone()\n                  delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))\n                  p.data.add_(-delta)\n                else:\n                  p.data.addcdiv_(-step_size, exp_avg, denom)\n        return loss\n\nclass AdamW(Optimizer):\n    \"\"\"Implements Adam algorithm.\n    It has been proposed in `Adam: A Method for Stochastic Optimization`_.\n    Arguments:\n        params (iterable): iterable of parameters to optimize or dicts defining\n            parameter groups\n        lr (float, optional): learning rate (default: 1e-3)\n        betas (Tuple[float, float], optional): coefficients used for computing\n            running averages of gradient and its square (default: (0.9, 0.999))\n        eps (float, optional): term added to the denominator to improve\n            numerical stability (default: 1e-8)\n        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)\n        amsgrad (boolean, optional): whether to use the AMSGrad variant of this\n            algorithm from the paper `On the Convergence of Adam and Beyond`_\n    .. _Adam\\: A Method for Stochastic Optimization:\n        https://arxiv.org/abs/1412.6980\n    .. _On the Convergence of Adam and Beyond:\n        https://openreview.net/forum?id=ryQu7f-RZ\n    \"\"\"\n\n    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,\n                 weight_decay=0, amsgrad=False):\n        if not 0.0 <= lr:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if not 0.0 <= eps:\n            raise ValueError(\"Invalid epsilon value: {}\".format(eps))\n        if not 0.0 <= betas[0] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 0: {}\".format(betas[0]))\n        if not 0.0 <= betas[1] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 1: {}\".format(betas[1]))\n        defaults = dict(lr=lr, betas=betas, eps=eps,\n                        weight_decay=weight_decay, amsgrad=amsgrad)\n        super(AdamW, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(AdamW, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('amsgrad', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                grad = p.grad.data\n                if grad.is_sparse:\n                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')\n                amsgrad = group['amsgrad']\n\n                state = self.state[p]\n\n                # State initialization\n                if len(state) == 0:\n                    state['step'] = 0\n                    # Exponential moving average of gradient values\n                    state['exp_avg'] = torch.zeros_like(p.data)\n                    # Exponential moving average of squared gradient values\n                    state['exp_avg_sq'] = torch.zeros_like(p.data)\n                    if amsgrad:\n                        # Maintains max of all exp. moving avg. of sq. grad. values\n                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)\n\n                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']\n                if amsgrad:\n                    max_exp_avg_sq = state['max_exp_avg_sq']\n                beta1, beta2 = group['betas']\n\n                state['step'] += 1\n\n                # if group['weight_decay'] != 0:\n                #     grad = grad.add(group['weight_decay'], p.data)\n\n                # Decay the first and second moment running average coefficient\n                exp_avg.mul_(beta1).add_(1 - beta1, grad)\n                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)\n                if amsgrad:\n                    # Maintains the maximum of all 2nd moment running avg. till now\n                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)\n                    # Use the max. for normalizing running avg. of gradient\n                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])\n                else:\n                    denom = exp_avg_sq.sqrt().add_(group['eps'])\n\n                bias_correction1 = 1 - beta1 ** state['step']\n                bias_correction2 = 1 - beta2 ** state['step']\n                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1\n\n                # p.data.addcdiv_(-step_size, exp_avg, denom)\n                p.data.add_(-step_size,  torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )\n\n        return loss\n\n\n\nclass AdamW_GCC(Optimizer):\n    \"\"\"Implements Adam algorithm.\n    It has been proposed in `Adam: A Method for Stochastic Optimization`_.\n    Arguments:\n        params (iterable): iterable of parameters to optimize or dicts defining\n            parameter groups\n        lr (float, optional): learning rate (default: 1e-3)\n        betas (Tuple[float, float], optional): coefficients used for computing\n            running averages of gradient and its square (default: (0.9, 0.999))\n        eps (float, optional): term added to the denominator to improve\n            numerical stability (default: 1e-8)\n        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)\n        amsgrad (boolean, optional): whether to use the AMSGrad variant of this\n            algorithm from the paper `On the Convergence of Adam and Beyond`_\n    .. _Adam\\: A Method for Stochastic Optimization:\n        https://arxiv.org/abs/1412.6980\n    .. _On the Convergence of Adam and Beyond:\n        https://openreview.net/forum?id=ryQu7f-RZ\n    \"\"\"\n\n    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,\n                 weight_decay=0, amsgrad=False):\n        if not 0.0 <= lr:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if not 0.0 <= eps:\n            raise ValueError(\"Invalid epsilon value: {}\".format(eps))\n        if not 0.0 <= betas[0] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 0: {}\".format(betas[0]))\n        if not 0.0 <= betas[1] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 1: {}\".format(betas[1]))\n        defaults = dict(lr=lr, betas=betas, eps=eps,\n                        weight_decay=weight_decay, amsgrad=amsgrad)\n        super(AdamW_GCC, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(AdamW_GCC, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('amsgrad', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                grad = p.grad.data\n                if grad.is_sparse:\n                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')\n                amsgrad = group['amsgrad']\n\n                state = self.state[p]\n\n                # State initialization\n                if len(state) == 0:\n                    state['step'] = 0\n                    # Exponential moving average of gradient values\n                    state['exp_avg'] = torch.zeros_like(p.data)\n                    # Exponential moving average of squared gradient values\n                    state['exp_avg_sq'] = torch.zeros_like(p.data)\n                    if amsgrad:\n                        # Maintains max of all exp. moving avg. of sq. grad. values\n                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)\n\n                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']\n                if amsgrad:\n                    max_exp_avg_sq = state['max_exp_avg_sq']\n                beta1, beta2 = group['betas']\n\n                #GC operation for Conv layers\n                if len(list(grad.size()))>3:                    \n                   grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))\n\n                state['step'] += 1\n\n                # if group['weight_decay'] != 0:\n                #     grad = grad.add(group['weight_decay'], p.data)\n\n                # Decay the first and second moment running average coefficient\n                exp_avg.mul_(beta1).add_(1 - beta1, grad)\n                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)\n                if amsgrad:\n                    # Maintains the maximum of all 2nd moment running avg. till now\n                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)\n                    # Use the max. for normalizing running avg. of gradient\n                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])\n                else:\n                    denom = exp_avg_sq.sqrt().add_(group['eps'])\n\n                bias_correction1 = 1 - beta1 ** state['step']\n                bias_correction2 = 1 - beta2 ** state['step']\n                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1\n\n                # p.data.addcdiv_(-step_size, exp_avg, denom)\n                p.data.add_(-step_size,  torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )\n\n        return loss\n \nclass AdamW_GC(Optimizer):\n    \"\"\"Implements Adam algorithm.\n    It has been proposed in `Adam: A Method for Stochastic Optimization`_.\n    Arguments:\n        params (iterable): iterable of parameters to optimize or dicts defining\n            parameter groups\n        lr (float, optional): learning rate (default: 1e-3)\n        betas (Tuple[float, float], optional): coefficients used for computing\n            running averages of gradient and its square (default: (0.9, 0.999))\n        eps (float, optional): term added to the denominator to improve\n            numerical stability (default: 1e-8)\n        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)\n        amsgrad (boolean, optional): whether to use the AMSGrad variant of this\n            algorithm from the paper `On the Convergence of Adam and Beyond`_\n    .. _Adam\\: A Method for Stochastic Optimization:\n        https://arxiv.org/abs/1412.6980\n    .. _On the Convergence of Adam and Beyond:\n        https://openreview.net/forum?id=ryQu7f-RZ\n    \"\"\"\n\n    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,\n                 weight_decay=0, amsgrad=False):\n        if not 0.0 <= lr:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if not 0.0 <= eps:\n            raise ValueError(\"Invalid epsilon value: {}\".format(eps))\n        if not 0.0 <= betas[0] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 0: {}\".format(betas[0]))\n        if not 0.0 <= betas[1] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 1: {}\".format(betas[1]))\n        defaults = dict(lr=lr, betas=betas, eps=eps,\n                        weight_decay=weight_decay, amsgrad=amsgrad)\n        super(AdamW_GC, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(AdamW_GC, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('amsgrad', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                grad = p.grad.data\n                if grad.is_sparse:\n                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')\n                amsgrad = group['amsgrad']\n\n                state = self.state[p]\n\n                # State initialization\n                if len(state) == 0:\n                    state['step'] = 0\n                    # Exponential moving average of gradient values\n                    state['exp_avg'] = torch.zeros_like(p.data)\n                    # Exponential moving average of squared gradient values\n                    state['exp_avg_sq'] = torch.zeros_like(p.data)\n                    if amsgrad:\n                        # Maintains max of all exp. moving avg. of sq. grad. values\n                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)\n\n                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']\n                if amsgrad:\n                    max_exp_avg_sq = state['max_exp_avg_sq']\n                beta1, beta2 = group['betas']\n\n                #GC operation for Conv and FC layers\n                if len(list(grad.size()))>1:                    \n                   grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))\n\n                state['step'] += 1\n\n                # if group['weight_decay'] != 0:\n                #     grad = grad.add(group['weight_decay'], p.data)\n\n                # Decay the first and second moment running average coefficient\n                exp_avg.mul_(beta1).add_(1 - beta1, grad)\n                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)\n                if amsgrad:\n                    # Maintains the maximum of all 2nd moment running avg. till now\n                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)\n                    # Use the max. for normalizing running avg. of gradient\n                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])\n                else:\n                    denom = exp_avg_sq.sqrt().add_(group['eps'])\n\n                bias_correction1 = 1 - beta1 ** state['step']\n                bias_correction2 = 1 - beta2 ** state['step']\n                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1\n\n                # p.data.addcdiv_(-step_size, exp_avg, denom)\n                p.data.add_(-step_size,  torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )\n\n        return loss\n\nclass AdamW_GCC2(Optimizer):\n    \"\"\"Implements Adam algorithm.\n    It has been proposed in `Adam: A Method for Stochastic Optimization`_.\n    Arguments:\n        params (iterable): iterable of parameters to optimize or dicts defining\n            parameter groups\n        lr (float, optional): learning rate (default: 1e-3)\n        betas (Tuple[float, float], optional): coefficients used for computing\n            running averages of gradient and its square (default: (0.9, 0.999))\n        eps (float, optional): term added to the denominator to improve\n            numerical stability (default: 1e-8)\n        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)\n        amsgrad (boolean, optional): whether to use the AMSGrad variant of this\n            algorithm from the paper `On the Convergence of Adam and Beyond`_\n    .. _Adam\\: A Method for Stochastic Optimization:\n        https://arxiv.org/abs/1412.6980\n    .. _On the Convergence of Adam and Beyond:\n        https://openreview.net/forum?id=ryQu7f-RZ\n    \"\"\"\n\n    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,\n                 weight_decay=0, amsgrad=False):\n        if not 0.0 <= lr:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if not 0.0 <= eps:\n            raise ValueError(\"Invalid epsilon value: {}\".format(eps))\n        if not 0.0 <= betas[0] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 0: {}\".format(betas[0]))\n        if not 0.0 <= betas[1] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 1: {}\".format(betas[1]))\n        defaults = dict(lr=lr, betas=betas, eps=eps,\n                        weight_decay=weight_decay, amsgrad=amsgrad)\n        super(AdamW_GCC2, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(AdamW_GCC2, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('amsgrad', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                grad = p.grad.data\n                if grad.is_sparse:\n                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')\n                amsgrad = group['amsgrad']\n\n                state = self.state[p]\n\n                # State initialization\n                if len(state) == 0:\n                    state['step'] = 0\n                    # Exponential moving average of gradient values\n                    state['exp_avg'] = torch.zeros_like(p.data)\n                    # Exponential moving average of squared gradient values\n                    state['exp_avg_sq'] = torch.zeros_like(p.data)\n                    if amsgrad:\n                        # Maintains max of all exp. moving avg. of sq. grad. values\n                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)\n\n                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']\n                if amsgrad:\n                    max_exp_avg_sq = state['max_exp_avg_sq']\n                beta1, beta2 = group['betas']\n\n                state['step'] += 1\n\n                # if group['weight_decay'] != 0:\n                #     grad = grad.add(group['weight_decay'], p.data)\n\n                # Decay the first and second moment running average coefficient\n                exp_avg.mul_(beta1).add_(1 - beta1, grad)\n                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)\n                if amsgrad:\n                    # Maintains the maximum of all 2nd moment running avg. till now\n                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)\n                    # Use the max. for normalizing running avg. of gradient\n                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])\n                else:\n                    denom = exp_avg_sq.sqrt().add_(group['eps'])\n\n                bias_correction1 = 1 - beta1 ** state['step']\n                bias_correction2 = 1 - beta2 ** state['step']\n                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1\n\n                # GC operation for Conv layers\n                if len(list(grad.size()))>3:\n                  delta=(step_size*torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom)).clone()\n                  delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))\n                  p.data.add_(-delta)\n                else:\n                  p.data.add_(-step_size,  torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )\n                             \n        return loss\n\nclass AdamW_GC2(Optimizer):\n    \"\"\"Implements Adam algorithm.\n    It has been proposed in `Adam: A Method for Stochastic Optimization`_.\n    Arguments:\n        params (iterable): iterable of parameters to optimize or dicts defining\n            parameter groups\n        lr (float, optional): learning rate (default: 1e-3)\n        betas (Tuple[float, float], optional): coefficients used for computing\n            running averages of gradient and its square (default: (0.9, 0.999))\n        eps (float, optional): term added to the denominator to improve\n            numerical stability (default: 1e-8)\n        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)\n        amsgrad (boolean, optional): whether to use the AMSGrad variant of this\n            algorithm from the paper `On the Convergence of Adam and Beyond`_\n    .. _Adam\\: A Method for Stochastic Optimization:\n        https://arxiv.org/abs/1412.6980\n    .. _On the Convergence of Adam and Beyond:\n        https://openreview.net/forum?id=ryQu7f-RZ\n    \"\"\"\n\n    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,\n                 weight_decay=0, amsgrad=False):\n        if not 0.0 <= lr:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if not 0.0 <= eps:\n            raise ValueError(\"Invalid epsilon value: {}\".format(eps))\n        if not 0.0 <= betas[0] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 0: {}\".format(betas[0]))\n        if not 0.0 <= betas[1] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 1: {}\".format(betas[1]))\n        defaults = dict(lr=lr, betas=betas, eps=eps,\n                        weight_decay=weight_decay, amsgrad=amsgrad)\n        super(AdamW_GC2, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(AdamW_GC2, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('amsgrad', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                grad = p.grad.data\n                if grad.is_sparse:\n                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')\n                amsgrad = group['amsgrad']\n\n                state = self.state[p]\n\n                # State initialization\n                if len(state) == 0:\n                    state['step'] = 0\n                    # Exponential moving average of gradient values\n                    state['exp_avg'] = torch.zeros_like(p.data)\n                    # Exponential moving average of squared gradient values\n                    state['exp_avg_sq'] = torch.zeros_like(p.data)\n                    if amsgrad:\n                        # Maintains max of all exp. moving avg. of sq. grad. values\n                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)\n\n                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']\n                if amsgrad:\n                    max_exp_avg_sq = state['max_exp_avg_sq']\n                beta1, beta2 = group['betas']\n\n                state['step'] += 1\n\n                # if group['weight_decay'] != 0:\n                #     grad = grad.add(group['weight_decay'], p.data)\n\n                # Decay the first and second moment running average coefficient\n                exp_avg.mul_(beta1).add_(1 - beta1, grad)\n                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)\n                if amsgrad:\n                    # Maintains the maximum of all 2nd moment running avg. till now\n                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)\n                    # Use the max. for normalizing running avg. of gradient\n                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])\n                else:\n                    denom = exp_avg_sq.sqrt().add_(group['eps'])\n\n                bias_correction1 = 1 - beta1 ** state['step']\n                bias_correction2 = 1 - beta2 ** state['step']\n                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1\n\n                # GC operation for Conv and FC layers\n                if len(list(grad.size()))>1:\n                  delta=(step_size*torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom)).clone()\n                  delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))\n                  p.data.add_(-delta)\n                else:\n                  p.data.add_(-step_size,  torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )\n                \n               \n        return loss\n"
  },
  {
    "path": "GC_code/CIFAR100/algorithm/SGD.py",
    "content": "import torch\nfrom torch.optim.optimizer import Optimizer, required\n\n\n\nclass SGD_GCC(Optimizer):\n\n    def __init__(self, params, lr=required, momentum=0, dampening=0,\n                 weight_decay=0, nesterov=False):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if momentum < 0.0:\n            raise ValueError(\"Invalid momentum value: {}\".format(momentum))\n        if weight_decay < 0.0:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n\n        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,\n                        weight_decay=weight_decay, nesterov=nesterov)\n        if nesterov and (momentum <= 0 or dampening != 0):\n            raise ValueError(\"Nesterov momentum requires a momentum and zero dampening\")\n        super(SGD_GCC, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(SGD_GCC, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('nesterov', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            weight_decay = group['weight_decay']\n            momentum = group['momentum']\n            dampening = group['dampening']\n            nesterov = group['nesterov']\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                d_p = p.grad.data\n\n                if weight_decay != 0:\n                    d_p.add_(weight_decay, p.data)\n\n                #GC operation for Conv layers\n                if len(list(d_p.size()))>3:\n                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))\n                   \n                if momentum != 0:\n                    param_state = self.state[p]\n                    if 'momentum_buffer' not in param_state:\n                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()\n                    else:\n                        buf = param_state['momentum_buffer']\n                        buf.mul_(momentum).add_(1 - dampening, d_p)\n                    if nesterov:\n                        d_p = d_p.add(momentum, buf)\n                    else:\n                        d_p = buf\n\n                p.data.add_(-group['lr'], d_p)\n\n        return loss\n\nclass SGD_GC(Optimizer):\n\n    def __init__(self, params, lr=required, momentum=0, dampening=0,\n                 weight_decay=0, nesterov=False):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if momentum < 0.0:\n            raise ValueError(\"Invalid momentum value: {}\".format(momentum))\n        if weight_decay < 0.0:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n\n        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,\n                        weight_decay=weight_decay, nesterov=nesterov)\n        if nesterov and (momentum <= 0 or dampening != 0):\n            raise ValueError(\"Nesterov momentum requires a momentum and zero dampening\")\n        super(SGD_GC, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(SGD_GC, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('nesterov', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            weight_decay = group['weight_decay']\n            momentum = group['momentum']\n            dampening = group['dampening']\n            nesterov = group['nesterov']\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                d_p = p.grad.data\n\n                if weight_decay != 0:\n                    d_p.add_(weight_decay, p.data)\n\n                #GC operation for Conv layers and FC layers\n                if len(list(d_p.size()))>1:\n                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))\n\n                if momentum != 0:\n                    param_state = self.state[p]\n                    if 'momentum_buffer' not in param_state:\n                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()\n                    else:\n                        buf = param_state['momentum_buffer']\n                        buf.mul_(momentum).add_(1 - dampening, d_p)\n                    if nesterov:\n                        d_p = d_p.add(momentum, buf)\n                    else:\n                        d_p = buf\n\n                p.data.add_(-group['lr'], d_p)\n\n        return loss\n\n\nclass SGDW(Optimizer):\n\n    def __init__(self, params, lr=required, momentum=0, dampening=0,\n                 weight_decay=0, nesterov=False):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if momentum < 0.0:\n            raise ValueError(\"Invalid momentum value: {}\".format(momentum))\n        if weight_decay < 0.0:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n\n        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,\n                        weight_decay=weight_decay, nesterov=nesterov)\n        if nesterov and (momentum <= 0 or dampening != 0):\n            raise ValueError(\"Nesterov momentum requires a momentum and zero dampening\")\n        super(SGDW, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(SGDW, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('nesterov', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            weight_decay = group['weight_decay']\n            momentum = group['momentum']\n            dampening = group['dampening']\n            nesterov = group['nesterov']\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                d_p = p.grad.data\n\n                old = torch.clone(p.data).detach()\n                #if weight_decay != 0:\n                #    d_p.add_(weight_decay, p.data)\n                if momentum != 0:\n                    param_state = self.state[p]\n                    if 'momentum_buffer' not in param_state:\n                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)\n                        buf.mul_(momentum).add_(d_p)\n                    else:\n                        buf = param_state['momentum_buffer']\n                        buf.mul_(momentum).add_(1 - dampening, d_p)\n                    if nesterov:\n                        d_p = d_p.add(momentum, buf)\n                    else:\n                        d_p = buf\n\n                p.data.add_(-group['lr'], d_p)\n\n                if weight_decay != 0:\n                    p.data.add_(-weight_decay*group['lr'], old)\n\n        return loss\n\n\n\nclass SGDW_GCC(Optimizer):\n    def __init__(self, params, lr=required, momentum=0, dampening=0,\n                 weight_decay=0, nesterov=False):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if momentum < 0.0:\n            raise ValueError(\"Invalid momentum value: {}\".format(momentum))\n        if weight_decay < 0.0:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n\n        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,\n                        weight_decay=weight_decay, nesterov=nesterov)\n        if nesterov and (momentum <= 0 or dampening != 0):\n            raise ValueError(\"Nesterov momentum requires a momentum and zero dampening\")\n        super(SGDW_GCC, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(SGDW_GCC, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('nesterov', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            weight_decay = group['weight_decay']\n            momentum = group['momentum']\n            dampening = group['dampening']\n            nesterov = group['nesterov']\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                d_p = p.grad.data\n\n                old = torch.clone(p.data).detach()\n                #if weight_decay != 0:\n                #    d_p.add_(weight_decay, p.data)\n                \n                #GC operation for Conv layers\n                if len(list(d_p.size()))>3:\n                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))\n\n\n                if momentum != 0:\n                    param_state = self.state[p]\n                    if 'momentum_buffer' not in param_state:\n                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)\n                        buf.mul_(momentum).add_(d_p)\n                    else:\n                        buf = param_state['momentum_buffer']\n                        buf.mul_(momentum).add_(1 - dampening, d_p)\n                    if nesterov:\n                        d_p = d_p.add(momentum, buf)\n                    else:\n                        d_p = buf\n\n                p.data.add_(-group['lr'], d_p)\n\n                if weight_decay != 0:\n                    p.data.add_(-weight_decay*group['lr'], old)\n\n        return loss\n\n    \nclass SGDW_GC(Optimizer):\n    def __init__(self, params, lr=required, momentum=0, dampening=0,\n                 weight_decay=0, nesterov=False):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if momentum < 0.0:\n            raise ValueError(\"Invalid momentum value: {}\".format(momentum))\n        if weight_decay < 0.0:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n\n        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,\n                        weight_decay=weight_decay, nesterov=nesterov)\n        if nesterov and (momentum <= 0 or dampening != 0):\n            raise ValueError(\"Nesterov momentum requires a momentum and zero dampening\")\n        super(SGDW_GC, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(SGDW_GC, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('nesterov', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            weight_decay = group['weight_decay']\n            momentum = group['momentum']\n            dampening = group['dampening']\n            nesterov = group['nesterov']\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                d_p = p.grad.data\n\n                old = torch.clone(p.data).detach()\n                #if weight_decay != 0:\n                #    d_p.add_(weight_decay, p.data)\n                \n                #GC operation for Conv and FC layers\n                if len(list(d_p.size()))>1:\n                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))\n\n\n                if momentum != 0:\n                    param_state = self.state[p]\n                    if 'momentum_buffer' not in param_state:\n                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)\n                        buf.mul_(momentum).add_(d_p)\n                    else:\n                        buf = param_state['momentum_buffer']\n                        buf.mul_(momentum).add_(1 - dampening, d_p)\n                    if nesterov:\n                        d_p = d_p.add(momentum, buf)\n                    else:\n                        d_p = buf\n\n                p.data.add_(-group['lr'], d_p)\n\n                if weight_decay != 0:\n                    p.data.add_(-weight_decay*group['lr'], old)\n\n        return loss\n"
  },
  {
    "path": "GC_code/CIFAR100/main.py",
    "content": "'''Train CIFAR100 with PyTorch.'''\nfrom __future__ import print_function\n\nimport torch\nimport torch.nn as nn\nimport torch.backends.cudnn as cudnn\n\n\nimport torch.optim as optim\nimport torch.nn.functional as F\n\nimport torchvision\nimport torchvision.transforms as transforms\n\n\nfrom torch.optim import lr_scheduler\nimport os\nimport argparse\nfrom torchvision import datasets, models\nfrom models import *\n#from utils import progress_bar\nimport numpy as np\n\n#import optimizers with GC\nfrom algorithm.SGD import *\nfrom algorithm.Adam import *\nfrom algorithm.Adagrad import *\n\n\nparser = argparse.ArgumentParser(description='PyTorch CIFAR100 Training')\nparser.add_argument('--lr', default=0.1, type=float, help='learning rate')\nparser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint')\nparser.add_argument('--bs', default=128, type=int, help='batchsize')\nparser.add_argument('--wd', default=0.0005, type=float, help='weight decay')\nparser.add_argument('--alg', default='sgd', type=str, help='algorithm')\nparser.add_argument('--epochs', default=200, type=int, help='epochs')\nparser.add_argument('--path', default='logout/result', type=str, help='path')\nparser.add_argument('--model', default='r50', type=str, help='model')\n\n\nargs = parser.parse_args()\nos.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n\n\nepochs=args.epochs\ndevice = 'cuda' if torch.cuda.is_available() else 'cpu'\nbest_acc = 0  # best test accuracy\nstart_epoch = 0  # start from epoch 0 or last checkpoint epoch\n\n\n\n# Data\nprint('==> Preparing data..')\ntransform_train = transforms.Compose([\n    transforms.RandomCrop(32, padding=4),\n    transforms.RandomHorizontalFlip(),\n    transforms.ToTensor(),\n    transforms.Normalize((0.507, 0.487, 0.441), (0.267, 0.256, 0.276)),\n  ])\ntransform_test = transforms.Compose([\n    transforms.ToTensor(),\n    transforms.Normalize((0.507, 0.487, 0.441), (0.267, 0.256, 0.276)),\n  ])\ntrainset = torchvision.datasets.CIFAR100(root='/home/yonghw/data/cifar100/', train=True, download=True, transform=transform_train)\ntrainloader = torch.utils.data.DataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=4,drop_last=True)\ntestset = torchvision.datasets.CIFAR100(root='/home/yonghw/data/cifar100/', train=False, download=True, transform=transform_test)\ntestloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=4)\n\n\n\n\n# Model\nprint('==> Building model..')\n\nNum_classes = 100\n\nif args.model=='r18':\n    net = ResNet18(Num_classes=Num_classes)\nif args.model=='r34':\n    net = ResNet34(Num_classes=Num_classes)\nif args.model=='r50':\n    net = ResNet50(Num_classes=Num_classes)\nif args.model=='r101':\n    net = ResNet101(Num_classes=Num_classes)\nif args.model=='v11':\n    net = VGG('VGG11',Num_classes=Num_classes)\nif args.model=='rx29':\n    net = ResNeXt29_4x64d(Num_classes=Num_classes)\nif args.model=='d121':\n    net = DenseNet121(Num_classes=Num_classes)\n\nif device == 'cuda':\n    net = net.cuda()\n    net = torch.nn.DataParallel(net)\n    cudnn.benchmark = True\n\n\nif args.resume:\n    # Load checkpoint.\n    print('==> Resuming from checkpoint..')\n    assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'\n    checkpoint = torch.load('./checkpoint/ckpt.t7')\n    net.load_state_dict(checkpoint['net'])\n    best_acc = checkpoint['acc']\n    start_epoch = checkpoint['epoch']\n    \ncriterion = nn.CrossEntropyLoss()\n\n#optimizer\nWD=args.wd\nprint('==> choose optimizer..')\nif args.alg=='sgd':\n    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)\nif args.alg=='sgdGC':\n    optimizer = SGD_GC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)\nif args.alg=='sgdGCC':\n    optimizer = SGD_GCC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)\n    \nif args.alg=='adam':\n    optimizer = optim.Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD)\nif args.alg=='adamGC':\n    optimizer = Adam_GC(net.parameters(), lr=args.lr*0.01, weight_decay = WD)\nif args.alg=='adamGCC':\n    optimizer = Adam_GCC(net.parameters(), lr=args.lr*0.01, weight_decay = WD)\nif args.alg=='adamGC2':\n    optimizer = Adam_GC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD)\nif args.alg=='adamGCC2':\n    optimizer = Adam_GCC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD) \n\n\nif args.alg=='adagrad':\n    optimizer = optim.Adagrad(net.parameters(), lr=args.lr*0.1,weight_decay = WD)\nif args.alg=='adagradGC':\n    optimizer = Adagrad_GC(net.parameters(), lr=args.lr*0.1,weight_decay = WD)\nif args.alg=='adagradGCC':\n    optimizer = Adagrad_GCC(net.parameters(), lr=args.lr*0.1,weight_decay = WD)\nif args.alg=='adagradGC2':\n    optimizer = Adagrad_GC2(net.parameters(), lr=args.lr*0.1,weight_decay = WD)\nif args.alg=='adagradGCC2':\n    optimizer = Adagrad_GCC2(net.parameters(), lr=args.lr*0.1,weight_decay = WD)\n    \n    \nif args.alg=='sgdW':\n    optimizer = SGDW(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)\nif args.alg=='sgdWGC':\n    optimizer = SGDW_GC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)\nif args.alg=='sgdWGCC':\n    optimizer = SGDW_GCC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)\n    \nif args.alg=='adamW':\n    optimizer = AdamW(net.parameters(), lr=args.lr*0.01, weight_decay = WD)\nif args.alg=='adamWGC':\n    optimizer = Adam_GC(net.parameters(), lr=args.lr*0.01, weight_decay = WD)\nif args.alg=='adamWGCC':\n    optimizer = Adam_GCC(net.parameters(), lr=args.lr*0.01, weight_decay = WD)\nif args.alg=='adamWGC2':\n    optimizer = Adam_GC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD)\nif args.alg=='adamWGCC2':\n    optimizer = Adam_GCC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD)\n    \n    \nexp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=60, gamma=0.1)\n\n# Training\ndef train(epoch,net,optimizer):\n    print('\\nEpoch: %d' % epoch)\n    net.train()\n    train_loss = 0\n    correct = 0\n    total = 0\n    for batch_idx, (inputs, targets) in enumerate(trainloader):\n        inputs, targets = inputs.to(device), targets.to(device)\n        optimizer.zero_grad()\n        outputs = net(inputs)\n        loss = criterion(outputs, targets)\n        loss.backward()\n        optimizer.step()\n\n        train_loss += loss.item()\n        _, predicted = outputs.max(1)\n        total += targets.size(0)\n        correct += predicted.eq(targets).sum().item()\n    print('Training: Loss: {:.4f} | Acc: {:.4f}'.format(train_loss/(batch_idx+1),correct/total))\n    #        progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'\n    #            % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))\n    acc=100.*correct/total\n    return acc\n    \n# Testing\ndef test(epoch,net):\n    global best_acc\n    net.eval()\n    test_loss = 0\n    correct = 0\n    total = 0\n    with torch.no_grad():\n      for batch_idx, (inputs, targets) in enumerate(testloader):\n            inputs, targets = inputs.to(device), targets.to(device)\n            outputs = net(inputs)\n            loss = criterion(outputs, targets)\n\n            test_loss += loss.item()\n            _, predicted = outputs.max(1)\n            total += targets.size(0)\n            correct += predicted.eq(targets).sum().item()\n\n            #progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'\n                #% (test_loss/(batch_idx+1), 100.*correct/total, correct, total))\n    print('Testing:Loss: {:.4f} | Acc: {:.4f}'.format(test_loss/(batch_idx+1),correct/total) )\n\n    # Save checkpoint.\n    acc = 100.*correct/total\n    if acc > best_acc:\n        print('Saving..')\n        state = {\n            'net': net.state_dict(),\n            'acc': acc,\n            'epoch': epoch,\n        }\n        if not os.path.isdir('checkpoint'):\n            os.mkdir('checkpoint')\n        torch.save(state, './checkpoint/ckpt.t7')\n        best_acc = acc\n    return acc\n\n\nfor epoch in range(start_epoch, start_epoch+epochs):\n    train_acc=train(epoch,net,optimizer)\n    exp_lr_scheduler.step()\n    val_acc=test(epoch,net)\n\n"
  },
  {
    "path": "GC_code/CIFAR100/models/__init__.py",
    "content": "from .vgg import *\nfrom .dpn import *\nfrom .lenet import *\nfrom .senet import *\nfrom .pnasnet import *\nfrom .densenet import *\nfrom .googlenet import *\nfrom .shufflenet import *\nfrom .resnet import *\nfrom .resnext import *\nfrom .preact_resnet import *\nfrom .mobilenet import *\nfrom .mobilenetv2 import *\n"
  },
  {
    "path": "GC_code/CIFAR100/models/densenet.py",
    "content": "'''DenseNet in PyTorch.'''\nimport math\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Bottleneck(nn.Module):\n    def __init__(self, in_planes, growth_rate):\n        super(Bottleneck, self).__init__()\n        self.bn1 = nn.BatchNorm2d(in_planes)\n        self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False)\n        self.bn2 = nn.BatchNorm2d(4*growth_rate)\n        self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)\n\n    def forward(self, x):\n        out = self.conv1(F.relu(self.bn1(x)))\n        out = self.conv2(F.relu(self.bn2(out)))\n        out = torch.cat([out,x], 1)\n        return out\n\n\nclass Transition(nn.Module):\n    def __init__(self, in_planes, out_planes):\n        super(Transition, self).__init__()\n        self.bn = nn.BatchNorm2d(in_planes)\n        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False)\n\n    def forward(self, x):\n        out = self.conv(F.relu(self.bn(x)))\n        out = F.avg_pool2d(out, 2)\n        return out\n\n\nclass DenseNet(nn.Module):\n    def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):\n        super(DenseNet, self).__init__()\n        self.growth_rate = growth_rate\n\n        num_planes = 2*growth_rate\n        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False)\n\n        self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])\n        num_planes += nblocks[0]*growth_rate\n        out_planes = int(math.floor(num_planes*reduction))\n        self.trans1 = Transition(num_planes, out_planes)\n        num_planes = out_planes\n\n        self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])\n        num_planes += nblocks[1]*growth_rate\n        out_planes = int(math.floor(num_planes*reduction))\n        self.trans2 = Transition(num_planes, out_planes)\n        num_planes = out_planes\n\n        self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])\n        num_planes += nblocks[2]*growth_rate\n        out_planes = int(math.floor(num_planes*reduction))\n        self.trans3 = Transition(num_planes, out_planes)\n        num_planes = out_planes\n\n        self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3])\n        num_planes += nblocks[3]*growth_rate\n\n        self.bn = nn.BatchNorm2d(num_planes)\n        self.linear = nn.Linear(num_planes, num_classes)\n\n    def _make_dense_layers(self, block, in_planes, nblock):\n        layers = []\n        for i in range(nblock):\n            layers.append(block(in_planes, self.growth_rate))\n            in_planes += self.growth_rate\n        return nn.Sequential(*layers)\n\n    def forward(self, x):\n        out = self.conv1(x)\n        out = self.trans1(self.dense1(out))\n        out = self.trans2(self.dense2(out))\n        out = self.trans3(self.dense3(out))\n        out = self.dense4(out)\n        out = F.avg_pool2d(F.relu(self.bn(out)), 4)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\ndef DenseNet121(Num_classes=10):\n    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32, num_classes=Num_classes)\n\ndef DenseNet169(Num_classes=10):\n    return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32, num_classes=Num_classes)\n\ndef DenseNet201(Num_classes=10):\n    return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32, num_classes=Num_classes)\n\ndef DenseNet161(Num_classes=10):\n    return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48, num_classes=Num_classes)\n\ndef densenet_cifar(Num_classes=10):\n    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12, num_classes=Num_classes)\n\ndef test():\n    net = densenet_cifar()\n    x = torch.randn(1,3,32,32)\n    y = net(x)\n    print(y)\n\n# test()\n"
  },
  {
    "path": "GC_code/CIFAR100/models/dpn.py",
    "content": "'''Dual Path Networks in PyTorch.'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Bottleneck(nn.Module):\n    def __init__(self, last_planes, in_planes, out_planes, dense_depth, stride, first_layer):\n        super(Bottleneck, self).__init__()\n        self.out_planes = out_planes\n        self.dense_depth = dense_depth\n\n        self.conv1 = nn.Conv2d(last_planes, in_planes, kernel_size=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(in_planes)\n        self.conv2 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=32, bias=False)\n        self.bn2 = nn.BatchNorm2d(in_planes)\n        self.conv3 = nn.Conv2d(in_planes, out_planes+dense_depth, kernel_size=1, bias=False)\n        self.bn3 = nn.BatchNorm2d(out_planes+dense_depth)\n\n        self.shortcut = nn.Sequential()\n        if first_layer:\n            self.shortcut = nn.Sequential(\n                nn.Conv2d(last_planes, out_planes+dense_depth, kernel_size=1, stride=stride, bias=False),\n                nn.BatchNorm2d(out_planes+dense_depth)\n            )\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = F.relu(self.bn2(self.conv2(out)))\n        out = self.bn3(self.conv3(out))\n        x = self.shortcut(x)\n        d = self.out_planes\n        out = torch.cat([x[:,:d,:,:]+out[:,:d,:,:], x[:,d:,:,:], out[:,d:,:,:]], 1)\n        out = F.relu(out)\n        return out\n\n\nclass DPN(nn.Module):\n    def __init__(self, cfg):\n        super(DPN, self).__init__()\n        in_planes, out_planes = cfg['in_planes'], cfg['out_planes']\n        num_blocks, dense_depth = cfg['num_blocks'], cfg['dense_depth']\n\n        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(64)\n        self.last_planes = 64\n        self.layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1)\n        self.layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2)\n        self.layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=2)\n        self.layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2)\n        self.linear = nn.Linear(out_planes[3]+(num_blocks[3]+1)*dense_depth[3], 10)\n\n    def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride):\n        strides = [stride] + [1]*(num_blocks-1)\n        layers = []\n        for i,stride in enumerate(strides):\n            layers.append(Bottleneck(self.last_planes, in_planes, out_planes, dense_depth, stride, i==0))\n            self.last_planes = out_planes + (i+2) * dense_depth\n        return nn.Sequential(*layers)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.layer1(out)\n        out = self.layer2(out)\n        out = self.layer3(out)\n        out = self.layer4(out)\n        out = F.avg_pool2d(out, 4)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\n\ndef DPN26():\n    cfg = {\n        'in_planes': (96,192,384,768),\n        'out_planes': (256,512,1024,2048),\n        'num_blocks': (2,2,2,2),\n        'dense_depth': (16,32,24,128)\n    }\n    return DPN(cfg)\n\ndef DPN92():\n    cfg = {\n        'in_planes': (96,192,384,768),\n        'out_planes': (256,512,1024,2048),\n        'num_blocks': (3,4,20,3),\n        'dense_depth': (16,32,24,128)\n    }\n    return DPN(cfg)\n\n\ndef test():\n    net = DPN92()\n    x = torch.randn(1,3,32,32)\n    y = net(x)\n    print(y)\n\n# test()\n"
  },
  {
    "path": "GC_code/CIFAR100/models/googlenet.py",
    "content": "'''GoogLeNet with PyTorch.'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Inception(nn.Module):\n    def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes):\n        super(Inception, self).__init__()\n        # 1x1 conv branch\n        self.b1 = nn.Sequential(\n            nn.Conv2d(in_planes, n1x1, kernel_size=1),\n            nn.BatchNorm2d(n1x1),\n            nn.ReLU(True),\n        )\n\n        # 1x1 conv -> 3x3 conv branch\n        self.b2 = nn.Sequential(\n            nn.Conv2d(in_planes, n3x3red, kernel_size=1),\n            nn.BatchNorm2d(n3x3red),\n            nn.ReLU(True),\n            nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1),\n            nn.BatchNorm2d(n3x3),\n            nn.ReLU(True),\n        )\n\n        # 1x1 conv -> 5x5 conv branch\n        self.b3 = nn.Sequential(\n            nn.Conv2d(in_planes, n5x5red, kernel_size=1),\n            nn.BatchNorm2d(n5x5red),\n            nn.ReLU(True),\n            nn.Conv2d(n5x5red, n5x5, kernel_size=3, padding=1),\n            nn.BatchNorm2d(n5x5),\n            nn.ReLU(True),\n            nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1),\n            nn.BatchNorm2d(n5x5),\n            nn.ReLU(True),\n        )\n\n        # 3x3 pool -> 1x1 conv branch\n        self.b4 = nn.Sequential(\n            nn.MaxPool2d(3, stride=1, padding=1),\n            nn.Conv2d(in_planes, pool_planes, kernel_size=1),\n            nn.BatchNorm2d(pool_planes),\n            nn.ReLU(True),\n        )\n\n    def forward(self, x):\n        y1 = self.b1(x)\n        y2 = self.b2(x)\n        y3 = self.b3(x)\n        y4 = self.b4(x)\n        return torch.cat([y1,y2,y3,y4], 1)\n\n\nclass GoogLeNet(nn.Module):\n    def __init__(self):\n        super(GoogLeNet, self).__init__()\n        self.pre_layers = nn.Sequential(\n            nn.Conv2d(3, 192, kernel_size=3, padding=1),\n            nn.BatchNorm2d(192),\n            nn.ReLU(True),\n        )\n\n        self.a3 = Inception(192,  64,  96, 128, 16, 32, 32)\n        self.b3 = Inception(256, 128, 128, 192, 32, 96, 64)\n\n        self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)\n\n        self.a4 = Inception(480, 192,  96, 208, 16,  48,  64)\n        self.b4 = Inception(512, 160, 112, 224, 24,  64,  64)\n        self.c4 = Inception(512, 128, 128, 256, 24,  64,  64)\n        self.d4 = Inception(512, 112, 144, 288, 32,  64,  64)\n        self.e4 = Inception(528, 256, 160, 320, 32, 128, 128)\n\n        self.a5 = Inception(832, 256, 160, 320, 32, 128, 128)\n        self.b5 = Inception(832, 384, 192, 384, 48, 128, 128)\n\n        self.avgpool = nn.AvgPool2d(8, stride=1)\n        self.linear = nn.Linear(1024, 10)\n\n    def forward(self, x):\n        out = self.pre_layers(x)\n        out = self.a3(out)\n        out = self.b3(out)\n        out = self.maxpool(out)\n        out = self.a4(out)\n        out = self.b4(out)\n        out = self.c4(out)\n        out = self.d4(out)\n        out = self.e4(out)\n        out = self.maxpool(out)\n        out = self.a5(out)\n        out = self.b5(out)\n        out = self.avgpool(out)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\n\ndef test():\n    net = GoogLeNet()\n    x = torch.randn(1,3,32,32)\n    y = net(x)\n    print(y.size())\n\n# test()\n"
  },
  {
    "path": "GC_code/CIFAR100/models/lenet.py",
    "content": "'''LeNet in PyTorch.'''\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass LeNet(nn.Module):\n    def __init__(self):\n        super(LeNet, self).__init__()\n        self.conv1 = nn.Conv2d(3, 6, 5)\n        self.conv2 = nn.Conv2d(6, 16, 5)\n        self.fc1   = nn.Linear(16*5*5, 120)\n        self.fc2   = nn.Linear(120, 84)\n        self.fc3   = nn.Linear(84, 10)\n\n    def forward(self, x):\n        out = F.relu(self.conv1(x))\n        out = F.max_pool2d(out, 2)\n        out = F.relu(self.conv2(out))\n        out = F.max_pool2d(out, 2)\n        out = out.view(out.size(0), -1)\n        out = F.relu(self.fc1(out))\n        out = F.relu(self.fc2(out))\n        out = self.fc3(out)\n        return out\n"
  },
  {
    "path": "GC_code/CIFAR100/models/mobilenet.py",
    "content": "'''MobileNet in PyTorch.\n\nSee the paper \"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications\"\nfor more details.\n'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Block(nn.Module):\n    '''Depthwise conv + Pointwise conv'''\n    def __init__(self, in_planes, out_planes, stride=1):\n        super(Block, self).__init__()\n        self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False)\n        self.bn1 = nn.BatchNorm2d(in_planes)\n        self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)\n        self.bn2 = nn.BatchNorm2d(out_planes)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = F.relu(self.bn2(self.conv2(out)))\n        return out\n\n\nclass MobileNet(nn.Module):\n    # (128,2) means conv planes=128, conv stride=2, by default conv stride=1\n    cfg = [64, (128,2), 128, (256,2), 256, (512,2), 512, 512, 512, 512, 512, (1024,2), 1024]\n\n    def __init__(self, num_classes=10):\n        super(MobileNet, self).__init__()\n        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(32)\n        self.layers = self._make_layers(in_planes=32)\n        self.linear = nn.Linear(1024, num_classes)\n\n    def _make_layers(self, in_planes):\n        layers = []\n        for x in self.cfg:\n            out_planes = x if isinstance(x, int) else x[0]\n            stride = 1 if isinstance(x, int) else x[1]\n            layers.append(Block(in_planes, out_planes, stride))\n            in_planes = out_planes\n        return nn.Sequential(*layers)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.layers(out)\n        out = F.avg_pool2d(out, 2)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\n\ndef test():\n    net = MobileNet()\n    x = torch.randn(1,3,32,32)\n    y = net(x)\n    print(y.size())\n\n# test()\n"
  },
  {
    "path": "GC_code/CIFAR100/models/mobilenetv2.py",
    "content": "'''MobileNetV2 in PyTorch.\n\nSee the paper \"Inverted Residuals and Linear Bottlenecks:\nMobile Networks for Classification, Detection and Segmentation\" for more details.\n'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Block(nn.Module):\n    '''expand + depthwise + pointwise'''\n    def __init__(self, in_planes, out_planes, expansion, stride):\n        super(Block, self).__init__()\n        self.stride = stride\n\n        planes = expansion * in_planes\n        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False)\n        self.bn1 = nn.BatchNorm2d(planes)\n        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False)\n        self.bn2 = nn.BatchNorm2d(planes)\n        self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)\n        self.bn3 = nn.BatchNorm2d(out_planes)\n\n        self.shortcut = nn.Sequential()\n        if stride == 1 and in_planes != out_planes:\n            self.shortcut = nn.Sequential(\n                nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False),\n                nn.BatchNorm2d(out_planes),\n            )\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = F.relu(self.bn2(self.conv2(out)))\n        out = self.bn3(self.conv3(out))\n        out = out + self.shortcut(x) if self.stride==1 else out\n        return out\n\n\nclass MobileNetV2(nn.Module):\n    # (expansion, out_planes, num_blocks, stride)\n    cfg = [(1,  16, 1, 1),\n           (6,  24, 2, 1),  # NOTE: change stride 2 -> 1 for CIFAR10\n           (6,  32, 3, 2),\n           (6,  64, 4, 2),\n           (6,  96, 3, 1),\n           (6, 160, 3, 2),\n           (6, 320, 1, 1)]\n\n    def __init__(self, num_classes=10):\n        super(MobileNetV2, self).__init__()\n        # NOTE: change conv1 stride 2 -> 1 for CIFAR10\n        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(32)\n        self.layers = self._make_layers(in_planes=32)\n        self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False)\n        self.bn2 = nn.BatchNorm2d(1280)\n        self.linear = nn.Linear(1280, num_classes)\n\n    def _make_layers(self, in_planes):\n        layers = []\n        for expansion, out_planes, num_blocks, stride in self.cfg:\n            strides = [stride] + [1]*(num_blocks-1)\n            for stride in strides:\n                layers.append(Block(in_planes, out_planes, expansion, stride))\n                in_planes = out_planes\n        return nn.Sequential(*layers)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.layers(out)\n        out = F.relu(self.bn2(self.conv2(out)))\n        # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10\n        out = F.avg_pool2d(out, 4)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\n\ndef test():\n    net = MobileNetV2()\n    x = torch.randn(2,3,32,32)\n    y = net(x)\n    print(y.size())\n\n# test()\n"
  },
  {
    "path": "GC_code/CIFAR100/models/pnasnet.py",
    "content": "'''PNASNet in PyTorch.\n\nPaper: Progressive Neural Architecture Search\n'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass SepConv(nn.Module):\n    '''Separable Convolution.'''\n    def __init__(self, in_planes, out_planes, kernel_size, stride):\n        super(SepConv, self).__init__()\n        self.conv1 = nn.Conv2d(in_planes, out_planes,\n                               kernel_size, stride,\n                               padding=(kernel_size-1)//2,\n                               bias=False, groups=in_planes)\n        self.bn1 = nn.BatchNorm2d(out_planes)\n\n    def forward(self, x):\n        return self.bn1(self.conv1(x))\n\n\nclass CellA(nn.Module):\n    def __init__(self, in_planes, out_planes, stride=1):\n        super(CellA, self).__init__()\n        self.stride = stride\n        self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)\n        if stride==2:\n            self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)\n            self.bn1 = nn.BatchNorm2d(out_planes)\n\n    def forward(self, x):\n        y1 = self.sep_conv1(x)\n        y2 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)\n        if self.stride==2:\n            y2 = self.bn1(self.conv1(y2))\n        return F.relu(y1+y2)\n\nclass CellB(nn.Module):\n    def __init__(self, in_planes, out_planes, stride=1):\n        super(CellB, self).__init__()\n        self.stride = stride\n        # Left branch\n        self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)\n        self.sep_conv2 = SepConv(in_planes, out_planes, kernel_size=3, stride=stride)\n        # Right branch\n        self.sep_conv3 = SepConv(in_planes, out_planes, kernel_size=5, stride=stride)\n        if stride==2:\n            self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)\n            self.bn1 = nn.BatchNorm2d(out_planes)\n        # Reduce channels\n        self.conv2 = nn.Conv2d(2*out_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)\n        self.bn2 = nn.BatchNorm2d(out_planes)\n\n    def forward(self, x):\n        # Left branch\n        y1 = self.sep_conv1(x)\n        y2 = self.sep_conv2(x)\n        # Right branch\n        y3 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)\n        if self.stride==2:\n            y3 = self.bn1(self.conv1(y3))\n        y4 = self.sep_conv3(x)\n        # Concat & reduce channels\n        b1 = F.relu(y1+y2)\n        b2 = F.relu(y3+y4)\n        y = torch.cat([b1,b2], 1)\n        return F.relu(self.bn2(self.conv2(y)))\n\nclass PNASNet(nn.Module):\n    def __init__(self, cell_type, num_cells, num_planes):\n        super(PNASNet, self).__init__()\n        self.in_planes = num_planes\n        self.cell_type = cell_type\n\n        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, stride=1, padding=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(num_planes)\n\n        self.layer1 = self._make_layer(num_planes, num_cells=6)\n        self.layer2 = self._downsample(num_planes*2)\n        self.layer3 = self._make_layer(num_planes*2, num_cells=6)\n        self.layer4 = self._downsample(num_planes*4)\n        self.layer5 = self._make_layer(num_planes*4, num_cells=6)\n\n        self.linear = nn.Linear(num_planes*4, 10)\n\n    def _make_layer(self, planes, num_cells):\n        layers = []\n        for _ in range(num_cells):\n            layers.append(self.cell_type(self.in_planes, planes, stride=1))\n            self.in_planes = planes\n        return nn.Sequential(*layers)\n\n    def _downsample(self, planes):\n        layer = self.cell_type(self.in_planes, planes, stride=2)\n        self.in_planes = planes\n        return layer\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.layer1(out)\n        out = self.layer2(out)\n        out = self.layer3(out)\n        out = self.layer4(out)\n        out = self.layer5(out)\n        out = F.avg_pool2d(out, 8)\n        out = self.linear(out.view(out.size(0), -1))\n        return out\n\n\ndef PNASNetA():\n    return PNASNet(CellA, num_cells=6, num_planes=44)\n\ndef PNASNetB():\n    return PNASNet(CellB, num_cells=6, num_planes=32)\n\n\ndef test():\n    net = PNASNetB()\n    x = torch.randn(1,3,32,32)\n    y = net(x)\n    print(y)\n\n# test()\n"
  },
  {
    "path": "GC_code/CIFAR100/models/preact_resnet.py",
    "content": "'''Pre-activation ResNet in PyTorch.\n\nReference:\n[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun\n    Identity Mappings in Deep Residual Networks. arXiv:1603.05027\n'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass PreActBlock(nn.Module):\n    '''Pre-activation version of the BasicBlock.'''\n    expansion = 1\n\n    def __init__(self, in_planes, planes, stride=1):\n        super(PreActBlock, self).__init__()\n        self.bn1 = nn.BatchNorm2d(in_planes)\n        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)\n        self.bn2 = nn.BatchNorm2d(planes)\n        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)\n\n        if stride != 1 or in_planes != self.expansion*planes:\n            self.shortcut = nn.Sequential(\n                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)\n            )\n\n    def forward(self, x):\n        out = F.relu(self.bn1(x))\n        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x\n        out = self.conv1(out)\n        out = self.conv2(F.relu(self.bn2(out)))\n        out += shortcut\n        return out\n\n\nclass PreActBottleneck(nn.Module):\n    '''Pre-activation version of the original Bottleneck module.'''\n    expansion = 4\n\n    def __init__(self, in_planes, planes, stride=1):\n        super(PreActBottleneck, self).__init__()\n        self.bn1 = nn.BatchNorm2d(in_planes)\n        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)\n        self.bn2 = nn.BatchNorm2d(planes)\n        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)\n        self.bn3 = nn.BatchNorm2d(planes)\n        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)\n\n        if stride != 1 or in_planes != self.expansion*planes:\n            self.shortcut = nn.Sequential(\n                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)\n            )\n\n    def forward(self, x):\n        out = F.relu(self.bn1(x))\n        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x\n        out = self.conv1(out)\n        out = self.conv2(F.relu(self.bn2(out)))\n        out = self.conv3(F.relu(self.bn3(out)))\n        out += shortcut\n        return out\n\n\nclass PreActResNet(nn.Module):\n    def __init__(self, block, num_blocks, num_classes=10):\n        super(PreActResNet, self).__init__()\n        self.in_planes = 64\n\n        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)\n        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)\n        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)\n        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)\n        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)\n        self.linear = nn.Linear(512*block.expansion, num_classes)\n\n    def _make_layer(self, block, planes, num_blocks, stride):\n        strides = [stride] + [1]*(num_blocks-1)\n        layers = []\n        for stride in strides:\n            layers.append(block(self.in_planes, planes, stride))\n            self.in_planes = planes * block.expansion\n        return nn.Sequential(*layers)\n\n    def forward(self, x):\n        out = self.conv1(x)\n        out = self.layer1(out)\n        out = self.layer2(out)\n        out = self.layer3(out)\n        out = self.layer4(out)\n        out = F.avg_pool2d(out, 4)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\n\ndef PreActResNet18():\n    return PreActResNet(PreActBlock, [2,2,2,2])\n\ndef PreActResNet34():\n    return PreActResNet(PreActBlock, [3,4,6,3])\n\ndef PreActResNet50():\n    return PreActResNet(PreActBottleneck, [3,4,6,3])\n\ndef PreActResNet101():\n    return PreActResNet(PreActBottleneck, [3,4,23,3])\n\ndef PreActResNet152():\n    return PreActResNet(PreActBottleneck, [3,8,36,3])\n\n\ndef test():\n    net = PreActResNet18()\n    y = net((torch.randn(1,3,32,32)))\n    print(y.size())\n\n# test()\n"
  },
  {
    "path": "GC_code/CIFAR100/models/resnet.py",
    "content": "'''ResNet in PyTorch.\n\nFor Pre-activation ResNet, see 'preact_resnet.py'.\n\nReference:\n[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun\n    Deep Residual Learning for Image Recognition. arXiv:1512.03385\n'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass BasicBlock(nn.Module):\n    expansion = 1\n\n    def __init__(self, in_planes, planes, stride=1):\n        super(BasicBlock, self).__init__()\n        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(planes)\n        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)\n        self.bn2 = nn.BatchNorm2d(planes)\n\n        self.shortcut = nn.Sequential()\n        if stride != 1 or in_planes != self.expansion*planes:\n            self.shortcut = nn.Sequential(\n                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),\n                nn.BatchNorm2d(self.expansion*planes)\n            )\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.bn2(self.conv2(out))\n        out += self.shortcut(x)\n        out = F.relu(out)\n        return out\n\n\nclass Bottleneck(nn.Module):\n    expansion = 4\n\n    def __init__(self, in_planes, planes, stride=1):\n        super(Bottleneck, self).__init__()\n        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(planes)\n        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)\n        self.bn2 = nn.BatchNorm2d(planes)\n        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)\n        self.bn3 = nn.BatchNorm2d(self.expansion*planes)\n\n        self.shortcut = nn.Sequential()\n        if stride != 1 or in_planes != self.expansion*planes:\n            self.shortcut = nn.Sequential(\n                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),\n                nn.BatchNorm2d(self.expansion*planes)\n            )\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = F.relu(self.bn2(self.conv2(out)))\n        out = self.bn3(self.conv3(out))\n        out += self.shortcut(x)\n        out = F.relu(out)\n        return out\n\n\nclass ResNet(nn.Module):\n    def __init__(self, block, num_blocks, num_classes=10):\n        super(ResNet, self).__init__()\n        self.in_planes = 64\n\n        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(64)\n        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)\n        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)\n        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)\n        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)\n        self.linear = nn.Linear(512*block.expansion, num_classes)\n\n    def _make_layer(self, block, planes, num_blocks, stride):\n        strides = [stride] + [1]*(num_blocks-1)\n        layers = []\n        for stride in strides:\n            layers.append(block(self.in_planes, planes, stride))\n            self.in_planes = planes * block.expansion\n        return nn.Sequential(*layers)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.layer1(out)\n        out = self.layer2(out)\n        out = self.layer3(out)\n        out = self.layer4(out)\n        out = F.avg_pool2d(out, 4)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\n\ndef ResNet18(Num_classes=10):\n    return ResNet(BasicBlock, [2,2,2,2],num_classes=Num_classes)\n\ndef ResNet34(Num_classes=10):\n    return ResNet(BasicBlock, [3,4,6,3],num_classes=Num_classes)\n\ndef ResNet50(Num_classes=10):\n    return ResNet(Bottleneck, [3,4,6,3],num_classes=Num_classes)\n\ndef ResNet101(Num_classes=10):\n    return ResNet(Bottleneck, [3,4,23,3],num_classes=Num_classes)\n\ndef ResNet152(Num_classes=10):\n    return ResNet(Bottleneck, [3,8,36,3],num_classes=Num_classes)\n\n\ndef test():\n    net = ResNet18()\n    y = net(torch.randn(1,3,32,32))\n    print(y.size())\n\n# test()\n"
  },
  {
    "path": "GC_code/CIFAR100/models/resnext.py",
    "content": "'''ResNeXt in PyTorch.\n\nSee the paper \"Aggregated Residual Transformations for Deep Neural Networks\" for more details.\n'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Block(nn.Module):\n    '''Grouped convolution block.'''\n    expansion = 2\n\n    def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1):\n        super(Block, self).__init__()\n        group_width = cardinality * bottleneck_width\n        self.conv1 = nn.Conv2d(in_planes, group_width, kernel_size=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(group_width)\n        self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False)\n        self.bn2 = nn.BatchNorm2d(group_width)\n        self.conv3 = nn.Conv2d(group_width, self.expansion*group_width, kernel_size=1, bias=False)\n        self.bn3 = nn.BatchNorm2d(self.expansion*group_width)\n\n        self.shortcut = nn.Sequential()\n        if stride != 1 or in_planes != self.expansion*group_width:\n            self.shortcut = nn.Sequential(\n                nn.Conv2d(in_planes, self.expansion*group_width, kernel_size=1, stride=stride, bias=False),\n                nn.BatchNorm2d(self.expansion*group_width)\n            )\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = F.relu(self.bn2(self.conv2(out)))\n        out = self.bn3(self.conv3(out))\n        out += self.shortcut(x)\n        out = F.relu(out)\n        return out\n\n\nclass ResNeXt(nn.Module):\n    def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10):\n        super(ResNeXt, self).__init__()\n        self.cardinality = cardinality\n        self.bottleneck_width = bottleneck_width\n        self.in_planes = 64\n\n        self.conv1 = nn.Conv2d(3, 64, kernel_size=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(64)\n        self.layer1 = self._make_layer(num_blocks[0], 1)\n        self.layer2 = self._make_layer(num_blocks[1], 2)\n        self.layer3 = self._make_layer(num_blocks[2], 2)\n        # self.layer4 = self._make_layer(num_blocks[3], 2)\n        self.linear = nn.Linear(cardinality*bottleneck_width*8, num_classes)\n\n    def _make_layer(self, num_blocks, stride):\n        strides = [stride] + [1]*(num_blocks-1)\n        layers = []\n        for stride in strides:\n            layers.append(Block(self.in_planes, self.cardinality, self.bottleneck_width, stride))\n            self.in_planes = Block.expansion * self.cardinality * self.bottleneck_width\n        # Increase bottleneck_width by 2 after each stage.\n        self.bottleneck_width *= 2\n        return nn.Sequential(*layers)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.layer1(out)\n        out = self.layer2(out)\n        out = self.layer3(out)\n        # out = self.layer4(out)\n        out = F.avg_pool2d(out, 8)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\n\ndef ResNeXt29_2x64d(Num_classes=10):\n    return ResNeXt(num_blocks=[3,3,3], cardinality=2, bottleneck_width=64,num_classes=Num_classes)\n\ndef ResNeXt29_4x64d(Num_classes=10):\n    return ResNeXt(num_blocks=[3,3,3], cardinality=4, bottleneck_width=64,num_classes=Num_classes)\n\ndef ResNeXt29_8x64d(Num_classes=10):\n    return ResNeXt(num_blocks=[3,3,3], cardinality=8, bottleneck_width=64,num_classes=Num_classes)\n\ndef ResNeXt29_32x4d(Num_classes=10):\n    return ResNeXt(num_blocks=[3,3,3], cardinality=32, bottleneck_width=4,num_classes=Num_classes)\n\ndef test_resnext():\n    net = ResNeXt29_2x64d()\n    x = torch.randn(1,3,32,32)\n    y = net(x)\n    print(y.size())\n\n# test_resnext()\n"
  },
  {
    "path": "GC_code/CIFAR100/models/senet.py",
    "content": "'''SENet in PyTorch.\n\nSENet is the winner of ImageNet-2017. The paper is not released yet.\n'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass BasicBlock(nn.Module):\n    def __init__(self, in_planes, planes, stride=1):\n        super(BasicBlock, self).__init__()\n        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(planes)\n        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)\n        self.bn2 = nn.BatchNorm2d(planes)\n\n        self.shortcut = nn.Sequential()\n        if stride != 1 or in_planes != planes:\n            self.shortcut = nn.Sequential(\n                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False),\n                nn.BatchNorm2d(planes)\n            )\n\n        # SE layers\n        self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)  # Use nn.Conv2d instead of nn.Linear\n        self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.bn2(self.conv2(out))\n\n        # Squeeze\n        w = F.avg_pool2d(out, out.size(2))\n        w = F.relu(self.fc1(w))\n        w = F.sigmoid(self.fc2(w))\n        # Excitation\n        out = out * w  # New broadcasting feature from v0.2!\n\n        out += self.shortcut(x)\n        out = F.relu(out)\n        return out\n\n\nclass PreActBlock(nn.Module):\n    def __init__(self, in_planes, planes, stride=1):\n        super(PreActBlock, self).__init__()\n        self.bn1 = nn.BatchNorm2d(in_planes)\n        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)\n        self.bn2 = nn.BatchNorm2d(planes)\n        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)\n\n        if stride != 1 or in_planes != planes:\n            self.shortcut = nn.Sequential(\n                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False)\n            )\n\n        # SE layers\n        self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)\n        self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(x))\n        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x\n        out = self.conv1(out)\n        out = self.conv2(F.relu(self.bn2(out)))\n\n        # Squeeze\n        w = F.avg_pool2d(out, out.size(2))\n        w = F.relu(self.fc1(w))\n        w = F.sigmoid(self.fc2(w))\n        # Excitation\n        out = out * w\n\n        out += shortcut\n        return out\n\n\nclass SENet(nn.Module):\n    def __init__(self, block, num_blocks, num_classes=10):\n        super(SENet, self).__init__()\n        self.in_planes = 64\n\n        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(64)\n        self.layer1 = self._make_layer(block,  64, num_blocks[0], stride=1)\n        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)\n        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)\n        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)\n        self.linear = nn.Linear(512, num_classes)\n\n    def _make_layer(self, block, planes, num_blocks, stride):\n        strides = [stride] + [1]*(num_blocks-1)\n        layers = []\n        for stride in strides:\n            layers.append(block(self.in_planes, planes, stride))\n            self.in_planes = planes\n        return nn.Sequential(*layers)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.layer1(out)\n        out = self.layer2(out)\n        out = self.layer3(out)\n        out = self.layer4(out)\n        out = F.avg_pool2d(out, 4)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\n\ndef SENet18():\n    return SENet(PreActBlock, [2,2,2,2])\n\n\ndef test():\n    net = SENet18()\n    y = net(torch.randn(1,3,32,32))\n    print(y.size())\n\n# test()\n"
  },
  {
    "path": "GC_code/CIFAR100/models/shufflenet.py",
    "content": "'''ShuffleNet in PyTorch.\n\nSee the paper \"ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices\" for more details.\n'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass ShuffleBlock(nn.Module):\n    def __init__(self, groups):\n        super(ShuffleBlock, self).__init__()\n        self.groups = groups\n\n    def forward(self, x):\n        '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''\n        N,C,H,W = x.size()\n        g = self.groups\n        return x.view(N,g,C/g,H,W).permute(0,2,1,3,4).contiguous().view(N,C,H,W)\n\n\nclass Bottleneck(nn.Module):\n    def __init__(self, in_planes, out_planes, stride, groups):\n        super(Bottleneck, self).__init__()\n        self.stride = stride\n\n        mid_planes = out_planes/4\n        g = 1 if in_planes==24 else groups\n        self.conv1 = nn.Conv2d(in_planes, mid_planes, kernel_size=1, groups=g, bias=False)\n        self.bn1 = nn.BatchNorm2d(mid_planes)\n        self.shuffle1 = ShuffleBlock(groups=g)\n        self.conv2 = nn.Conv2d(mid_planes, mid_planes, kernel_size=3, stride=stride, padding=1, groups=mid_planes, bias=False)\n        self.bn2 = nn.BatchNorm2d(mid_planes)\n        self.conv3 = nn.Conv2d(mid_planes, out_planes, kernel_size=1, groups=groups, bias=False)\n        self.bn3 = nn.BatchNorm2d(out_planes)\n\n        self.shortcut = nn.Sequential()\n        if stride == 2:\n            self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1))\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.shuffle1(out)\n        out = F.relu(self.bn2(self.conv2(out)))\n        out = self.bn3(self.conv3(out))\n        res = self.shortcut(x)\n        out = F.relu(torch.cat([out,res], 1)) if self.stride==2 else F.relu(out+res)\n        return out\n\n\nclass ShuffleNet(nn.Module):\n    def __init__(self, cfg):\n        super(ShuffleNet, self).__init__()\n        out_planes = cfg['out_planes']\n        num_blocks = cfg['num_blocks']\n        groups = cfg['groups']\n\n        self.conv1 = nn.Conv2d(3, 24, kernel_size=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(24)\n        self.in_planes = 24\n        self.layer1 = self._make_layer(out_planes[0], num_blocks[0], groups)\n        self.layer2 = self._make_layer(out_planes[1], num_blocks[1], groups)\n        self.layer3 = self._make_layer(out_planes[2], num_blocks[2], groups)\n        self.linear = nn.Linear(out_planes[2], 10)\n\n    def _make_layer(self, out_planes, num_blocks, groups):\n        layers = []\n        for i in range(num_blocks):\n            stride = 2 if i == 0 else 1\n            cat_planes = self.in_planes if i == 0 else 0\n            layers.append(Bottleneck(self.in_planes, out_planes-cat_planes, stride=stride, groups=groups))\n            self.in_planes = out_planes\n        return nn.Sequential(*layers)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.layer1(out)\n        out = self.layer2(out)\n        out = self.layer3(out)\n        out = F.avg_pool2d(out, 4)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\n\ndef ShuffleNetG2():\n    cfg = {\n        'out_planes': [200,400,800],\n        'num_blocks': [4,8,4],\n        'groups': 2\n    }\n    return ShuffleNet(cfg)\n\ndef ShuffleNetG3():\n    cfg = {\n        'out_planes': [240,480,960],\n        'num_blocks': [4,8,4],\n        'groups': 3\n    }\n    return ShuffleNet(cfg)\n\n\ndef test():\n    net = ShuffleNetG2()\n    x = torch.randn(1,3,32,32)\n    y = net(x)\n    print(y)\n\n# test()\n"
  },
  {
    "path": "GC_code/CIFAR100/models/vgg.py",
    "content": "'''VGG11/13/16/19 in Pytorch.'''\nimport torch\nimport torch.nn as nn\n\n\ncfg = {\n    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],\n    'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],\n    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],\n    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],\n}\n\n\nclass VGG(nn.Module):\n    def __init__(self, vgg_name,Num_classes=100):\n        super(VGG, self).__init__()\n        self.features = self._make_layers(cfg[vgg_name])\n        self.classifier = nn.Linear(512, Num_classes)\n\n    def forward(self, x):\n        out = self.features(x)\n        out = out.view(out.size(0), -1)\n        out = self.classifier(out)\n        return out\n\n    def _make_layers(self, cfg):\n        layers = []\n        in_channels = 3\n        for x in cfg:\n            if x == 'M':\n                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]\n            else:\n                layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),\n                           nn.BatchNorm2d(x),\n                           nn.ReLU(inplace=True)]\n                in_channels = x\n        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]\n        return nn.Sequential(*layers)\n\n\ndef test():\n    net = VGG('VGG11')\n    x = torch.randn(2,3,32,32)\n    y = net(x)\n    print(y.size())\n\n# test()\n"
  },
  {
    "path": "GC_code/CIFAR100/os_run.py",
    "content": "\nimport os,time\n\n#cifar100 sgd & sgdGCC\n\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200  --model r50 > logout/r50_lr11_wd45_sgd.log \")\n\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200  --model r50 > logout/r50_lr11_wd45_sgdGC.log \")\n"
  },
  {
    "path": "GC_code/Fine-grained_classification/SGD.py",
    "content": "import torch\nfrom torch.optim.optimizer import Optimizer, required\n\n\n\nclass SGD_GCC(Optimizer):\n\n    def __init__(self, params, lr=required, momentum=0, dampening=0,\n                 weight_decay=0, nesterov=False):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if momentum < 0.0:\n            raise ValueError(\"Invalid momentum value: {}\".format(momentum))\n        if weight_decay < 0.0:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n\n        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,\n                        weight_decay=weight_decay, nesterov=nesterov)\n        if nesterov and (momentum <= 0 or dampening != 0):\n            raise ValueError(\"Nesterov momentum requires a momentum and zero dampening\")\n        super(SGD_GCC, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(SGD_GCC, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('nesterov', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            weight_decay = group['weight_decay']\n            momentum = group['momentum']\n            dampening = group['dampening']\n            nesterov = group['nesterov']\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                d_p = p.grad.data\n\n                if weight_decay != 0:\n                    d_p.add_(weight_decay, p.data)\n\n                #GC operation for Conv layers\n                if len(list(d_p.size()))>3:\n                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))\n                   \n                if momentum != 0:\n                    param_state = self.state[p]\n                    if 'momentum_buffer' not in param_state:\n                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()\n                    else:\n                        buf = param_state['momentum_buffer']\n                        buf.mul_(momentum).add_(1 - dampening, d_p)\n                    if nesterov:\n                        d_p = d_p.add(momentum, buf)\n                    else:\n                        d_p = buf\n\n                p.data.add_(-group['lr'], d_p)\n\n        return loss\n\nclass SGD_GC(Optimizer):\n\n    def __init__(self, params, lr=required, momentum=0, dampening=0,\n                 weight_decay=0, nesterov=False):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if momentum < 0.0:\n            raise ValueError(\"Invalid momentum value: {}\".format(momentum))\n        if weight_decay < 0.0:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n\n        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,\n                        weight_decay=weight_decay, nesterov=nesterov)\n        if nesterov and (momentum <= 0 or dampening != 0):\n            raise ValueError(\"Nesterov momentum requires a momentum and zero dampening\")\n        super(SGD_GC, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(SGD_GC, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('nesterov', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            weight_decay = group['weight_decay']\n            momentum = group['momentum']\n            dampening = group['dampening']\n            nesterov = group['nesterov']\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                d_p = p.grad.data\n\n                if weight_decay != 0:\n                    d_p.add_(weight_decay, p.data)\n\n                #GC operation for Conv layers and FC layers\n                if len(list(d_p.size()))>1:\n                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))\n\n                if momentum != 0:\n                    param_state = self.state[p]\n                    if 'momentum_buffer' not in param_state:\n                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()\n                    else:\n                        buf = param_state['momentum_buffer']\n                        buf.mul_(momentum).add_(1 - dampening, d_p)\n                    if nesterov:\n                        d_p = d_p.add(momentum, buf)\n                    else:\n                        d_p = buf\n\n                p.data.add_(-group['lr'], d_p)\n\n        return loss\n\n\nclass SGDW(Optimizer):\n\n    def __init__(self, params, lr=required, momentum=0, dampening=0,\n                 weight_decay=0, nesterov=False):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if momentum < 0.0:\n            raise ValueError(\"Invalid momentum value: {}\".format(momentum))\n        if weight_decay < 0.0:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n\n        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,\n                        weight_decay=weight_decay, nesterov=nesterov)\n        if nesterov and (momentum <= 0 or dampening != 0):\n            raise ValueError(\"Nesterov momentum requires a momentum and zero dampening\")\n        super(SGDW, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(SGDW, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('nesterov', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            weight_decay = group['weight_decay']\n            momentum = group['momentum']\n            dampening = group['dampening']\n            nesterov = group['nesterov']\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                d_p = p.grad.data\n\n                old = torch.clone(p.data).detach()\n                #if weight_decay != 0:\n                #    d_p.add_(weight_decay, p.data)\n                if momentum != 0:\n                    param_state = self.state[p]\n                    if 'momentum_buffer' not in param_state:\n                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)\n                        buf.mul_(momentum).add_(d_p)\n                    else:\n                        buf = param_state['momentum_buffer']\n                        buf.mul_(momentum).add_(1 - dampening, d_p)\n                    if nesterov:\n                        d_p = d_p.add(momentum, buf)\n                    else:\n                        d_p = buf\n\n                p.data.add_(-group['lr'], d_p)\n\n                if weight_decay != 0:\n                    p.data.add_(-weight_decay*group['lr'], old)\n\n        return loss\n\n\n\nclass SGDW_GCC(Optimizer):\n    def __init__(self, params, lr=required, momentum=0, dampening=0,\n                 weight_decay=0, nesterov=False):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if momentum < 0.0:\n            raise ValueError(\"Invalid momentum value: {}\".format(momentum))\n        if weight_decay < 0.0:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n\n        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,\n                        weight_decay=weight_decay, nesterov=nesterov)\n        if nesterov and (momentum <= 0 or dampening != 0):\n            raise ValueError(\"Nesterov momentum requires a momentum and zero dampening\")\n        super(SGDW_GCC, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(SGDW_GCC, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('nesterov', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            weight_decay = group['weight_decay']\n            momentum = group['momentum']\n            dampening = group['dampening']\n            nesterov = group['nesterov']\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                d_p = p.grad.data\n\n                old = torch.clone(p.data).detach()\n                #if weight_decay != 0:\n                #    d_p.add_(weight_decay, p.data)\n                \n                #GC operation for Conv layers\n                if len(list(d_p.size()))>3:\n                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))\n\n\n                if momentum != 0:\n                    param_state = self.state[p]\n                    if 'momentum_buffer' not in param_state:\n                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)\n                        buf.mul_(momentum).add_(d_p)\n                    else:\n                        buf = param_state['momentum_buffer']\n                        buf.mul_(momentum).add_(1 - dampening, d_p)\n                    if nesterov:\n                        d_p = d_p.add(momentum, buf)\n                    else:\n                        d_p = buf\n\n                p.data.add_(-group['lr'], d_p)\n\n                if weight_decay != 0:\n                    p.data.add_(-weight_decay*group['lr'], old)\n\n        return loss\n"
  },
  {
    "path": "GC_code/Fine-grained_classification/main.py",
    "content": "import argparse\nimport os\nimport random\nimport shutil\nimport time\nimport warnings\nimport sys\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.parallel\nimport torch.backends.cudnn as cudnn\nimport torch.distributed as dist\nimport torch.optim\nimport torch.multiprocessing as mp\nimport torch.utils.data\nimport torch.utils.data.distributed\nimport torchvision.transforms as transforms\nimport torchvision.datasets as datasets\nimport torchvision.models as models\n\nfrom torch.optim import lr_scheduler\n\nfrom SGD import SGD_GC #import SGD with GC\n\n\nmodel_names = sorted(name for name in models.__dict__\n    if name.islower() and not name.startswith(\"__\")\n    and callable(models.__dict__[name]))\n\nparser = argparse.ArgumentParser(description='PyTorch ImageNet Training')\n\nparser.add_argument('-b', '--batch-size', default=256, type=int,\n                    metavar='N',\n                    help='mini-batch size (default: 256), this is the total '\n                         'batch size of all GPUs on the current node when '\n                         'using Data Parallel or Distributed Data Parallel')\n\nparser.add_argument('--lr', '--learning-rate', default=0.1*128/128, type=float,\n                    metavar='LR', help='initial learning rate', dest='lr')\n\nparser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',\n                    choices=model_names,\n                    help='model architecture: ' +\n                        ' | '.join(model_names) +\n                        ' (default: resnet18)')\n\nparser.add_argument('data', metavar='DIR',\n                    help='path to dataset')\n\nparser.add_argument('-j', '--workers', default=4, type=int, metavar='N',\n                    help='number of data loading workers (default: 4)')\nparser.add_argument('--epochs', default=100, type=int, metavar='N',\n                    help='number of total epochs to run')\nparser.add_argument('--start-epoch', default=0, type=int, metavar='N',\n                    help='manual epoch number (useful on restarts)')\n\nparser.add_argument('--momentum', default=0.9, type=float, metavar='M',\n                    help='momentum')\nparser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,\n                    metavar='W', help='weight decay (default: 1e-4)',\n                    dest='weight_decay')\nparser.add_argument('-p', '--print-freq', default=100, type=int,\n                    metavar='N', help='print frequency (default: 10)')\nparser.add_argument('--resume', default='', type=str, metavar='PATH',\n                    help='path to latest checkpoint (default: none)')\nparser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',\n                    help='evaluate model on validation set')\nparser.add_argument('--pretrained', dest='pretrained', action='store_true',\n                    help='use pre-trained model')\nparser.add_argument('--world-size', default=-1, type=int,\n                    help='number of nodes for distributed training')\nparser.add_argument('--rank', default=-1, type=int,\n                    help='node rank for distributed training')\nparser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,\n                    help='url used to set up distributed training')\nparser.add_argument('--dist-backend', default='nccl', type=str,\n                    help='distributed backend')\nparser.add_argument('--seed', default=None, type=int,\n                    help='seed for initializing training. ')\nparser.add_argument('--gpu', default=None, type=int,\n                    help='GPU id to use.')\nparser.add_argument('--multiprocessing-distributed', action='store_true',\n                    help='Use multi-processing distributed training to launch '\n                         'N processes per node, which has N GPUs. This is the '\n                         'fastest way to use PyTorch for either single node or '\n                         'multi node data parallel training')\nparser.add_argument('--model', default='r50p', type=str, help='model')\n\nparser.add_argument('--path', default='test', type=str, help='model')\nparser.add_argument('--alg', default='sgd', type=str, help='algorithm')\n\nparser.add_argument('--dataset', default='cub', type=str, help='model')\n\nbest_acc1 = 0\n\ndef main():\n    args = parser.parse_args()\n    os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0,1,2,3\"\n\n    if args.seed is not None:\n        random.seed(args.seed)\n        torch.manual_seed(args.seed)\n        cudnn.deterministic = True\n        warnings.warn('You have chosen to seed training. '\n                      'This will turn on the CUDNN deterministic setting, '\n                      'which can slow down your training considerably! '\n                      'You may see unexpected behavior when restarting '\n                      'from checkpoints.')\n\n    if args.gpu is not None:\n        warnings.warn('You have chosen a specific GPU. This will completely '\n                      'disable data parallelism.')\n\n    if args.dist_url == \"env://\" and args.world_size == -1:\n        args.world_size = int(os.environ[\"WORLD_SIZE\"])\n\n    args.distributed = args.world_size > 1 or args.multiprocessing_distributed\n\n    ngpus_per_node = torch.cuda.device_count()\n    if args.multiprocessing_distributed:\n        # Since we have ngpus_per_node processes per node, the total world_size\n        # needs to be adjusted accordingly\n        args.world_size = ngpus_per_node * args.world_size\n        # Use torch.multiprocessing.spawn to launch distributed processes: the\n        # main_worker process function\n        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))\n    else:\n        # Simply call main_worker function\n        main_worker(args.gpu, ngpus_per_node, args)\n\n\ndef main_worker(gpu, ngpus_per_node, args):\n    global best_acc1\n    args.gpu = gpu\n\n    class_num={'cub':200,'cars':196,'dogs':120,'fgvc':100}\n    if args.gpu is not None:\n        print(\"Use GPU: {} for training\".format(args.gpu))\n\n    if args.distributed:\n        if args.dist_url == \"env://\" and args.rank == -1:\n            args.rank = int(os.environ[\"RANK\"])\n        if args.multiprocessing_distributed:\n            # For multiprocessing distributed training, rank needs to be the\n            # global rank among all the processes\n            args.rank = args.rank * ngpus_per_node + gpu\n        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,\n                                world_size=args.world_size, rank=args.rank)\n    # create model\n    if args.model=='r18p':\n      model =models.resnet18(pretrained=True)\n      model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True)\n    if args.model=='r18':\n      model =models.resnet18()\n      model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True)\n    if args.model=='r50p':\n      model =models.resnet50(pretrained=True)\n      model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True)\n    if args.model=='r50':\n      model =models.resnet50()\n      model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True)\n\n\n    if args.distributed:\n        # For multiprocessing distributed, DistributedDataParallel constructor\n        # should always set the single device scope, otherwise,\n        # DistributedDataParallel will use all available devices.\n        if args.gpu is not None:\n            torch.cuda.set_device(args.gpu)\n            model.cuda(args.gpu)\n            # When using a single GPU per process and per\n            # DistributedDataParallel, we need to divide the batch size\n            # ourselves based on the total number of GPUs we have\n            args.batch_size = int(args.batch_size / ngpus_per_node)\n            args.workers = int(args.workers / ngpus_per_node)\n            model = torch.nn.parallel.DistributedDataParallel(model)\n        else:\n            model.cuda()\n            # DistributedDataParallel will divide and allocate batch_size to all\n            # available GPUs if device_ids are not set\n            model = torch.nn.parallel.DistributedDataParallel(model)\n    else:\n        # DataParallel will divide and allocate batch_size to all available GPUs\n        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):\n            model.features = torch.nn.DataParallel(model.features)\n            model.cuda()\n        else:\n            model = torch.nn.DataParallel(model).cuda()\n\n    # define loss function (criterion) and optimizer\n    criterion = nn.CrossEntropyLoss().cuda(args.gpu)\n\n    # choose optimizer\n    if   args.model=='r50p' or args.model=='r50':\n        new_param_ids = set(map(id, model.module.fc.parameters()))\n    base_params = [p for p in model.parameters() if\n            id(p) not in new_param_ids]\n    param_groups_base =[{'params': base_params, 'lr_mult': 0.1}]\n\n    if   args.model=='r50p' or args.model=='r50':\n       param_groups_new=[{'params': model.module.fc.parameters(), 'lr_mult': 1.0}]\n\n    if args.alg=='sgd':\n       optimizer_base = torch.optim.SGD(param_groups_base, args.lr, momentum=args.momentum,weight_decay=args.weight_decay)\n       optimizer_new= torch.optim.SGD(param_groups_new, args.lr, momentum=args.momentum,weight_decay=args.weight_decay)\n    if args.alg=='sgdGC':\n       optimizer_base = SGD_GC(param_groups_base, args.lr, momentum=args.momentum,weight_decay=args.weight_decay)\n       optimizer_new= SGD_GC(param_groups_new, args.lr, momentum=args.momentum,weight_decay=args.weight_decay)\n\n    exp_lr_scheduler_new = lr_scheduler.MultiStepLR(optimizer_new, milestones=[50,80], gamma=0.1)\n    exp_lr_scheduler_base = lr_scheduler.MultiStepLR(optimizer_base, milestones=[50,80], gamma=0.1)\n    \n    # optionally resume from a checkpoint\n    if args.resume:\n        if os.path.isfile(args.resume):\n            print(\"=> loading checkpoint '{}'\".format(args.resume))\n            checkpoint = torch.load(args.resume)\n            args.start_epoch = checkpoint['epoch']\n            best_acc1 = checkpoint['best_acc1']\n            if args.gpu is not None:\n                # best_acc1 may be from a checkpoint from a different GPU\n                best_acc1 = best_acc1.to(args.gpu)\n            model.load_state_dict(checkpoint['state_dict'])\n            optimizer.load_state_dict(checkpoint['optimizer'])\n            print(\"=> loaded checkpoint '{}' (epoch {})\"\n                  .format(args.resume, checkpoint['epoch']))\n        else:\n            print(\"=> no checkpoint found at '{}'\".format(args.resume))\n\n    cudnn.benchmark = True\n\n    # Data loading code\n    traindir = os.path.join(args.data, 'train')\n    valdir = os.path.join(args.data, 'val')\n    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],\n                                     std=[0.229, 0.224, 0.225])\n\n    train_dataset = datasets.ImageFolder(\n        traindir,\n        transforms.Compose([\n                    transforms.Resize(512),\n                    transforms.RandomHorizontalFlip(),\n                    transforms.CenterCrop(448),\n                    transforms.ToTensor(),\n                    normalize,\n                ]))\n\n    if args.distributed:\n        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)\n    else:\n        train_sampler = None\n\n    train_loader = torch.utils.data.DataLoader(\n        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),\n        num_workers=args.workers, pin_memory=True, sampler=train_sampler,drop_last=True)\n\n    val_loader = torch.utils.data.DataLoader(\n        datasets.ImageFolder(valdir, transforms.Compose([\n                    transforms.Resize(512),\n                    transforms.CenterCrop(448),\n                    transforms.ToTensor(),\n                    normalize,\n                ])),\n        batch_size=args.batch_size, shuffle=False,\n        num_workers=args.workers, pin_memory=True,drop_last=True)\n\n    if args.evaluate:\n        validate(val_loader, model, criterion, args)\n        return\n\n    for epoch in range(args.start_epoch, args.epochs):\n        if args.distributed:\n            train_sampler.set_epoch(epoch)\n        #adjust_learning_rate(optimizer, epoch, args)\n\n        # train for one epoch\n        train(train_loader, model, criterion, optimizer_base, optimizer_new,epoch, args)\n        #exp_lr_scheduler.step()\n        exp_lr_scheduler_new.step()\n        exp_lr_scheduler_base.step()\n        # evaluate on validation set\n        acc1 = validate(val_loader, model, criterion, args)\n\n        # remember best acc@1 and save checkpoint\n        is_best = acc1 > best_acc1\n        best_acc1 = max(acc1, best_acc1)\n\n        if not args.multiprocessing_distributed or (args.multiprocessing_distributed\n                and args.rank % ngpus_per_node == 0):\n            save_checkpoint({\n                'epoch': epoch + 1,\n                'arch': args.arch,\n                'state_dict': model.state_dict(),\n                'best_acc1': best_acc1,\n                #'optimizer' : optimizer.state_dict(),\n            }, is_best)\n        #torch.save(model.module, './result_model/'+args.path+'.pth')\n\n\n# train\ndef train(train_loader, model, criterion, optimizer_base, optimizer_new, epoch, args):\n    batch_time = AverageMeter()\n    data_time = AverageMeter()\n    losses = AverageMeter()\n    top1 = AverageMeter()\n    top5 = AverageMeter()\n    total = 0\n    train_loss = 0\n    correct = 0\n    # switch to train mode\n    model.train()\n    print('\\nEpoch: %d' % epoch)\n    end = time.time()\n    for i, (input, target) in enumerate(train_loader):\n        # measure data loading time\n        data_time.update(time.time() - end)\n\n        #if args.gpu is not None:\n         #input = input.cuda(args.gpu, non_blocking=True)\n        #target = target.cuda(args.gpu, non_blocking=True)\n        input, target = input.to('cuda'), target.to('cuda')\n\n\n\n        # compute output\n        output = model(input)\n        loss = criterion(output, target)\n\n        # measure accuracy and record loss\n        acc1, acc5 = accuracy(output, target, topk=(1, 5))\n        losses.update(loss.item(), input.size(0))\n        top1.update(acc1[0], input.size(0))\n        top5.update(acc5[0], input.size(0))\n\n        _, predicted = output.max(1)\n        correct += predicted.eq(target).sum().item()\n\n        train_loss += loss.item()\n        #correct +=acc1[0]\n        total += target.size(0)\n        # compute gradient and do SGD step\n        optimizer_new.zero_grad()\n        optimizer_base.zero_grad()\n        loss.backward()\n        optimizer_new.step()\n        optimizer_base.step()\n        # measure elapsed time\n        batch_time.update(time.time() - end)\n        end = time.time()\n    print('Training: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5))\n    #print('Training: Loss: {:.3f} | Acc: {:.3f}'.format(train_loss/(i+1),correct/total))\n\n# test\ndef validate(val_loader, model, criterion, args):\n    batch_time = AverageMeter()\n    losses = AverageMeter()\n    top1 = AverageMeter()\n    top5 = AverageMeter()\n\n    val_loss = 0\n    total = 0\n    correct = 0\n    # switch to evaluate mode\n    model.eval()\n\n    with torch.no_grad():\n        end = time.time()\n        for i, (input, target) in enumerate(val_loader):\n            if args.gpu is not None:\n                input = input.cuda(args.gpu, non_blocking=True)\n            target = target.cuda(args.gpu, non_blocking=True)\n\n            # compute output\n            output = model(input)\n            loss = criterion(output, target)\n\n            # measure accuracy and record loss\n            acc1, acc5 = accuracy(output, target, topk=(1, 5))\n            losses.update(loss.item(), input.size(0))\n            top1.update(acc1[0], input.size(0))\n            top5.update(acc5[0], input.size(0))\n\n            _, predicted = output.max(1)\n            total += target.size(0)\n            correct += predicted.eq(target).sum().item()\n\n            val_loss +=loss.item()\n            # measure elapsed time\n            batch_time.update(time.time() - end)\n            end = time.time()\n\n            val_loss += loss.item()\n\n        print('Testing: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5))\n    return top1.avg\n\n\ndef save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):\n    torch.save(state, filename)\n    if is_best:\n        shutil.copyfile(filename, 'model_best.pth.tar')\n\n\nclass AverageMeter(object):\n    \"\"\"Computes and stores the average and current value\"\"\"\n    def __init__(self):\n        self.reset()\n\n    def reset(self):\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n\n    def update(self, val, n=1):\n        self.val = val\n        self.sum += val * n\n        self.count += n\n        self.avg = self.sum / self.count\n\n\ndef adjust_learning_rate(optimizer, epoch, args):\n    \"\"\"Sets the learning rate to the initial LR decayed by 10 every 30 epochs\"\"\"\n    lr = args.lr * (0.1 ** (epoch // 30))\n    for param_group in optimizer.param_groups:\n        param_group['lr'] = lr\n\n\ndef accuracy(output, target, topk=(1,)):\n    \"\"\"Computes the accuracy over the k top predictions for the specified values of k\"\"\"\n    with torch.no_grad():\n        maxk = max(topk)\n        batch_size = target.size(0)\n\n        _, pred = output.topk(maxk, 1, True, True)\n        pred = pred.t()\n        correct = pred.eq(target.view(1, -1).expand_as(pred))\n\n        res = []\n        for k in topk:\n            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)\n            res.append(correct_k.mul_(1.0 / batch_size))\n        return res\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "GC_code/Fine-grained_classification/os_run.py",
    "content": "\nimport os,time\n\n\n\n\nos.system(\"nohup python -W ignore main.py /home/yonghw/data/data/CUB_200_2011/ --model r50p -b 128 --alg sgd --dataset cub  > logout/Cub_r50p_sgd_b128_g4.log \")\nos.system(\"nohup python -W ignore main.py /home/yonghw/data/data/CUB_200_2011/ --model r50p -b 128  --alg sgdGC --dataset cub > logout/Cub_r50p_sgdGC_b128_g4.log \")\n\nos.system(\"nohup python -W ignore main.py /home/yonghw/data/data/Car196/ --model r50p -b 128 --alg sgd --dataset cars > logout/Car_r50p_sgd_b128_g4.log \")\nos.system(\"nohup python -W ignore main.py /home/yonghw/data/data/Car196/ --model r50p -b 128 --alg sgdGC --dataset cars> logout/Car_r50p_sgdGC_b128_g4.log \")\n\nos.system(\"nohup python -W ignore main.py /home/yonghw/data/data/fgvc_aricraft/ --model r50p  -b 128 --alg sgd --dataset fgvc > logout/Ari_r50p_sgd_b128_g4.log \")\nos.system(\"nohup python -W ignore main.py /home/yonghw/data/data/fgvc_aricraft/ --model r50p  -b 128 --alg sgdGC --dataset fgvc > logout/Ari_r50p_sgdGC_b128_g4.log \")\n\nos.system(\"nohup python -W ignore main.py /home/yonghw/data/data/StanfordDogs/ --model r50p  -b 128  --alg sgd --dataset dogs > logout/Dog_r50p_sgd_b128_g4.log \")\nos.system(\"nohup python -W ignore main.py /home/yonghw/data/data/StanfordDogs/ --model r50p  -b 128  --alg sgdGC --dataset dogs > logout/Dog_r50p_sgdGC_b128_g4.log \")\n"
  },
  {
    "path": "GC_code/ImageNet/SGD.py",
    "content": "import torch\nfrom torch.optim.optimizer import Optimizer, required\n\n\n\nclass SGD_GCC(Optimizer):\n\n    def __init__(self, params, lr=required, momentum=0, dampening=0,\n                 weight_decay=0, nesterov=False):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if momentum < 0.0:\n            raise ValueError(\"Invalid momentum value: {}\".format(momentum))\n        if weight_decay < 0.0:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n\n        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,\n                        weight_decay=weight_decay, nesterov=nesterov)\n        if nesterov and (momentum <= 0 or dampening != 0):\n            raise ValueError(\"Nesterov momentum requires a momentum and zero dampening\")\n        super(SGD_GCC, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(SGD_GCC, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('nesterov', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            weight_decay = group['weight_decay']\n            momentum = group['momentum']\n            dampening = group['dampening']\n            nesterov = group['nesterov']\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                d_p = p.grad.data\n\n                if weight_decay != 0:\n                    d_p.add_(weight_decay, p.data)\n\n                #GC operation for Conv layers\n                if len(list(d_p.size()))>3:\n                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))\n                   \n                if momentum != 0:\n                    param_state = self.state[p]\n                    if 'momentum_buffer' not in param_state:\n                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()\n                    else:\n                        buf = param_state['momentum_buffer']\n                        buf.mul_(momentum).add_(1 - dampening, d_p)\n                    if nesterov:\n                        d_p = d_p.add(momentum, buf)\n                    else:\n                        d_p = buf\n\n                p.data.add_(-group['lr'], d_p)\n\n        return loss\n\nclass SGD_GC(Optimizer):\n\n    def __init__(self, params, lr=required, momentum=0, dampening=0,\n                 weight_decay=0, nesterov=False):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if momentum < 0.0:\n            raise ValueError(\"Invalid momentum value: {}\".format(momentum))\n        if weight_decay < 0.0:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n\n        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,\n                        weight_decay=weight_decay, nesterov=nesterov)\n        if nesterov and (momentum <= 0 or dampening != 0):\n            raise ValueError(\"Nesterov momentum requires a momentum and zero dampening\")\n        super(SGD_GC, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(SGD_GC, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('nesterov', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            weight_decay = group['weight_decay']\n            momentum = group['momentum']\n            dampening = group['dampening']\n            nesterov = group['nesterov']\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                d_p = p.grad.data\n\n                if weight_decay != 0:\n                    d_p.add_(weight_decay, p.data)\n\n                #GC operation for Conv layers and FC layers\n                if len(list(d_p.size()))>1:\n                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))\n\n                if momentum != 0:\n                    param_state = self.state[p]\n                    if 'momentum_buffer' not in param_state:\n                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()\n                    else:\n                        buf = param_state['momentum_buffer']\n                        buf.mul_(momentum).add_(1 - dampening, d_p)\n                    if nesterov:\n                        d_p = d_p.add(momentum, buf)\n                    else:\n                        d_p = buf\n\n                p.data.add_(-group['lr'], d_p)\n\n        return loss\n\n\nclass SGDW(Optimizer):\n\n    def __init__(self, params, lr=required, momentum=0, dampening=0,\n                 weight_decay=0, nesterov=False):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if momentum < 0.0:\n            raise ValueError(\"Invalid momentum value: {}\".format(momentum))\n        if weight_decay < 0.0:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n\n        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,\n                        weight_decay=weight_decay, nesterov=nesterov)\n        if nesterov and (momentum <= 0 or dampening != 0):\n            raise ValueError(\"Nesterov momentum requires a momentum and zero dampening\")\n        super(SGDW, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(SGDW, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('nesterov', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            weight_decay = group['weight_decay']\n            momentum = group['momentum']\n            dampening = group['dampening']\n            nesterov = group['nesterov']\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                d_p = p.grad.data\n\n                old = torch.clone(p.data).detach()\n                #if weight_decay != 0:\n                #    d_p.add_(weight_decay, p.data)\n                if momentum != 0:\n                    param_state = self.state[p]\n                    if 'momentum_buffer' not in param_state:\n                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)\n                        buf.mul_(momentum).add_(d_p)\n                    else:\n                        buf = param_state['momentum_buffer']\n                        buf.mul_(momentum).add_(1 - dampening, d_p)\n                    if nesterov:\n                        d_p = d_p.add(momentum, buf)\n                    else:\n                        d_p = buf\n\n                p.data.add_(-group['lr'], d_p)\n\n                if weight_decay != 0:\n                    p.data.add_(-weight_decay*group['lr'], old)\n\n        return loss\n\n\n\nclass SGDW_GCC(Optimizer):\n    def __init__(self, params, lr=required, momentum=0, dampening=0,\n                 weight_decay=0, nesterov=False):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if momentum < 0.0:\n            raise ValueError(\"Invalid momentum value: {}\".format(momentum))\n        if weight_decay < 0.0:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n\n        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,\n                        weight_decay=weight_decay, nesterov=nesterov)\n        if nesterov and (momentum <= 0 or dampening != 0):\n            raise ValueError(\"Nesterov momentum requires a momentum and zero dampening\")\n        super(SGDW_GCC, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(SGDW_GCC, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('nesterov', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            weight_decay = group['weight_decay']\n            momentum = group['momentum']\n            dampening = group['dampening']\n            nesterov = group['nesterov']\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                d_p = p.grad.data\n\n                old = torch.clone(p.data).detach()\n                #if weight_decay != 0:\n                #    d_p.add_(weight_decay, p.data)\n                \n                #GC operation for Conv layers\n                if len(list(d_p.size()))>3:\n                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))\n\n\n                if momentum != 0:\n                    param_state = self.state[p]\n                    if 'momentum_buffer' not in param_state:\n                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)\n                        buf.mul_(momentum).add_(d_p)\n                    else:\n                        buf = param_state['momentum_buffer']\n                        buf.mul_(momentum).add_(1 - dampening, d_p)\n                    if nesterov:\n                        d_p = d_p.add(momentum, buf)\n                    else:\n                        d_p = buf\n\n                p.data.add_(-group['lr'], d_p)\n\n                if weight_decay != 0:\n                    p.data.add_(-weight_decay*group['lr'], old)\n\n        return loss\n"
  },
  {
    "path": "GC_code/ImageNet/main.py",
    "content": "import argparse\nimport os\nimport random\nimport shutil\nimport time\nimport warnings\nimport sys\n#nohup python -W ignore main.py /mnt/v0/ --model r50bn --alg sgd1 -b 256 --gpug 1 --path r50bn_sgd1_b256_g4 > logout/r50bn_sgd1_b256_g4.log\nimport torch\nimport torch.nn as nn\nimport torch.nn.parallel\nimport torch.backends.cudnn as cudnn\nimport torch.distributed as dist\nimport torch.optim\nimport torch.multiprocessing as mp\nimport torch.utils.data\nimport torch.utils.data.distributed\nimport torchvision.transforms as transforms\nimport torchvision.datasets as datasets\nimport torchvision.models as models\n#from myresnet_nbn import resnet18_nbn, resnet101_nbn,resnet50_nbn\nfrom myresnet import resnet50, resnet101\nfrom myresnetgn import resnet50gn, resnet101gn\n\n\nfrom torch.optim import lr_scheduler\n\n\nfrom SGD import SGD_GCC #import SGD with GC for Conv layer\n\n\nmodel_names = sorted(name for name in models.__dict__\n    if name.islower() and not name.startswith(\"__\")\n    and callable(models.__dict__[name]))\n\nparser = argparse.ArgumentParser(description='PyTorch ImageNet Training')\n\nparser.add_argument('-b', '--batch-size', default=256, type=int,\n                    metavar='N',\n                    help='mini-batch size (default: 256), this is the total '\n                         'batch size of all GPUs on the current node when '\n                         'using Data Parallel or Distributed Data Parallel')\n\nparser.add_argument('--lr', '--learning-rate', default=0.1*128/128, type=float,\n                    metavar='LR', help='initial learning rate', dest='lr')\n\nparser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',\n                    choices=model_names,\n                    help='model architecture: ' +\n                        ' | '.join(model_names) +\n                        ' (default: resnet18)')\n\nparser.add_argument('data', metavar='DIR',\n                    help='path to dataset')\n\nparser.add_argument('-j', '--workers', default=4, type=int, metavar='N',\n                    help='number of data loading workers (default: 4)')\nparser.add_argument('--epochs', default=100, type=int, metavar='N',\n                    help='number of total epochs to run')\nparser.add_argument('--start-epoch', default=0, type=int, metavar='N',\n                    help='manual epoch number (useful on restarts)')\n\nparser.add_argument('--bgn', default=1, type=int, help='bn group number')\n\nparser.add_argument('--momentum', default=0.9, type=float, metavar='M',\n                    help='momentum')\nparser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,\n                    metavar='W', help='weight decay (default: 1e-4)',\n                    dest='weight_decay')\nparser.add_argument('-p', '--print-freq', default=100, type=int,\n                    metavar='N', help='print frequency (default: 10)')\nparser.add_argument('--resume', default='', type=str, metavar='PATH',\n                    help='path to latest checkpoint (default: none)')\nparser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',\n                    help='evaluate model on validation set')\nparser.add_argument('--pretrained', dest='pretrained', action='store_true',\n                    help='use pre-trained model')\nparser.add_argument('--world-size', default=-1, type=int,\n                    help='number of nodes for distributed training')\nparser.add_argument('--rank', default=-1, type=int,\n                    help='node rank for distributed training')\nparser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,\n                    help='url used to set up distributed training')\nparser.add_argument('--dist-backend', default='nccl', type=str,\n                    help='distributed backend')\nparser.add_argument('--seed', default=None, type=int,\n                    help='seed for initializing training. ')\nparser.add_argument('--gpu', default=None, type=int,\n                    help='GPU id to use.')\nparser.add_argument('--multiprocessing-distributed', action='store_true',\n                    help='Use multi-processing distributed training to launch '\n                         'N processes per node, which has N GPUs. This is the '\n                         'fastest way to use PyTorch for either single node or '\n                         'multi node data parallel training')\nparser.add_argument('--model', default='r50bn', type=str, help='model')\nparser.add_argument('--path', default='test', type=str, help='model')\nparser.add_argument('--alg', default='sgd', type=str, help='algorithm')\n\n\nbest_acc1 = 0\ndevice_ids=[0,1,2,3,4,5,6,7]\n\ndef main():\n    args = parser.parse_args()\n    os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0,1,2,3\"\n\n\n    if args.seed is not None:\n        random.seed(args.seed)\n        torch.manual_seed(args.seed)\n        cudnn.deterministic = True\n        warnings.warn('You have chosen to seed training. '\n                      'This will turn on the CUDNN deterministic setting, '\n                      'which can slow down your training considerably! '\n                      'You may see unexpected behavior when restarting '\n                      'from checkpoints.')\n\n    if args.gpu is not None:\n        warnings.warn('You have chosen a specific GPU. This will completely '\n                      'disable data parallelism.')\n\n    if args.dist_url == \"env://\" and args.world_size == -1:\n        args.world_size = int(os.environ[\"WORLD_SIZE\"])\n\n    args.distributed = args.world_size > 1 or args.multiprocessing_distributed\n\n    ngpus_per_node = torch.cuda.device_count()\n    if args.multiprocessing_distributed:\n        # Since we have ngpus_per_node processes per node, the total world_size\n        # needs to be adjusted accordingly\n        args.world_size = ngpus_per_node * args.world_size\n        # Use torch.multiprocessing.spawn to launch distributed processes: the\n        # main_worker process function\n        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))\n    else:\n        # Simply call main_worker function\n        main_worker(args.gpu, ngpus_per_node, args)\n\n\ndef main_worker(gpu, ngpus_per_node, args):\n    global best_acc1\n    args.gpu = gpu\n\n    if args.gpu is not None:\n        print(\"Use GPU: {} for training\".format(args.gpu))\n\n    if args.distributed:\n        if args.dist_url == \"env://\" and args.rank == -1:\n            args.rank = int(os.environ[\"RANK\"])\n        if args.multiprocessing_distributed:\n            # For multiprocessing distributed training, rank needs to be the\n            # global rank among all the processes\n            args.rank = args.rank * ngpus_per_node + gpu\n        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,\n                                world_size=args.world_size, rank=args.rank)\n    # create model\n    if args.model=='r50bn':\n      model = resnet50()\n    if args.model=='r50gn':\n      model = resnet50gn()\n\n    if args.model=='r101bn':\n      model = resnet101()\n    if args.model=='r101gn':\n      model = resnet101gn()\n\n\n\n    if args.distributed:\n        # For multiprocessing distributed, DistributedDataParallel constructor\n        # should always set the single device scope, otherwise,\n        # DistributedDataParallel will use all available devices.\n        if args.gpu is not None:\n            torch.cuda.set_device(args.gpu)\n            model.cuda(args.gpu)\n            # When using a single GPU per process and per\n            # DistributedDataParallel, we need to divide the batch size\n            # ourselves based on the total number of GPUs we have\n            args.batch_size = int(args.batch_size / ngpus_per_node)\n            args.workers = int(args.workers / ngpus_per_node)\n            model = torch.nn.parallel.DistributedDataParallel(model)\n        else:\n            model.cuda()\n            # DistributedDataParallel will divide and allocate batch_size to all\n            # available GPUs if device_ids are not set\n            model = torch.nn.parallel.DistributedDataParallel(model)\n    else:\n        # DataParallel will divide and allocate batch_size to all available GPUs\n        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):\n            model.features = torch.nn.DataParallel(model.features)\n            model.cuda()\n        else:\n            model = torch.nn.DataParallel(model).cuda()\n\n    # define loss function (criterion) and optimizer\n    criterion = nn.CrossEntropyLoss().cuda(args.gpu)\n    \n    # choose optimizer\n    if args.alg=='sgd':\n      optimizer =torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum,weight_decay=args.weight_decay)\n    if args.alg=='sgdGC':\n      optimizer = SGD_GCC(model.parameters(), args.lr, momentum=args.momentum,weight_decay=args.weight_decay)\n\n    exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)\n    # optionally resume from a checkpoint\n    if args.resume:\n        if os.path.isfile(args.resume):\n            print(\"=> loading checkpoint '{}'\".format(args.resume))\n            checkpoint = torch.load(args.resume)\n            args.start_epoch = checkpoint['epoch']\n            best_acc1 = checkpoint['best_acc1']\n            if args.gpu is not None:\n                # best_acc1 may be from a checkpoint from a different GPU\n                best_acc1 = best_acc1.to(args.gpu)\n            model.load_state_dict(checkpoint['state_dict'])\n            optimizer.load_state_dict(checkpoint['optimizer'])\n            print(\"=> loaded checkpoint '{}' (epoch {})\"\n                  .format(args.resume, checkpoint['epoch']))\n        else:\n            print(\"=> no checkpoint found at '{}'\".format(args.resume))\n\n    cudnn.benchmark = True\n\n    # Data loading code\n    traindir = os.path.join(args.data, 'train')\n    valdir = os.path.join(args.data, 'val')\n    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],\n                                     std=[0.229, 0.224, 0.225])\n\n    train_dataset = datasets.ImageFolder(\n        traindir,\n        transforms.Compose([\n            transforms.RandomResizedCrop(224),\n            transforms.RandomHorizontalFlip(),\n            transforms.ToTensor(),\n            normalize,\n        ]))\n\n    if args.distributed:\n        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)\n    else:\n        train_sampler = None\n\n    train_loader = torch.utils.data.DataLoader(\n        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),\n        num_workers=args.workers, pin_memory=True, sampler=train_sampler,drop_last=True)\n\n    val_loader = torch.utils.data.DataLoader(\n        datasets.ImageFolder(valdir, transforms.Compose([\n            transforms.Resize(256),\n            transforms.CenterCrop(224),\n            transforms.ToTensor(),\n            normalize,\n        ])),\n        batch_size=args.batch_size, shuffle=False,\n        num_workers=args.workers, pin_memory=True,drop_last=True)\n\n    if args.evaluate:\n        validate(val_loader, model, criterion, args)\n        return\n\n    for epoch in range(args.start_epoch, args.epochs):\n        if args.distributed:\n            train_sampler.set_epoch(epoch)\n        #adjust_learning_rate(optimizer, epoch, args)\n\n        # train for one epoch\n        train(train_loader, model, criterion, optimizer, epoch, args)\n        exp_lr_scheduler.step()\n        # evaluate on validation set\n        acc1 = validate(val_loader, model, criterion, args)\n\n        # remember best acc@1 and save checkpoint\n        is_best = acc1 > best_acc1\n        best_acc1 = max(acc1, best_acc1)\n\n        if not args.multiprocessing_distributed or (args.multiprocessing_distributed\n                and args.rank % ngpus_per_node == 0):\n            save_checkpoint({\n                'epoch': epoch + 1,\n                'arch': args.arch,\n                'state_dict': model.state_dict(),\n                'best_acc1': best_acc1,\n                'optimizer' : optimizer.state_dict(),\n            }, is_best)\n        torch.save(model.module, './result_model/'+args.path+'.pth')\n\n# train\ndef train(train_loader, model, criterion, optimizer, epoch, args):\n    batch_time = AverageMeter()\n    data_time = AverageMeter()\n    losses = AverageMeter()\n    top1 = AverageMeter()\n    top5 = AverageMeter()\n    total = 0\n    train_loss = 0\n    correct = 0\n    # switch to train mode\n    model.train()\n    print('\\nEpoch: %d' % epoch)\n    end = time.time()\n    for i, (input, target) in enumerate(train_loader):\n        # measure data loading time\n        data_time.update(time.time() - end)\n\n        #if args.gpu is not None:\n         #input = input.cuda(args.gpu, non_blocking=True)\n        #target = target.cuda(args.gpu, non_blocking=True)\n        input, target = input.to('cuda'), target.to('cuda')\n\n        # compute output\n        output = model(input)\n        loss = criterion(output, target)\n\n        # measure accuracy and record loss\n        acc1, acc5 = accuracy(output, target, topk=(1, 5))\n        losses.update(loss.item(), input.size(0))\n        top1.update(acc1[0], input.size(0))\n        top5.update(acc5[0], input.size(0))\n\n        _, predicted = output.max(1)\n        correct += predicted.eq(target).sum().item()\n\n        train_loss += loss.item()\n        #correct +=acc1[0]\n        total += target.size(0)\n        # compute gradient and do SGD step\n        optimizer.zero_grad()\n        loss.backward()\n        optimizer.step()\n        # measure elapsed time\n        batch_time.update(time.time() - end)\n        end = time.time()\n    print('Training: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5))\n    #print('Training: Loss: {:.3f} | Acc: {:.3f}'.format(train_loss/(i+1),correct/total))\n\n# validate\ndef validate(val_loader, model, criterion, args):\n    batch_time = AverageMeter()\n    losses = AverageMeter()\n    top1 = AverageMeter()\n    top5 = AverageMeter()\n\n    val_loss = 0\n    total = 0\n    correct = 0\n    # switch to evaluate mode\n    model.eval()\n\n    with torch.no_grad():\n        end = time.time()\n        for i, (input, target) in enumerate(val_loader):\n            if args.gpu is not None:\n                input = input.cuda(args.gpu, non_blocking=True)\n            target = target.cuda(args.gpu, non_blocking=True)\n\n            # compute output\n            output = model(input)\n            loss = criterion(output, target)\n\n            # measure accuracy and record loss\n            acc1, acc5 = accuracy(output, target, topk=(1, 5))\n            losses.update(loss.item(), input.size(0))\n            top1.update(acc1[0], input.size(0))\n            top5.update(acc5[0], input.size(0))\n\n            _, predicted = output.max(1)\n            total += target.size(0)\n            correct += predicted.eq(target).sum().item()\n\n            val_loss +=loss.item()\n            # measure elapsed time\n            batch_time.update(time.time() - end)\n            end = time.time()\n            val_loss += loss.item()\n        print('Testing: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5))\n    return top1.avg\n\n\ndef save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):\n    torch.save(state, filename)\n    if is_best:\n        shutil.copyfile(filename, 'model_best.pth.tar')\n\n\nclass AverageMeter(object):\n    \"\"\"Computes and stores the average and current value\"\"\"\n    def __init__(self):\n        self.reset()\n\n    def reset(self):\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n\n    def update(self, val, n=1):\n        self.val = val\n        self.sum += val * n\n        self.count += n\n        self.avg = self.sum / self.count\n\n\ndef adjust_learning_rate(optimizer, epoch, args):\n    \"\"\"Sets the learning rate to the initial LR decayed by 10 every 30 epochs\"\"\"\n    lr = args.lr * (0.1 ** (epoch // 30))\n    for param_group in optimizer.param_groups:\n        param_group['lr'] = lr\n\n\ndef accuracy(output, target, topk=(1,)):\n    \"\"\"Computes the accuracy over the k top predictions for the specified values of k\"\"\"\n    with torch.no_grad():\n        maxk = max(topk)\n        batch_size = target.size(0)\n\n        _, pred = output.topk(maxk, 1, True, True)\n        pred = pred.t()\n        correct = pred.eq(target.view(1, -1).expand_as(pred))\n\n        res = []\n        for k in topk:\n            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)\n            res.append(correct_k.mul_(1.0 / batch_size))\n        return res\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "GC_code/ImageNet/myresnet.py",
    "content": "from __future__ import print_function, division, absolute_import\r\nimport torch.nn as nn\r\nimport math\r\nimport torch.utils.model_zoo as model_zoo\r\n\r\n\r\n__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',\r\n           'resnet152']\r\n\r\n\r\nmodel_urls = {\r\n    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',\r\n    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',\r\n    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',\r\n    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',\r\n    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',\r\n}\r\n\r\n\r\ndef conv3x3(in_planes, out_planes, stride=1):\r\n    \"3x3 convolution with padding\"\r\n    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,\r\n                     padding=1, bias=True)\r\n\r\n\r\nclass BasicBlock(nn.Module):\r\n    expansion = 1\r\n\r\n    def __init__(self, inplanes, planes, stride=1, downsample=None):\r\n        super(BasicBlock, self).__init__()\r\n        self.conv1 = conv3x3(inplanes, planes, stride)\r\n        self.bn1 = nn.BatchNorm2d(planes)\r\n        self.relu = nn.ReLU(inplace=True)\r\n        self.conv2 = conv3x3(planes, planes)\r\n        self.bn2 = nn.BatchNorm2d(planes)\r\n        self.downsample = downsample\r\n        self.stride = stride\r\n\r\n    def forward(self, x):\r\n        residual = x\r\n\r\n        out = self.conv1(x)\r\n        out = self.bn1(out)\r\n        out = self.relu(out)\r\n\r\n        out = self.conv2(out)\r\n        out = self.bn2(out)\r\n\r\n        if self.downsample is not None:\r\n            residual = self.downsample(x)\r\n\r\n        out += residual\r\n        out = self.relu(out)\r\n\r\n        return out\r\n\r\n\r\nclass Bottleneck(nn.Module):\r\n    expansion = 4\r\n\r\n    def __init__(self, inplanes, planes, stride=1, downsample=None):\r\n        super(Bottleneck, self).__init__()\r\n        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=True)\r\n        self.bn1 = nn.BatchNorm2d(planes)\r\n        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,\r\n                               padding=1, bias=True)\r\n        self.bn2 = nn.BatchNorm2d(planes)\r\n        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=True)\r\n        self.bn3 = nn.BatchNorm2d(planes * 4)\r\n        self.relu = nn.ReLU(inplace=True)\r\n        self.downsample = downsample\r\n        self.stride = stride\r\n\r\n    def forward(self, x):\r\n        residual = x\r\n\r\n        out = self.conv1(x)\r\n        out = self.bn1(out)\r\n        out = self.relu(out)\r\n\r\n        out = self.conv2(out)\r\n        out = self.bn2(out)\r\n        out = self.relu(out)\r\n\r\n        out = self.conv3(out)\r\n        out = self.bn3(out)\r\n\r\n        if self.downsample is not None:\r\n            residual = self.downsample(x)\r\n\r\n        out += residual\r\n        out = self.relu(out)\r\n\r\n        return out\r\n\r\n#from torch.legacy import nn as nnl\r\n\r\nclass ResNet(nn.Module):\r\n\r\n    def __init__(self, block, layers, num_classes=1000):\r\n        self.inplanes = 64\r\n        super(ResNet, self).__init__()\r\n        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,\r\n                                bias=True)\r\n        #self.conv1 = nnl.SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3)\r\n        self.bn1 = nn.BatchNorm2d(64)\r\n        self.relu = nn.ReLU(inplace=True)\r\n        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)\r\n        self.layer1 = self._make_layer(block, 64, layers[0])\r\n        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)\r\n        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)\r\n        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)\r\n        self.avgpool = nn.AvgPool2d(7)\r\n        self.fc = nn.Linear(512 * block.expansion, num_classes)\r\n\r\n        for m in self.modules():\r\n            if isinstance(m, nn.Conv2d):\r\n                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels\r\n                m.weight.data.normal_(0, math.sqrt(2. / n))\r\n            elif isinstance(m, nn.BatchNorm2d):\r\n                m.weight.data.fill_(1)\r\n                m.bias.data.zero_()\r\n\r\n    def _make_layer(self, block, planes, blocks, stride=1):\r\n        downsample = None\r\n        if stride != 1 or self.inplanes != planes * block.expansion:\r\n            downsample = nn.Sequential(\r\n                nn.Conv2d(self.inplanes, planes * block.expansion,\r\n                          kernel_size=1, stride=stride, bias=True),\r\n                nn.BatchNorm2d(planes * block.expansion),\r\n            )\r\n\r\n        layers = []\r\n        layers.append(block(self.inplanes, planes, stride, downsample))\r\n        self.inplanes = planes * block.expansion\r\n        for i in range(1, blocks):\r\n            layers.append(block(self.inplanes, planes))\r\n\r\n        return nn.Sequential(*layers)\r\n\r\n    def forward(self, x):\r\n        x = self.conv1(x)\r\n        self.conv1_input = x.clone()\r\n        x = self.bn1(x)\r\n        x = self.relu(x)\r\n        x = self.maxpool(x)\r\n\r\n        x = self.layer1(x)\r\n        x = self.layer2(x)\r\n        x = self.layer3(x)\r\n        x = self.layer4(x)\r\n\r\n        x = self.avgpool(x)\r\n        x = x.view(x.size(0), -1)\r\n        x = self.fc(x)\r\n\r\n        return x\r\n\r\n\r\ndef resnet18(pretrained=False, **kwargs):\r\n    \"\"\"Constructs a ResNet-18 model.\r\n    Args:\r\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\r\n    \"\"\"\r\n    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)\r\n    if pretrained:\r\n        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))\r\n    return model\r\n\r\n\r\ndef resnet34(pretrained=False, **kwargs):\r\n    \"\"\"Constructs a ResNet-34 model.\r\n    Args:\r\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\r\n    \"\"\"\r\n    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)\r\n    if pretrained:\r\n        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))\r\n    return model\r\n\r\n\r\ndef resnet50(pretrained=False, **kwargs):\r\n    \"\"\"Constructs a ResNet-50 model.\r\n    Args:\r\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\r\n    \"\"\"\r\n    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)\r\n    if pretrained:\r\n        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))\r\n    return model\r\n\r\n\r\ndef resnet101(pretrained=False, **kwargs):\r\n    \"\"\"Constructs a ResNet-101 model.\r\n    Args:\r\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\r\n    \"\"\"\r\n    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)\r\n    if pretrained:\r\n        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))\r\n    return model\r\n\r\n\r\ndef resnet152(pretrained=False, **kwargs):\r\n    \"\"\"Constructs a ResNet-152 model.\r\n    Args:\r\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\r\n    \"\"\"\r\n    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)\r\n    if pretrained:\r\n        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))\r\n    return model\r\n\r\n\r\ndef test():\r\n    net = resnet18()\r\n    net.eval()\r\n    x=Variable(torch.randn(2,3,224,224))\r\n    y = net(x)\r\n    print(y.size())\r\n    print(net)\r\n#test()\r\n"
  },
  {
    "path": "GC_code/ImageNet/myresnetgn.py",
    "content": "from __future__ import print_function, division, absolute_import\r\nimport torch.nn as nn\r\nimport math\r\nimport torch.utils.model_zoo as model_zoo\r\n\r\n\r\n__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',\r\n           'resnet152']\r\n\r\n\r\nmodel_urls = {\r\n    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',\r\n    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',\r\n    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',\r\n    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',\r\n    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',\r\n}\r\n\r\n\r\ndef conv3x3(in_planes, out_planes, stride=1):\r\n    \"3x3 convolution with padding\"\r\n    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,\r\n                     padding=1, bias=True)\r\n\r\n\r\nclass BasicBlock(nn.Module):\r\n    expansion = 1\r\n\r\n    def __init__(self, inplanes, planes, stride=1, downsample=None):\r\n        super(BasicBlock, self).__init__()\r\n        self.conv1 = conv3x3(inplanes, planes, stride)\r\n        self.bn1 = nn.GroupNorm(32,planes)\r\n        self.relu = nn.ReLU(inplace=True)\r\n        self.conv2 = conv3x3(planes, planes)\r\n        self.bn2 = nn.GroupNorm(32,planes)\r\n        self.downsample = downsample\r\n        self.stride = stride\r\n\r\n    def forward(self, x):\r\n        residual = x\r\n\r\n        out = self.conv1(x)\r\n        out = self.bn1(out)\r\n        out = self.relu(out)\r\n\r\n        out = self.conv2(out)\r\n        out = self.bn2(out)\r\n\r\n        if self.downsample is not None:\r\n            residual = self.downsample(x)\r\n\r\n        out += residual\r\n        out = self.relu(out)\r\n\r\n        return out\r\n\r\n\r\nclass Bottleneck(nn.Module):\r\n    expansion = 4\r\n\r\n    def __init__(self, inplanes, planes, stride=1, downsample=None):\r\n        super(Bottleneck, self).__init__()\r\n        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=True)\r\n        self.bn1 = nn.GroupNorm(32,planes)\r\n        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,\r\n                               padding=1, bias=True)\r\n        self.bn2 = nn.GroupNorm(32,planes)\r\n        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=True)\r\n        self.bn3 = nn.GroupNorm(32,planes * 4)\r\n        self.relu = nn.ReLU(inplace=True)\r\n        self.downsample = downsample\r\n        self.stride = stride\r\n\r\n    def forward(self, x):\r\n        residual = x\r\n\r\n        out = self.conv1(x)\r\n        out = self.bn1(out)\r\n        out = self.relu(out)\r\n\r\n        out = self.conv2(out)\r\n        out = self.bn2(out)\r\n        out = self.relu(out)\r\n\r\n        out = self.conv3(out)\r\n        out = self.bn3(out)\r\n\r\n        if self.downsample is not None:\r\n            residual = self.downsample(x)\r\n\r\n        out += residual\r\n        out = self.relu(out)\r\n\r\n        return out\r\n\r\n#from torch.legacy import nn as nnl\r\n\r\nclass ResNet(nn.Module):\r\n\r\n    def __init__(self, block, layers, num_classes=1000):\r\n        self.inplanes = 64\r\n        super(ResNet, self).__init__()\r\n        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,\r\n                                bias=True)\r\n        #self.conv1 = nnl.SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3)\r\n        self.bn1 = nn.GroupNorm(32,64)\r\n        self.relu = nn.ReLU(inplace=True)\r\n        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)\r\n        self.layer1 = self._make_layer(block, 64, layers[0])\r\n        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)\r\n        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)\r\n        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)\r\n        self.avgpool = nn.AvgPool2d(7)\r\n        self.fc = nn.Linear(512 * block.expansion, num_classes)\r\n\r\n        for m in self.modules():\r\n            if isinstance(m, nn.Conv2d):\r\n                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels\r\n                m.weight.data.normal_(0, math.sqrt(2. / n))\r\n            elif isinstance(m, nn.GroupNorm):\r\n                m.weight.data.fill_(1)\r\n                m.bias.data.zero_()\r\n\r\n    def _make_layer(self, block, planes, blocks, stride=1):\r\n        downsample = None\r\n        if stride != 1 or self.inplanes != planes * block.expansion:\r\n            downsample = nn.Sequential(\r\n                nn.Conv2d(self.inplanes, planes * block.expansion,\r\n                          kernel_size=1, stride=stride, bias=True),\r\n                nn.GroupNorm(32,planes * block.expansion),\r\n            )\r\n\r\n        layers = []\r\n        layers.append(block(self.inplanes, planes, stride, downsample))\r\n        self.inplanes = planes * block.expansion\r\n        for i in range(1, blocks):\r\n            layers.append(block(self.inplanes, planes))\r\n\r\n        return nn.Sequential(*layers)\r\n\r\n    def forward(self, x):\r\n        x = self.conv1(x)\r\n        self.conv1_input = x.clone()\r\n        x = self.bn1(x)\r\n        x = self.relu(x)\r\n        x = self.maxpool(x)\r\n\r\n        x = self.layer1(x)\r\n        x = self.layer2(x)\r\n        x = self.layer3(x)\r\n        x = self.layer4(x)\r\n\r\n        x = self.avgpool(x)\r\n        x = x.view(x.size(0), -1)\r\n        x = self.fc(x)\r\n\r\n        return x\r\n\r\n\r\ndef resnet18gn(pretrained=False, **kwargs):\r\n    \"\"\"Constructs a ResNet-18 model.\r\n    Args:\r\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\r\n    \"\"\"\r\n    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)\r\n    if pretrained:\r\n        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))\r\n    return model\r\n\r\n\r\ndef resnet34gn(pretrained=False, **kwargs):\r\n    \"\"\"Constructs a ResNet-34 model.\r\n    Args:\r\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\r\n    \"\"\"\r\n    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)\r\n    if pretrained:\r\n        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))\r\n    return model\r\n\r\n\r\ndef resnet50gn(pretrained=False, **kwargs):\r\n    \"\"\"Constructs a ResNet-50 model.\r\n    Args:\r\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\r\n    \"\"\"\r\n    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)\r\n    if pretrained:\r\n        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))\r\n    return model\r\n\r\n\r\ndef resnet101gn(pretrained=False, **kwargs):\r\n    \"\"\"Constructs a ResNet-101 model.\r\n    Args:\r\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\r\n    \"\"\"\r\n    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)\r\n    if pretrained:\r\n        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))\r\n    return model\r\n\r\n\r\ndef resnet152gn(pretrained=False, **kwargs):\r\n    \"\"\"Constructs a ResNet-152 model.\r\n    Args:\r\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\r\n    \"\"\"\r\n    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)\r\n    if pretrained:\r\n        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))\r\n    return model\r\n\r\n\r\ndef test():\r\n    net = resnet18gn()\r\n    net.eval()\r\n    x=torch.randn(2,3,224,224)\r\n    y = net(x)\r\n    print(y.size())\r\n    print(net)\r\n#test()\r\n"
  },
  {
    "path": "GC_code/ImageNet/os_run.py",
    "content": "\nimport os,time\n\n\nos.system(\"#nohup python -W ignore main.py /ssd/data/yonghw/Imagenet/v0/ --model r50bn --alg sgd -b 256 --path r50bn_sgd_b256_g4 > logout/r50bn_sgd_b256_g4.log &\")\n\nos.system(\"#nohup python -W ignore main.py /ssd/data/yonghw/Imagenet/v0/ --model r50bn --alg sgdGC -b 256 --path r50bn_sgdGC_b256_g4 > logout/r50bn_sgdGC_b256_g4.log &\")\n"
  },
  {
    "path": "GC_code/Mini_ImageNet/SGD.py",
    "content": "import torch\nfrom torch.optim.optimizer import Optimizer, required\n\n\n\nclass SGD_GCC(Optimizer):\n\n    def __init__(self, params, lr=required, momentum=0, dampening=0,\n                 weight_decay=0, nesterov=False):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if momentum < 0.0:\n            raise ValueError(\"Invalid momentum value: {}\".format(momentum))\n        if weight_decay < 0.0:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n\n        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,\n                        weight_decay=weight_decay, nesterov=nesterov)\n        if nesterov and (momentum <= 0 or dampening != 0):\n            raise ValueError(\"Nesterov momentum requires a momentum and zero dampening\")\n        super(SGD_GCC, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(SGD_GCC, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('nesterov', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            weight_decay = group['weight_decay']\n            momentum = group['momentum']\n            dampening = group['dampening']\n            nesterov = group['nesterov']\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                d_p = p.grad.data\n\n                if weight_decay != 0:\n                    d_p.add_(weight_decay, p.data)\n\n                #GC operation for Conv layers\n                if len(list(d_p.size()))>3:\n                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))\n                   \n                if momentum != 0:\n                    param_state = self.state[p]\n                    if 'momentum_buffer' not in param_state:\n                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()\n                    else:\n                        buf = param_state['momentum_buffer']\n                        buf.mul_(momentum).add_(1 - dampening, d_p)\n                    if nesterov:\n                        d_p = d_p.add(momentum, buf)\n                    else:\n                        d_p = buf\n\n                p.data.add_(-group['lr'], d_p)\n\n        return loss\n\nclass SGD_GC(Optimizer):\n\n    def __init__(self, params, lr=required, momentum=0, dampening=0,\n                 weight_decay=0, nesterov=False):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if momentum < 0.0:\n            raise ValueError(\"Invalid momentum value: {}\".format(momentum))\n        if weight_decay < 0.0:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n\n        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,\n                        weight_decay=weight_decay, nesterov=nesterov)\n        if nesterov and (momentum <= 0 or dampening != 0):\n            raise ValueError(\"Nesterov momentum requires a momentum and zero dampening\")\n        super(SGD_GC, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(SGD_GC, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('nesterov', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            weight_decay = group['weight_decay']\n            momentum = group['momentum']\n            dampening = group['dampening']\n            nesterov = group['nesterov']\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                d_p = p.grad.data\n\n                if weight_decay != 0:\n                    d_p.add_(weight_decay, p.data)\n\n                #GC operation for Conv layers and FC layers\n                if len(list(d_p.size()))>1:\n                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))\n\n                if momentum != 0:\n                    param_state = self.state[p]\n                    if 'momentum_buffer' not in param_state:\n                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()\n                    else:\n                        buf = param_state['momentum_buffer']\n                        buf.mul_(momentum).add_(1 - dampening, d_p)\n                    if nesterov:\n                        d_p = d_p.add(momentum, buf)\n                    else:\n                        d_p = buf\n\n                p.data.add_(-group['lr'], d_p)\n\n        return loss\n\n\nclass SGDW(Optimizer):\n\n    def __init__(self, params, lr=required, momentum=0, dampening=0,\n                 weight_decay=0, nesterov=False):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if momentum < 0.0:\n            raise ValueError(\"Invalid momentum value: {}\".format(momentum))\n        if weight_decay < 0.0:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n\n        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,\n                        weight_decay=weight_decay, nesterov=nesterov)\n        if nesterov and (momentum <= 0 or dampening != 0):\n            raise ValueError(\"Nesterov momentum requires a momentum and zero dampening\")\n        super(SGDW, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(SGDW, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('nesterov', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            weight_decay = group['weight_decay']\n            momentum = group['momentum']\n            dampening = group['dampening']\n            nesterov = group['nesterov']\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                d_p = p.grad.data\n\n                old = torch.clone(p.data).detach()\n                #if weight_decay != 0:\n                #    d_p.add_(weight_decay, p.data)\n                if momentum != 0:\n                    param_state = self.state[p]\n                    if 'momentum_buffer' not in param_state:\n                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)\n                        buf.mul_(momentum).add_(d_p)\n                    else:\n                        buf = param_state['momentum_buffer']\n                        buf.mul_(momentum).add_(1 - dampening, d_p)\n                    if nesterov:\n                        d_p = d_p.add(momentum, buf)\n                    else:\n                        d_p = buf\n\n                p.data.add_(-group['lr'], d_p)\n\n                if weight_decay != 0:\n                    p.data.add_(-weight_decay*group['lr'], old)\n\n        return loss\n\n\n\nclass SGDW_GCC(Optimizer):\n    def __init__(self, params, lr=required, momentum=0, dampening=0,\n                 weight_decay=0, nesterov=False):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if momentum < 0.0:\n            raise ValueError(\"Invalid momentum value: {}\".format(momentum))\n        if weight_decay < 0.0:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n\n        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,\n                        weight_decay=weight_decay, nesterov=nesterov)\n        if nesterov and (momentum <= 0 or dampening != 0):\n            raise ValueError(\"Nesterov momentum requires a momentum and zero dampening\")\n        super(SGDW_GCC, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(SGDW_GCC, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('nesterov', False)\n\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n            weight_decay = group['weight_decay']\n            momentum = group['momentum']\n            dampening = group['dampening']\n            nesterov = group['nesterov']\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                d_p = p.grad.data\n\n                old = torch.clone(p.data).detach()\n                #if weight_decay != 0:\n                #    d_p.add_(weight_decay, p.data)\n                \n                #GC operation for Conv layers\n                if len(list(d_p.size()))>3:\n                   d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))\n\n\n                if momentum != 0:\n                    param_state = self.state[p]\n                    if 'momentum_buffer' not in param_state:\n                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)\n                        buf.mul_(momentum).add_(d_p)\n                    else:\n                        buf = param_state['momentum_buffer']\n                        buf.mul_(momentum).add_(1 - dampening, d_p)\n                    if nesterov:\n                        d_p = d_p.add(momentum, buf)\n                    else:\n                        d_p = buf\n\n                p.data.add_(-group['lr'], d_p)\n\n                if weight_decay != 0:\n                    p.data.add_(-weight_decay*group['lr'], old)\n\n        return loss\n"
  },
  {
    "path": "GC_code/Mini_ImageNet/main.py",
    "content": "import argparse\nimport os\nimport random\nimport shutil\nimport time\nimport warnings\nimport sys\nimport torch\nimport torch.nn as nn\nimport torch.nn.parallel\nimport torch.backends.cudnn as cudnn\nimport torch.distributed as dist\nimport torch.optim\nimport torch.multiprocessing as mp\nimport torch.utils.data\nimport torch.utils.data.distributed\nimport torchvision.transforms as transforms\nimport torchvision.datasets as datasets\nimport torchvision.models as models\nfrom resnet_ws import l_resnet50\n\nimport torchvision.models as models\nimport math\nimport numpy as np\nfrom torch.optim import lr_scheduler\n\n\nfrom SGD import SGD_GC #import SGD with GC\n\nmodel_names = sorted(name for name in models.__dict__\n    if name.islower() and not name.startswith(\"__\")\n    and callable(models.__dict__[name]))\n\nparser = argparse.ArgumentParser(description='PyTorch ImageNet Training')\n\nparser.add_argument('-b', '--batch_size', default=256, type=int,\n                    metavar='N',\n                    help='mini-batch size (default: 256), this is the total '\n                         'batch size of all GPUs on the current node when '\n                         'using Data Parallel or Distributed Data Parallel')\n\nparser.add_argument('--lr', '--learning-rate', default=0.1*32/32, type=float,\n                    metavar='LR', help='initial learning rate', dest='lr')\n\nparser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',\n                    choices=model_names,\n                    help='model architecture: ' +\n                        ' | '.join(model_names) +\n                        ' (default: resnet18)')\n\nparser.add_argument('data', metavar='DIR',\n                    help='path to dataset')\n\nparser.add_argument('-j', '--workers', default=4, type=int, metavar='N',\n                    help='number of data loading workers (default: 4)')\nparser.add_argument('--epochs', default=100, type=int, metavar='N',\n                    help='number of total epochs to run')\nparser.add_argument('--start-epoch', default=0, type=int, metavar='N',\n                    help='manual epoch number (useful on restarts)')\n\nparser.add_argument('--momentum', default=0.9, type=float, metavar='M',\n                    help='momentum')\nparser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,\n                    metavar='W', help='weight decay (default: 1e-4)',\n                    dest='weight_decay')\nparser.add_argument('-p', '--print-freq', default=100, type=int,\n                    metavar='N', help='print frequency (default: 10)')\nparser.add_argument('--resume', default='', type=str, metavar='PATH',\n                    help='path to latest checkpoint (default: none)')\nparser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',\n                    help='evaluate model on validation set')\nparser.add_argument('--pretrained', dest='pretrained', action='store_true',\n                    help='use pre-trained model')\nparser.add_argument('--world-size', default=-1, type=int,\n                    help='number of nodes for distributed training')\nparser.add_argument('--rank', default=-1, type=int,\n                    help='node rank for distributed training')\nparser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,\n                    help='url used to set up distributed training')\nparser.add_argument('--dist-backend', default='nccl', type=str,\n                    help='distributed backend')\nparser.add_argument('--seed', default=None, type=int,\n                    help='seed for initializing training. ')\nparser.add_argument('--gpu', default=None, type=int,\n                    help='GPU id to use.')\nparser.add_argument('--multiprocessing-distributed', action='store_true',\n                    help='Use multi-processing distributed training to launch '\n                         'N processes per node, which has N GPUs. This is the '\n                         'fastest way to use PyTorch for either single node or '\n                         'multi node data parallel training')\nparser.add_argument('--model', default='r18', type=str, help='model')\nparser.add_argument('--path', default='test', type=str, help='model')\nparser.add_argument('--alg', default='sgd', type=str, help='model')\n\n\nbest_acc1 = 0\ndevice_ids=[0,1,2,3,4,5,6,7]\n\ndef main():\n    args = parser.parse_args()\n    os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0,1,2,3\"\n\n    if args.seed is not None:\n        random.seed(args.seed)\n        torch.manual_seed(args.seed)\n        cudnn.deterministic = True\n        warnings.warn('You have chosen to seed training. '\n                      'This will turn on the CUDNN deterministic setting, '\n                      'which can slow down your training considerably! '\n                      'You may see unexpected behavior when restarting '\n                      'from checkpoints.')\n\n    if args.gpu is not None:\n        warnings.warn('You have chosen a specific GPU. This will completely '\n                      'disable data parallelism.')\n\n    if args.dist_url == \"env://\" and args.world_size == -1:\n        args.world_size = int(os.environ[\"WORLD_SIZE\"])\n\n    args.distributed = args.world_size > 1 or args.multiprocessing_distributed\n\n    ngpus_per_node = torch.cuda.device_count()\n    if args.multiprocessing_distributed:\n        # Since we have ngpus_per_node processes per node, the total world_size\n        # needs to be adjusted accordingly\n        args.world_size = ngpus_per_node * args.world_size\n        # Use torch.multiprocessing.spawn to launch distributed processes: the\n        # main_worker process function\n        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))\n    else:\n        # Simply call main_worker function\n        main_worker(args.gpu, ngpus_per_node, args)\n\n\ndef main_worker(gpu, ngpus_per_node, args):\n    global best_acc1\n    args.gpu = gpu\n    #momentum=pow(math.e,math.log(0.9)/64*args.batch_size/ngpus_per_node/args.bgn)\n    if args.gpu is not None:\n        print(\"Use GPU: {} for training\".format(args.gpu))\n\n    if args.distributed:\n        if args.dist_url == \"env://\" and args.rank == -1:\n            args.rank = int(os.environ[\"RANK\"])\n        if args.multiprocessing_distributed:\n            # For multiprocessing distributed training, rank needs to be the\n            # global rank among all the processes\n            args.rank = args.rank * ngpus_per_node + gpu\n        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,\n                                world_size=args.world_size, rank=args.rank)\n\n\n    # create model\n    num_classes=100\n    if args.model=='r50':\n        model = models.resnet50()\n        model.fc= nn.Linear(in_features=2048, out_features=num_classes, bias=True)\n    if args.model=='r50ws':\n      model =l_resnet50(num_classes=num_classes)\n\n    for m in model.modules():\n            if isinstance(m, nn.Conv2d):\n                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels\n                m.weight.data.normal_(0, math.sqrt(2. / n))\n            elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.GroupNorm):\n                m.weight.data.uniform_()\n                m.bias.data.zero_()\n\n    if args.distributed:\n        # For multiprocessing distributed, DistributedDataParallel constructor\n        # should always set the single device scope, otherwise,\n        # DistributedDataParallel will use all available devices.\n        if args.gpu is not None:\n            torch.cuda.set_device(args.gpu)\n            model.cuda(args.gpu)\n            # When using a single GPU per process and per\n            # DistributedDataParallel, we need to divide the batch size\n            # ourselves based on the total number of GPUs we have\n            args.batch_size = int(args.batch_size / ngpus_per_node)\n            args.workers = int(args.workers / ngpus_per_node)\n            model = torch.nn.parallel.DistributedDataParallel(model)\n        else:\n            model.cuda()\n            # DistributedDataParallel will divide and allocate batch_size to all\n            # available GPUs if device_ids are not set\n            model = torch.nn.parallel.DistributedDataParallel(model)\n    else:\n        # DataParallel will divide and allocate batch_size to all available GPUs\n        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):\n            model.features = torch.nn.DataParallel(model.features)\n            model.cuda()\n        else:\n            model = torch.nn.DataParallel(model).cuda()\n\n    # define loss function (criterion) \n    criterion = nn.CrossEntropyLoss().cuda(args.gpu)\n\n\n    # choose optimizer\n    if args.alg=='sgd':\n       optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,momentum=args.momentum, weight_decay = args.weight_decay)\n    if args.alg=='sgdGC':\n      optimizer = SGD_GC(model.parameters(), lr=args.lr,momentum=args.momentum, weight_decay = args.weight_decay)\n\n\n\n    exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)\n\n\n    cudnn.benchmark = True\n\n    # Data loading code\n    traindir = os.path.join(args.data, 'train')\n    valdir = os.path.join(args.data, 'val')\n    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],\n                                     std=[0.229, 0.224, 0.225])\n\n    train_dataset = datasets.ImageFolder(\n        traindir,\n        transforms.Compose([\n            transforms.RandomResizedCrop(224),\n            transforms.RandomHorizontalFlip(),\n            transforms.ToTensor(),\n            normalize,\n         ]))\n\n\n    if args.distributed:\n        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)\n    else:\n        train_sampler = None\n\n    train_loader = torch.utils.data.DataLoader(\n        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),\n        num_workers=args.workers, pin_memory=True, sampler=train_sampler,drop_last=True)\n\n\n    val_loader = torch.utils.data.DataLoader(\n        datasets.ImageFolder(valdir, transforms.Compose([\n            transforms.Resize(256),\n            transforms.CenterCrop(224),\n            transforms.ToTensor(),\n            normalize,\n        ])),\n        batch_size=args.batch_size, shuffle=False,\n        num_workers=args.workers, pin_memory=True)\n\n    if args.evaluate:\n        validate(val_loader, model, criterion, args)\n        return\n\n    for epoch in range(args.start_epoch, args.epochs):\n        if args.distributed:\n            train_sampler.set_epoch(epoch)\n        #adjust_learning_rate(optimizer, epoch, args)\n\n        # train for one epoch\n        train(train_loader, model, criterion, optimizer, epoch, args)\n        exp_lr_scheduler.step()\n        # evaluate on validation set\n        acc1 = validate(val_loader, model, criterion, args)\n\n        # remember best acc@1 and save checkpoint\n        is_best = acc1 > best_acc1\n        best_acc1 = max(acc1, best_acc1)\n\n#        if not args.multiprocessing_distributed or (args.multiprocessing_distributed\n#                and args.rank % ngpus_per_node == 0):\n#            save_checkpoint({\n#                'epoch': epoch + 1,\n#                'arch': args.arch,\n#                'state_dict': model.state_dict(),\n#                'best_acc1': best_acc1,\n#                'optimizer' : optimizer.state_dict(),\n#            }, is_best)\n            #torch.save(model.module, './result_model/'+args.path+'.pth')\n\n#train\ndef train(train_loader, model, criterion, optimizer, epoch, args):\n    batch_time = AverageMeter()\n    data_time = AverageMeter()\n    losses = AverageMeter()\n    top1 = AverageMeter()\n    top5 = AverageMeter()\n    total = 0\n    train_loss = 0\n    correct = 0\n    # switch to train mode\n    model.train()\n    print('\\nEpoch: %d' % epoch)\n    end = time.time()\n    for i, (input, target) in enumerate(train_loader):\n        # measure data loading time\n        data_time.update(time.time() - end)\n\n        #if args.gpu is not None:\n         #input = input.cuda(args.gpu, non_blocking=True)\n        #target = target.cuda(args.gpu, non_blocking=True)\n        input, target = input.to('cuda'), target.to('cuda')\n\n\n\n        # compute output\n        output = model(input)\n        loss = criterion(output, target)\n\n        # measure accuracy and record loss\n        acc1, acc5 = accuracy(output, target, topk=(1, 5))\n        losses.update(loss.item(), input.size(0))\n        top1.update(acc1[0], input.size(0))\n        top5.update(acc5[0], input.size(0))\n\n        _, predicted = output.max(1)\n        correct += predicted.eq(target).sum().item()\n\n        train_loss += loss.item()\n        #correct +=acc1[0]\n        total += target.size(0)\n        # compute gradient and do SGD step\n        optimizer.zero_grad()\n        loss.backward()\n        optimizer.step()\n\n\n        # measure elapsed time\n        batch_time.update(time.time() - end)\n        end = time.time()\n\n    print('Training: Top1: {top1.avg:.4f}|loss:{losses.avg:.4f}'.format(top1=top1, losses=losses))\n    #print('Training: top1: {:.4f} '.format(correct/total))\n\n# test\ndef validate(val_loader, model, criterion, args):\n    batch_time = AverageMeter()\n    losses = AverageMeter()\n    top1 = AverageMeter()\n    top5 = AverageMeter()\n\n    val_loss = 0\n    total = 0\n    correct = 0\n    # switch to evaluate mode\n    model.eval()\n\n    with torch.no_grad():\n        end = time.time()\n        for i, (input, target) in enumerate(val_loader):\n            if args.gpu is not None:\n                input = input.cuda(args.gpu, non_blocking=True)\n            target = target.cuda(args.gpu, non_blocking=True)\n\n            # compute output\n            output = model(input)\n            loss = criterion(output, target)\n\n            # measure accuracy and record loss\n            acc1, acc5 = accuracy(output, target, topk=(1, 5))\n            losses.update(loss.item(), input.size(0))\n            top1.update(acc1[0], input.size(0))\n            top5.update(acc5[0], input.size(0))\n\n            _, predicted = output.max(1)\n            total += target.size(0)\n            correct += predicted.eq(target).sum().item()\n\n            val_loss +=loss.item()\n            # measure elapsed time\n            batch_time.update(time.time() - end)\n            end = time.time()\n            val_loss += loss.item()\n        print('Testing: Top1: {top1.avg:.4f}|loss:{losses.avg:.4f}'.format(top1=top1, losses=losses))\n        #print('Testing: top1: {:.4f} '.format(correct/total))\n    return top1.avg\n\n\ndef save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):\n    torch.save(state, filename)\n    if is_best:\n        shutil.copyfile(filename, 'model_best.pth.tar')\n\n\nclass AverageMeter(object):\n    \"\"\"Computes and stores the average and current value\"\"\"\n    def __init__(self):\n        self.reset()\n\n    def reset(self):\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n\n    def update(self, val, n=1):\n        self.val = val\n        self.sum += val * n\n        self.count += n\n        self.avg = self.sum / self.count\n\n\ndef adjust_learning_rate(optimizer, epoch, args):\n    \"\"\"Sets the learning rate to the initial LR decayed by 10 every 30 epochs\"\"\"\n    lr = args.lr * (0.1 ** (epoch // 30))\n    for param_group in optimizer.param_groups:\n        param_group['lr'] = lr\n\n\ndef accuracy(output, target, topk=(1,)):\n    \"\"\"Computes the accuracy over the k top predictions for the specified values of k\"\"\"\n    with torch.no_grad():\n        maxk = max(topk)\n        batch_size = target.size(0)\n\n        _, pred = output.topk(maxk, 1, True, True)\n        pred = pred.t()\n        correct = pred.eq(target.view(1, -1).expand_as(pred))\n\n        res = []\n        for k in topk:\n            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)\n            res.append(correct_k.mul_(1.0 / batch_size))\n        return res\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "GC_code/Mini_ImageNet/os_run.py",
    "content": "#cifar100 e200 bs128  gs  2,4,8,16\nimport os,time\n\n#print('runing mini_imagenet.py')\n\n\nos.system(\"nohup  python -W ignore main.py /home/yonghw/data/mini_imagenet/split_mini/ --model r50  -b 128 --alg sgd   > logout/r50_b128_sgd.log  \")\n\nos.system(\"nohup  python -W ignore main.py /home/yonghw/data/mini_imagenet/split_mini/ --model r50  -b 128 --alg sgdGC   > logout/r50_b128_sgdGC.log  \")\n\nos.system(\"nohup  python -W ignore main.py /home/yonghw/data/mini_imagenet/split_mini/ --model r50ws  -b 128 --alg sgd   > logout/r50ws_b128_sgd.log  \")\n\nos.system(\"nohup  python -W ignore main.py /home/yonghw/data/mini_imagenet/split_mini/ --model r50ws  -b 128 --alg sgdGC   > logout/r50ws_b128_sgdGC.log  \")\n"
  },
  {
    "path": "GC_code/Mini_ImageNet/resnet_ws.py",
    "content": "import torch.nn as nn\r\nimport torch.utils.model_zoo as model_zoo\r\n\r\nimport torch\r\nimport torch.nn as nn\r\nfrom torch.nn.parameter import Parameter\r\nfrom torch.nn import functional as F\r\n\r\n#from .. import layers as L\r\nimport math\r\n\r\n__all__ = ['ResNet', 'l_resnet18', 'l_resnet34', 'l_resnet50', 'l_resnet101',\r\n           'l_resnet152']\r\n\r\n\r\nclass Conv2d(nn.Conv2d):\r\n\r\n    def __init__(self, in_channels, out_channels, kernel_size, stride=1,\r\n                 padding=0, dilation=1, groups=1, bias=True):\r\n        super(Conv2d, self).__init__(in_channels, out_channels, kernel_size, stride,\r\n                 padding, dilation, groups, bias)\r\n\r\n    def forward(self, x):\r\n        # return super(Conv2d, self).forward(x)\r\n        weight = self.weight\r\n        weight_mean = weight.mean(dim=1, keepdim=True).mean(dim=2,\r\n                                  keepdim=True).mean(dim=3, keepdim=True)\r\n        weight = weight - weight_mean\r\n        std = weight.view(weight.size(0), -1).std(dim=1).view(-1, 1, 1, 1) + 1e-5\r\n        weight = weight / std.expand_as(weight)\r\n        return F.conv2d(x, weight, self.bias, self.stride,\r\n                        self.padding, self.dilation, self.groups)\r\n\r\n\r\ndef BatchNorm2d(num_features):\r\n\r\n    #return nn.GroupNorm(num_channels=num_features, num_groups=32)\r\n    return nn.BatchNorm2d(num_features=num_features)\r\n\r\n\r\ndef conv3x3(in_planes, out_planes, stride=1):\r\n    \"\"\"3x3 convolution with padding\"\"\"\r\n    return Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,\r\n                     padding=1, bias=False)\r\n\r\n\r\ndef conv1x1(in_planes, out_planes, stride=1):\r\n    \"\"\"1x1 convolution\"\"\"\r\n    return Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)\r\n\r\n\r\nclass BasicBlock(nn.Module):\r\n    expansion = 1\r\n\r\n    def __init__(self, inplanes, planes, stride=1, downsample=None):\r\n        super(BasicBlock, self).__init__()\r\n        self.conv1 = conv3x3(inplanes, planes, stride)\r\n        self.bn1 = BatchNorm2d(planes)\r\n        self.relu = nn.ReLU(inplace=True)\r\n        self.conv2 = conv3x3(planes, planes)\r\n        self.bn2 = BatchNorm2d(planes)\r\n        self.downsample = downsample\r\n        self.stride = stride\r\n\r\n    def forward(self, x):\r\n        identity = x\r\n\r\n        out = self.conv1(x)\r\n        out = self.bn1(out)\r\n        out = self.relu(out)\r\n\r\n        out = self.conv2(out)\r\n        out = self.bn2(out)\r\n\r\n        if self.downsample is not None:\r\n            identity = self.downsample(x)\r\n\r\n        out += identity\r\n        out = self.relu(out)\r\n\r\n        return out\r\n\r\n\r\nclass Bottleneck(nn.Module):\r\n    expansion = 4\r\n\r\n    def __init__(self, inplanes, planes, stride=1, downsample=None):\r\n        super(Bottleneck, self).__init__()\r\n        self.conv1 = conv1x1(inplanes, planes)\r\n        self.bn1 = BatchNorm2d(planes)\r\n        self.conv2 = conv3x3(planes, planes, stride)\r\n        self.bn2 = BatchNorm2d(planes)\r\n        self.conv3 = conv1x1(planes, planes * self.expansion)\r\n        self.bn3 = BatchNorm2d(planes * self.expansion)\r\n        self.relu = nn.ReLU(inplace=True)\r\n        self.downsample = downsample\r\n        self.stride = stride\r\n\r\n    def forward(self, x):\r\n        identity = x\r\n\r\n        out = self.conv1(x)\r\n        out = self.bn1(out)\r\n        out = self.relu(out)\r\n\r\n        out = self.conv2(out)\r\n        out = self.bn2(out)\r\n        out = self.relu(out)\r\n\r\n        out = self.conv3(out)\r\n        out = self.bn3(out)\r\n\r\n        if self.downsample is not None:\r\n            identity = self.downsample(x)\r\n\r\n        out += identity\r\n        out = self.relu(out)\r\n\r\n        return out\r\n\r\n\r\nclass ResNet(nn.Module):\r\n\r\n    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False):\r\n        super(ResNet, self).__init__()\r\n        self.inplanes = 64\r\n        self.conv1 = Conv2d(3, 64, kernel_size=7, stride=2, padding=3,\r\n                               bias=False)\r\n        self.bn1 = BatchNorm2d(64)\r\n        self.relu = nn.ReLU(inplace=True)\r\n        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)\r\n        self.layer1 = self._make_layer(block, 64, layers[0])\r\n        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)\r\n        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)\r\n        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)\r\n        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))\r\n        self.fc = nn.Linear(512 * block.expansion, num_classes)\r\n\r\n        for m in self.modules():\r\n            if isinstance(m, Conv2d):\r\n                #nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')\r\n                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels\r\n                m.weight.data.normal_(0, math.sqrt(2. / n))\r\n            elif isinstance(m,nn.BatchNorm2d):\r\n                #nn.init.constant_(m.weight, 1)\r\n                #nn.init.constant_(m.bias, 0)\r\n                m.weight.data.uniform_()\r\n                m.bias.data.zero_()\r\n\r\n\r\n        # Zero-initialize the last BN in each residual branch,\r\n        # so that the residual branch starts with zeros, and each residual block behaves like an identity.\r\n        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677\r\n        if zero_init_residual:\r\n            for m in self.modules():\r\n                if isinstance(m, Bottleneck):\r\n                    nn.init.constant_(m.bn3.weight, 0)\r\n                elif isinstance(m, BasicBlock):\r\n                    nn.init.constant_(m.bn2.weight, 0)\r\n\r\n    def _make_layer(self, block, planes, blocks, stride=1):\r\n        downsample = None\r\n        if stride != 1 or self.inplanes != planes * block.expansion:\r\n            downsample = nn.Sequential(\r\n                conv1x1(self.inplanes, planes * block.expansion, stride),\r\n                BatchNorm2d(planes * block.expansion),\r\n            )\r\n\r\n        layers = []\r\n        layers.append(block(self.inplanes, planes, stride, downsample))\r\n        self.inplanes = planes * block.expansion\r\n        for _ in range(1, blocks):\r\n            layers.append(block(self.inplanes, planes))\r\n\r\n        return nn.Sequential(*layers)\r\n\r\n    def forward(self, x):\r\n        x = self.conv1(x)\r\n        x = self.bn1(x)\r\n        x = self.relu(x)\r\n        x = self.maxpool(x)\r\n\r\n        x = self.layer1(x)\r\n        x = self.layer2(x)\r\n        x = self.layer3(x)\r\n        x = self.layer4(x)\r\n\r\n        x = self.avgpool(x)\r\n        x = x.view(x.size(0), -1)\r\n        x = self.fc(x)\r\n\r\n        return x\r\n\r\n\r\ndef l_resnet18(pretrained=False, **kwargs):\r\n    \"\"\"Constructs a ResNet-18 model.\r\n    Args:\r\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\r\n    \"\"\"\r\n    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)\r\n    return model\r\n\r\n\r\ndef l_resnet34(pretrained=False, **kwargs):\r\n    \"\"\"Constructs a ResNet-34 model.\r\n    Args:\r\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\r\n    \"\"\"\r\n    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)\r\n    return model\r\n\r\n\r\ndef l_resnet50(pretrained=False, **kwargs):\r\n    \"\"\"Constructs a ResNet-50 model.\r\n    Args:\r\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\r\n    \"\"\"\r\n    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)\r\n    return model\r\n\r\n\r\ndef l_resnet101(pretrained=False, **kwargs):\r\n    \"\"\"Constructs a ResNet-101 model.\r\n    Args:\r\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\r\n    \"\"\"\r\n    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)\r\n    return model\r\n\r\n\r\ndef l_resnet152(pretrained=False, **kwargs):\r\n    \"\"\"Constructs a ResNet-152 model.\r\n    Args:\r\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\r\n    \"\"\"\r\n    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)\r\n    return model\r\n"
  },
  {
    "path": "README.md",
    "content": "# Gradient Centralization\n\n## [Gradient Centralization: A New Optimization Technique for Deep Neural Networks](https://arxiv.org/abs/2004.01461)\n\n***\n\n## Introduction\n\n* Gradient Centralization (GC) is a simple and effective optimization technique for Deep Neural Networks (DNNs), which operates directly on gradients by centralizing the gradient vectors to have zero mean. It can both speedup training process and improve the final generalization performance of DNNs. GC is very simple to implement and can be easily embedded into existing gradient based DNN optimizers with only few lines of code. It can also be directly used to finetune the pre-trained DNNs. Please refer to the [algorithm-GC](https://github.com/Yonghongwei/Gradient-Centralization/blob/master/algorithm-GC/) to obtain the codes of more advanced optimizers.\n\n<div  align=\"center\"><img src=\"https://github.com/Yonghongwei/Gradient-Centralization/blob/master/fig/gradient.png\" height=\"45%\" width=\"45%\" alt=\"Illustration of the GC operation on gradient matrix/tensor of weights in the fully-connected layer (left) and convolutional layer (right).\"/></div>\n\n* GC can be viewed as a projected gradient descent method with a constrained loss function.  The Lipschitzness of the constrained loss function and its gradient is better so that the training process becomes more efficient and stable.   Our experiments on various applications, including `general image classification`, `fine-grained image classification`, `detection and segmentation` and `Person ReID` demonstrate that GC can consistently improve the performance of DNN learning. \n\n<div  align=\"center\"><img src=\"https://github.com/Yonghongwei/Gradient-Centralization/blob/master/fig/projected_Grad.png\" height=\"50%\" width=\"50%\" alt=\"\"/></div>\n\n* The optimizers are provided in the files: [`SGD.py`](https://github.com/Yonghongwei/Gradient-Centralization/blob/master/GC_code/CIFAR100/algorithm/SGD.py), [`Adam.py`](https://github.com/Yonghongwei/Gradient-Centralization/blob/master/GC_code/CIFAR100/algorithm/Adam.py) and [`Adagrad.py`](https://github.com/Yonghongwei/Gradient-Centralization/blob/master/GC_code/CIFAR100/algorithm/Adagrad.py), including SGD_GC, SGD_GCC, SGDW_GCC, Adam_GC, Adam_GCC, Adam_GCC2, AdamW_GCC, AdamW_GCC2  and Adagrad_GCC. The optimizers with \"_GC\" use GC for both Conv layers and FC layers, and the optimizers with \"_GCC\" use GC only for Conv layers. For adaptive learning rate methods, keeping mean of weight vector unchanged usually works better. Please refer to Adam_GCC2 and AdamW_GCC2. We can use the following codes to import SGD_GC:\n```python\nfrom SGD import SGD_GC \n```\n\n***\n\n## Update\n* 2020/04/07:Release a pytorch implementation of optimizers with GC, and provide some examples on classification task, including\ngeneral image classification (Mini-ImageNet,  CIFAR100 and ImageNet) and Fine-grained image classification (FGVC Aircraft， Stanford Cars， Stanford  Dogs and CUB-200-2011).\n\n* 2020/04/14:Release the code of GC on MMdetection and update some tables of experimental results.\n\n* 2020/05/07:Release the code of GC on Person ReID and show some results on Market1501.\n\n* 2020/08/08:Release the code of some advanced optimizers with GC.\n***\n\n## Citation\n    @article{GradientCentra,\n      title={Gradient-Centralization: A New Optimization Technique for Deep Neural Networks},\n      author={Hongwei Yong and Jianqiang Huang and Xiansheng Hua and Lei Zhang},\n      booktitle={the European Conference on Conputer Vision},\n      year={2020}\n    }\n\n***\n## Link to the other implementation of GC\n* Gradient Centralization in TensorFlow [`https://github.com/Rishit-dagli/Gradient-Centralization-TensorFlow`](https://github.com/Rishit-dagli/Gradient-Centralization-TensorFlow)\n* Gradient Centralization in Ranger optimizer [`https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer`](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer)\n\n\n## Experiments\n***\n\n### General Image Classification\n* Mini-ImageNet\n\nThe codes are in [`GC_code/Mini_ImageNet`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/GC_code/Mini_ImageNet). The split dataset can be downloaded from [here](https://drive.google.com/open?id=1XWRjPzwRWChNgvemqsylYM1ocpxhGtfy) (Google drive) or [here](https://pan.baidu.com/s/1Ah6Lu8OSfAVc3PZM-mPpvw) (Baidu drive, safe code: 1681). The following figure  is training loss (left) and testing accuracy (right) curves vs. training epoch on the Mini-ImageNet. The ResNet50 is used as the DNN model. The compared optimization techniques include BN, BN+GC, BN+WS and BN+WS+GC.\n\n<div  align=\"center\"><img src=\"https://github.com/Yonghongwei/Gradient-Centralization/blob/master/fig/miniIN_largeBN.png\" height=\"60%\" width=\"60%\" alt=\"\"/></div>\n\n*  CIFAR100\n\nThe codes are in [`GC_code/CIFAR100`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/GC_code/CIFAR100).\n\n*  ImageNet\n\nThe codes are in [`GC_code/ImageNet`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/GC_code/ImageNet). The following table is the Top-1 error rates on ImageNet w/o GC and w/ GC:\n    \n|Backbone       |  R50BN        |R50GN         | R101BN      | R101GN      |\n| :-----------: | :-----------: | :----:       |:------:     |:-------:    |\n| w/o GC        | 23.71         |24.50         |22.37        |23.34        |\n| w/ GC         | 23.21         |23.53         |21.82        |22.14        |\n\nThe following figure  is the training error (left) and validation error (right) curves vs. training epoch on\nImageNet. The DNN model is ResNet50 with GN.\n<div  align=\"center\"><img src=\"https://github.com/Yonghongwei/Gradient-Centralization/blob/master/fig/Imagnet_r50GN2.png\" height=\"60%\" width=\"60%\" alt=\"\"/></div>\n\n\n***\n\n### Fine-grained Image Classification\nThe codes are in [`GC_code/Fine-grained_classification`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/GC_code/Fine-grained_classification).  The preprocessed dataset can be downloaded from [here](https://drive.google.com/open?id=1c3OnKq3EsMKK1OerWdouCG7hvN8Rv8yh). The following table is the testing accuracies on the four fine-grained image classification datasets with ResNet50:\n\n|Datesets       | FGVC Aircraft |Stanford Cars |Stanford Dogs| CUB-200-2011|\n| :-----------: | :-----------: | :----:       |:------:     |:-------:    |\n| w/o GC        | 86.62         |88.66         |76.16        |82.07        |\n| w/ GC         | 87.77         |90.03         |78.23        |83.40        |\n\nThe following figure is the training accuracy (solid line) and testing accuracy (dotted line) curves vs. training epoch on four fine-grained image classification datasets:\n\n<div  align=\"center\"><img src=\"https://github.com/Yonghongwei/Gradient-Centralization/blob/master/fig/fine_grid2_c.png\" height=\"100%\" width=\"100%\" alt=\"\"/></div>\n\n***\n\n### Objection Detection and Segmentation\nThe codes are in [`MMdetection`](https://github.com/Yonghongwei/mmdetection). Please let [`SGD.py`](https://github.com/Yonghongwei/mmdetection/blob/master/tools/SGD.py) in [`MMdetection\\tools\\`](https://github.com/Yonghongwei/mmdetection/tree/master/tools), and update [`MMdetection\\tools\\train.py`](https://github.com/Yonghongwei/mmdetection/blob/master/tools/train.py). Then if you want use SGD_GC optimizer, just update optimizer in the [`configs`](https://github.com/Yonghongwei/mmdetection/blob/master/configs/) file. For example, if we want use SGD_GC to optimize Faster_RCNN with ResNet50 backbone and FPN, we update the 151th line in [`MMdetection/configs/faster_rcnn_r50_fpn_1x.py`](https://github.com/Yonghongwei/mmdetection/blob/master/configs/faster_rcnn_r50_fpn_1x.py). The following table is the detection results on COCO by using Faster-RCNN and FPN with various backbone models:\n\n| Method        | Backbone      |  AP   | AP<sub>.5</sub> | AP<sub>.75</sub> | Backbone |  AP  | AP<sub>.5</sub> | AP<sub>.75</sub> |\n| :-----------: | :-----------: | :----:|:------:|:-------: | :-----------: | :----:|:------:|:-------: |\n| w/o GC        | R50           |  36.4 |  58.4  |  39.1    | X101-32x4d    |  40.1 |  62.0  |   43.8   |\n| w/ GC         | R50           |  37.0 |  59.0  |  40.2    | X101-32x4d    |  40.7 |  62.7  |   43.9   |\n| w/o GC        | R101          |  38.5 |  60.3  |  41.6    | X101-64x4d    |  41.3 |  63.3  |   45.2   |\n| w/ GC         | R101          |  38.9 |  60.8  |  42.2    | X101-64x4d    |  41.6 |  63.8  |   45.4   |\n\nThe following table is the detection and segmentation results on COCO by using Mask-RCNN and FPN with various backbone models:\n\n| Method        | Backbone      |  AP<sup>b</sup>  | AP<sup>b</sup><sub>.5</sub>| AP<sup>b</sup><sub>.75</sub>|  AP<sup>m</sup>   | AP<sup>m</sup><sub>.5</sub>| AP<sup>m</sup><sub>.75</sub> |\n| :-----------: | :-----------: | :----:|:------:|:-------:| :----:|:------:|:-------: |\n| w/o GC        | R50           | 37.4  | 59.0   | 40.6    | 34.1  | 55.5   | 36.1     |\n| w/ GC         | R50           | 37.9  | 59.6   | 41.2    | 34.7  | 56.1   | 37.0     |\n| w/o GC        | R101          | 39.4  | 60.9   | 43.3    | 35.9  | 57.7   | 38.4     |\n| w/ GC         | R101          | 40.0  | 61.5   | 43.7    | 36.2  | 58.1   | 38.7     |\n| w/o GC        | X101-32x4d    | 41.1  | 62.8   | 45.0    | 37.1  | 59.4   | 39.8     |\n| w/ GC         | X101-32x4d    | 41.6  | 63.1   | 45.5    | 37.4  | 59.8   | 39.9     |\n| w/o GC        | X101-64x4d    | 42.1  | 63.8   | 46.3    | 38.0  | 60.6   | 40.9     |\n| w/ GC         | X101-64x4d    | 42.8  | 64.5   | 46.8    | 38.4  | 61.0   | 41.1     |\n| w/o GC        | R50 (4c1f)    | 37.5  | 58.2   | 41.0    | 33.9  | 55.0   | 36.1     |\n| w/ GC         | R50 (4c1f)    | 38.4  | 59.5   | 41.8    | 34.6  | 55.9   | 36.7     |\n| w/o GC        | R101GN        | 41.1  | 61.7   | 44.9    | 36.9  | 58.7   | 39.3     |\n| w/ GC         | R101GN        | 41.7  | 62.3   | 45.3    | 37.4  | 59.3   | 40.3     |\n| w/o GC        | R50GN+WS      | 40.0  | 60.7   | 43.6    | 36.1  | 57.8   | 38.6     |\n| w/ GC         | R50GN+WS      | 40.6  | 61.3   | 43.9    | 36.6  | 58.2   | 39.1     |\n\n***\n\n### Person ReId\nThe codes are in [`PersonReId`](https://github.com/Yonghongwei/reid-strong-baseline). Please let [`SGD.py`](https://github.com/Yonghongwei/reid-strong-baseline/tree/master/tools/SGD.py) in [`reid-strong-baseline\\tools\\`](https://github.com/Yonghongwei/reid-strong-baseline/tree/master/tools), and update [`reid-strong-baseline\\solver\\build.py`](https://github.com/Yonghongwei/reid-strong-baseline/blob/master/solver/build.py). For Market1501, please use SGD_GCC algorithm with\nlearning rate 0.03 or 0.02 and weight decay 0.002. For example, you can change the '.sh' file with the following codes: \n```python\npython3 tools/train.py --config_file='configs/softmax_triplet_with_center.yml' MODEL.DEVICE_ID \"('0')\" DATASETS.NAMES \"('market1501')\" DATASETS.ROOT_DIR \"('/home/yonghw/data/reid/')\" OUTPUT_DIR \"('out_dir/market1501/test')\" SOLVER.OPTIMIZER_NAME \"('SGD_GCC')\" SOLVER.BASE_LR \"(0.03)\" SOLVER.WEIGHT_DECAY \"(0.002)\" SOLVER.WEIGHT_DECAY_BIAS \"(0.002)\"\n```\nThe results of Market1501 without reranking are shown in the following table:\n| Method        | Backbone      |  MAP    | Top 1    |\n| :-----------: | :-----------: |:------:|:-------: |\n|  Adam*        | R18           | 77.8   |  91.7   |\n| SGD_GCC       | R18           | 81.3   | 92.7    |\n|  Adam*        | R50           | 85.9   | 94.5    |\n| SGD_GCC       | R50           |  86.6  |  94.8   |\n|  Adam*        | R101          |  87.1  | 94.5    |\n| SGD_GCC       | R101          |  87.9  |  95.0   |\n\nThe results with * are reported by the authors in [reid-strong-baseline](https://github.com/michuanhaohao/reid-strong-baseline). Our reproduced results are slightly lower than the results provided by the authors.\n"
  },
  {
    "path": "algorithm-GC/README.md",
    "content": "# Advanced-optimizer-with-Gradient-Centralization\nAdvanced optimizer with Gradient-Centralization\nPlease Refer to\n## [Gradient Centralization: A New Optimization Technique for Deep Neural Networks](https://arxiv.org/abs/2004.01461)\n\n## Introduction\n\nWe embed GC into some advanced DNN optimizers, including [`SGD.py`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/SGD.py),\n[`Adam.py`](https://github.com/Yonghongwei/Advanced-optimizer-with-Gradient-Centralization/blob/master/algorithm/Adam.py), [`AdamW`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/Adam.py), [`RAdam`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/RAdam.py),[`Lookahead`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/Lookahead.py)+[`SGD.py`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/SGD.py), [`Lookahead`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/Lookahead.py)+[`Adam.py`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/Adam.py), [`Ranger`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/Ranger.py).\n\nThere are three hyper-parameters `use_gc`, `gc_conv_only` and `gc_loc`. `use_gc=True` means that the algorithm adds GC operation, otherwise, not. `gc_conv_only=True` means the algorithm only adds GC operation for Conv layer, otherwise, for both Conv and FC layer. `gc_loc` controls the location of GC operation for adaptive learning rate algorithms, including Adam, Radam, Ranger and so on. There are two locations in the algorithm to add GC operation for original gradient and generalized gradient, respectively. Generalized gradient is the variable which is directly used to update the weight.  For adaptive learning rate algorithms, we suggest `gc_loc=False`.  For SGD, these two locations for GC are equivalent, so we do not introduce the hyper-parameter `gc_loc`.\n\nWe also give an example of how to use these algorithms in [`Cifar`](https://github.com/Yonghongwei/Gradient-Centralization/blob/master/algorithm-GC/cifar/main.py). \nFor example: \n\n```python\n# SGD\noptimizer = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = args.weight_decay,use_gc=True, gc_conv_only=False) \n```\n\n```python\n# Adam\noptimizer = Adam(net.parameters(), lr=args.lr, weight_decay = args.weight_decay,use_gc=True, gc_conv_only=False,gc_loc=False) \n```\n\n```python\n# RAdam\noptimizer = RAdam(net.parameters(), lr=args.lr, weight_decay = args.weight_decay,use_gc=True, gc_conv_only=False,gc_loc=False)\n```\n```python\n# lookahead+SGD\nbase_opt = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = args.weight_decay,use_gc=False, gc_conv_only=False)\noptimizer = Lookahead(base_opt, k=5, alpha=0.5)\n```\n```python\n# Ranger\noptimizer = Ranger(net.parameters(), lr=args.lr, weight_decay = args.weight_decay,use_gc=True, gc_conv_only=False,gc_loc=False)\n```\n## References:\n* Adam: https://arxiv.org/abs/1412.6980\n\n* AdamW: https://arxiv.org/abs/1711.05101\n\n* Lookahead: https://arxiv.org/abs/1907.08610\n\n* RAdam: https://arxiv.org/abs/1908.03265, https://github.com/LiyuanLucasLiu/RAdam\n\n* Ranger: https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer\n\n* Gradient Centralization: https://arxiv.org/abs/2004.01461v2\n"
  },
  {
    "path": "algorithm-GC/algorithm/Adam.py",
    "content": "import math\nimport torch\nfrom torch.optim.optimizer import Optimizer\nfrom .Centralization import centralized_gradient\n\nclass Adam(Optimizer):\n    r\"\"\"Implements Adam algorithm.\n\n    It has been proposed in `Adam: A Method for Stochastic Optimization`_.\n\n    Arguments:\n        params (iterable): iterable of parameters to optimize or dicts defining\n            parameter groups\n        lr (float, optional): learning rate (default: 1e-3)\n        betas (Tuple[float, float], optional): coefficients used for computing\n            running averages of gradient and its square (default: (0.9, 0.999))\n        eps (float, optional): term added to the denominator to improve\n            numerical stability (default: 1e-8)\n        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)\n        amsgrad (boolean, optional): whether to use the AMSGrad variant of this\n            algorithm from the paper `On the Convergence of Adam and Beyond`_\n            (default: False)\n\n    .. _Adam\\: A Method for Stochastic Optimization:\n        https://arxiv.org/abs/1412.6980\n    .. _On the Convergence of Adam and Beyond:\n        https://openreview.net/forum?id=ryQu7f-RZ\n    \"\"\"\n\n    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,\n                 weight_decay=0, amsgrad=False,use_gc=False, gc_conv_only=False,gc_loc=False):\n        if not 0.0 <= lr:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if not 0.0 <= eps:\n            raise ValueError(\"Invalid epsilon value: {}\".format(eps))\n        if not 0.0 <= betas[0] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 0: {}\".format(betas[0]))\n        if not 0.0 <= betas[1] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 1: {}\".format(betas[1]))\n        if not 0.0 <= weight_decay:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n        defaults = dict(lr=lr, betas=betas, eps=eps,\n                        weight_decay=weight_decay, amsgrad=amsgrad)\n        super(Adam, self).__init__(params, defaults)\n        self.gc_loc=gc_loc\n        self.use_gc=use_gc\n        self.gc_conv_only=gc_conv_only\n\n    def __setstate__(self, state):\n        super(Adam, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('amsgrad', False)\n\n    @torch.no_grad()\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            with torch.enable_grad():\n                loss = closure()\n\n        for group in self.param_groups:\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                grad = p.grad\n                if grad.is_sparse:\n                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')\n                amsgrad = group['amsgrad']\n\n                state = self.state[p]\n\n                # State initialization\n                if len(state) == 0:\n                    state['step'] = 0\n                    # Exponential moving average of gradient values\n                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)\n                    # Exponential moving average of squared gradient values\n                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)\n                    if amsgrad:\n                        # Maintains max of all exp. moving avg. of sq. grad. values\n                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)\n\n                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']\n                if amsgrad:\n                    max_exp_avg_sq = state['max_exp_avg_sq']\n                beta1, beta2 = group['betas']\n\n                state['step'] += 1\n                bias_correction1 = 1 - beta1 ** state['step']\n                bias_correction2 = 1 - beta2 ** state['step']\n\n                if group['weight_decay'] != 0:\n                    grad = grad.add(p, alpha=group['weight_decay'])\n                if self.gc_loc:\n                   grad=centralized_gradient(grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)\n                    \n                # Decay the first and second moment running average coefficient\n                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)\n                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)\n                if amsgrad:\n                    # Maintains the maximum of all 2nd moment running avg. till now\n                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)\n                    # Use the max. for normalizing running avg. of gradient\n                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])\n                else:\n                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])\n\n                step_size = group['lr'] / bias_correction1\n                #GC operation \n                G_grad=exp_avg/denom \n                if self.gc_loc==False:       \n                    G_grad=centralized_gradient(G_grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)\n                \n                p.add_( G_grad, alpha=-step_size)\n\n        return loss\n\n\n\n\nclass AdamW(Optimizer):\n    r\"\"\"Implements AdamW algorithm.\n\n    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.\n    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.\n\n    Arguments:\n        params (iterable): iterable of parameters to optimize or dicts defining\n            parameter groups\n        lr (float, optional): learning rate (default: 1e-3)\n        betas (Tuple[float, float], optional): coefficients used for computing\n            running averages of gradient and its square (default: (0.9, 0.999))\n        eps (float, optional): term added to the denominator to improve\n            numerical stability (default: 1e-8)\n        weight_decay (float, optional): weight decay coefficient (default: 1e-2)\n        amsgrad (boolean, optional): whether to use the AMSGrad variant of this\n            algorithm from the paper `On the Convergence of Adam and Beyond`_\n            (default: False)\n\n    .. _Adam\\: A Method for Stochastic Optimization:\n        https://arxiv.org/abs/1412.6980\n    .. _Decoupled Weight Decay Regularization:\n        https://arxiv.org/abs/1711.05101\n    .. _On the Convergence of Adam and Beyond:\n        https://openreview.net/forum?id=ryQu7f-RZ\n    \"\"\"\n\n    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,\n                 weight_decay=1e-2, amsgrad=False,use_gc=False, gc_conv_only=False,gc_loc=True):\n        if not 0.0 <= lr:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if not 0.0 <= eps:\n            raise ValueError(\"Invalid epsilon value: {}\".format(eps))\n        if not 0.0 <= betas[0] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 0: {}\".format(betas[0]))\n        if not 0.0 <= betas[1] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 1: {}\".format(betas[1]))\n        if not 0.0 <= weight_decay:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n        defaults = dict(lr=lr, betas=betas, eps=eps,\n                        weight_decay=weight_decay, amsgrad=amsgrad)\n        super(AdamW, self).__init__(params, defaults)\n        self.gc_loc=gc_loc\n        self.use_gc=use_gc\n        self.gc_conv_only=gc_conv_only\n        \n    def __setstate__(self, state):\n        super(AdamW, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('amsgrad', False)\n\n    @torch.no_grad()\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            with torch.enable_grad():\n                loss = closure()\n\n        for group in self.param_groups:\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n\n                # Perform optimization step\n                grad = p.grad\n                if grad.is_sparse:\n                    raise RuntimeError('AdamW does not support sparse gradients')\n                amsgrad = group['amsgrad']\n\n                state = self.state[p]\n\n                # State initialization\n                if len(state) == 0:\n                    state['step'] = 0\n                    # Exponential moving average of gradient values\n                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)\n                    # Exponential moving average of squared gradient values\n                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)\n                    if amsgrad:\n                        # Maintains max of all exp. moving avg. of sq. grad. values\n                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)\n\n                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']\n                if amsgrad:\n                    max_exp_avg_sq = state['max_exp_avg_sq']\n                beta1, beta2 = group['betas']\n\n                state['step'] += 1\n                bias_correction1 = 1 - beta1 ** state['step']\n                bias_correction2 = 1 - beta2 ** state['step']\n                if self.gc_loc:\n                   grad=centralized_gradient(grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)\n\n                # Decay the first and second moment running average coefficient\n                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)\n                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)\n                if amsgrad:\n                    # Maintains the maximum of all 2nd moment running avg. till now\n                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)\n                    # Use the max. for normalizing running avg. of gradient\n                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])\n                else:\n                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])\n\n                step_size = group['lr'] / bias_correction1\n\n                #GC operation and stepweight decay\n                G_grad=(exp_avg/denom).add(p.data,alpha=group['weight_decay'])        \n                if self.gc_loc==False:       \n                    G_grad=centralized_gradient(G_grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)\n                \n                p.add_( G_grad, alpha=-step_size)\n\n        return loss"
  },
  {
    "path": "algorithm-GC/algorithm/Centralization.py",
    "content": "import torch\n#from torch.optim.optimizer import Optimizer, required\n\n\n\ndef centralized_gradient(x,use_gc=True,gc_conv_only=False):\n    if use_gc:\n      if gc_conv_only:\n        if len(list(x.size()))>3:\n            x.add_(-x.mean(dim = tuple(range(1,len(list(x.size())))), keepdim = True))\n      else:\n        if len(list(x.size()))>1:\n            x.add_(-x.mean(dim = tuple(range(1,len(list(x.size())))), keepdim = True))\n    return x                   \n\n"
  },
  {
    "path": "algorithm-GC/algorithm/Lookahead.py",
    "content": "from collections import defaultdict\nfrom itertools import chain\nfrom torch.optim import Optimizer\nimport torch\nimport warnings\n\nclass Lookahead(Optimizer):\n    def __init__(self, optimizer, k=5, alpha=0.5):\n        self.optimizer = optimizer\n        self.k = k\n        self.alpha = alpha\n        self.param_groups = self.optimizer.param_groups\n        self.state = defaultdict(dict)\n        self.fast_state = self.optimizer.state\n        for group in self.param_groups:\n            group[\"counter\"] = 0\n    \n    def update(self, group):\n        for fast in group[\"params\"]:\n            param_state = self.state[fast]\n            if \"slow_param\" not in param_state:\n                param_state[\"slow_param\"] = torch.zeros_like(fast.data)\n                param_state[\"slow_param\"].copy_(fast.data)\n            slow = param_state[\"slow_param\"]\n            slow += (fast.data - slow) * self.alpha\n            fast.data.copy_(slow)\n    \n    def update_lookahead(self):\n        for group in self.param_groups:\n            self.update(group)\n\n    def step(self, closure=None):\n        loss = self.optimizer.step(closure)\n        for group in self.param_groups:\n            if group[\"counter\"] == 0:\n                self.update(group)\n            group[\"counter\"] += 1\n            if group[\"counter\"] >= self.k:\n                group[\"counter\"] = 0\n        return loss\n\n    def state_dict(self):\n        fast_state_dict = self.optimizer.state_dict()\n        slow_state = {\n            (id(k) if isinstance(k, torch.Tensor) else k): v\n            for k, v in self.state.items()\n        }\n        fast_state = fast_state_dict[\"state\"]\n        param_groups = fast_state_dict[\"param_groups\"]\n        return {\n            \"fast_state\": fast_state,\n            \"slow_state\": slow_state,\n            \"param_groups\": param_groups,\n        }\n\n    def load_state_dict(self, state_dict):\n        slow_state_dict = {\n            \"state\": state_dict[\"slow_state\"],\n            \"param_groups\": state_dict[\"param_groups\"],\n        }\n        fast_state_dict = {\n            \"state\": state_dict[\"fast_state\"],\n            \"param_groups\": state_dict[\"param_groups\"],\n        }\n        super(Lookahead, self).load_state_dict(slow_state_dict)\n        self.optimizer.load_state_dict(fast_state_dict)\n        self.fast_state = self.optimizer.state\n\n    def add_param_group(self, param_group):\n        param_group[\"counter\"] = 0\n        self.optimizer.add_param_group(param_group)"
  },
  {
    "path": "algorithm-GC/algorithm/RAdam.py",
    "content": "import math\nimport torch\nfrom torch.optim.optimizer import Optimizer\nfrom .Centralization import centralized_gradient\n\n\nclass RAdam(Optimizer):\n\n    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True,use_gc=False, gc_conv_only=False,gc_loc=False):\n        if not 0.0 <= lr:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if not 0.0 <= eps:\n            raise ValueError(\"Invalid epsilon value: {}\".format(eps))\n        if not 0.0 <= betas[0] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 0: {}\".format(betas[0]))\n        if not 0.0 <= betas[1] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 1: {}\".format(betas[1]))\n\n        self.degenerated_to_sgd = degenerated_to_sgd\n        self.gc_loc=gc_loc\n        self.use_gc=use_gc\n        self.gc_conv_only=gc_conv_only\n                \n        if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):\n            for param in params:\n                if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):\n                    param['buffer'] = [[None, None, None] for _ in range(10)]\n        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)])\n        super(RAdam, self).__init__(params, defaults)\n\n        \n    def __setstate__(self, state):\n        super(RAdam, self).__setstate__(state)\n\n    def step(self, closure=None):\n\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                grad = p.grad.data.float()\n                if grad.is_sparse:\n                    raise RuntimeError('RAdam does not support sparse gradients')\n\n                p_data_fp32 = p.data.float()\n\n                state = self.state[p]\n\n                if len(state) == 0:\n                    state['step'] = 0\n                    state['exp_avg'] = torch.zeros_like(p_data_fp32)\n                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)\n                else:\n                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)\n                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)\n\n                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']\n                beta1, beta2 = group['betas']\n                if self.gc_loc:\n                   grad=centralized_gradient(grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)\n                   \n                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)\n                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)\n                \n\n\n                state['step'] += 1\n                buffered = group['buffer'][int(state['step'] % 10)]\n                if state['step'] == buffered[0]:\n                    N_sma, step_size = buffered[1], buffered[2]\n                else:\n                    buffered[0] = state['step']\n                    beta2_t = beta2 ** state['step']\n                    N_sma_max = 2 / (1 - beta2) - 1\n                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)\n                    buffered[1] = N_sma\n\n                    # more conservative since it's an approximated value\n                    if N_sma >= 5:\n                        step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])\n                    elif self.degenerated_to_sgd:\n                        step_size = 1.0 / (1 - beta1 ** state['step'])\n                    else:\n                        step_size = -1\n                    buffered[2] = step_size\n\n                # more conservative since it's an approximated value\n                if N_sma >= 5:\n                    denom = exp_avg_sq.sqrt().add_(group['eps'])                                        \n                    G_grad=exp_avg/denom  \n                elif step_size > 0:\n                    G_grad=exp_avg  \n       \n                if group['weight_decay'] != 0:\n                       G_grad.add_(p_data_fp32,alpha=group['weight_decay'])   \n                #GC operation                                                  \n                if self.gc_loc==False:       \n                    G_grad=centralized_gradient(G_grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)                                       \n                p_data_fp32.add_( G_grad, alpha=-step_size * group['lr'])\n                    #p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)\n                p.data.copy_(p_data_fp32)\n        return loss\n\nclass PlainRAdam(Optimizer):\n\n    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True,use_gc=False, gc_conv_only=False,gc_loc=False):\n        if not 0.0 <= lr:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if not 0.0 <= eps:\n            raise ValueError(\"Invalid epsilon value: {}\".format(eps))\n        if not 0.0 <= betas[0] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 0: {}\".format(betas[0]))\n        if not 0.0 <= betas[1] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 1: {}\".format(betas[1]))\n                    \n        self.degenerated_to_sgd = degenerated_to_sgd\n        self.gc_loc=gc_loc\n        self.use_gc = use_gc\n        self.gc_conv_only=gc_conv_only\n        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)\n\n        super(PlainRAdam, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(PlainRAdam, self).__setstate__(state)\n\n    def step(self, closure=None):\n\n        loss = None\n        if closure is not None:\n            loss = closure()\n\n        for group in self.param_groups:\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                grad = p.grad.data.float()\n                if grad.is_sparse:\n                    raise RuntimeError('RAdam does not support sparse gradients')\n\n                p_data_fp32 = p.data.float()\n\n                state = self.state[p]\n\n                if len(state) == 0:\n                    state['step'] = 0\n                    state['exp_avg'] = torch.zeros_like(p_data_fp32)\n                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)\n                else:\n                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)\n                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)\n\n                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']\n                beta1, beta2 = group['betas']\n                if self.gc_loc:\n                   grad=centralized_gradient(grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)\n                   \n                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)\n                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)\n\n                state['step'] += 1\n                beta2_t = beta2 ** state['step']\n                N_sma_max = 2 / (1 - beta2) - 1\n                N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)\n\n\n                # more conservative since it's an approximated value\n                if N_sma >= 5:\n                    #if group['weight_decay'] != 0:\n                    #    p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)\n                    step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])\n                    denom = exp_avg_sq.sqrt().add_(group['eps'])                                       \n                    G_grad=exp_avg/denom  \n\n                elif self.degenerated_to_sgd:\n                    #if group['weight_decay'] != 0:\n                    #    p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)\n                    step_size = group['lr'] / (1 - beta1 ** state['step'])\n                    G_grad=exp_avg  \n\n                if group['weight_decay'] != 0:\n                       G_grad.add_(p.data,alpha=group['weight_decay']) \n                                                                   \n                if self.gc_loc==False:       \n                    G_grad=centralized_gradient(G_grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)\n                    \n                p_data_fp32.add_( G_grad, alpha=-step_size * group['lr'])                   \n                    #p_data_fp32.addcdiv_(-step_size, exp_avg, denom)\n                p.data.copy_(p_data_fp32)\n        return loss"
  },
  {
    "path": "algorithm-GC/algorithm/Ranger.py",
    "content": "import math\nimport torch\nfrom torch.optim.optimizer import Optimizer\nfrom .Centralization import centralized_gradient\n\n\nclass Ranger(Optimizer):\n\n    def __init__(self, params, lr=1e-3,                       # lr\n                 alpha=0.5, k=6, N_sma_threshhold=5,           # Ranger options\n                 betas=(.95, 0.999), eps=1e-5, weight_decay=0,  # Adam options\n                 # Gradient centralization on or off, applied to conv layers only or conv + fc layers\n                 use_gc=False, gc_conv_only=False,gc_loc=False\n                 ):\n\n        # parameter checks\n        if not 0.0 <= alpha <= 1.0:\n            raise ValueError(f'Invalid slow update rate: {alpha}')\n        if not 1 <= k:\n            raise ValueError(f'Invalid lookahead steps: {k}')\n        if not lr > 0:\n            raise ValueError(f'Invalid Learning Rate: {lr}')\n        if not eps > 0:\n            raise ValueError(f'Invalid eps: {eps}')\n\n        # parameter comments:\n        # beta1 (momentum) of .95 seems to work better than .90...\n        # N_sma_threshold of 5 seems better in testing than 4.\n        # In both cases, worth testing on your dataset (.90 vs .95, 4 vs 5) to make sure which works best for you.\n\n        # prep defaults and init torch.optim base\n        defaults = dict(lr=lr, alpha=alpha, k=k, step_counter=0, betas=betas,\n                        N_sma_threshhold=N_sma_threshhold, eps=eps, weight_decay=weight_decay)\n        super().__init__(params, defaults)\n\n        # adjustable threshold\n        self.N_sma_threshhold = N_sma_threshhold\n\n        # look ahead params\n\n        self.alpha = alpha\n        self.k = k\n\n        # radam buffer for state\n        self.radam_buffer = [[None, None, None] for ind in range(10)]\n\n        # gc on or off\n        self.gc_loc=gc_loc\n        self.use_gc = use_gc\n        self.gc_conv_only=gc_conv_only\n        # level of gradient centralization\n        #self.gc_gradient_threshold = 3 if gc_conv_only else 1\n\n        print(\n            f\"Ranger optimizer loaded. \\nGradient Centralization usage = {self.use_gc}\")\n        if (self.use_gc and self.gc_conv_only == False):\n            print(f\"GC applied to both conv and fc layers\")\n        elif (self.use_gc and self.gc_conv_only == True):\n            print(f\"GC applied to conv layers only\")\n\n    def __setstate__(self, state):\n        print(\"set state called\")\n        super(Ranger, self).__setstate__(state)\n\n    def step(self, closure=None):\n        loss = None\n        # note - below is commented out b/c I have other work that passes back the loss as a float, and thus not a callable closure.\n        # Uncomment if you need to use the actual closure...\n\n        # if closure is not None:\n        #loss = closure()\n\n        # Evaluate averages and grad, update param tensors\n        for group in self.param_groups:\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                grad = p.grad.data.float()\n\n                if grad.is_sparse:\n                    raise RuntimeError(\n                        'Ranger optimizer does not support sparse gradients')\n\n                p_data_fp32 = p.data.float()\n\n                state = self.state[p]  # get state dict for this param\n\n                if len(state) == 0:  # if first time to run...init dictionary with our desired entries\n                    # if self.first_run_check==0:\n                    # self.first_run_check=1\n                    #print(\"Initializing slow buffer...should not see this at load from saved model!\")\n                    state['step'] = 0\n                    state['exp_avg'] = torch.zeros_like(p_data_fp32)\n                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)\n\n                    # look ahead weight storage now in state dict\n                    state['slow_buffer'] = torch.empty_like(p.data)\n                    state['slow_buffer'].copy_(p.data)\n\n                else:\n                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)\n                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(\n                        p_data_fp32)\n\n                # begin computations\n                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']\n                beta1, beta2 = group['betas']\n\n                # GC operation for Conv layers and FC layers\n                #if grad.dim() > self.gc_gradient_threshold:\n                #    grad.add_(-grad.mean(dim=tuple(range(1, grad.dim())), keepdim=True))\n                if self.gc_loc:\n                   grad=centralized_gradient(grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)\n                   \n                state['step'] += 1\n\n                # compute variance mov avg\n                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)\n                \n                # compute mean moving avg\n                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)\n                \n\n                buffered = self.radam_buffer[int(state['step'] % 10)]\n\n                if state['step'] == buffered[0]:\n                    N_sma, step_size = buffered[1], buffered[2]\n                else:\n                    buffered[0] = state['step']\n                    beta2_t = beta2 ** state['step']\n                    N_sma_max = 2 / (1 - beta2) - 1\n                    N_sma = N_sma_max - 2 * \\\n                        state['step'] * beta2_t / (1 - beta2_t)\n                    buffered[1] = N_sma\n                    if N_sma > self.N_sma_threshhold:\n                        step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (\n                            N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])\n                    else:\n                        step_size = 1.0 / (1 - beta1 ** state['step'])\n                    buffered[2] = step_size\n\n                #if group['weight_decay'] != 0:\n                #    p_data_fp32.add_(-group['weight_decay']\n                #                     * group['lr'], p_data_fp32)\n\n                # apply lr\n                if N_sma > self.N_sma_threshhold:\n                    denom = exp_avg_sq.sqrt().add_(group['eps'])                                         \n                    G_grad=exp_avg/denom                                                                                    \n                else:\n                    G_grad=exp_avg  \n\n                if group['weight_decay'] != 0:\n                       G_grad.add_(p_data_fp32,alpha=group['weight_decay']) \n                #GC operation                                                   \n                if self.gc_loc==False:       \n                    G_grad=centralized_gradient(G_grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only)\n                                                           \n                p_data_fp32.add_( G_grad, alpha=-step_size * group['lr'])\n                p.data.copy_(p_data_fp32)\n\n                # integrated look ahead...\n                # we do it at the param level instead of group level\n                if state['step'] % group['k'] == 0:\n                    # get access to slow param tensor\n                    slow_p = state['slow_buffer']\n                    # (fast weights - slow weights) * alpha\n                    slow_p.add_( p.data - slow_p,alpha=self.alpha)\n                    # copy interpolated weights to RAdam param tensor\n                    p.data.copy_(slow_p)\n\n        return loss"
  },
  {
    "path": "algorithm-GC/algorithm/SGD.py",
    "content": "import torch\nfrom torch.optim.optimizer import Optimizer, required\n\nfrom .Centralization import centralized_gradient\n\nclass SGD(Optimizer):\n    r\"\"\"Implements stochastic gradient descent (optionally with momentum).\n\n    Nesterov momentum is based on the formula from\n    `On the importance of initialization and momentum in deep learning`__.\n\n    Args:\n        params (iterable): iterable of parameters to optimize or dicts defining\n            parameter groups\n        lr (float): learning rate\n        momentum (float, optional): momentum factor (default: 0)\n        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)\n        dampening (float, optional): dampening for momentum (default: 0)\n        nesterov (bool, optional): enables Nesterov momentum (default: False)\n\n    Example:\n        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)\n        >>> optimizer.zero_grad()\n        >>> loss_fn(model(input), target).backward()\n        >>> optimizer.step()\n\n    __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf\n\n    .. note::\n        The implementation of SGD with Momentum/Nesterov subtly differs from\n        Sutskever et. al. and implementations in some other frameworks.\n\n        Considering the specific case of Momentum, the update can be written as\n\n        .. math::\n            \\begin{aligned}\n                v_{t+1} & = \\mu * v_{t} + g_{t+1}, \\\\\n                p_{t+1} & = p_{t} - \\text{lr} * v_{t+1},\n            \\end{aligned}\n\n        where :math:`p`, :math:`g`, :math:`v` and :math:`\\mu` denote the \n        parameters, gradient, velocity, and momentum respectively.\n\n        This is in contrast to Sutskever et. al. and\n        other frameworks which employ an update of the form\n\n        .. math::\n            \\begin{aligned}\n                v_{t+1} & = \\mu * v_{t} + \\text{lr} * g_{t+1}, \\\\\n                p_{t+1} & = p_{t} - v_{t+1}.\n            \\end{aligned}\n\n        The Nesterov version is analogously modified.\n    \"\"\"\n\n    def __init__(self, params, lr=required, momentum=0, dampening=0,\n                 weight_decay=0, nesterov=False,use_gc=False, gc_conv_only=False):\n        if lr is not required and lr < 0.0:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if momentum < 0.0:\n            raise ValueError(\"Invalid momentum value: {}\".format(momentum))\n        if weight_decay < 0.0:\n            raise ValueError(\"Invalid weight_decay value: {}\".format(weight_decay))\n\n        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,\n                        weight_decay=weight_decay, nesterov=nesterov, use_gc=use_gc,gc_conv_only=gc_conv_only)\n        if nesterov and (momentum <= 0 or dampening != 0):\n            raise ValueError(\"Nesterov momentum requires a momentum and zero dampening\")\n        super(SGD, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(SGD, self).__setstate__(state)\n        for group in self.param_groups:\n            group.setdefault('nesterov', False)\n\n    @torch.no_grad()\n    def step(self, closure=None):\n        \"\"\"Performs a single optimization step.\n\n        Arguments:\n            closure (callable, optional): A closure that reevaluates the model\n                and returns the loss.\n        \"\"\"\n        loss = None\n        if closure is not None:\n            with torch.enable_grad():\n                loss = closure()\n\n        for group in self.param_groups:\n            weight_decay = group['weight_decay']\n            momentum = group['momentum']\n            dampening = group['dampening']\n            nesterov = group['nesterov']\n\n            for p in group['params']:\n                if p.grad is None:\n                    continue\n                d_p = p.grad\n                if weight_decay != 0:\n                    d_p = d_p.add(p, alpha=weight_decay)\n                    \n                #GC operation     \n                d_p =centralized_gradient(d_p ,use_gc=group['use_gc'],gc_conv_only=group['gc_conv_only'])                 \n                                        \n                if momentum != 0:\n                    param_state = self.state[p]\n                    if 'momentum_buffer' not in param_state:\n                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()\n                    else:\n                        buf = param_state['momentum_buffer']\n                        buf.mul_(momentum).add_(d_p, alpha=1 - dampening)\n                    if nesterov:\n                        d_p = d_p.add(buf, alpha=momentum)\n                    else:\n                        d_p = buf\n \n                             \n                p.add_(d_p, alpha=-group['lr'])\n\n        return loss\n"
  },
  {
    "path": "algorithm-GC/cifar/main.py",
    "content": "'''Train CIFAR100 with PyTorch.'''\nfrom __future__ import print_function\n\nimport torch\nimport torch.nn as nn\nimport torch.backends.cudnn as cudnn\n\n\nimport torch.optim as optim\nimport torch.nn.functional as F\n\nimport torchvision\nimport torchvision.transforms as transforms\n\n\nfrom torch.optim import lr_scheduler\nimport os\nimport argparse\nfrom torchvision import datasets, models\nfrom models import *\n#from utils import progress_bar\nimport numpy as np\n\n\nimport sys \nsys.path.append('../')\n \n#import optimizers with GC\nfrom algorithm.SGD import SGD\nfrom algorithm.Adam import Adam,AdamW\nfrom algorithm.RAdam import RAdam\nfrom algorithm.Lookahead import Lookahead\nfrom algorithm.Ranger import Ranger\n#from algorithm.Adam import Adam_GCC,AdamW,AdamW_GCC\n#from algorithm.Adagrad import Adagrad_GCC\n\n\nparser = argparse.ArgumentParser(description='PyTorch CIFAR100 Training')\nparser.add_argument('--lr', default=0.1, type=float, help='learning rate')\nparser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint')\nparser.add_argument('--bs', default=128, type=int, help='batchsize')\nparser.add_argument('--wd', default=0.0005, type=float, help='weight decay')\nparser.add_argument('--alg', default='sgd', type=str, help='algorithm')\nparser.add_argument('--epochs', default=200, type=int, help='epochs')\nparser.add_argument('--path', default='logout/result', type=str, help='path')\nparser.add_argument('--model', default='r50', type=str, help='model')\nparser.add_argument('--gpug', default=1, type=int, help='gpugroup')\n\nargs = parser.parse_args()\n#os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n\nif args.gpug==11:\n      os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"1\"   \nif args.gpug==12:\n      os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"2\"   \nif args.gpug==13:\n      os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"3\"   \nif args.gpug==14:\n      os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"4\"   \nif args.gpug==15:\n      os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"5\"   \nif args.gpug==16:\n      os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"6\"   \nif args.gpug==17:\n      os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"7\"   \nif args.gpug==10:\n     os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n\nepochs=args.epochs\ndevice = 'cuda' if torch.cuda.is_available() else 'cpu'\nbest_acc = 0  # best test accuracy\nstart_epoch = 0  # start from epoch 0 or last checkpoint epoch\n\n\n\n# Data\nprint('==> Preparing data..')\ntransform_train = transforms.Compose([\n    transforms.RandomCrop(32, padding=4),\n    transforms.RandomHorizontalFlip(),\n    transforms.ToTensor(),\n    transforms.Normalize((0.507, 0.487, 0.441), (0.267, 0.256, 0.276)),\n  ])\ntransform_test = transforms.Compose([\n    transforms.ToTensor(),\n    transforms.Normalize((0.507, 0.487, 0.441), (0.267, 0.256, 0.276)),\n  ])\ntrainset = torchvision.datasets.CIFAR100(root='/home/yonghw/data/cifar100/', train=True, download=True, transform=transform_train)\ntrainloader = torch.utils.data.DataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=4,drop_last=True)\ntestset = torchvision.datasets.CIFAR100(root='/home/yonghw/data/cifar100/', train=False, download=True, transform=transform_test)\ntestloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=4)\n\n\n\n\n# Model\nprint('==> Building model..')\n\nNum_classes = 100\n\nif args.model=='r18':\n    net = ResNet18(Num_classes=Num_classes)\nif args.model=='r34':\n    net = ResNet34(Num_classes=Num_classes)\nif args.model=='r50':\n    net = ResNet50(Num_classes=Num_classes)\nif args.model=='r101':\n    net = ResNet101(Num_classes=Num_classes)\nif args.model=='v11':\n    net = VGG('VGG11',Num_classes=Num_classes)\nif args.model=='rx29':\n    net = ResNeXt29_4x64d(Num_classes=Num_classes)\nif args.model=='d121':\n    net = DenseNet121(Num_classes=Num_classes)\n\nif device == 'cuda':\n    net = net.cuda()\n    net = torch.nn.DataParallel(net)\n    cudnn.benchmark = True\n\n\nif args.resume:\n    # Load checkpoint.\n    print('==> Resuming from checkpoint..')\n    assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'\n    checkpoint = torch.load('./checkpoint/ckpt.t7')\n    net.load_state_dict(checkpoint['net'])\n    best_acc = checkpoint['acc']\n    start_epoch = checkpoint['epoch']\n    \ncriterion = nn.CrossEntropyLoss()\n\n#optimizer\nWD=args.wd\nprint('==> choose optimizer..')\nif args.alg=='sgd':\n    optimizer = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD,use_gc=False, gc_conv_only=False)\nif args.alg=='sgdGC':\n    optimizer = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD,use_gc=True, gc_conv_only=False)\nif args.alg=='sgdGCC':\n    optimizer = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD,use_gc=True, gc_conv_only=True)    \n    \n\n\nif args.alg=='adam':\n    optimizer = Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=False, gc_conv_only=False)\nif args.alg=='adamGC':\n    optimizer = Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=False)\nif args.alg=='adamGCC':\n    optimizer = Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=True)\n\n\nif args.alg=='adamW':\n    optimizer = AdamW(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=False, gc_conv_only=False)\nif args.alg=='adamWGC':\n    optimizer = AdamW(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=False)\nif args.alg=='adamWGCC':\n    optimizer = AdamW(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=True)\n\n\nif args.alg=='radam':\n    optimizer = RAdam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=False, gc_conv_only=False)\nif args.alg=='radamGC':\n    optimizer = RAdam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=False)\nif args.alg=='radamGCC':\n    optimizer = RAdam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=True)\n\n\n\n\nif args.alg=='Lsgd':\n    base_opt = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD,use_gc=False, gc_conv_only=False)\n    optimizer = Lookahead(base_opt, k=5, alpha=0.5)\nif args.alg=='LsgdGC':\n    base_opt = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD,use_gc=True, gc_conv_only=False)\n    optimizer = Lookahead(base_opt, k=5, alpha=0.5)\nif args.alg=='LsgdGCC':\n    base_opt = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD,use_gc=True, gc_conv_only=True)\n    optimizer = Lookahead(base_opt, k=5, alpha=0.5)\n\n\nif args.alg=='Ladam':\n     base_opt  = Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=False, gc_conv_only=False)\n     optimizer = Lookahead(base_opt, k=5, alpha=0.5)     \nif args.alg=='LadamGC':\n     base_opt  = Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=False)\n     optimizer = Lookahead(base_opt, k=5, alpha=0.5)     \nif args.alg=='LadamGCC':\n     base_opt  = Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=True)\n     optimizer = Lookahead(base_opt, k=5, alpha=0.5)     \n\nif args.alg=='Lradam':\n     base_opt  = RAdam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=False, gc_conv_only=False)\n     optimizer = Lookahead(base_opt, k=5, alpha=0.5)     \nif args.alg=='LradamGC':\n     base_opt  = RAdam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=False)\n     optimizer = Lookahead(base_opt, k=5, alpha=0.5)     \nif args.alg=='LradamGCC':\n     base_opt  = RAdam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=True)\n     optimizer = Lookahead(base_opt, k=5, alpha=0.5) \n\n\n\nif args.alg=='ranger':\n    optimizer = Ranger(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=False, gc_conv_only=False)\nif args.alg=='rangerGC':\n    optimizer = Ranger(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=False)\nif args.alg=='rangerGCC':\n    optimizer = Ranger(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=True)\n    \n    \n\nif args.epochs==200:\n   exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=60, gamma=0.1)\nif args.epochs==400:\n   exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=120, gamma=0.1)\n# Training\ndef train(epoch,net,optimizer):\n    print('\\nEpoch: %d' % epoch)\n    net.train()\n    train_loss = 0\n    correct = 0\n    total = 0\n    for batch_idx, (inputs, targets) in enumerate(trainloader):\n        inputs, targets = inputs.to(device), targets.to(device)\n        optimizer.zero_grad()\n        outputs = net(inputs)\n        loss = criterion(outputs, targets)\n        loss.backward()\n        optimizer.step()\n\n        train_loss += loss.item()\n        _, predicted = outputs.max(1)\n        total += targets.size(0)\n        correct += predicted.eq(targets).sum().item()\n    print('Training: Loss: {:.4f} | Acc: {:.4f}'.format(train_loss/(batch_idx+1),correct/total))\n    #        progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'\n    #            % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))\n    acc=100.*correct/total\n    return acc\n    \n# Testing\ndef test(epoch,net):\n    global best_acc\n    net.eval()\n    test_loss = 0\n    correct = 0\n    total = 0\n    with torch.no_grad():\n      for batch_idx, (inputs, targets) in enumerate(testloader):\n            inputs, targets = inputs.to(device), targets.to(device)\n            outputs = net(inputs)\n            loss = criterion(outputs, targets)\n\n            test_loss += loss.item()\n            _, predicted = outputs.max(1)\n            total += targets.size(0)\n            correct += predicted.eq(targets).sum().item()\n\n            #progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'\n                #% (test_loss/(batch_idx+1), 100.*correct/total, correct, total))\n    print('Testing:Loss: {:.4f} | Acc: {:.4f}'.format(test_loss/(batch_idx+1),correct/total) )\n\n    # Save checkpoint.\n    acc = 100.*correct/total\n    if acc > best_acc:\n        print('Saving..')\n        state = {\n            'net': net.state_dict(),\n            'acc': acc,\n            'epoch': epoch,\n        }\n        if not os.path.isdir('checkpoint'):\n            os.mkdir('checkpoint')\n        torch.save(state, './checkpoint/ckpt.t7')\n        best_acc = acc\n    return acc\n\n\nfor epoch in range(start_epoch, start_epoch+epochs):\n    train_acc=train(epoch,net,optimizer)\n    exp_lr_scheduler.step()\n    val_acc=test(epoch,net)\n\n"
  },
  {
    "path": "algorithm-GC/cifar/models/__init__.py",
    "content": "from .vgg import *\nfrom .dpn import *\nfrom .lenet import *\nfrom .senet import *\nfrom .pnasnet import *\nfrom .densenet import *\nfrom .googlenet import *\nfrom .shufflenet import *\nfrom .resnet import *\nfrom .resnext import *\nfrom .preact_resnet import *\nfrom .mobilenet import *\nfrom .mobilenetv2 import *\n"
  },
  {
    "path": "algorithm-GC/cifar/models/densenet.py",
    "content": "'''DenseNet in PyTorch.'''\nimport math\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Bottleneck(nn.Module):\n    def __init__(self, in_planes, growth_rate):\n        super(Bottleneck, self).__init__()\n        self.bn1 = nn.BatchNorm2d(in_planes)\n        self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False)\n        self.bn2 = nn.BatchNorm2d(4*growth_rate)\n        self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)\n\n    def forward(self, x):\n        out = self.conv1(F.relu(self.bn1(x)))\n        out = self.conv2(F.relu(self.bn2(out)))\n        out = torch.cat([out,x], 1)\n        return out\n\n\nclass Transition(nn.Module):\n    def __init__(self, in_planes, out_planes):\n        super(Transition, self).__init__()\n        self.bn = nn.BatchNorm2d(in_planes)\n        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False)\n\n    def forward(self, x):\n        out = self.conv(F.relu(self.bn(x)))\n        out = F.avg_pool2d(out, 2)\n        return out\n\n\nclass DenseNet(nn.Module):\n    def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):\n        super(DenseNet, self).__init__()\n        self.growth_rate = growth_rate\n\n        num_planes = 2*growth_rate\n        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False)\n\n        self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])\n        num_planes += nblocks[0]*growth_rate\n        out_planes = int(math.floor(num_planes*reduction))\n        self.trans1 = Transition(num_planes, out_planes)\n        num_planes = out_planes\n\n        self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])\n        num_planes += nblocks[1]*growth_rate\n        out_planes = int(math.floor(num_planes*reduction))\n        self.trans2 = Transition(num_planes, out_planes)\n        num_planes = out_planes\n\n        self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])\n        num_planes += nblocks[2]*growth_rate\n        out_planes = int(math.floor(num_planes*reduction))\n        self.trans3 = Transition(num_planes, out_planes)\n        num_planes = out_planes\n\n        self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3])\n        num_planes += nblocks[3]*growth_rate\n\n        self.bn = nn.BatchNorm2d(num_planes)\n        self.linear = nn.Linear(num_planes, num_classes)\n\n    def _make_dense_layers(self, block, in_planes, nblock):\n        layers = []\n        for i in range(nblock):\n            layers.append(block(in_planes, self.growth_rate))\n            in_planes += self.growth_rate\n        return nn.Sequential(*layers)\n\n    def forward(self, x):\n        out = self.conv1(x)\n        out = self.trans1(self.dense1(out))\n        out = self.trans2(self.dense2(out))\n        out = self.trans3(self.dense3(out))\n        out = self.dense4(out)\n        out = F.avg_pool2d(F.relu(self.bn(out)), 4)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\ndef DenseNet121():\n    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32)\n\ndef DenseNet169():\n    return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32)\n\ndef DenseNet201():\n    return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32)\n\ndef DenseNet161():\n    return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48)\n\ndef densenet_cifar():\n    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12)\n\ndef test():\n    net = densenet_cifar()\n    x = torch.randn(1,3,32,32)\n    y = net(x)\n    print(y)\n\n# test()\n"
  },
  {
    "path": "algorithm-GC/cifar/models/dpn.py",
    "content": "'''Dual Path Networks in PyTorch.'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Bottleneck(nn.Module):\n    def __init__(self, last_planes, in_planes, out_planes, dense_depth, stride, first_layer):\n        super(Bottleneck, self).__init__()\n        self.out_planes = out_planes\n        self.dense_depth = dense_depth\n\n        self.conv1 = nn.Conv2d(last_planes, in_planes, kernel_size=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(in_planes)\n        self.conv2 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=32, bias=False)\n        self.bn2 = nn.BatchNorm2d(in_planes)\n        self.conv3 = nn.Conv2d(in_planes, out_planes+dense_depth, kernel_size=1, bias=False)\n        self.bn3 = nn.BatchNorm2d(out_planes+dense_depth)\n\n        self.shortcut = nn.Sequential()\n        if first_layer:\n            self.shortcut = nn.Sequential(\n                nn.Conv2d(last_planes, out_planes+dense_depth, kernel_size=1, stride=stride, bias=False),\n                nn.BatchNorm2d(out_planes+dense_depth)\n            )\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = F.relu(self.bn2(self.conv2(out)))\n        out = self.bn3(self.conv3(out))\n        x = self.shortcut(x)\n        d = self.out_planes\n        out = torch.cat([x[:,:d,:,:]+out[:,:d,:,:], x[:,d:,:,:], out[:,d:,:,:]], 1)\n        out = F.relu(out)\n        return out\n\n\nclass DPN(nn.Module):\n    def __init__(self, cfg):\n        super(DPN, self).__init__()\n        in_planes, out_planes = cfg['in_planes'], cfg['out_planes']\n        num_blocks, dense_depth = cfg['num_blocks'], cfg['dense_depth']\n\n        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(64)\n        self.last_planes = 64\n        self.layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1)\n        self.layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2)\n        self.layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=2)\n        self.layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2)\n        self.linear = nn.Linear(out_planes[3]+(num_blocks[3]+1)*dense_depth[3], 10)\n\n    def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride):\n        strides = [stride] + [1]*(num_blocks-1)\n        layers = []\n        for i,stride in enumerate(strides):\n            layers.append(Bottleneck(self.last_planes, in_planes, out_planes, dense_depth, stride, i==0))\n            self.last_planes = out_planes + (i+2) * dense_depth\n        return nn.Sequential(*layers)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.layer1(out)\n        out = self.layer2(out)\n        out = self.layer3(out)\n        out = self.layer4(out)\n        out = F.avg_pool2d(out, 4)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\n\ndef DPN26():\n    cfg = {\n        'in_planes': (96,192,384,768),\n        'out_planes': (256,512,1024,2048),\n        'num_blocks': (2,2,2,2),\n        'dense_depth': (16,32,24,128)\n    }\n    return DPN(cfg)\n\ndef DPN92():\n    cfg = {\n        'in_planes': (96,192,384,768),\n        'out_planes': (256,512,1024,2048),\n        'num_blocks': (3,4,20,3),\n        'dense_depth': (16,32,24,128)\n    }\n    return DPN(cfg)\n\n\ndef test():\n    net = DPN92()\n    x = torch.randn(1,3,32,32)\n    y = net(x)\n    print(y)\n\n# test()\n"
  },
  {
    "path": "algorithm-GC/cifar/models/googlenet.py",
    "content": "'''GoogLeNet with PyTorch.'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Inception(nn.Module):\n    def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes):\n        super(Inception, self).__init__()\n        # 1x1 conv branch\n        self.b1 = nn.Sequential(\n            nn.Conv2d(in_planes, n1x1, kernel_size=1),\n            nn.BatchNorm2d(n1x1),\n            nn.ReLU(True),\n        )\n\n        # 1x1 conv -> 3x3 conv branch\n        self.b2 = nn.Sequential(\n            nn.Conv2d(in_planes, n3x3red, kernel_size=1),\n            nn.BatchNorm2d(n3x3red),\n            nn.ReLU(True),\n            nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1),\n            nn.BatchNorm2d(n3x3),\n            nn.ReLU(True),\n        )\n\n        # 1x1 conv -> 5x5 conv branch\n        self.b3 = nn.Sequential(\n            nn.Conv2d(in_planes, n5x5red, kernel_size=1),\n            nn.BatchNorm2d(n5x5red),\n            nn.ReLU(True),\n            nn.Conv2d(n5x5red, n5x5, kernel_size=3, padding=1),\n            nn.BatchNorm2d(n5x5),\n            nn.ReLU(True),\n            nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1),\n            nn.BatchNorm2d(n5x5),\n            nn.ReLU(True),\n        )\n\n        # 3x3 pool -> 1x1 conv branch\n        self.b4 = nn.Sequential(\n            nn.MaxPool2d(3, stride=1, padding=1),\n            nn.Conv2d(in_planes, pool_planes, kernel_size=1),\n            nn.BatchNorm2d(pool_planes),\n            nn.ReLU(True),\n        )\n\n    def forward(self, x):\n        y1 = self.b1(x)\n        y2 = self.b2(x)\n        y3 = self.b3(x)\n        y4 = self.b4(x)\n        return torch.cat([y1,y2,y3,y4], 1)\n\n\nclass GoogLeNet(nn.Module):\n    def __init__(self):\n        super(GoogLeNet, self).__init__()\n        self.pre_layers = nn.Sequential(\n            nn.Conv2d(3, 192, kernel_size=3, padding=1),\n            nn.BatchNorm2d(192),\n            nn.ReLU(True),\n        )\n\n        self.a3 = Inception(192,  64,  96, 128, 16, 32, 32)\n        self.b3 = Inception(256, 128, 128, 192, 32, 96, 64)\n\n        self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)\n\n        self.a4 = Inception(480, 192,  96, 208, 16,  48,  64)\n        self.b4 = Inception(512, 160, 112, 224, 24,  64,  64)\n        self.c4 = Inception(512, 128, 128, 256, 24,  64,  64)\n        self.d4 = Inception(512, 112, 144, 288, 32,  64,  64)\n        self.e4 = Inception(528, 256, 160, 320, 32, 128, 128)\n\n        self.a5 = Inception(832, 256, 160, 320, 32, 128, 128)\n        self.b5 = Inception(832, 384, 192, 384, 48, 128, 128)\n\n        self.avgpool = nn.AvgPool2d(8, stride=1)\n        self.linear = nn.Linear(1024, 10)\n\n    def forward(self, x):\n        out = self.pre_layers(x)\n        out = self.a3(out)\n        out = self.b3(out)\n        out = self.maxpool(out)\n        out = self.a4(out)\n        out = self.b4(out)\n        out = self.c4(out)\n        out = self.d4(out)\n        out = self.e4(out)\n        out = self.maxpool(out)\n        out = self.a5(out)\n        out = self.b5(out)\n        out = self.avgpool(out)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\n\ndef test():\n    net = GoogLeNet()\n    x = torch.randn(1,3,32,32)\n    y = net(x)\n    print(y.size())\n\n# test()\n"
  },
  {
    "path": "algorithm-GC/cifar/models/lenet.py",
    "content": "'''LeNet in PyTorch.'''\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass LeNet(nn.Module):\n    def __init__(self):\n        super(LeNet, self).__init__()\n        self.conv1 = nn.Conv2d(3, 6, 5)\n        self.conv2 = nn.Conv2d(6, 16, 5)\n        self.fc1   = nn.Linear(16*5*5, 120)\n        self.fc2   = nn.Linear(120, 84)\n        self.fc3   = nn.Linear(84, 10)\n\n    def forward(self, x):\n        out = F.relu(self.conv1(x))\n        out = F.max_pool2d(out, 2)\n        out = F.relu(self.conv2(out))\n        out = F.max_pool2d(out, 2)\n        out = out.view(out.size(0), -1)\n        out = F.relu(self.fc1(out))\n        out = F.relu(self.fc2(out))\n        out = self.fc3(out)\n        return out\n"
  },
  {
    "path": "algorithm-GC/cifar/models/mobilenet.py",
    "content": "'''MobileNet in PyTorch.\n\nSee the paper \"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications\"\nfor more details.\n'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Block(nn.Module):\n    '''Depthwise conv + Pointwise conv'''\n    def __init__(self, in_planes, out_planes, stride=1):\n        super(Block, self).__init__()\n        self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False)\n        self.bn1 = nn.BatchNorm2d(in_planes)\n        self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)\n        self.bn2 = nn.BatchNorm2d(out_planes)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = F.relu(self.bn2(self.conv2(out)))\n        return out\n\n\nclass MobileNet(nn.Module):\n    # (128,2) means conv planes=128, conv stride=2, by default conv stride=1\n    cfg = [64, (128,2), 128, (256,2), 256, (512,2), 512, 512, 512, 512, 512, (1024,2), 1024]\n\n    def __init__(self, num_classes=10):\n        super(MobileNet, self).__init__()\n        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(32)\n        self.layers = self._make_layers(in_planes=32)\n        self.linear = nn.Linear(1024, num_classes)\n\n    def _make_layers(self, in_planes):\n        layers = []\n        for x in self.cfg:\n            out_planes = x if isinstance(x, int) else x[0]\n            stride = 1 if isinstance(x, int) else x[1]\n            layers.append(Block(in_planes, out_planes, stride))\n            in_planes = out_planes\n        return nn.Sequential(*layers)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.layers(out)\n        out = F.avg_pool2d(out, 2)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\n\ndef test():\n    net = MobileNet()\n    x = torch.randn(1,3,32,32)\n    y = net(x)\n    print(y.size())\n\n# test()\n"
  },
  {
    "path": "algorithm-GC/cifar/models/mobilenetv2.py",
    "content": "'''MobileNetV2 in PyTorch.\n\nSee the paper \"Inverted Residuals and Linear Bottlenecks:\nMobile Networks for Classification, Detection and Segmentation\" for more details.\n'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Block(nn.Module):\n    '''expand + depthwise + pointwise'''\n    def __init__(self, in_planes, out_planes, expansion, stride):\n        super(Block, self).__init__()\n        self.stride = stride\n\n        planes = expansion * in_planes\n        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False)\n        self.bn1 = nn.BatchNorm2d(planes)\n        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False)\n        self.bn2 = nn.BatchNorm2d(planes)\n        self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)\n        self.bn3 = nn.BatchNorm2d(out_planes)\n\n        self.shortcut = nn.Sequential()\n        if stride == 1 and in_planes != out_planes:\n            self.shortcut = nn.Sequential(\n                nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False),\n                nn.BatchNorm2d(out_planes),\n            )\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = F.relu(self.bn2(self.conv2(out)))\n        out = self.bn3(self.conv3(out))\n        out = out + self.shortcut(x) if self.stride==1 else out\n        return out\n\n\nclass MobileNetV2(nn.Module):\n    # (expansion, out_planes, num_blocks, stride)\n    cfg = [(1,  16, 1, 1),\n           (6,  24, 2, 1),  # NOTE: change stride 2 -> 1 for CIFAR10\n           (6,  32, 3, 2),\n           (6,  64, 4, 2),\n           (6,  96, 3, 1),\n           (6, 160, 3, 2),\n           (6, 320, 1, 1)]\n\n    def __init__(self, num_classes=10):\n        super(MobileNetV2, self).__init__()\n        # NOTE: change conv1 stride 2 -> 1 for CIFAR10\n        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(32)\n        self.layers = self._make_layers(in_planes=32)\n        self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False)\n        self.bn2 = nn.BatchNorm2d(1280)\n        self.linear = nn.Linear(1280, num_classes)\n\n    def _make_layers(self, in_planes):\n        layers = []\n        for expansion, out_planes, num_blocks, stride in self.cfg:\n            strides = [stride] + [1]*(num_blocks-1)\n            for stride in strides:\n                layers.append(Block(in_planes, out_planes, expansion, stride))\n                in_planes = out_planes\n        return nn.Sequential(*layers)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.layers(out)\n        out = F.relu(self.bn2(self.conv2(out)))\n        # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10\n        out = F.avg_pool2d(out, 4)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\n\ndef test():\n    net = MobileNetV2()\n    x = torch.randn(2,3,32,32)\n    y = net(x)\n    print(y.size())\n\n# test()\n"
  },
  {
    "path": "algorithm-GC/cifar/models/pnasnet.py",
    "content": "'''PNASNet in PyTorch.\n\nPaper: Progressive Neural Architecture Search\n'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass SepConv(nn.Module):\n    '''Separable Convolution.'''\n    def __init__(self, in_planes, out_planes, kernel_size, stride):\n        super(SepConv, self).__init__()\n        self.conv1 = nn.Conv2d(in_planes, out_planes,\n                               kernel_size, stride,\n                               padding=(kernel_size-1)//2,\n                               bias=False, groups=in_planes)\n        self.bn1 = nn.BatchNorm2d(out_planes)\n\n    def forward(self, x):\n        return self.bn1(self.conv1(x))\n\n\nclass CellA(nn.Module):\n    def __init__(self, in_planes, out_planes, stride=1):\n        super(CellA, self).__init__()\n        self.stride = stride\n        self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)\n        if stride==2:\n            self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)\n            self.bn1 = nn.BatchNorm2d(out_planes)\n\n    def forward(self, x):\n        y1 = self.sep_conv1(x)\n        y2 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)\n        if self.stride==2:\n            y2 = self.bn1(self.conv1(y2))\n        return F.relu(y1+y2)\n\nclass CellB(nn.Module):\n    def __init__(self, in_planes, out_planes, stride=1):\n        super(CellB, self).__init__()\n        self.stride = stride\n        # Left branch\n        self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)\n        self.sep_conv2 = SepConv(in_planes, out_planes, kernel_size=3, stride=stride)\n        # Right branch\n        self.sep_conv3 = SepConv(in_planes, out_planes, kernel_size=5, stride=stride)\n        if stride==2:\n            self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)\n            self.bn1 = nn.BatchNorm2d(out_planes)\n        # Reduce channels\n        self.conv2 = nn.Conv2d(2*out_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)\n        self.bn2 = nn.BatchNorm2d(out_planes)\n\n    def forward(self, x):\n        # Left branch\n        y1 = self.sep_conv1(x)\n        y2 = self.sep_conv2(x)\n        # Right branch\n        y3 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)\n        if self.stride==2:\n            y3 = self.bn1(self.conv1(y3))\n        y4 = self.sep_conv3(x)\n        # Concat & reduce channels\n        b1 = F.relu(y1+y2)\n        b2 = F.relu(y3+y4)\n        y = torch.cat([b1,b2], 1)\n        return F.relu(self.bn2(self.conv2(y)))\n\nclass PNASNet(nn.Module):\n    def __init__(self, cell_type, num_cells, num_planes):\n        super(PNASNet, self).__init__()\n        self.in_planes = num_planes\n        self.cell_type = cell_type\n\n        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, stride=1, padding=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(num_planes)\n\n        self.layer1 = self._make_layer(num_planes, num_cells=6)\n        self.layer2 = self._downsample(num_planes*2)\n        self.layer3 = self._make_layer(num_planes*2, num_cells=6)\n        self.layer4 = self._downsample(num_planes*4)\n        self.layer5 = self._make_layer(num_planes*4, num_cells=6)\n\n        self.linear = nn.Linear(num_planes*4, 10)\n\n    def _make_layer(self, planes, num_cells):\n        layers = []\n        for _ in range(num_cells):\n            layers.append(self.cell_type(self.in_planes, planes, stride=1))\n            self.in_planes = planes\n        return nn.Sequential(*layers)\n\n    def _downsample(self, planes):\n        layer = self.cell_type(self.in_planes, planes, stride=2)\n        self.in_planes = planes\n        return layer\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.layer1(out)\n        out = self.layer2(out)\n        out = self.layer3(out)\n        out = self.layer4(out)\n        out = self.layer5(out)\n        out = F.avg_pool2d(out, 8)\n        out = self.linear(out.view(out.size(0), -1))\n        return out\n\n\ndef PNASNetA():\n    return PNASNet(CellA, num_cells=6, num_planes=44)\n\ndef PNASNetB():\n    return PNASNet(CellB, num_cells=6, num_planes=32)\n\n\ndef test():\n    net = PNASNetB()\n    x = torch.randn(1,3,32,32)\n    y = net(x)\n    print(y)\n\n# test()\n"
  },
  {
    "path": "algorithm-GC/cifar/models/preact_resnet.py",
    "content": "'''Pre-activation ResNet in PyTorch.\n\nReference:\n[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun\n    Identity Mappings in Deep Residual Networks. arXiv:1603.05027\n'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass PreActBlock(nn.Module):\n    '''Pre-activation version of the BasicBlock.'''\n    expansion = 1\n\n    def __init__(self, in_planes, planes, stride=1):\n        super(PreActBlock, self).__init__()\n        self.bn1 = nn.BatchNorm2d(in_planes)\n        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)\n        self.bn2 = nn.BatchNorm2d(planes)\n        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)\n\n        if stride != 1 or in_planes != self.expansion*planes:\n            self.shortcut = nn.Sequential(\n                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)\n            )\n\n    def forward(self, x):\n        out = F.relu(self.bn1(x))\n        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x\n        out = self.conv1(out)\n        out = self.conv2(F.relu(self.bn2(out)))\n        out += shortcut\n        return out\n\n\nclass PreActBottleneck(nn.Module):\n    '''Pre-activation version of the original Bottleneck module.'''\n    expansion = 4\n\n    def __init__(self, in_planes, planes, stride=1):\n        super(PreActBottleneck, self).__init__()\n        self.bn1 = nn.BatchNorm2d(in_planes)\n        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)\n        self.bn2 = nn.BatchNorm2d(planes)\n        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)\n        self.bn3 = nn.BatchNorm2d(planes)\n        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)\n\n        if stride != 1 or in_planes != self.expansion*planes:\n            self.shortcut = nn.Sequential(\n                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)\n            )\n\n    def forward(self, x):\n        out = F.relu(self.bn1(x))\n        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x\n        out = self.conv1(out)\n        out = self.conv2(F.relu(self.bn2(out)))\n        out = self.conv3(F.relu(self.bn3(out)))\n        out += shortcut\n        return out\n\n\nclass PreActResNet(nn.Module):\n    def __init__(self, block, num_blocks, num_classes=10):\n        super(PreActResNet, self).__init__()\n        self.in_planes = 64\n\n        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)\n        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)\n        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)\n        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)\n        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)\n        self.linear = nn.Linear(512*block.expansion, num_classes)\n\n    def _make_layer(self, block, planes, num_blocks, stride):\n        strides = [stride] + [1]*(num_blocks-1)\n        layers = []\n        for stride in strides:\n            layers.append(block(self.in_planes, planes, stride))\n            self.in_planes = planes * block.expansion\n        return nn.Sequential(*layers)\n\n    def forward(self, x):\n        out = self.conv1(x)\n        out = self.layer1(out)\n        out = self.layer2(out)\n        out = self.layer3(out)\n        out = self.layer4(out)\n        out = F.avg_pool2d(out, 4)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\n\ndef PreActResNet18():\n    return PreActResNet(PreActBlock, [2,2,2,2])\n\ndef PreActResNet34():\n    return PreActResNet(PreActBlock, [3,4,6,3])\n\ndef PreActResNet50():\n    return PreActResNet(PreActBottleneck, [3,4,6,3])\n\ndef PreActResNet101():\n    return PreActResNet(PreActBottleneck, [3,4,23,3])\n\ndef PreActResNet152():\n    return PreActResNet(PreActBottleneck, [3,8,36,3])\n\n\ndef test():\n    net = PreActResNet18()\n    y = net((torch.randn(1,3,32,32)))\n    print(y.size())\n\n# test()\n"
  },
  {
    "path": "algorithm-GC/cifar/models/resnet.py",
    "content": "'''ResNet in PyTorch.\n\nFor Pre-activation ResNet, see 'preact_resnet.py'.\n\nReference:\n[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun\n    Deep Residual Learning for Image Recognition. arXiv:1512.03385\n'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass BasicBlock(nn.Module):\n    expansion = 1\n\n    def __init__(self, in_planes, planes, stride=1):\n        super(BasicBlock, self).__init__()\n        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(planes)\n        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)\n        self.bn2 = nn.BatchNorm2d(planes)\n\n        self.shortcut = nn.Sequential()\n        if stride != 1 or in_planes != self.expansion*planes:\n            self.shortcut = nn.Sequential(\n                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),\n                nn.BatchNorm2d(self.expansion*planes)\n            )\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.bn2(self.conv2(out))\n        out += self.shortcut(x)\n        out = F.relu(out)\n        return out\n\n\nclass Bottleneck(nn.Module):\n    expansion = 4\n\n    def __init__(self, in_planes, planes, stride=1):\n        super(Bottleneck, self).__init__()\n        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(planes)\n        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)\n        self.bn2 = nn.BatchNorm2d(planes)\n        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)\n        self.bn3 = nn.BatchNorm2d(self.expansion*planes)\n\n        self.shortcut = nn.Sequential()\n        if stride != 1 or in_planes != self.expansion*planes:\n            self.shortcut = nn.Sequential(\n                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),\n                nn.BatchNorm2d(self.expansion*planes)\n            )\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = F.relu(self.bn2(self.conv2(out)))\n        out = self.bn3(self.conv3(out))\n        out += self.shortcut(x)\n        out = F.relu(out)\n        return out\n\n\nclass ResNet(nn.Module):\n    def __init__(self, block, num_blocks, num_classes=10):\n        super(ResNet, self).__init__()\n        self.in_planes = 64\n\n        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(64)\n        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)\n        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)\n        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)\n        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)\n        self.linear = nn.Linear(512*block.expansion, num_classes)\n\n    def _make_layer(self, block, planes, num_blocks, stride):\n        strides = [stride] + [1]*(num_blocks-1)\n        layers = []\n        for stride in strides:\n            layers.append(block(self.in_planes, planes, stride))\n            self.in_planes = planes * block.expansion\n        return nn.Sequential(*layers)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.layer1(out)\n        out = self.layer2(out)\n        out = self.layer3(out)\n        out = self.layer4(out)\n        out = F.avg_pool2d(out, 4)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\n\ndef ResNet18(Num_classes=10):\n    return ResNet(BasicBlock, [2,2,2,2],num_classes=Num_classes)\n\ndef ResNet34(Num_classes=10):\n    return ResNet(BasicBlock, [3,4,6,3],num_classes=Num_classes)\n\ndef ResNet50(Num_classes=10):\n    return ResNet(Bottleneck, [3,4,6,3],num_classes=Num_classes)\n\ndef ResNet101(Num_classes=10):\n    return ResNet(Bottleneck, [3,4,23,3],num_classes=Num_classes)\n\ndef ResNet152(Num_classes=10):\n    return ResNet(Bottleneck, [3,8,36,3],num_classes=Num_classes)\n\n\ndef test():\n    net = ResNet18()\n    y = net(torch.randn(1,3,32,32))\n    print(y.size())\n\n# test()\n"
  },
  {
    "path": "algorithm-GC/cifar/models/resnext.py",
    "content": "'''ResNeXt in PyTorch.\n\nSee the paper \"Aggregated Residual Transformations for Deep Neural Networks\" for more details.\n'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Block(nn.Module):\n    '''Grouped convolution block.'''\n    expansion = 2\n\n    def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1):\n        super(Block, self).__init__()\n        group_width = cardinality * bottleneck_width\n        self.conv1 = nn.Conv2d(in_planes, group_width, kernel_size=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(group_width)\n        self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False)\n        self.bn2 = nn.BatchNorm2d(group_width)\n        self.conv3 = nn.Conv2d(group_width, self.expansion*group_width, kernel_size=1, bias=False)\n        self.bn3 = nn.BatchNorm2d(self.expansion*group_width)\n\n        self.shortcut = nn.Sequential()\n        if stride != 1 or in_planes != self.expansion*group_width:\n            self.shortcut = nn.Sequential(\n                nn.Conv2d(in_planes, self.expansion*group_width, kernel_size=1, stride=stride, bias=False),\n                nn.BatchNorm2d(self.expansion*group_width)\n            )\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = F.relu(self.bn2(self.conv2(out)))\n        out = self.bn3(self.conv3(out))\n        out += self.shortcut(x)\n        out = F.relu(out)\n        return out\n\n\nclass ResNeXt(nn.Module):\n    def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10):\n        super(ResNeXt, self).__init__()\n        self.cardinality = cardinality\n        self.bottleneck_width = bottleneck_width\n        self.in_planes = 64\n\n        self.conv1 = nn.Conv2d(3, 64, kernel_size=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(64)\n        self.layer1 = self._make_layer(num_blocks[0], 1)\n        self.layer2 = self._make_layer(num_blocks[1], 2)\n        self.layer3 = self._make_layer(num_blocks[2], 2)\n        # self.layer4 = self._make_layer(num_blocks[3], 2)\n        self.linear = nn.Linear(cardinality*bottleneck_width*8, num_classes)\n\n    def _make_layer(self, num_blocks, stride):\n        strides = [stride] + [1]*(num_blocks-1)\n        layers = []\n        for stride in strides:\n            layers.append(Block(self.in_planes, self.cardinality, self.bottleneck_width, stride))\n            self.in_planes = Block.expansion * self.cardinality * self.bottleneck_width\n        # Increase bottleneck_width by 2 after each stage.\n        self.bottleneck_width *= 2\n        return nn.Sequential(*layers)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.layer1(out)\n        out = self.layer2(out)\n        out = self.layer3(out)\n        # out = self.layer4(out)\n        out = F.avg_pool2d(out, 8)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\n\ndef ResNeXt29_2x64d():\n    return ResNeXt(num_blocks=[3,3,3], cardinality=2, bottleneck_width=64)\n\ndef ResNeXt29_4x64d():\n    return ResNeXt(num_blocks=[3,3,3], cardinality=4, bottleneck_width=64)\n\ndef ResNeXt29_8x64d():\n    return ResNeXt(num_blocks=[3,3,3], cardinality=8, bottleneck_width=64)\n\ndef ResNeXt29_32x4d():\n    return ResNeXt(num_blocks=[3,3,3], cardinality=32, bottleneck_width=4)\n\ndef test_resnext():\n    net = ResNeXt29_2x64d()\n    x = torch.randn(1,3,32,32)\n    y = net(x)\n    print(y.size())\n\n# test_resnext()\n"
  },
  {
    "path": "algorithm-GC/cifar/models/senet.py",
    "content": "'''SENet in PyTorch.\n\nSENet is the winner of ImageNet-2017. The paper is not released yet.\n'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass BasicBlock(nn.Module):\n    def __init__(self, in_planes, planes, stride=1):\n        super(BasicBlock, self).__init__()\n        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(planes)\n        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)\n        self.bn2 = nn.BatchNorm2d(planes)\n\n        self.shortcut = nn.Sequential()\n        if stride != 1 or in_planes != planes:\n            self.shortcut = nn.Sequential(\n                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False),\n                nn.BatchNorm2d(planes)\n            )\n\n        # SE layers\n        self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)  # Use nn.Conv2d instead of nn.Linear\n        self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.bn2(self.conv2(out))\n\n        # Squeeze\n        w = F.avg_pool2d(out, out.size(2))\n        w = F.relu(self.fc1(w))\n        w = F.sigmoid(self.fc2(w))\n        # Excitation\n        out = out * w  # New broadcasting feature from v0.2!\n\n        out += self.shortcut(x)\n        out = F.relu(out)\n        return out\n\n\nclass PreActBlock(nn.Module):\n    def __init__(self, in_planes, planes, stride=1):\n        super(PreActBlock, self).__init__()\n        self.bn1 = nn.BatchNorm2d(in_planes)\n        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)\n        self.bn2 = nn.BatchNorm2d(planes)\n        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)\n\n        if stride != 1 or in_planes != planes:\n            self.shortcut = nn.Sequential(\n                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False)\n            )\n\n        # SE layers\n        self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)\n        self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(x))\n        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x\n        out = self.conv1(out)\n        out = self.conv2(F.relu(self.bn2(out)))\n\n        # Squeeze\n        w = F.avg_pool2d(out, out.size(2))\n        w = F.relu(self.fc1(w))\n        w = F.sigmoid(self.fc2(w))\n        # Excitation\n        out = out * w\n\n        out += shortcut\n        return out\n\n\nclass SENet(nn.Module):\n    def __init__(self, block, num_blocks, num_classes=10):\n        super(SENet, self).__init__()\n        self.in_planes = 64\n\n        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(64)\n        self.layer1 = self._make_layer(block,  64, num_blocks[0], stride=1)\n        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)\n        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)\n        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)\n        self.linear = nn.Linear(512, num_classes)\n\n    def _make_layer(self, block, planes, num_blocks, stride):\n        strides = [stride] + [1]*(num_blocks-1)\n        layers = []\n        for stride in strides:\n            layers.append(block(self.in_planes, planes, stride))\n            self.in_planes = planes\n        return nn.Sequential(*layers)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.layer1(out)\n        out = self.layer2(out)\n        out = self.layer3(out)\n        out = self.layer4(out)\n        out = F.avg_pool2d(out, 4)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\n\ndef SENet18():\n    return SENet(PreActBlock, [2,2,2,2])\n\n\ndef test():\n    net = SENet18()\n    y = net(torch.randn(1,3,32,32))\n    print(y.size())\n\n# test()\n"
  },
  {
    "path": "algorithm-GC/cifar/models/shufflenet.py",
    "content": "'''ShuffleNet in PyTorch.\n\nSee the paper \"ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices\" for more details.\n'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass ShuffleBlock(nn.Module):\n    def __init__(self, groups):\n        super(ShuffleBlock, self).__init__()\n        self.groups = groups\n\n    def forward(self, x):\n        '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''\n        N,C,H,W = x.size()\n        g = self.groups\n        return x.view(N,g,C/g,H,W).permute(0,2,1,3,4).contiguous().view(N,C,H,W)\n\n\nclass Bottleneck(nn.Module):\n    def __init__(self, in_planes, out_planes, stride, groups):\n        super(Bottleneck, self).__init__()\n        self.stride = stride\n\n        mid_planes = out_planes/4\n        g = 1 if in_planes==24 else groups\n        self.conv1 = nn.Conv2d(in_planes, mid_planes, kernel_size=1, groups=g, bias=False)\n        self.bn1 = nn.BatchNorm2d(mid_planes)\n        self.shuffle1 = ShuffleBlock(groups=g)\n        self.conv2 = nn.Conv2d(mid_planes, mid_planes, kernel_size=3, stride=stride, padding=1, groups=mid_planes, bias=False)\n        self.bn2 = nn.BatchNorm2d(mid_planes)\n        self.conv3 = nn.Conv2d(mid_planes, out_planes, kernel_size=1, groups=groups, bias=False)\n        self.bn3 = nn.BatchNorm2d(out_planes)\n\n        self.shortcut = nn.Sequential()\n        if stride == 2:\n            self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1))\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.shuffle1(out)\n        out = F.relu(self.bn2(self.conv2(out)))\n        out = self.bn3(self.conv3(out))\n        res = self.shortcut(x)\n        out = F.relu(torch.cat([out,res], 1)) if self.stride==2 else F.relu(out+res)\n        return out\n\n\nclass ShuffleNet(nn.Module):\n    def __init__(self, cfg):\n        super(ShuffleNet, self).__init__()\n        out_planes = cfg['out_planes']\n        num_blocks = cfg['num_blocks']\n        groups = cfg['groups']\n\n        self.conv1 = nn.Conv2d(3, 24, kernel_size=1, bias=False)\n        self.bn1 = nn.BatchNorm2d(24)\n        self.in_planes = 24\n        self.layer1 = self._make_layer(out_planes[0], num_blocks[0], groups)\n        self.layer2 = self._make_layer(out_planes[1], num_blocks[1], groups)\n        self.layer3 = self._make_layer(out_planes[2], num_blocks[2], groups)\n        self.linear = nn.Linear(out_planes[2], 10)\n\n    def _make_layer(self, out_planes, num_blocks, groups):\n        layers = []\n        for i in range(num_blocks):\n            stride = 2 if i == 0 else 1\n            cat_planes = self.in_planes if i == 0 else 0\n            layers.append(Bottleneck(self.in_planes, out_planes-cat_planes, stride=stride, groups=groups))\n            self.in_planes = out_planes\n        return nn.Sequential(*layers)\n\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.layer1(out)\n        out = self.layer2(out)\n        out = self.layer3(out)\n        out = F.avg_pool2d(out, 4)\n        out = out.view(out.size(0), -1)\n        out = self.linear(out)\n        return out\n\n\ndef ShuffleNetG2():\n    cfg = {\n        'out_planes': [200,400,800],\n        'num_blocks': [4,8,4],\n        'groups': 2\n    }\n    return ShuffleNet(cfg)\n\ndef ShuffleNetG3():\n    cfg = {\n        'out_planes': [240,480,960],\n        'num_blocks': [4,8,4],\n        'groups': 3\n    }\n    return ShuffleNet(cfg)\n\n\ndef test():\n    net = ShuffleNetG2()\n    x = torch.randn(1,3,32,32)\n    y = net(x)\n    print(y)\n\n# test()\n"
  },
  {
    "path": "algorithm-GC/cifar/models/vgg.py",
    "content": "'''VGG11/13/16/19 in Pytorch.'''\nimport torch\nimport torch.nn as nn\n\n\ncfg = {\n    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],\n    'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],\n    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],\n    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],\n}\n\n\nclass VGG(nn.Module):\n    def __init__(self, vgg_name,Num_classes=100):\n        super(VGG, self).__init__()\n        self.features = self._make_layers(cfg[vgg_name])\n        self.classifier = nn.Linear(512, Num_classes)\n\n    def forward(self, x):\n        out = self.features(x)\n        out = out.view(out.size(0), -1)\n        out = self.classifier(out)\n        return out\n\n    def _make_layers(self, cfg):\n        layers = []\n        in_channels = 3\n        for x in cfg:\n            if x == 'M':\n                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]\n            else:\n                layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),\n                           nn.BatchNorm2d(x),\n                           nn.ReLU(inplace=True)]\n                in_channels = x\n        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]\n        return nn.Sequential(*layers)\n\n\ndef test():\n    net = VGG('VGG11')\n    x = torch.randn(2,3,32,32)\n    y = net(x)\n    print(y.size())\n\n# test()\n"
  },
  {
    "path": "algorithm-GC/cifar/nohup.out",
    "content": "Traceback (most recent call last):\n  File \"main.py\", line 281, in <module>\n    train_acc=train(epoch,net,optimizer)\n  File \"main.py\", line 227, in train\n    outputs = net(inputs)\n  File \"/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py\", line 550, in __call__\n    result = self.forward(*input, **kwargs)\n  File \"/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py\", line 153, in forward\n    return self.module(*inputs[0], **kwargs[0])\n  File \"/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py\", line 550, in __call__\n    result = self.forward(*input, **kwargs)\n  File \"/home/yonghw/mycode/Opt_GC/cifar/models/resnet.py\", line 90, in forward\n    out = self.layer1(out)\n  File \"/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py\", line 550, in __call__\n    result = self.forward(*input, **kwargs)\n  File \"/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/container.py\", line 100, in forward\n    input = module(input)\n  File \"/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py\", line 550, in __call__\n    result = self.forward(*input, **kwargs)\n  File \"/home/yonghw/mycode/Opt_GC/cifar/models/resnet.py\", line 61, in forward\n    out = self.bn3(self.conv3(out))\n  File \"/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py\", line 550, in __call__\n    result = self.forward(*input, **kwargs)\n  File \"/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/conv.py\", line 349, in forward\n    return self._conv_forward(input, self.weight)\n  File \"/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/conv.py\", line 346, in _conv_forward\n    self.padding, self.dilation, self.groups)\nRuntimeError: CUDA out of memory. Tried to allocate 128.00 MiB (GPU 0; 15.90 GiB total capacity; 1.06 GiB already allocated; 31.38 MiB free; 1.23 GiB reserved in total by PyTorch)\nTerminated\n"
  },
  {
    "path": "algorithm-GC/cifar/os_run.py",
    "content": "#cifar100 e200 bs128  gs  2,4,8,16\nimport os,time\n#############################\n#r18\n##############\n\n#### sgd \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_sgd_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_sgd_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_sgd_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_sgd_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_sgd_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_sgd_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_sgd_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_sgd_8.log \")\n#time.sleep(500)\n#\n#### sgdGC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_sgdGC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_sgdGC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_sgdGC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_sgdGC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_sgdGC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_sgdGC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_sgdGC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_sgdGC_8.log \")\n#time.sleep(500)\n#\n#### sgdGCC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_sgdGCC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_sgdGCC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_sgdGCC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_sgdGCC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_sgdGCC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_sgdGCC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_sgdGCC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_sgdGCC_8.log \")\n#time.sleep(500)\n#\n###############\n###############\n#\n#### adam \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_adam_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_adam_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_adam_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_adam_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_adam_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_adam_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_adam_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_adam_8.log \")\n#\n#time.sleep(500)\n#### adamGC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_adamGC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_adamGC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_adamGC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_adamGC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_adamGC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_adamGC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_adamGC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_adamGC_8.log \")\n#time.sleep(500)\n#\n#### adamGCC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_adamGCC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_adamGCC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_adamGCC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_adamGCC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_adamGCC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_adamGCC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_adamGCC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_adamGCC_8.log \")\n#time.sleep(500)\n#\n###############\n###############\n#\n#### adamW \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_adamW_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_adamW_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_adamW_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_adamW_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_adamW_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_adamW_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_adamW_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_adamW_8.log \")\n#\n#time.sleep(500)\n#### adamWGC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_adamWGC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_adamWGC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_adamWGC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_adamWGC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_adamWGC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_adamWGC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_adamWGC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_adamWGC_8.log \")\n#time.sleep(500)\n#\n#### adamWGCC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_adamWGCC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_adamWGCC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_adamWGCC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_adamWGCC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_adamWGCC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_adamWGCC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_adamWGCC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_adamWGCC_8.log \")\n#time.sleep(500)\n#\n###############\n###############\n#\n#### radam \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_radam_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_radam_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_radam_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_radam_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_radam_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_radam_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_radam_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_radam_8.log \")\n#\n#time.sleep(500)\n#### radamGC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_radamGC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_radamGC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_radamGC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_radamGC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_radamGC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_radamGC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_radamGC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_radamGC_8.log \")\n#time.sleep(500)\n#\n#### radamGCC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_radamGCC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_radamGCC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_radamGCC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_radamGCC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_radamGCC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_radamGCC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_radamGCC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_radamGCC_8.log \")\n#time.sleep(500)\n#\n###############\n###############\n#\n#### Lsgd \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_Lsgd_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_Lsgd_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_Lsgd_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_Lsgd_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_Lsgd_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_Lsgd_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_Lsgd_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_Lsgd_8.log \")\n#time.sleep(500)\n#\n#### LsgdGC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_LsgdGC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_LsgdGC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_LsgdGC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_LsgdGC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_LsgdGC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_LsgdGC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_LsgdGC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_LsgdGC_8.log \")\n#time.sleep(500)\n#\n#### LsgdGCC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_LsgdGCC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_LsgdGCC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_LsgdGCC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_LsgdGCC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_LsgdGCC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_LsgdGCC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_LsgdGCC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_LsgdGCC_8.log \")\n#time.sleep(500)\n#\n###############\n###############\n#\n#### Ladam \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_Ladam_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_Ladam_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_Ladam_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_Ladam_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_Ladam_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_Ladam_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_Ladam_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_Ladam_8.log \")\n#\n#time.sleep(500)\n#### LadamGC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_LadamGC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_LadamGC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_LadamGC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_LadamGC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_LadamGC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_LadamGC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_LadamGC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_LadamGC_8.log \")\n#time.sleep(500)\n#\n#### LadamGCC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_LadamGCC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_LadamGCC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_LadamGCC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_LadamGCC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_LadamGCC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_LadamGCC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_LadamGCC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_LadamGCC_8.log \")\n#time.sleep(500)\n#\n###############\n###############\n#\n#### ranger\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger  --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_ranger_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_ranger_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_ranger_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_ranger_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_ranger_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_ranger_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_ranger_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_ranger_8.log \")\n#\n#time.sleep(500)\n#### ranger \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_rangerGC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_rangerGC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_rangerGC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_rangerGC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_rangerGC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_rangerGC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_rangerGC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_rangerGC_8.log \")\n#time.sleep(500)\n#\n#### ranger \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r18 --gpug 10 > logout/r18_lr11_wd45_rangerGCC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r18 --gpug 11 > logout/r18_lr11_wd45_rangerGCC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r18 --gpug 12 > logout/r18_lr11_wd45_rangerGCC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r18 --gpug 13 > logout/r18_lr11_wd45_rangerGCC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r18 --gpug 14 > logout/r18_lr11_wd45_rangerGCC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r18 --gpug 15 > logout/r18_lr11_wd45_rangerGCC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r18 --gpug 16 > logout/r18_lr11_wd45_rangerGCC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r18 --gpug 17 > logout/r18_lr11_wd45_rangerGCC_8.log \")\n#time.sleep(500)\n#\n###############\n#\n##r50\n###############\n#\n#### sgd \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_sgd_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_sgd_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_sgd_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_sgd_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_sgd_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_sgd_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_sgd_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgd   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_sgd_8.log \")\n#time.sleep(500)\n#\n#### sgdGC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_sgdGC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_sgdGC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_sgdGC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_sgdGC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_sgdGC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_sgdGC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_sgdGC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_sgdGC_8.log \")\n#time.sleep(500)\n#\n#### sgdGCC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_sgdGCC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_sgdGCC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_sgdGCC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_sgdGCC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_sgdGCC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_sgdGCC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_sgdGCC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg sgdGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_sgdGCC_8.log \")\n#time.sleep(500)\n#\n###############\n###############\n#\n#### adam \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_adam_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_adam_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_adam_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_adam_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_adam_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_adam_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_adam_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_adam_8.log \")\n#\n#time.sleep(500)\n#### adamGC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_adamGC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_adamGC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_adamGC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_adamGC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_adamGC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_adamGC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_adamGC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_adamGC_8.log \")\n#time.sleep(500)\n#\n#### adamGCC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_adamGCC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_adamGCC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_adamGCC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_adamGCC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_adamGCC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_adamGCC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_adamGCC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_adamGCC_8.log \")\n#time.sleep(500)\n#\n###############\n###############\n#\n#### adamW \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_adamW_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_adamW_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_adamW_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_adamW_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_adamW_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_adamW_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_adamW_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_adamW_8.log \")\n#\n#time.sleep(500)\n#### adamWGC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_adamWGC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_adamWGC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_adamWGC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_adamWGC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_adamWGC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_adamWGC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_adamWGC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_adamWGC_8.log \")\n#time.sleep(500)\n\n### adamWGCC \nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_adamWGCC_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_adamWGCC_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_adamWGCC_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_adamWGCC_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_adamWGCC_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_adamWGCC_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_adamWGCC_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_adamWGCC_8.log \")\ntime.sleep(500)\n\n##############\n##############\n\n### radam \nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_radam_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_radam_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_radam_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_radam_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_radam_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_radam_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_radam_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_radam_8.log \")\n\ntime.sleep(500)\n### radamGC \nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_radamGC_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_radamGC_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_radamGC_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_radamGC_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_radamGC_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_radamGC_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_radamGC_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_radamGC_8.log \")\ntime.sleep(500)\n\n### radamGCC \nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_radamGCC_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_radamGCC_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_radamGCC_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_radamGCC_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_radamGCC_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_radamGCC_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_radamGCC_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_radamGCC_8.log \")\ntime.sleep(500)\n\n##############\n##############\n\n### Lsgd \nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_Lsgd_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_Lsgd_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_Lsgd_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_Lsgd_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_Lsgd_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_Lsgd_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_Lsgd_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_Lsgd_8.log \")\ntime.sleep(500)\n\n### LsgdGC \nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_LsgdGC_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_LsgdGC_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_LsgdGC_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_LsgdGC_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_LsgdGC_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_LsgdGC_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_LsgdGC_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_LsgdGC_8.log \")\ntime.sleep(500)\n\n### LsgdGCC \nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_LsgdGCC_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_LsgdGCC_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_LsgdGCC_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_LsgdGCC_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_LsgdGCC_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_LsgdGCC_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_LsgdGCC_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_LsgdGCC_8.log \")\ntime.sleep(500)\n\n##############\n##############\n\n### Ladam \nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_Ladam_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_Ladam_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_Ladam_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_Ladam_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_Ladam_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_Ladam_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_Ladam_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_Ladam_8.log \")\n\ntime.sleep(500)\n### LadamGC \nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_LadamGC_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_LadamGC_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_LadamGC_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_LadamGC_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_LadamGC_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_LadamGC_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_LadamGC_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_LadamGC_8.log \")\ntime.sleep(500)\n\n### LadamGCC \nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_LadamGCC_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_LadamGCC_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_LadamGCC_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_LadamGCC_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_LadamGCC_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_LadamGCC_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_LadamGCC_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_LadamGCC_8.log \")\ntime.sleep(500)\n\n##############\n##############\n\n### ranger\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_ranger_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_ranger_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_ranger_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_ranger_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_ranger_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_ranger_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_ranger_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_ranger_8.log \")\n\ntime.sleep(500)\n### ranger \nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_rangerGC_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_rangerGC_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_rangerGC_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_rangerGC_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_rangerGC_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_rangerGC_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_rangerGC_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_rangerGC_8.log \")\ntime.sleep(500)\n\n### ranger \nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_rangerGCC_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_rangerGCC_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_rangerGCC_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_rangerGCC_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_rangerGCC_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_rangerGCC_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_rangerGCC_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_rangerGCC_8.log \")\ntime.sleep(500)\n\n##############\n\n\n"
  },
  {
    "path": "algorithm-GC/cifar/os_run2.py",
    "content": "#cifar100 e200 bs128  gs  2,4,8,16\nimport os,time\n\n\n#r50\n##############\n\n\n### adam \nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 10 > logout2/r50_lr21_wd45_adam_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 11 > logout2/r50_lr21_wd45_adam_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 12 > logout2/r50_lr21_wd45_adam_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 13 > logout2/r50_lr21_wd45_adam_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 14 > logout2/r50_lr21_wd45_adam_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 15 > logout2/r50_lr21_wd45_adam_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 16 > logout2/r50_lr21_wd45_adam_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 17 > logout2/r50_lr21_wd45_adam_8.log \")\n\ntime.sleep(500)\n### adamGC \nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 10 > logout2/r50_lr21_wd45_adamGC_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 11 > logout2/r50_lr21_wd45_adamGC_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 12 > logout2/r50_lr21_wd45_adamGC_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 13 > logout2/r50_lr21_wd45_adamGC_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 14 > logout2/r50_lr21_wd45_adamGC_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 15 > logout2/r50_lr21_wd45_adamGC_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 16 > logout2/r50_lr21_wd45_adamGC_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 17 > logout2/r50_lr21_wd45_adamGC_8.log \")\ntime.sleep(500)\n\n### adamGCC \nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 10 > logout2/r50_lr21_wd45_adamGCC_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 11 > logout2/r50_lr21_wd45_adamGCC_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 12 > logout2/r50_lr21_wd45_adamGCC_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 13 > logout2/r50_lr21_wd45_adamGCC_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 14 > logout2/r50_lr21_wd45_adamGCC_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 15 > logout2/r50_lr21_wd45_adamGCC_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 16 > logout2/r50_lr21_wd45_adamGCC_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.01 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 17 > logout2/r50_lr21_wd45_adamGCC_8.log \")\ntime.sleep(500)\n\n##############\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 10 > logout2/r50_lr25_wd45_adam_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 11 > logout2/r50_lr25_wd45_adam_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 12 > logout2/r50_lr25_wd45_adam_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 13 > logout2/r50_lr25_wd45_adam_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 14 > logout2/r50_lr25_wd45_adam_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 15 > logout2/r50_lr25_wd45_adam_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 16 > logout2/r50_lr25_wd45_adam_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 17 > logout2/r50_lr25_wd45_adam_8.log \")\n\ntime.sleep(500)\n### adamGC \nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 10 > logout2/r50_lr25_wd45_adamGC_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 11 > logout2/r50_lr25_wd45_adamGC_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 12 > logout2/r50_lr25_wd45_adamGC_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 13 > logout2/r50_lr25_wd45_adamGC_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 14 > logout2/r50_lr25_wd45_adamGC_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 15 > logout2/r50_lr25_wd45_adamGC_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 16 > logout2/r50_lr25_wd45_adamGC_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 17 > logout2/r50_lr25_wd45_adamGC_8.log \")\ntime.sleep(500)\n\n### adamGCC \nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 10 > logout2/r50_lr25_wd45_adamGCC_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 11 > logout2/r50_lr25_wd45_adamGCC_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 12 > logout2/r50_lr25_wd45_adamGCC_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 13 > logout2/r50_lr25_wd45_adamGCC_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 14 > logout2/r50_lr25_wd45_adamGCC_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 15 > logout2/r50_lr25_wd45_adamGCC_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 16 > logout2/r50_lr25_wd45_adamGCC_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.05 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 17 > logout2/r50_lr25_wd45_adamGCC_8.log \")\ntime.sleep(500)\n\n\n\n##############\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 10 > logout2/r50_lr115_wd45_adam_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 11 > logout2/r50_lr115_wd45_adam_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 12 > logout2/r50_lr115_wd45_adam_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 13 > logout2/r50_lr115_wd45_adam_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 14 > logout2/r50_lr115_wd45_adam_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 15 > logout2/r50_lr115_wd45_adam_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 16 > logout2/r50_lr115_wd45_adam_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adam   --epochs 200  --model r50 --gpug 17 > logout2/r50_lr115_wd45_adam_8.log \")\n\ntime.sleep(500)\n### adamGC \nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 10 > logout2/r50_lr115_wd45_adamGC_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 11 > logout2/r50_lr115_wd45_adamGC_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 12 > logout2/r50_lr115_wd45_adamGC_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 13 > logout2/r50_lr115_wd45_adamGC_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 14 > logout2/r50_lr115_wd45_adamGC_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 15 > logout2/r50_lr115_wd45_adamGC_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 16 > logout2/r50_lr115_wd45_adamGC_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGC   --epochs 200  --model r50 --gpug 17 > logout2/r50_lr115_wd45_adamGC_8.log \")\ntime.sleep(500)\n\n### adamGCC \nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 10 > logout2/r50_lr115_wd45_adamGCC_1.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 11 > logout2/r50_lr115_wd45_adamGCC_2.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 12 > logout2/r50_lr115_wd45_adamGCC_3.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 13 > logout2/r50_lr115_wd45_adamGCC_4.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 14 > logout2/r50_lr115_wd45_adamGCC_5.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 15 > logout2/r50_lr115_wd45_adamGCC_6.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 16 > logout2/r50_lr115_wd45_adamGCC_7.log &\")\nos.system(\"nohup  python  main.py --lr 0.15 --wd 0.0005 --alg adamGCC   --epochs 200  --model r50 --gpug 17 > logout2/r50_lr115_wd45_adamGCC_8.log \")\ntime.sleep(500)\n\n\n\n\n#\n###############\n###############\n#\n#### adamW \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_adamW_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_adamW_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_adamW_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_adamW_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_adamW_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_adamW_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_adamW_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamW   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_adamW_8.log \")\n#\n#time.sleep(500)\n#### adamWGC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_adamWGC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_adamWGC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_adamWGC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_adamWGC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_adamWGC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_adamWGC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_adamWGC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_adamWGC_8.log \")\n#time.sleep(500)\n\n### adamWGCC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_adamWGCC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_adamWGCC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_adamWGCC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_adamWGCC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_adamWGCC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_adamWGCC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_adamWGCC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg adamWGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_adamWGCC_8.log \")\n#time.sleep(500)\n#\n###############\n###############\n#\n#### radam \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_radam_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_radam_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_radam_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_radam_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_radam_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_radam_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_radam_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radam   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_radam_8.log \")\n#\n#time.sleep(500)\n#### radamGC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_radamGC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_radamGC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_radamGC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_radamGC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_radamGC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_radamGC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_radamGC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_radamGC_8.log \")\n#time.sleep(500)\n#\n#### radamGCC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_radamGCC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_radamGCC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_radamGCC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_radamGCC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_radamGCC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_radamGCC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_radamGCC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg radamGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_radamGCC_8.log \")\n#time.sleep(500)\n#\n###############\n###############\n#\n#### Lsgd \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_Lsgd_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_Lsgd_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_Lsgd_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_Lsgd_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_Lsgd_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_Lsgd_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_Lsgd_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Lsgd   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_Lsgd_8.log \")\n#time.sleep(500)\n#\n#### LsgdGC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_LsgdGC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_LsgdGC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_LsgdGC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_LsgdGC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_LsgdGC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_LsgdGC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_LsgdGC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_LsgdGC_8.log \")\n#time.sleep(500)\n#\n#### LsgdGCC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_LsgdGCC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_LsgdGCC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_LsgdGCC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_LsgdGCC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_LsgdGCC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_LsgdGCC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_LsgdGCC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_LsgdGCC_8.log \")\n#time.sleep(500)\n#\n###############\n###############\n#\n#### Ladam \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_Ladam_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_Ladam_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_Ladam_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_Ladam_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_Ladam_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_Ladam_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_Ladam_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg Ladam   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_Ladam_8.log \")\n#\n#time.sleep(500)\n#### LadamGC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_LadamGC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_LadamGC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_LadamGC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_LadamGC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_LadamGC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_LadamGC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_LadamGC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_LadamGC_8.log \")\n#time.sleep(500)\n#\n#### LadamGCC \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_LadamGCC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_LadamGCC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_LadamGCC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_LadamGCC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_LadamGCC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_LadamGCC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_LadamGCC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg LadamGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_LadamGCC_8.log \")\n#time.sleep(500)\n#\n###############\n###############\n#\n#### ranger\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_ranger_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_ranger_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_ranger_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_ranger_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_ranger_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_ranger_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_ranger_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg ranger   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_ranger_8.log \")\n#\n#time.sleep(500)\n#### ranger \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_rangerGC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_rangerGC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_rangerGC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_rangerGC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_rangerGC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_rangerGC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_rangerGC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_rangerGC_8.log \")\n#time.sleep(500)\n#\n#### ranger \n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 10 > logout/r50_lr11_wd45_rangerGCC_1.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 11 > logout/r50_lr11_wd45_rangerGCC_2.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 12 > logout/r50_lr11_wd45_rangerGCC_3.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 13 > logout/r50_lr11_wd45_rangerGCC_4.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 14 > logout/r50_lr11_wd45_rangerGCC_5.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 15 > logout/r50_lr11_wd45_rangerGCC_6.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 16 > logout/r50_lr11_wd45_rangerGCC_7.log &\")\n#os.system(\"nohup  python  main.py --lr 0.1 --wd 0.0005 --alg rangerGCC   --epochs 200  --model r50 --gpug 17 > logout/r50_lr11_wd45_rangerGCC_8.log \")\n#time.sleep(500)\n\n##############\n\n\n"
  }
]