Repository: Yonghongwei/Gradient-Centralization Branch: master Commit: ed2a608ccdbb Files: 57 Total size: 394.5 KB Directory structure: gitextract_l162dn_3/ ├── GC_code/ │ ├── CIFAR100/ │ │ ├── algorithm/ │ │ │ ├── Adagrad.py │ │ │ ├── Adam.py │ │ │ └── SGD.py │ │ ├── main.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── densenet.py │ │ │ ├── dpn.py │ │ │ ├── googlenet.py │ │ │ ├── lenet.py │ │ │ ├── mobilenet.py │ │ │ ├── mobilenetv2.py │ │ │ ├── pnasnet.py │ │ │ ├── preact_resnet.py │ │ │ ├── resnet.py │ │ │ ├── resnext.py │ │ │ ├── senet.py │ │ │ ├── shufflenet.py │ │ │ └── vgg.py │ │ └── os_run.py │ ├── Fine-grained_classification/ │ │ ├── SGD.py │ │ ├── main.py │ │ └── os_run.py │ ├── ImageNet/ │ │ ├── SGD.py │ │ ├── main.py │ │ ├── myresnet.py │ │ ├── myresnetgn.py │ │ └── os_run.py │ └── Mini_ImageNet/ │ ├── SGD.py │ ├── main.py │ ├── os_run.py │ └── resnet_ws.py ├── README.md └── algorithm-GC/ ├── README.md ├── algorithm/ │ ├── Adam.py │ ├── Centralization.py │ ├── Lookahead.py │ ├── RAdam.py │ ├── Ranger.py │ └── SGD.py └── cifar/ ├── main.py ├── models/ │ ├── __init__.py │ ├── densenet.py │ ├── dpn.py │ ├── googlenet.py │ ├── lenet.py │ ├── mobilenet.py │ ├── mobilenetv2.py │ ├── pnasnet.py │ ├── preact_resnet.py │ ├── resnet.py │ ├── resnext.py │ ├── senet.py │ ├── shufflenet.py │ └── vgg.py ├── nohup.out ├── os_run.py └── os_run2.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: GC_code/CIFAR100/algorithm/Adagrad.py ================================================ import torch from torch.optim.optimizer import Optimizer class Adagrad_GCC(Optimizer): """Implements Adagrad algorithm. It has been proposed in `Adaptive Subgradient Methods for Online Learning and Stochastic Optimization`_. Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-2) lr_decay (float, optional): learning rate decay (default: 0) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-10) .. _Adaptive Subgradient Methods for Online Learning and Stochastic Optimization: http://jmlr.org/papers/v12/duchi11a.html """ def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0, eps=1e-10): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= lr_decay: raise ValueError("Invalid lr_decay value: {}".format(lr_decay)) if not 0.0 <= weight_decay: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) if not 0.0 <= initial_accumulator_value: raise ValueError("Invalid initial_accumulator_value value: {}".format(initial_accumulator_value)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) defaults = dict(lr=lr, lr_decay=lr_decay, eps=eps, weight_decay=weight_decay, initial_accumulator_value=initial_accumulator_value) super(Adagrad_GCC, self).__init__(params, defaults) for group in self.param_groups: for p in group['params']: state = self.state[p] state['step'] = 0 state['sum'] = torch.full_like(p.data, initial_accumulator_value) def share_memory(self): for group in self.param_groups: for p in group['params']: state = self.state[p] state['sum'].share_memory_() def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data state = self.state[p] state['step'] += 1 if group['weight_decay'] != 0: if p.grad.data.is_sparse: raise RuntimeError("weight_decay option is not compatible with sparse gradients") grad = grad.add(group['weight_decay'], p.data) #GC operation for Conv layers if len(list(grad.size()))>3: grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True)) clr = group['lr'] / (1 + (state['step'] - 1) * group['lr_decay']) if grad.is_sparse: grad = grad.coalesce() # the update is non-linear so indices must be unique grad_indices = grad._indices() grad_values = grad._values() size = grad.size() def make_sparse(values): constructor = grad.new if grad_indices.dim() == 0 or values.dim() == 0: return constructor().resize_as_(grad) return constructor(grad_indices, values, size) state['sum'].add_(make_sparse(grad_values.pow(2))) std = state['sum'].sparse_mask(grad) std_values = std._values().sqrt_().add_(group['eps']) p.data.add_(-clr, make_sparse(grad_values / std_values)) else: state['sum'].addcmul_(1, grad, grad) std = state['sum'].sqrt().add_(group['eps']) p.data.addcdiv_(-clr, grad, std) return loss class Adagrad_GC(Optimizer): """Implements Adagrad algorithm. It has been proposed in `Adaptive Subgradient Methods for Online Learning and Stochastic Optimization`_. Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-2) lr_decay (float, optional): learning rate decay (default: 0) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-10) .. _Adaptive Subgradient Methods for Online Learning and Stochastic Optimization: http://jmlr.org/papers/v12/duchi11a.html """ def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0, eps=1e-10): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= lr_decay: raise ValueError("Invalid lr_decay value: {}".format(lr_decay)) if not 0.0 <= weight_decay: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) if not 0.0 <= initial_accumulator_value: raise ValueError("Invalid initial_accumulator_value value: {}".format(initial_accumulator_value)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) defaults = dict(lr=lr, lr_decay=lr_decay, eps=eps, weight_decay=weight_decay, initial_accumulator_value=initial_accumulator_value) super(Adagrad_GC, self).__init__(params, defaults) for group in self.param_groups: for p in group['params']: state = self.state[p] state['step'] = 0 state['sum'] = torch.full_like(p.data, initial_accumulator_value) def share_memory(self): for group in self.param_groups: for p in group['params']: state = self.state[p] state['sum'].share_memory_() def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data state = self.state[p] state['step'] += 1 if group['weight_decay'] != 0: if p.grad.data.is_sparse: raise RuntimeError("weight_decay option is not compatible with sparse gradients") grad = grad.add(group['weight_decay'], p.data) #GC operation for Conv layers if len(list(grad.size()))>1: grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True)) clr = group['lr'] / (1 + (state['step'] - 1) * group['lr_decay']) if grad.is_sparse: grad = grad.coalesce() # the update is non-linear so indices must be unique grad_indices = grad._indices() grad_values = grad._values() size = grad.size() def make_sparse(values): constructor = grad.new if grad_indices.dim() == 0 or values.dim() == 0: return constructor().resize_as_(grad) return constructor(grad_indices, values, size) state['sum'].add_(make_sparse(grad_values.pow(2))) std = state['sum'].sparse_mask(grad) std_values = std._values().sqrt_().add_(group['eps']) p.data.add_(-clr, make_sparse(grad_values / std_values)) else: state['sum'].addcmul_(1, grad, grad) std = state['sum'].sqrt().add_(group['eps']) p.data.addcdiv_(-clr, grad, std) return loss ================================================ FILE: GC_code/CIFAR100/algorithm/Adam.py ================================================ import math import torch from torch.optim.optimizer import Optimizer class Adam_GCC(Optimizer): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad) super(Adam_GCC, self).__init__(params, defaults) def __setstate__(self, state): super(Adam_GCC, self).__setstate__(state) for group in self.param_groups: group.setdefault('amsgrad', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') amsgrad = group['amsgrad'] state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] if amsgrad: max_exp_avg_sq = state['max_exp_avg_sq'] beta1, beta2 = group['betas'] state['step'] += 1 bias_correction1 = 1 - beta1 ** state['step'] bias_correction2 = 1 - beta2 ** state['step'] if group['weight_decay'] != 0: grad.add_(group['weight_decay'], p.data) #GC operation for Conv layers if len(list(grad.size()))>3: grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True)) # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(1 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) else: denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) step_size = group['lr'] / bias_correction1 p.data.addcdiv_(-step_size, exp_avg, denom) return loss class Adam_GCC2(Optimizer): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad) super(Adam_GCC2, self).__init__(params, defaults) def __setstate__(self, state): super(Adam_GCC2, self).__setstate__(state) for group in self.param_groups: group.setdefault('amsgrad', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') amsgrad = group['amsgrad'] state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] if amsgrad: max_exp_avg_sq = state['max_exp_avg_sq'] beta1, beta2 = group['betas'] state['step'] += 1 bias_correction1 = 1 - beta1 ** state['step'] bias_correction2 = 1 - beta2 ** state['step'] if group['weight_decay'] != 0: grad.add_(group['weight_decay'], p.data) # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(1 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) else: denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) step_size = group['lr'] / bias_correction1 #GC operation for Conv layers if len(list(grad.size()))>3: delta=(step_size*exp_avg/denom).clone() delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True)) p.data.add_(-delta) else: p.data.addcdiv_(-step_size, exp_avg, denom) return loss class Adam_GC(Optimizer): r"""Implements Adam algorithm. It has been proposed in `Adam: A Method for Stochastic Optimization`_. Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) amsgrad (boolean, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ (default: False) .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad) super(Adam_GC, self).__init__(params, defaults) def __setstate__(self, state): super(Adam_GC, self).__setstate__(state) for group in self.param_groups: group.setdefault('amsgrad', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') amsgrad = group['amsgrad'] state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] if amsgrad: max_exp_avg_sq = state['max_exp_avg_sq'] beta1, beta2 = group['betas'] state['step'] += 1 bias_correction1 = 1 - beta1 ** state['step'] bias_correction2 = 1 - beta2 ** state['step'] if group['weight_decay'] != 0: grad.add_(group['weight_decay'], p.data) #GC operation for Conv layers and FC layers if len(list(grad.size()))>1: grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True)) # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(1 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) else: denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) step_size = group['lr'] / bias_correction1 p.data.addcdiv_(-step_size, exp_avg, denom) return loss class Adam_GC2(Optimizer): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad) super(Adam_GC2, self).__init__(params, defaults) def __setstate__(self, state): super(Adam_GC2, self).__setstate__(state) for group in self.param_groups: group.setdefault('amsgrad', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') amsgrad = group['amsgrad'] state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] if amsgrad: max_exp_avg_sq = state['max_exp_avg_sq'] beta1, beta2 = group['betas'] state['step'] += 1 bias_correction1 = 1 - beta1 ** state['step'] bias_correction2 = 1 - beta2 ** state['step'] if group['weight_decay'] != 0: grad.add_(group['weight_decay'], p.data) # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(1 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) else: denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) step_size = group['lr'] / bias_correction1 #GC operation for Conv layers and FC layers if len(list(grad.size()))>1: delta=(step_size*exp_avg/denom).clone() delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True)) p.data.add_(-delta) else: p.data.addcdiv_(-step_size, exp_avg, denom) return loss class AdamW(Optimizer): """Implements Adam algorithm. It has been proposed in `Adam: A Method for Stochastic Optimization`_. Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) amsgrad (boolean, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad) super(AdamW, self).__init__(params, defaults) def __setstate__(self, state): super(AdamW, self).__setstate__(state) for group in self.param_groups: group.setdefault('amsgrad', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') amsgrad = group['amsgrad'] state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] if amsgrad: max_exp_avg_sq = state['max_exp_avg_sq'] beta1, beta2 = group['betas'] state['step'] += 1 # if group['weight_decay'] != 0: # grad = grad.add(group['weight_decay'], p.data) # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(1 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = max_exp_avg_sq.sqrt().add_(group['eps']) else: denom = exp_avg_sq.sqrt().add_(group['eps']) bias_correction1 = 1 - beta1 ** state['step'] bias_correction2 = 1 - beta2 ** state['step'] step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 # p.data.addcdiv_(-step_size, exp_avg, denom) p.data.add_(-step_size, torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) ) return loss class AdamW_GCC(Optimizer): """Implements Adam algorithm. It has been proposed in `Adam: A Method for Stochastic Optimization`_. Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) amsgrad (boolean, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad) super(AdamW_GCC, self).__init__(params, defaults) def __setstate__(self, state): super(AdamW_GCC, self).__setstate__(state) for group in self.param_groups: group.setdefault('amsgrad', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') amsgrad = group['amsgrad'] state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] if amsgrad: max_exp_avg_sq = state['max_exp_avg_sq'] beta1, beta2 = group['betas'] #GC operation for Conv layers if len(list(grad.size()))>3: grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True)) state['step'] += 1 # if group['weight_decay'] != 0: # grad = grad.add(group['weight_decay'], p.data) # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(1 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = max_exp_avg_sq.sqrt().add_(group['eps']) else: denom = exp_avg_sq.sqrt().add_(group['eps']) bias_correction1 = 1 - beta1 ** state['step'] bias_correction2 = 1 - beta2 ** state['step'] step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 # p.data.addcdiv_(-step_size, exp_avg, denom) p.data.add_(-step_size, torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) ) return loss class AdamW_GC(Optimizer): """Implements Adam algorithm. It has been proposed in `Adam: A Method for Stochastic Optimization`_. Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) amsgrad (boolean, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad) super(AdamW_GC, self).__init__(params, defaults) def __setstate__(self, state): super(AdamW_GC, self).__setstate__(state) for group in self.param_groups: group.setdefault('amsgrad', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') amsgrad = group['amsgrad'] state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] if amsgrad: max_exp_avg_sq = state['max_exp_avg_sq'] beta1, beta2 = group['betas'] #GC operation for Conv and FC layers if len(list(grad.size()))>1: grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True)) state['step'] += 1 # if group['weight_decay'] != 0: # grad = grad.add(group['weight_decay'], p.data) # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(1 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = max_exp_avg_sq.sqrt().add_(group['eps']) else: denom = exp_avg_sq.sqrt().add_(group['eps']) bias_correction1 = 1 - beta1 ** state['step'] bias_correction2 = 1 - beta2 ** state['step'] step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 # p.data.addcdiv_(-step_size, exp_avg, denom) p.data.add_(-step_size, torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) ) return loss class AdamW_GCC2(Optimizer): """Implements Adam algorithm. It has been proposed in `Adam: A Method for Stochastic Optimization`_. Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) amsgrad (boolean, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad) super(AdamW_GCC2, self).__init__(params, defaults) def __setstate__(self, state): super(AdamW_GCC2, self).__setstate__(state) for group in self.param_groups: group.setdefault('amsgrad', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') amsgrad = group['amsgrad'] state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] if amsgrad: max_exp_avg_sq = state['max_exp_avg_sq'] beta1, beta2 = group['betas'] state['step'] += 1 # if group['weight_decay'] != 0: # grad = grad.add(group['weight_decay'], p.data) # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(1 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = max_exp_avg_sq.sqrt().add_(group['eps']) else: denom = exp_avg_sq.sqrt().add_(group['eps']) bias_correction1 = 1 - beta1 ** state['step'] bias_correction2 = 1 - beta2 ** state['step'] step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 # GC operation for Conv layers if len(list(grad.size()))>3: delta=(step_size*torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom)).clone() delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True)) p.data.add_(-delta) else: p.data.add_(-step_size, torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) ) return loss class AdamW_GC2(Optimizer): """Implements Adam algorithm. It has been proposed in `Adam: A Method for Stochastic Optimization`_. Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) amsgrad (boolean, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad) super(AdamW_GC2, self).__init__(params, defaults) def __setstate__(self, state): super(AdamW_GC2, self).__setstate__(state) for group in self.param_groups: group.setdefault('amsgrad', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') amsgrad = group['amsgrad'] state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] if amsgrad: max_exp_avg_sq = state['max_exp_avg_sq'] beta1, beta2 = group['betas'] state['step'] += 1 # if group['weight_decay'] != 0: # grad = grad.add(group['weight_decay'], p.data) # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(1 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = max_exp_avg_sq.sqrt().add_(group['eps']) else: denom = exp_avg_sq.sqrt().add_(group['eps']) bias_correction1 = 1 - beta1 ** state['step'] bias_correction2 = 1 - beta2 ** state['step'] step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 # GC operation for Conv and FC layers if len(list(grad.size()))>1: delta=(step_size*torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom)).clone() delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True)) p.data.add_(-delta) else: p.data.add_(-step_size, torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) ) return loss ================================================ FILE: GC_code/CIFAR100/algorithm/SGD.py ================================================ import torch from torch.optim.optimizer import Optimizer, required class SGD_GCC(Optimizer): def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGD_GCC, self).__init__(params, defaults) def __setstate__(self, state): super(SGD_GCC, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data if weight_decay != 0: d_p.add_(weight_decay, p.data) #GC operation for Conv layers if len(list(d_p.size()))>3: d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True)) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) return loss class SGD_GC(Optimizer): def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGD_GC, self).__init__(params, defaults) def __setstate__(self, state): super(SGD_GC, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data if weight_decay != 0: d_p.add_(weight_decay, p.data) #GC operation for Conv layers and FC layers if len(list(d_p.size()))>1: d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True)) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) return loss class SGDW(Optimizer): def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGDW, self).__init__(params, defaults) def __setstate__(self, state): super(SGDW, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data old = torch.clone(p.data).detach() #if weight_decay != 0: # d_p.add_(weight_decay, p.data) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.zeros_like(p.data) buf.mul_(momentum).add_(d_p) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) if weight_decay != 0: p.data.add_(-weight_decay*group['lr'], old) return loss class SGDW_GCC(Optimizer): def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGDW_GCC, self).__init__(params, defaults) def __setstate__(self, state): super(SGDW_GCC, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data old = torch.clone(p.data).detach() #if weight_decay != 0: # d_p.add_(weight_decay, p.data) #GC operation for Conv layers if len(list(d_p.size()))>3: d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True)) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.zeros_like(p.data) buf.mul_(momentum).add_(d_p) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) if weight_decay != 0: p.data.add_(-weight_decay*group['lr'], old) return loss class SGDW_GC(Optimizer): def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGDW_GC, self).__init__(params, defaults) def __setstate__(self, state): super(SGDW_GC, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data old = torch.clone(p.data).detach() #if weight_decay != 0: # d_p.add_(weight_decay, p.data) #GC operation for Conv and FC layers if len(list(d_p.size()))>1: d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True)) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.zeros_like(p.data) buf.mul_(momentum).add_(d_p) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) if weight_decay != 0: p.data.add_(-weight_decay*group['lr'], old) return loss ================================================ FILE: GC_code/CIFAR100/main.py ================================================ '''Train CIFAR100 with PyTorch.''' from __future__ import print_function import torch import torch.nn as nn import torch.backends.cudnn as cudnn import torch.optim as optim import torch.nn.functional as F import torchvision import torchvision.transforms as transforms from torch.optim import lr_scheduler import os import argparse from torchvision import datasets, models from models import * #from utils import progress_bar import numpy as np #import optimizers with GC from algorithm.SGD import * from algorithm.Adam import * from algorithm.Adagrad import * parser = argparse.ArgumentParser(description='PyTorch CIFAR100 Training') parser.add_argument('--lr', default=0.1, type=float, help='learning rate') parser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint') parser.add_argument('--bs', default=128, type=int, help='batchsize') parser.add_argument('--wd', default=0.0005, type=float, help='weight decay') parser.add_argument('--alg', default='sgd', type=str, help='algorithm') parser.add_argument('--epochs', default=200, type=int, help='epochs') parser.add_argument('--path', default='logout/result', type=str, help='path') parser.add_argument('--model', default='r50', type=str, help='model') args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"]="0" epochs=args.epochs device = 'cuda' if torch.cuda.is_available() else 'cpu' best_acc = 0 # best test accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch # Data print('==> Preparing data..') transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.507, 0.487, 0.441), (0.267, 0.256, 0.276)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.507, 0.487, 0.441), (0.267, 0.256, 0.276)), ]) trainset = torchvision.datasets.CIFAR100(root='/home/yonghw/data/cifar100/', train=True, download=True, transform=transform_train) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=4,drop_last=True) testset = torchvision.datasets.CIFAR100(root='/home/yonghw/data/cifar100/', train=False, download=True, transform=transform_test) testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=4) # Model print('==> Building model..') Num_classes = 100 if args.model=='r18': net = ResNet18(Num_classes=Num_classes) if args.model=='r34': net = ResNet34(Num_classes=Num_classes) if args.model=='r50': net = ResNet50(Num_classes=Num_classes) if args.model=='r101': net = ResNet101(Num_classes=Num_classes) if args.model=='v11': net = VGG('VGG11',Num_classes=Num_classes) if args.model=='rx29': net = ResNeXt29_4x64d(Num_classes=Num_classes) if args.model=='d121': net = DenseNet121(Num_classes=Num_classes) if device == 'cuda': net = net.cuda() net = torch.nn.DataParallel(net) cudnn.benchmark = True if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!' checkpoint = torch.load('./checkpoint/ckpt.t7') net.load_state_dict(checkpoint['net']) best_acc = checkpoint['acc'] start_epoch = checkpoint['epoch'] criterion = nn.CrossEntropyLoss() #optimizer WD=args.wd print('==> choose optimizer..') if args.alg=='sgd': optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD) if args.alg=='sgdGC': optimizer = SGD_GC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD) if args.alg=='sgdGCC': optimizer = SGD_GCC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD) if args.alg=='adam': optimizer = optim.Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD) if args.alg=='adamGC': optimizer = Adam_GC(net.parameters(), lr=args.lr*0.01, weight_decay = WD) if args.alg=='adamGCC': optimizer = Adam_GCC(net.parameters(), lr=args.lr*0.01, weight_decay = WD) if args.alg=='adamGC2': optimizer = Adam_GC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD) if args.alg=='adamGCC2': optimizer = Adam_GCC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD) if args.alg=='adagrad': optimizer = optim.Adagrad(net.parameters(), lr=args.lr*0.1,weight_decay = WD) if args.alg=='adagradGC': optimizer = Adagrad_GC(net.parameters(), lr=args.lr*0.1,weight_decay = WD) if args.alg=='adagradGCC': optimizer = Adagrad_GCC(net.parameters(), lr=args.lr*0.1,weight_decay = WD) if args.alg=='adagradGC2': optimizer = Adagrad_GC2(net.parameters(), lr=args.lr*0.1,weight_decay = WD) if args.alg=='adagradGCC2': optimizer = Adagrad_GCC2(net.parameters(), lr=args.lr*0.1,weight_decay = WD) if args.alg=='sgdW': optimizer = SGDW(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD) if args.alg=='sgdWGC': optimizer = SGDW_GC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD) if args.alg=='sgdWGCC': optimizer = SGDW_GCC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD) if args.alg=='adamW': optimizer = AdamW(net.parameters(), lr=args.lr*0.01, weight_decay = WD) if args.alg=='adamWGC': optimizer = Adam_GC(net.parameters(), lr=args.lr*0.01, weight_decay = WD) if args.alg=='adamWGCC': optimizer = Adam_GCC(net.parameters(), lr=args.lr*0.01, weight_decay = WD) if args.alg=='adamWGC2': optimizer = Adam_GC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD) if args.alg=='adamWGCC2': optimizer = Adam_GCC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD) exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=60, gamma=0.1) # Training def train(epoch,net,optimizer): print('\nEpoch: %d' % epoch) net.train() train_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(trainloader): inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() print('Training: Loss: {:.4f} | Acc: {:.4f}'.format(train_loss/(batch_idx+1),correct/total)) # progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' # % (train_loss/(batch_idx+1), 100.*correct/total, correct, total)) acc=100.*correct/total return acc # Testing def test(epoch,net): global best_acc net.eval() test_loss = 0 correct = 0 total = 0 with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(testloader): inputs, targets = inputs.to(device), targets.to(device) outputs = net(inputs) loss = criterion(outputs, targets) test_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() #progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' #% (test_loss/(batch_idx+1), 100.*correct/total, correct, total)) print('Testing:Loss: {:.4f} | Acc: {:.4f}'.format(test_loss/(batch_idx+1),correct/total) ) # Save checkpoint. acc = 100.*correct/total if acc > best_acc: print('Saving..') state = { 'net': net.state_dict(), 'acc': acc, 'epoch': epoch, } if not os.path.isdir('checkpoint'): os.mkdir('checkpoint') torch.save(state, './checkpoint/ckpt.t7') best_acc = acc return acc for epoch in range(start_epoch, start_epoch+epochs): train_acc=train(epoch,net,optimizer) exp_lr_scheduler.step() val_acc=test(epoch,net) ================================================ FILE: GC_code/CIFAR100/models/__init__.py ================================================ from .vgg import * from .dpn import * from .lenet import * from .senet import * from .pnasnet import * from .densenet import * from .googlenet import * from .shufflenet import * from .resnet import * from .resnext import * from .preact_resnet import * from .mobilenet import * from .mobilenetv2 import * ================================================ FILE: GC_code/CIFAR100/models/densenet.py ================================================ '''DenseNet in PyTorch.''' import math import torch import torch.nn as nn import torch.nn.functional as F class Bottleneck(nn.Module): def __init__(self, in_planes, growth_rate): super(Bottleneck, self).__init__() self.bn1 = nn.BatchNorm2d(in_planes) self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False) self.bn2 = nn.BatchNorm2d(4*growth_rate) self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False) def forward(self, x): out = self.conv1(F.relu(self.bn1(x))) out = self.conv2(F.relu(self.bn2(out))) out = torch.cat([out,x], 1) return out class Transition(nn.Module): def __init__(self, in_planes, out_planes): super(Transition, self).__init__() self.bn = nn.BatchNorm2d(in_planes) self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False) def forward(self, x): out = self.conv(F.relu(self.bn(x))) out = F.avg_pool2d(out, 2) return out class DenseNet(nn.Module): def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10): super(DenseNet, self).__init__() self.growth_rate = growth_rate num_planes = 2*growth_rate self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False) self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0]) num_planes += nblocks[0]*growth_rate out_planes = int(math.floor(num_planes*reduction)) self.trans1 = Transition(num_planes, out_planes) num_planes = out_planes self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1]) num_planes += nblocks[1]*growth_rate out_planes = int(math.floor(num_planes*reduction)) self.trans2 = Transition(num_planes, out_planes) num_planes = out_planes self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2]) num_planes += nblocks[2]*growth_rate out_planes = int(math.floor(num_planes*reduction)) self.trans3 = Transition(num_planes, out_planes) num_planes = out_planes self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3]) num_planes += nblocks[3]*growth_rate self.bn = nn.BatchNorm2d(num_planes) self.linear = nn.Linear(num_planes, num_classes) def _make_dense_layers(self, block, in_planes, nblock): layers = [] for i in range(nblock): layers.append(block(in_planes, self.growth_rate)) in_planes += self.growth_rate return nn.Sequential(*layers) def forward(self, x): out = self.conv1(x) out = self.trans1(self.dense1(out)) out = self.trans2(self.dense2(out)) out = self.trans3(self.dense3(out)) out = self.dense4(out) out = F.avg_pool2d(F.relu(self.bn(out)), 4) out = out.view(out.size(0), -1) out = self.linear(out) return out def DenseNet121(Num_classes=10): return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32, num_classes=Num_classes) def DenseNet169(Num_classes=10): return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32, num_classes=Num_classes) def DenseNet201(Num_classes=10): return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32, num_classes=Num_classes) def DenseNet161(Num_classes=10): return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48, num_classes=Num_classes) def densenet_cifar(Num_classes=10): return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12, num_classes=Num_classes) def test(): net = densenet_cifar() x = torch.randn(1,3,32,32) y = net(x) print(y) # test() ================================================ FILE: GC_code/CIFAR100/models/dpn.py ================================================ '''Dual Path Networks in PyTorch.''' import torch import torch.nn as nn import torch.nn.functional as F class Bottleneck(nn.Module): def __init__(self, last_planes, in_planes, out_planes, dense_depth, stride, first_layer): super(Bottleneck, self).__init__() self.out_planes = out_planes self.dense_depth = dense_depth self.conv1 = nn.Conv2d(last_planes, in_planes, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(in_planes) self.conv2 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=32, bias=False) self.bn2 = nn.BatchNorm2d(in_planes) self.conv3 = nn.Conv2d(in_planes, out_planes+dense_depth, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(out_planes+dense_depth) self.shortcut = nn.Sequential() if first_layer: self.shortcut = nn.Sequential( nn.Conv2d(last_planes, out_planes+dense_depth, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(out_planes+dense_depth) ) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = F.relu(self.bn2(self.conv2(out))) out = self.bn3(self.conv3(out)) x = self.shortcut(x) d = self.out_planes out = torch.cat([x[:,:d,:,:]+out[:,:d,:,:], x[:,d:,:,:], out[:,d:,:,:]], 1) out = F.relu(out) return out class DPN(nn.Module): def __init__(self, cfg): super(DPN, self).__init__() in_planes, out_planes = cfg['in_planes'], cfg['out_planes'] num_blocks, dense_depth = cfg['num_blocks'], cfg['dense_depth'] self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(64) self.last_planes = 64 self.layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1) self.layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2) self.layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=2) self.layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2) self.linear = nn.Linear(out_planes[3]+(num_blocks[3]+1)*dense_depth[3], 10) def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride): strides = [stride] + [1]*(num_blocks-1) layers = [] for i,stride in enumerate(strides): layers.append(Bottleneck(self.last_planes, in_planes, out_planes, dense_depth, stride, i==0)) self.last_planes = out_planes + (i+2) * dense_depth return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) out = self.layer4(out) out = F.avg_pool2d(out, 4) out = out.view(out.size(0), -1) out = self.linear(out) return out def DPN26(): cfg = { 'in_planes': (96,192,384,768), 'out_planes': (256,512,1024,2048), 'num_blocks': (2,2,2,2), 'dense_depth': (16,32,24,128) } return DPN(cfg) def DPN92(): cfg = { 'in_planes': (96,192,384,768), 'out_planes': (256,512,1024,2048), 'num_blocks': (3,4,20,3), 'dense_depth': (16,32,24,128) } return DPN(cfg) def test(): net = DPN92() x = torch.randn(1,3,32,32) y = net(x) print(y) # test() ================================================ FILE: GC_code/CIFAR100/models/googlenet.py ================================================ '''GoogLeNet with PyTorch.''' import torch import torch.nn as nn import torch.nn.functional as F class Inception(nn.Module): def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes): super(Inception, self).__init__() # 1x1 conv branch self.b1 = nn.Sequential( nn.Conv2d(in_planes, n1x1, kernel_size=1), nn.BatchNorm2d(n1x1), nn.ReLU(True), ) # 1x1 conv -> 3x3 conv branch self.b2 = nn.Sequential( nn.Conv2d(in_planes, n3x3red, kernel_size=1), nn.BatchNorm2d(n3x3red), nn.ReLU(True), nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1), nn.BatchNorm2d(n3x3), nn.ReLU(True), ) # 1x1 conv -> 5x5 conv branch self.b3 = nn.Sequential( nn.Conv2d(in_planes, n5x5red, kernel_size=1), nn.BatchNorm2d(n5x5red), nn.ReLU(True), nn.Conv2d(n5x5red, n5x5, kernel_size=3, padding=1), nn.BatchNorm2d(n5x5), nn.ReLU(True), nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1), nn.BatchNorm2d(n5x5), nn.ReLU(True), ) # 3x3 pool -> 1x1 conv branch self.b4 = nn.Sequential( nn.MaxPool2d(3, stride=1, padding=1), nn.Conv2d(in_planes, pool_planes, kernel_size=1), nn.BatchNorm2d(pool_planes), nn.ReLU(True), ) def forward(self, x): y1 = self.b1(x) y2 = self.b2(x) y3 = self.b3(x) y4 = self.b4(x) return torch.cat([y1,y2,y3,y4], 1) class GoogLeNet(nn.Module): def __init__(self): super(GoogLeNet, self).__init__() self.pre_layers = nn.Sequential( nn.Conv2d(3, 192, kernel_size=3, padding=1), nn.BatchNorm2d(192), nn.ReLU(True), ) self.a3 = Inception(192, 64, 96, 128, 16, 32, 32) self.b3 = Inception(256, 128, 128, 192, 32, 96, 64) self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) self.a4 = Inception(480, 192, 96, 208, 16, 48, 64) self.b4 = Inception(512, 160, 112, 224, 24, 64, 64) self.c4 = Inception(512, 128, 128, 256, 24, 64, 64) self.d4 = Inception(512, 112, 144, 288, 32, 64, 64) self.e4 = Inception(528, 256, 160, 320, 32, 128, 128) self.a5 = Inception(832, 256, 160, 320, 32, 128, 128) self.b5 = Inception(832, 384, 192, 384, 48, 128, 128) self.avgpool = nn.AvgPool2d(8, stride=1) self.linear = nn.Linear(1024, 10) def forward(self, x): out = self.pre_layers(x) out = self.a3(out) out = self.b3(out) out = self.maxpool(out) out = self.a4(out) out = self.b4(out) out = self.c4(out) out = self.d4(out) out = self.e4(out) out = self.maxpool(out) out = self.a5(out) out = self.b5(out) out = self.avgpool(out) out = out.view(out.size(0), -1) out = self.linear(out) return out def test(): net = GoogLeNet() x = torch.randn(1,3,32,32) y = net(x) print(y.size()) # test() ================================================ FILE: GC_code/CIFAR100/models/lenet.py ================================================ '''LeNet in PyTorch.''' import torch.nn as nn import torch.nn.functional as F class LeNet(nn.Module): def __init__(self): super(LeNet, self).__init__() self.conv1 = nn.Conv2d(3, 6, 5) self.conv2 = nn.Conv2d(6, 16, 5) self.fc1 = nn.Linear(16*5*5, 120) self.fc2 = nn.Linear(120, 84) self.fc3 = nn.Linear(84, 10) def forward(self, x): out = F.relu(self.conv1(x)) out = F.max_pool2d(out, 2) out = F.relu(self.conv2(out)) out = F.max_pool2d(out, 2) out = out.view(out.size(0), -1) out = F.relu(self.fc1(out)) out = F.relu(self.fc2(out)) out = self.fc3(out) return out ================================================ FILE: GC_code/CIFAR100/models/mobilenet.py ================================================ '''MobileNet in PyTorch. See the paper "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" for more details. ''' import torch import torch.nn as nn import torch.nn.functional as F class Block(nn.Module): '''Depthwise conv + Pointwise conv''' def __init__(self, in_planes, out_planes, stride=1): super(Block, self).__init__() self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False) self.bn1 = nn.BatchNorm2d(in_planes) self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) self.bn2 = nn.BatchNorm2d(out_planes) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = F.relu(self.bn2(self.conv2(out))) return out class MobileNet(nn.Module): # (128,2) means conv planes=128, conv stride=2, by default conv stride=1 cfg = [64, (128,2), 128, (256,2), 256, (512,2), 512, 512, 512, 512, 512, (1024,2), 1024] def __init__(self, num_classes=10): super(MobileNet, self).__init__() self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(32) self.layers = self._make_layers(in_planes=32) self.linear = nn.Linear(1024, num_classes) def _make_layers(self, in_planes): layers = [] for x in self.cfg: out_planes = x if isinstance(x, int) else x[0] stride = 1 if isinstance(x, int) else x[1] layers.append(Block(in_planes, out_planes, stride)) in_planes = out_planes return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layers(out) out = F.avg_pool2d(out, 2) out = out.view(out.size(0), -1) out = self.linear(out) return out def test(): net = MobileNet() x = torch.randn(1,3,32,32) y = net(x) print(y.size()) # test() ================================================ FILE: GC_code/CIFAR100/models/mobilenetv2.py ================================================ '''MobileNetV2 in PyTorch. See the paper "Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification, Detection and Segmentation" for more details. ''' import torch import torch.nn as nn import torch.nn.functional as F class Block(nn.Module): '''expand + depthwise + pointwise''' def __init__(self, in_planes, out_planes, expansion, stride): super(Block, self).__init__() self.stride = stride planes = expansion * in_planes self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) self.bn3 = nn.BatchNorm2d(out_planes) self.shortcut = nn.Sequential() if stride == 1 and in_planes != out_planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False), nn.BatchNorm2d(out_planes), ) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = F.relu(self.bn2(self.conv2(out))) out = self.bn3(self.conv3(out)) out = out + self.shortcut(x) if self.stride==1 else out return out class MobileNetV2(nn.Module): # (expansion, out_planes, num_blocks, stride) cfg = [(1, 16, 1, 1), (6, 24, 2, 1), # NOTE: change stride 2 -> 1 for CIFAR10 (6, 32, 3, 2), (6, 64, 4, 2), (6, 96, 3, 1), (6, 160, 3, 2), (6, 320, 1, 1)] def __init__(self, num_classes=10): super(MobileNetV2, self).__init__() # NOTE: change conv1 stride 2 -> 1 for CIFAR10 self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(32) self.layers = self._make_layers(in_planes=32) self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False) self.bn2 = nn.BatchNorm2d(1280) self.linear = nn.Linear(1280, num_classes) def _make_layers(self, in_planes): layers = [] for expansion, out_planes, num_blocks, stride in self.cfg: strides = [stride] + [1]*(num_blocks-1) for stride in strides: layers.append(Block(in_planes, out_planes, expansion, stride)) in_planes = out_planes return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layers(out) out = F.relu(self.bn2(self.conv2(out))) # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10 out = F.avg_pool2d(out, 4) out = out.view(out.size(0), -1) out = self.linear(out) return out def test(): net = MobileNetV2() x = torch.randn(2,3,32,32) y = net(x) print(y.size()) # test() ================================================ FILE: GC_code/CIFAR100/models/pnasnet.py ================================================ '''PNASNet in PyTorch. Paper: Progressive Neural Architecture Search ''' import torch import torch.nn as nn import torch.nn.functional as F class SepConv(nn.Module): '''Separable Convolution.''' def __init__(self, in_planes, out_planes, kernel_size, stride): super(SepConv, self).__init__() self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding=(kernel_size-1)//2, bias=False, groups=in_planes) self.bn1 = nn.BatchNorm2d(out_planes) def forward(self, x): return self.bn1(self.conv1(x)) class CellA(nn.Module): def __init__(self, in_planes, out_planes, stride=1): super(CellA, self).__init__() self.stride = stride self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride) if stride==2: self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) self.bn1 = nn.BatchNorm2d(out_planes) def forward(self, x): y1 = self.sep_conv1(x) y2 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1) if self.stride==2: y2 = self.bn1(self.conv1(y2)) return F.relu(y1+y2) class CellB(nn.Module): def __init__(self, in_planes, out_planes, stride=1): super(CellB, self).__init__() self.stride = stride # Left branch self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride) self.sep_conv2 = SepConv(in_planes, out_planes, kernel_size=3, stride=stride) # Right branch self.sep_conv3 = SepConv(in_planes, out_planes, kernel_size=5, stride=stride) if stride==2: self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) self.bn1 = nn.BatchNorm2d(out_planes) # Reduce channels self.conv2 = nn.Conv2d(2*out_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) self.bn2 = nn.BatchNorm2d(out_planes) def forward(self, x): # Left branch y1 = self.sep_conv1(x) y2 = self.sep_conv2(x) # Right branch y3 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1) if self.stride==2: y3 = self.bn1(self.conv1(y3)) y4 = self.sep_conv3(x) # Concat & reduce channels b1 = F.relu(y1+y2) b2 = F.relu(y3+y4) y = torch.cat([b1,b2], 1) return F.relu(self.bn2(self.conv2(y))) class PNASNet(nn.Module): def __init__(self, cell_type, num_cells, num_planes): super(PNASNet, self).__init__() self.in_planes = num_planes self.cell_type = cell_type self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(num_planes) self.layer1 = self._make_layer(num_planes, num_cells=6) self.layer2 = self._downsample(num_planes*2) self.layer3 = self._make_layer(num_planes*2, num_cells=6) self.layer4 = self._downsample(num_planes*4) self.layer5 = self._make_layer(num_planes*4, num_cells=6) self.linear = nn.Linear(num_planes*4, 10) def _make_layer(self, planes, num_cells): layers = [] for _ in range(num_cells): layers.append(self.cell_type(self.in_planes, planes, stride=1)) self.in_planes = planes return nn.Sequential(*layers) def _downsample(self, planes): layer = self.cell_type(self.in_planes, planes, stride=2) self.in_planes = planes return layer def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) out = self.layer4(out) out = self.layer5(out) out = F.avg_pool2d(out, 8) out = self.linear(out.view(out.size(0), -1)) return out def PNASNetA(): return PNASNet(CellA, num_cells=6, num_planes=44) def PNASNetB(): return PNASNet(CellB, num_cells=6, num_planes=32) def test(): net = PNASNetB() x = torch.randn(1,3,32,32) y = net(x) print(y) # test() ================================================ FILE: GC_code/CIFAR100/models/preact_resnet.py ================================================ '''Pre-activation ResNet in PyTorch. Reference: [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Identity Mappings in Deep Residual Networks. arXiv:1603.05027 ''' import torch import torch.nn as nn import torch.nn.functional as F class PreActBlock(nn.Module): '''Pre-activation version of the BasicBlock.''' expansion = 1 def __init__(self, in_planes, planes, stride=1): super(PreActBlock, self).__init__() self.bn1 = nn.BatchNorm2d(in_planes) self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) if stride != 1 or in_planes != self.expansion*planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) ) def forward(self, x): out = F.relu(self.bn1(x)) shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x out = self.conv1(out) out = self.conv2(F.relu(self.bn2(out))) out += shortcut return out class PreActBottleneck(nn.Module): '''Pre-activation version of the original Bottleneck module.''' expansion = 4 def __init__(self, in_planes, planes, stride=1): super(PreActBottleneck, self).__init__() self.bn1 = nn.BatchNorm2d(in_planes) self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn3 = nn.BatchNorm2d(planes) self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) if stride != 1 or in_planes != self.expansion*planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) ) def forward(self, x): out = F.relu(self.bn1(x)) shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x out = self.conv1(out) out = self.conv2(F.relu(self.bn2(out))) out = self.conv3(F.relu(self.bn3(out))) out += shortcut return out class PreActResNet(nn.Module): def __init__(self, block, num_blocks, num_classes=10): super(PreActResNet, self).__init__() self.in_planes = 64 self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) self.linear = nn.Linear(512*block.expansion, num_classes) def _make_layer(self, block, planes, num_blocks, stride): strides = [stride] + [1]*(num_blocks-1) layers = [] for stride in strides: layers.append(block(self.in_planes, planes, stride)) self.in_planes = planes * block.expansion return nn.Sequential(*layers) def forward(self, x): out = self.conv1(x) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) out = self.layer4(out) out = F.avg_pool2d(out, 4) out = out.view(out.size(0), -1) out = self.linear(out) return out def PreActResNet18(): return PreActResNet(PreActBlock, [2,2,2,2]) def PreActResNet34(): return PreActResNet(PreActBlock, [3,4,6,3]) def PreActResNet50(): return PreActResNet(PreActBottleneck, [3,4,6,3]) def PreActResNet101(): return PreActResNet(PreActBottleneck, [3,4,23,3]) def PreActResNet152(): return PreActResNet(PreActBottleneck, [3,8,36,3]) def test(): net = PreActResNet18() y = net((torch.randn(1,3,32,32))) print(y.size()) # test() ================================================ FILE: GC_code/CIFAR100/models/resnet.py ================================================ '''ResNet in PyTorch. For Pre-activation ResNet, see 'preact_resnet.py'. Reference: [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Deep Residual Learning for Image Recognition. arXiv:1512.03385 ''' import torch import torch.nn as nn import torch.nn.functional as F class BasicBlock(nn.Module): expansion = 1 def __init__(self, in_planes, planes, stride=1): super(BasicBlock, self).__init__() self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.shortcut = nn.Sequential() if stride != 1 or in_planes != self.expansion*planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(self.expansion*planes) ) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.bn2(self.conv2(out)) out += self.shortcut(x) out = F.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, in_planes, planes, stride=1): super(Bottleneck, self).__init__() self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(self.expansion*planes) self.shortcut = nn.Sequential() if stride != 1 or in_planes != self.expansion*planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(self.expansion*planes) ) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = F.relu(self.bn2(self.conv2(out))) out = self.bn3(self.conv3(out)) out += self.shortcut(x) out = F.relu(out) return out class ResNet(nn.Module): def __init__(self, block, num_blocks, num_classes=10): super(ResNet, self).__init__() self.in_planes = 64 self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(64) self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) self.linear = nn.Linear(512*block.expansion, num_classes) def _make_layer(self, block, planes, num_blocks, stride): strides = [stride] + [1]*(num_blocks-1) layers = [] for stride in strides: layers.append(block(self.in_planes, planes, stride)) self.in_planes = planes * block.expansion return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) out = self.layer4(out) out = F.avg_pool2d(out, 4) out = out.view(out.size(0), -1) out = self.linear(out) return out def ResNet18(Num_classes=10): return ResNet(BasicBlock, [2,2,2,2],num_classes=Num_classes) def ResNet34(Num_classes=10): return ResNet(BasicBlock, [3,4,6,3],num_classes=Num_classes) def ResNet50(Num_classes=10): return ResNet(Bottleneck, [3,4,6,3],num_classes=Num_classes) def ResNet101(Num_classes=10): return ResNet(Bottleneck, [3,4,23,3],num_classes=Num_classes) def ResNet152(Num_classes=10): return ResNet(Bottleneck, [3,8,36,3],num_classes=Num_classes) def test(): net = ResNet18() y = net(torch.randn(1,3,32,32)) print(y.size()) # test() ================================================ FILE: GC_code/CIFAR100/models/resnext.py ================================================ '''ResNeXt in PyTorch. See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details. ''' import torch import torch.nn as nn import torch.nn.functional as F class Block(nn.Module): '''Grouped convolution block.''' expansion = 2 def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1): super(Block, self).__init__() group_width = cardinality * bottleneck_width self.conv1 = nn.Conv2d(in_planes, group_width, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(group_width) self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False) self.bn2 = nn.BatchNorm2d(group_width) self.conv3 = nn.Conv2d(group_width, self.expansion*group_width, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(self.expansion*group_width) self.shortcut = nn.Sequential() if stride != 1 or in_planes != self.expansion*group_width: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, self.expansion*group_width, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(self.expansion*group_width) ) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = F.relu(self.bn2(self.conv2(out))) out = self.bn3(self.conv3(out)) out += self.shortcut(x) out = F.relu(out) return out class ResNeXt(nn.Module): def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10): super(ResNeXt, self).__init__() self.cardinality = cardinality self.bottleneck_width = bottleneck_width self.in_planes = 64 self.conv1 = nn.Conv2d(3, 64, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(64) self.layer1 = self._make_layer(num_blocks[0], 1) self.layer2 = self._make_layer(num_blocks[1], 2) self.layer3 = self._make_layer(num_blocks[2], 2) # self.layer4 = self._make_layer(num_blocks[3], 2) self.linear = nn.Linear(cardinality*bottleneck_width*8, num_classes) def _make_layer(self, num_blocks, stride): strides = [stride] + [1]*(num_blocks-1) layers = [] for stride in strides: layers.append(Block(self.in_planes, self.cardinality, self.bottleneck_width, stride)) self.in_planes = Block.expansion * self.cardinality * self.bottleneck_width # Increase bottleneck_width by 2 after each stage. self.bottleneck_width *= 2 return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) # out = self.layer4(out) out = F.avg_pool2d(out, 8) out = out.view(out.size(0), -1) out = self.linear(out) return out def ResNeXt29_2x64d(Num_classes=10): return ResNeXt(num_blocks=[3,3,3], cardinality=2, bottleneck_width=64,num_classes=Num_classes) def ResNeXt29_4x64d(Num_classes=10): return ResNeXt(num_blocks=[3,3,3], cardinality=4, bottleneck_width=64,num_classes=Num_classes) def ResNeXt29_8x64d(Num_classes=10): return ResNeXt(num_blocks=[3,3,3], cardinality=8, bottleneck_width=64,num_classes=Num_classes) def ResNeXt29_32x4d(Num_classes=10): return ResNeXt(num_blocks=[3,3,3], cardinality=32, bottleneck_width=4,num_classes=Num_classes) def test_resnext(): net = ResNeXt29_2x64d() x = torch.randn(1,3,32,32) y = net(x) print(y.size()) # test_resnext() ================================================ FILE: GC_code/CIFAR100/models/senet.py ================================================ '''SENet in PyTorch. SENet is the winner of ImageNet-2017. The paper is not released yet. ''' import torch import torch.nn as nn import torch.nn.functional as F class BasicBlock(nn.Module): def __init__(self, in_planes, planes, stride=1): super(BasicBlock, self).__init__() self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.shortcut = nn.Sequential() if stride != 1 or in_planes != planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes) ) # SE layers self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1) # Use nn.Conv2d instead of nn.Linear self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.bn2(self.conv2(out)) # Squeeze w = F.avg_pool2d(out, out.size(2)) w = F.relu(self.fc1(w)) w = F.sigmoid(self.fc2(w)) # Excitation out = out * w # New broadcasting feature from v0.2! out += self.shortcut(x) out = F.relu(out) return out class PreActBlock(nn.Module): def __init__(self, in_planes, planes, stride=1): super(PreActBlock, self).__init__() self.bn1 = nn.BatchNorm2d(in_planes) self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) if stride != 1 or in_planes != planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False) ) # SE layers self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1) self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1) def forward(self, x): out = F.relu(self.bn1(x)) shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x out = self.conv1(out) out = self.conv2(F.relu(self.bn2(out))) # Squeeze w = F.avg_pool2d(out, out.size(2)) w = F.relu(self.fc1(w)) w = F.sigmoid(self.fc2(w)) # Excitation out = out * w out += shortcut return out class SENet(nn.Module): def __init__(self, block, num_blocks, num_classes=10): super(SENet, self).__init__() self.in_planes = 64 self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(64) self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) self.linear = nn.Linear(512, num_classes) def _make_layer(self, block, planes, num_blocks, stride): strides = [stride] + [1]*(num_blocks-1) layers = [] for stride in strides: layers.append(block(self.in_planes, planes, stride)) self.in_planes = planes return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) out = self.layer4(out) out = F.avg_pool2d(out, 4) out = out.view(out.size(0), -1) out = self.linear(out) return out def SENet18(): return SENet(PreActBlock, [2,2,2,2]) def test(): net = SENet18() y = net(torch.randn(1,3,32,32)) print(y.size()) # test() ================================================ FILE: GC_code/CIFAR100/models/shufflenet.py ================================================ '''ShuffleNet in PyTorch. See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details. ''' import torch import torch.nn as nn import torch.nn.functional as F class ShuffleBlock(nn.Module): def __init__(self, groups): super(ShuffleBlock, self).__init__() self.groups = groups def forward(self, x): '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]''' N,C,H,W = x.size() g = self.groups return x.view(N,g,C/g,H,W).permute(0,2,1,3,4).contiguous().view(N,C,H,W) class Bottleneck(nn.Module): def __init__(self, in_planes, out_planes, stride, groups): super(Bottleneck, self).__init__() self.stride = stride mid_planes = out_planes/4 g = 1 if in_planes==24 else groups self.conv1 = nn.Conv2d(in_planes, mid_planes, kernel_size=1, groups=g, bias=False) self.bn1 = nn.BatchNorm2d(mid_planes) self.shuffle1 = ShuffleBlock(groups=g) self.conv2 = nn.Conv2d(mid_planes, mid_planes, kernel_size=3, stride=stride, padding=1, groups=mid_planes, bias=False) self.bn2 = nn.BatchNorm2d(mid_planes) self.conv3 = nn.Conv2d(mid_planes, out_planes, kernel_size=1, groups=groups, bias=False) self.bn3 = nn.BatchNorm2d(out_planes) self.shortcut = nn.Sequential() if stride == 2: self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1)) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.shuffle1(out) out = F.relu(self.bn2(self.conv2(out))) out = self.bn3(self.conv3(out)) res = self.shortcut(x) out = F.relu(torch.cat([out,res], 1)) if self.stride==2 else F.relu(out+res) return out class ShuffleNet(nn.Module): def __init__(self, cfg): super(ShuffleNet, self).__init__() out_planes = cfg['out_planes'] num_blocks = cfg['num_blocks'] groups = cfg['groups'] self.conv1 = nn.Conv2d(3, 24, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(24) self.in_planes = 24 self.layer1 = self._make_layer(out_planes[0], num_blocks[0], groups) self.layer2 = self._make_layer(out_planes[1], num_blocks[1], groups) self.layer3 = self._make_layer(out_planes[2], num_blocks[2], groups) self.linear = nn.Linear(out_planes[2], 10) def _make_layer(self, out_planes, num_blocks, groups): layers = [] for i in range(num_blocks): stride = 2 if i == 0 else 1 cat_planes = self.in_planes if i == 0 else 0 layers.append(Bottleneck(self.in_planes, out_planes-cat_planes, stride=stride, groups=groups)) self.in_planes = out_planes return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) out = F.avg_pool2d(out, 4) out = out.view(out.size(0), -1) out = self.linear(out) return out def ShuffleNetG2(): cfg = { 'out_planes': [200,400,800], 'num_blocks': [4,8,4], 'groups': 2 } return ShuffleNet(cfg) def ShuffleNetG3(): cfg = { 'out_planes': [240,480,960], 'num_blocks': [4,8,4], 'groups': 3 } return ShuffleNet(cfg) def test(): net = ShuffleNetG2() x = torch.randn(1,3,32,32) y = net(x) print(y) # test() ================================================ FILE: GC_code/CIFAR100/models/vgg.py ================================================ '''VGG11/13/16/19 in Pytorch.''' import torch import torch.nn as nn cfg = { 'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], 'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], } class VGG(nn.Module): def __init__(self, vgg_name,Num_classes=100): super(VGG, self).__init__() self.features = self._make_layers(cfg[vgg_name]) self.classifier = nn.Linear(512, Num_classes) def forward(self, x): out = self.features(x) out = out.view(out.size(0), -1) out = self.classifier(out) return out def _make_layers(self, cfg): layers = [] in_channels = 3 for x in cfg: if x == 'M': layers += [nn.MaxPool2d(kernel_size=2, stride=2)] else: layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1), nn.BatchNorm2d(x), nn.ReLU(inplace=True)] in_channels = x layers += [nn.AvgPool2d(kernel_size=1, stride=1)] return nn.Sequential(*layers) def test(): net = VGG('VGG11') x = torch.randn(2,3,32,32) y = net(x) print(y.size()) # test() ================================================ FILE: GC_code/CIFAR100/os_run.py ================================================ import os,time #cifar100 sgd & sgdGCC os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 --model r50 > logout/r50_lr11_wd45_sgd.log ") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200 --model r50 > logout/r50_lr11_wd45_sgdGC.log ") ================================================ FILE: GC_code/Fine-grained_classification/SGD.py ================================================ import torch from torch.optim.optimizer import Optimizer, required class SGD_GCC(Optimizer): def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGD_GCC, self).__init__(params, defaults) def __setstate__(self, state): super(SGD_GCC, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data if weight_decay != 0: d_p.add_(weight_decay, p.data) #GC operation for Conv layers if len(list(d_p.size()))>3: d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True)) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) return loss class SGD_GC(Optimizer): def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGD_GC, self).__init__(params, defaults) def __setstate__(self, state): super(SGD_GC, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data if weight_decay != 0: d_p.add_(weight_decay, p.data) #GC operation for Conv layers and FC layers if len(list(d_p.size()))>1: d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True)) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) return loss class SGDW(Optimizer): def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGDW, self).__init__(params, defaults) def __setstate__(self, state): super(SGDW, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data old = torch.clone(p.data).detach() #if weight_decay != 0: # d_p.add_(weight_decay, p.data) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.zeros_like(p.data) buf.mul_(momentum).add_(d_p) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) if weight_decay != 0: p.data.add_(-weight_decay*group['lr'], old) return loss class SGDW_GCC(Optimizer): def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGDW_GCC, self).__init__(params, defaults) def __setstate__(self, state): super(SGDW_GCC, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data old = torch.clone(p.data).detach() #if weight_decay != 0: # d_p.add_(weight_decay, p.data) #GC operation for Conv layers if len(list(d_p.size()))>3: d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True)) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.zeros_like(p.data) buf.mul_(momentum).add_(d_p) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) if weight_decay != 0: p.data.add_(-weight_decay*group['lr'], old) return loss ================================================ FILE: GC_code/Fine-grained_classification/main.py ================================================ import argparse import os import random import shutil import time import warnings import sys import torch import torch.nn as nn import torch.nn.parallel import torch.backends.cudnn as cudnn import torch.distributed as dist import torch.optim import torch.multiprocessing as mp import torch.utils.data import torch.utils.data.distributed import torchvision.transforms as transforms import torchvision.datasets as datasets import torchvision.models as models from torch.optim import lr_scheduler from SGD import SGD_GC #import SGD with GC model_names = sorted(name for name in models.__dict__ if name.islower() and not name.startswith("__") and callable(models.__dict__[name])) parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') parser.add_argument('-b', '--batch-size', default=256, type=int, metavar='N', help='mini-batch size (default: 256), this is the total ' 'batch size of all GPUs on the current node when ' 'using Data Parallel or Distributed Data Parallel') parser.add_argument('--lr', '--learning-rate', default=0.1*128/128, type=float, metavar='LR', help='initial learning rate', dest='lr') parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50', choices=model_names, help='model architecture: ' + ' | '.join(model_names) + ' (default: resnet18)') parser.add_argument('data', metavar='DIR', help='path to dataset') parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', help='number of data loading workers (default: 4)') parser.add_argument('--epochs', default=100, type=int, metavar='N', help='number of total epochs to run') parser.add_argument('--start-epoch', default=0, type=int, metavar='N', help='manual epoch number (useful on restarts)') parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, metavar='W', help='weight decay (default: 1e-4)', dest='weight_decay') parser.add_argument('-p', '--print-freq', default=100, type=int, metavar='N', help='print frequency (default: 10)') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', help='evaluate model on validation set') parser.add_argument('--pretrained', dest='pretrained', action='store_true', help='use pre-trained model') parser.add_argument('--world-size', default=-1, type=int, help='number of nodes for distributed training') parser.add_argument('--rank', default=-1, type=int, help='node rank for distributed training') parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, help='url used to set up distributed training') parser.add_argument('--dist-backend', default='nccl', type=str, help='distributed backend') parser.add_argument('--seed', default=None, type=int, help='seed for initializing training. ') parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.') parser.add_argument('--multiprocessing-distributed', action='store_true', help='Use multi-processing distributed training to launch ' 'N processes per node, which has N GPUs. This is the ' 'fastest way to use PyTorch for either single node or ' 'multi node data parallel training') parser.add_argument('--model', default='r50p', type=str, help='model') parser.add_argument('--path', default='test', type=str, help='model') parser.add_argument('--alg', default='sgd', type=str, help='algorithm') parser.add_argument('--dataset', default='cub', type=str, help='model') best_acc1 = 0 def main(): args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3" if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') if args.gpu is not None: warnings.warn('You have chosen a specific GPU. This will completely ' 'disable data parallelism.') if args.dist_url == "env://" and args.world_size == -1: args.world_size = int(os.environ["WORLD_SIZE"]) args.distributed = args.world_size > 1 or args.multiprocessing_distributed ngpus_per_node = torch.cuda.device_count() if args.multiprocessing_distributed: # Since we have ngpus_per_node processes per node, the total world_size # needs to be adjusted accordingly args.world_size = ngpus_per_node * args.world_size # Use torch.multiprocessing.spawn to launch distributed processes: the # main_worker process function mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) else: # Simply call main_worker function main_worker(args.gpu, ngpus_per_node, args) def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu class_num={'cub':200,'cars':196,'dogs':120,'fgvc':100} if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.model=='r18p': model =models.resnet18(pretrained=True) model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True) if args.model=='r18': model =models.resnet18() model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True) if args.model=='r50p': model =models.resnet50(pretrained=True) model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True) if args.model=='r50': model =models.resnet50() model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int(args.workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) # choose optimizer if args.model=='r50p' or args.model=='r50': new_param_ids = set(map(id, model.module.fc.parameters())) base_params = [p for p in model.parameters() if id(p) not in new_param_ids] param_groups_base =[{'params': base_params, 'lr_mult': 0.1}] if args.model=='r50p' or args.model=='r50': param_groups_new=[{'params': model.module.fc.parameters(), 'lr_mult': 1.0}] if args.alg=='sgd': optimizer_base = torch.optim.SGD(param_groups_base, args.lr, momentum=args.momentum,weight_decay=args.weight_decay) optimizer_new= torch.optim.SGD(param_groups_new, args.lr, momentum=args.momentum,weight_decay=args.weight_decay) if args.alg=='sgdGC': optimizer_base = SGD_GC(param_groups_base, args.lr, momentum=args.momentum,weight_decay=args.weight_decay) optimizer_new= SGD_GC(param_groups_new, args.lr, momentum=args.momentum,weight_decay=args.weight_decay) exp_lr_scheduler_new = lr_scheduler.MultiStepLR(optimizer_new, milestones=[50,80], gamma=0.1) exp_lr_scheduler_base = lr_scheduler.MultiStepLR(optimizer_base, milestones=[50,80], gamma=0.1) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.Resize(512), transforms.RandomHorizontalFlip(), transforms.CenterCrop(448), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler,drop_last=True) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(512), transforms.CenterCrop(448), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True,drop_last=True) if args.evaluate: validate(val_loader, model, criterion, args) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) #adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer_base, optimizer_new,epoch, args) #exp_lr_scheduler.step() exp_lr_scheduler_new.step() exp_lr_scheduler_base.step() # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, #'optimizer' : optimizer.state_dict(), }, is_best) #torch.save(model.module, './result_model/'+args.path+'.pth') # train def train(train_loader, model, criterion, optimizer_base, optimizer_new, epoch, args): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() total = 0 train_loss = 0 correct = 0 # switch to train mode model.train() print('\nEpoch: %d' % epoch) end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) #if args.gpu is not None: #input = input.cuda(args.gpu, non_blocking=True) #target = target.cuda(args.gpu, non_blocking=True) input, target = input.to('cuda'), target.to('cuda') # compute output output = model(input) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(acc1[0], input.size(0)) top5.update(acc5[0], input.size(0)) _, predicted = output.max(1) correct += predicted.eq(target).sum().item() train_loss += loss.item() #correct +=acc1[0] total += target.size(0) # compute gradient and do SGD step optimizer_new.zero_grad() optimizer_base.zero_grad() loss.backward() optimizer_new.step() optimizer_base.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() print('Training: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5)) #print('Training: Loss: {:.3f} | Acc: {:.3f}'.format(train_loss/(i+1),correct/total)) # test def validate(val_loader, model, criterion, args): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() val_loss = 0 total = 0 correct = 0 # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, (input, target) in enumerate(val_loader): if args.gpu is not None: input = input.cuda(args.gpu, non_blocking=True) target = target.cuda(args.gpu, non_blocking=True) # compute output output = model(input) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(acc1[0], input.size(0)) top5.update(acc5[0], input.size(0)) _, predicted = output.max(1) total += target.size(0) correct += predicted.eq(target).sum().item() val_loss +=loss.item() # measure elapsed time batch_time.update(time.time() - end) end = time.time() val_loss += loss.item() print('Testing: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5)) return top1.avg def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): torch.save(state, filename) if is_best: shutil.copyfile(filename, 'model_best.pth.tar') class AverageMeter(object): """Computes and stores the average and current value""" def __init__(self): self.reset() def reset(self): self.val = 0 self.avg = 0 self.sum = 0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count def adjust_learning_rate(optimizer, epoch, args): """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" lr = args.lr * (0.1 ** (epoch // 30)) for param_group in optimizer.param_groups: param_group['lr'] = lr def accuracy(output, target, topk=(1,)): """Computes the accuracy over the k top predictions for the specified values of k""" with torch.no_grad(): maxk = max(topk) batch_size = target.size(0) _, pred = output.topk(maxk, 1, True, True) pred = pred.t() correct = pred.eq(target.view(1, -1).expand_as(pred)) res = [] for k in topk: correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) res.append(correct_k.mul_(1.0 / batch_size)) return res if __name__ == '__main__': main() ================================================ FILE: GC_code/Fine-grained_classification/os_run.py ================================================ import os,time os.system("nohup python -W ignore main.py /home/yonghw/data/data/CUB_200_2011/ --model r50p -b 128 --alg sgd --dataset cub > logout/Cub_r50p_sgd_b128_g4.log ") os.system("nohup python -W ignore main.py /home/yonghw/data/data/CUB_200_2011/ --model r50p -b 128 --alg sgdGC --dataset cub > logout/Cub_r50p_sgdGC_b128_g4.log ") os.system("nohup python -W ignore main.py /home/yonghw/data/data/Car196/ --model r50p -b 128 --alg sgd --dataset cars > logout/Car_r50p_sgd_b128_g4.log ") os.system("nohup python -W ignore main.py /home/yonghw/data/data/Car196/ --model r50p -b 128 --alg sgdGC --dataset cars> logout/Car_r50p_sgdGC_b128_g4.log ") os.system("nohup python -W ignore main.py /home/yonghw/data/data/fgvc_aricraft/ --model r50p -b 128 --alg sgd --dataset fgvc > logout/Ari_r50p_sgd_b128_g4.log ") os.system("nohup python -W ignore main.py /home/yonghw/data/data/fgvc_aricraft/ --model r50p -b 128 --alg sgdGC --dataset fgvc > logout/Ari_r50p_sgdGC_b128_g4.log ") os.system("nohup python -W ignore main.py /home/yonghw/data/data/StanfordDogs/ --model r50p -b 128 --alg sgd --dataset dogs > logout/Dog_r50p_sgd_b128_g4.log ") os.system("nohup python -W ignore main.py /home/yonghw/data/data/StanfordDogs/ --model r50p -b 128 --alg sgdGC --dataset dogs > logout/Dog_r50p_sgdGC_b128_g4.log ") ================================================ FILE: GC_code/ImageNet/SGD.py ================================================ import torch from torch.optim.optimizer import Optimizer, required class SGD_GCC(Optimizer): def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGD_GCC, self).__init__(params, defaults) def __setstate__(self, state): super(SGD_GCC, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data if weight_decay != 0: d_p.add_(weight_decay, p.data) #GC operation for Conv layers if len(list(d_p.size()))>3: d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True)) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) return loss class SGD_GC(Optimizer): def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGD_GC, self).__init__(params, defaults) def __setstate__(self, state): super(SGD_GC, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data if weight_decay != 0: d_p.add_(weight_decay, p.data) #GC operation for Conv layers and FC layers if len(list(d_p.size()))>1: d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True)) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) return loss class SGDW(Optimizer): def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGDW, self).__init__(params, defaults) def __setstate__(self, state): super(SGDW, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data old = torch.clone(p.data).detach() #if weight_decay != 0: # d_p.add_(weight_decay, p.data) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.zeros_like(p.data) buf.mul_(momentum).add_(d_p) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) if weight_decay != 0: p.data.add_(-weight_decay*group['lr'], old) return loss class SGDW_GCC(Optimizer): def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGDW_GCC, self).__init__(params, defaults) def __setstate__(self, state): super(SGDW_GCC, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data old = torch.clone(p.data).detach() #if weight_decay != 0: # d_p.add_(weight_decay, p.data) #GC operation for Conv layers if len(list(d_p.size()))>3: d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True)) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.zeros_like(p.data) buf.mul_(momentum).add_(d_p) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) if weight_decay != 0: p.data.add_(-weight_decay*group['lr'], old) return loss ================================================ FILE: GC_code/ImageNet/main.py ================================================ import argparse import os import random import shutil import time import warnings import sys #nohup python -W ignore main.py /mnt/v0/ --model r50bn --alg sgd1 -b 256 --gpug 1 --path r50bn_sgd1_b256_g4 > logout/r50bn_sgd1_b256_g4.log import torch import torch.nn as nn import torch.nn.parallel import torch.backends.cudnn as cudnn import torch.distributed as dist import torch.optim import torch.multiprocessing as mp import torch.utils.data import torch.utils.data.distributed import torchvision.transforms as transforms import torchvision.datasets as datasets import torchvision.models as models #from myresnet_nbn import resnet18_nbn, resnet101_nbn,resnet50_nbn from myresnet import resnet50, resnet101 from myresnetgn import resnet50gn, resnet101gn from torch.optim import lr_scheduler from SGD import SGD_GCC #import SGD with GC for Conv layer model_names = sorted(name for name in models.__dict__ if name.islower() and not name.startswith("__") and callable(models.__dict__[name])) parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') parser.add_argument('-b', '--batch-size', default=256, type=int, metavar='N', help='mini-batch size (default: 256), this is the total ' 'batch size of all GPUs on the current node when ' 'using Data Parallel or Distributed Data Parallel') parser.add_argument('--lr', '--learning-rate', default=0.1*128/128, type=float, metavar='LR', help='initial learning rate', dest='lr') parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50', choices=model_names, help='model architecture: ' + ' | '.join(model_names) + ' (default: resnet18)') parser.add_argument('data', metavar='DIR', help='path to dataset') parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', help='number of data loading workers (default: 4)') parser.add_argument('--epochs', default=100, type=int, metavar='N', help='number of total epochs to run') parser.add_argument('--start-epoch', default=0, type=int, metavar='N', help='manual epoch number (useful on restarts)') parser.add_argument('--bgn', default=1, type=int, help='bn group number') parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, metavar='W', help='weight decay (default: 1e-4)', dest='weight_decay') parser.add_argument('-p', '--print-freq', default=100, type=int, metavar='N', help='print frequency (default: 10)') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', help='evaluate model on validation set') parser.add_argument('--pretrained', dest='pretrained', action='store_true', help='use pre-trained model') parser.add_argument('--world-size', default=-1, type=int, help='number of nodes for distributed training') parser.add_argument('--rank', default=-1, type=int, help='node rank for distributed training') parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, help='url used to set up distributed training') parser.add_argument('--dist-backend', default='nccl', type=str, help='distributed backend') parser.add_argument('--seed', default=None, type=int, help='seed for initializing training. ') parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.') parser.add_argument('--multiprocessing-distributed', action='store_true', help='Use multi-processing distributed training to launch ' 'N processes per node, which has N GPUs. This is the ' 'fastest way to use PyTorch for either single node or ' 'multi node data parallel training') parser.add_argument('--model', default='r50bn', type=str, help='model') parser.add_argument('--path', default='test', type=str, help='model') parser.add_argument('--alg', default='sgd', type=str, help='algorithm') best_acc1 = 0 device_ids=[0,1,2,3,4,5,6,7] def main(): args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3" if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') if args.gpu is not None: warnings.warn('You have chosen a specific GPU. This will completely ' 'disable data parallelism.') if args.dist_url == "env://" and args.world_size == -1: args.world_size = int(os.environ["WORLD_SIZE"]) args.distributed = args.world_size > 1 or args.multiprocessing_distributed ngpus_per_node = torch.cuda.device_count() if args.multiprocessing_distributed: # Since we have ngpus_per_node processes per node, the total world_size # needs to be adjusted accordingly args.world_size = ngpus_per_node * args.world_size # Use torch.multiprocessing.spawn to launch distributed processes: the # main_worker process function mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) else: # Simply call main_worker function main_worker(args.gpu, ngpus_per_node, args) def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.model=='r50bn': model = resnet50() if args.model=='r50gn': model = resnet50gn() if args.model=='r101bn': model = resnet101() if args.model=='r101gn': model = resnet101gn() if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int(args.workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) # choose optimizer if args.alg=='sgd': optimizer =torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum,weight_decay=args.weight_decay) if args.alg=='sgdGC': optimizer = SGD_GCC(model.parameters(), args.lr, momentum=args.momentum,weight_decay=args.weight_decay) exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler,drop_last=True) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True,drop_last=True) if args.evaluate: validate(val_loader, model, criterion, args) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) #adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) exp_lr_scheduler.step() # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer' : optimizer.state_dict(), }, is_best) torch.save(model.module, './result_model/'+args.path+'.pth') # train def train(train_loader, model, criterion, optimizer, epoch, args): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() total = 0 train_loss = 0 correct = 0 # switch to train mode model.train() print('\nEpoch: %d' % epoch) end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) #if args.gpu is not None: #input = input.cuda(args.gpu, non_blocking=True) #target = target.cuda(args.gpu, non_blocking=True) input, target = input.to('cuda'), target.to('cuda') # compute output output = model(input) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(acc1[0], input.size(0)) top5.update(acc5[0], input.size(0)) _, predicted = output.max(1) correct += predicted.eq(target).sum().item() train_loss += loss.item() #correct +=acc1[0] total += target.size(0) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() print('Training: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5)) #print('Training: Loss: {:.3f} | Acc: {:.3f}'.format(train_loss/(i+1),correct/total)) # validate def validate(val_loader, model, criterion, args): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() val_loss = 0 total = 0 correct = 0 # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, (input, target) in enumerate(val_loader): if args.gpu is not None: input = input.cuda(args.gpu, non_blocking=True) target = target.cuda(args.gpu, non_blocking=True) # compute output output = model(input) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(acc1[0], input.size(0)) top5.update(acc5[0], input.size(0)) _, predicted = output.max(1) total += target.size(0) correct += predicted.eq(target).sum().item() val_loss +=loss.item() # measure elapsed time batch_time.update(time.time() - end) end = time.time() val_loss += loss.item() print('Testing: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5)) return top1.avg def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): torch.save(state, filename) if is_best: shutil.copyfile(filename, 'model_best.pth.tar') class AverageMeter(object): """Computes and stores the average and current value""" def __init__(self): self.reset() def reset(self): self.val = 0 self.avg = 0 self.sum = 0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count def adjust_learning_rate(optimizer, epoch, args): """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" lr = args.lr * (0.1 ** (epoch // 30)) for param_group in optimizer.param_groups: param_group['lr'] = lr def accuracy(output, target, topk=(1,)): """Computes the accuracy over the k top predictions for the specified values of k""" with torch.no_grad(): maxk = max(topk) batch_size = target.size(0) _, pred = output.topk(maxk, 1, True, True) pred = pred.t() correct = pred.eq(target.view(1, -1).expand_as(pred)) res = [] for k in topk: correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) res.append(correct_k.mul_(1.0 / batch_size)) return res if __name__ == '__main__': main() ================================================ FILE: GC_code/ImageNet/myresnet.py ================================================ from __future__ import print_function, division, absolute_import import torch.nn as nn import math import torch.utils.model_zoo as model_zoo __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'] model_urls = { 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', } def conv3x3(in_planes, out_planes, stride=1): "3x3 convolution with padding" return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=True) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None): super(BasicBlock, self).__init__() self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = nn.BatchNorm2d(planes) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = nn.BatchNorm2d(planes) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None): super(Bottleneck, self).__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=True) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=True) self.bn2 = nn.BatchNorm2d(planes) self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=True) self.bn3 = nn.BatchNorm2d(planes * 4) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out #from torch.legacy import nn as nnl class ResNet(nn.Module): def __init__(self, block, layers, num_classes=1000): self.inplanes = 64 super(ResNet, self).__init__() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=True) #self.conv1 = nnl.SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) self.avgpool = nn.AvgPool2d(7) self.fc = nn.Linear(512 * block.expansion, num_classes) for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() def _make_layer(self, block, planes, blocks, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=True), nn.BatchNorm2d(planes * block.expansion), ) layers = [] layers.append(block(self.inplanes, planes, stride, downsample)) self.inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.inplanes, planes)) return nn.Sequential(*layers) def forward(self, x): x = self.conv1(x) self.conv1_input = x.clone() x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.avgpool(x) x = x.view(x.size(0), -1) x = self.fc(x) return x def resnet18(pretrained=False, **kwargs): """Constructs a ResNet-18 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['resnet18'])) return model def resnet34(pretrained=False, **kwargs): """Constructs a ResNet-34 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['resnet34'])) return model def resnet50(pretrained=False, **kwargs): """Constructs a ResNet-50 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['resnet50'])) return model def resnet101(pretrained=False, **kwargs): """Constructs a ResNet-101 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['resnet101'])) return model def resnet152(pretrained=False, **kwargs): """Constructs a ResNet-152 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['resnet152'])) return model def test(): net = resnet18() net.eval() x=Variable(torch.randn(2,3,224,224)) y = net(x) print(y.size()) print(net) #test() ================================================ FILE: GC_code/ImageNet/myresnetgn.py ================================================ from __future__ import print_function, division, absolute_import import torch.nn as nn import math import torch.utils.model_zoo as model_zoo __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'] model_urls = { 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', } def conv3x3(in_planes, out_planes, stride=1): "3x3 convolution with padding" return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=True) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None): super(BasicBlock, self).__init__() self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = nn.GroupNorm(32,planes) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = nn.GroupNorm(32,planes) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None): super(Bottleneck, self).__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=True) self.bn1 = nn.GroupNorm(32,planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=True) self.bn2 = nn.GroupNorm(32,planes) self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=True) self.bn3 = nn.GroupNorm(32,planes * 4) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out #from torch.legacy import nn as nnl class ResNet(nn.Module): def __init__(self, block, layers, num_classes=1000): self.inplanes = 64 super(ResNet, self).__init__() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=True) #self.conv1 = nnl.SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3) self.bn1 = nn.GroupNorm(32,64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) self.avgpool = nn.AvgPool2d(7) self.fc = nn.Linear(512 * block.expansion, num_classes) for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.GroupNorm): m.weight.data.fill_(1) m.bias.data.zero_() def _make_layer(self, block, planes, blocks, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=True), nn.GroupNorm(32,planes * block.expansion), ) layers = [] layers.append(block(self.inplanes, planes, stride, downsample)) self.inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.inplanes, planes)) return nn.Sequential(*layers) def forward(self, x): x = self.conv1(x) self.conv1_input = x.clone() x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.avgpool(x) x = x.view(x.size(0), -1) x = self.fc(x) return x def resnet18gn(pretrained=False, **kwargs): """Constructs a ResNet-18 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['resnet18'])) return model def resnet34gn(pretrained=False, **kwargs): """Constructs a ResNet-34 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['resnet34'])) return model def resnet50gn(pretrained=False, **kwargs): """Constructs a ResNet-50 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['resnet50'])) return model def resnet101gn(pretrained=False, **kwargs): """Constructs a ResNet-101 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['resnet101'])) return model def resnet152gn(pretrained=False, **kwargs): """Constructs a ResNet-152 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['resnet152'])) return model def test(): net = resnet18gn() net.eval() x=torch.randn(2,3,224,224) y = net(x) print(y.size()) print(net) #test() ================================================ FILE: GC_code/ImageNet/os_run.py ================================================ import os,time os.system("#nohup python -W ignore main.py /ssd/data/yonghw/Imagenet/v0/ --model r50bn --alg sgd -b 256 --path r50bn_sgd_b256_g4 > logout/r50bn_sgd_b256_g4.log &") os.system("#nohup python -W ignore main.py /ssd/data/yonghw/Imagenet/v0/ --model r50bn --alg sgdGC -b 256 --path r50bn_sgdGC_b256_g4 > logout/r50bn_sgdGC_b256_g4.log &") ================================================ FILE: GC_code/Mini_ImageNet/SGD.py ================================================ import torch from torch.optim.optimizer import Optimizer, required class SGD_GCC(Optimizer): def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGD_GCC, self).__init__(params, defaults) def __setstate__(self, state): super(SGD_GCC, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data if weight_decay != 0: d_p.add_(weight_decay, p.data) #GC operation for Conv layers if len(list(d_p.size()))>3: d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True)) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) return loss class SGD_GC(Optimizer): def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGD_GC, self).__init__(params, defaults) def __setstate__(self, state): super(SGD_GC, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data if weight_decay != 0: d_p.add_(weight_decay, p.data) #GC operation for Conv layers and FC layers if len(list(d_p.size()))>1: d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True)) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) return loss class SGDW(Optimizer): def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGDW, self).__init__(params, defaults) def __setstate__(self, state): super(SGDW, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data old = torch.clone(p.data).detach() #if weight_decay != 0: # d_p.add_(weight_decay, p.data) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.zeros_like(p.data) buf.mul_(momentum).add_(d_p) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) if weight_decay != 0: p.data.add_(-weight_decay*group['lr'], old) return loss class SGDW_GCC(Optimizer): def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGDW_GCC, self).__init__(params, defaults) def __setstate__(self, state): super(SGDW_GCC, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data old = torch.clone(p.data).detach() #if weight_decay != 0: # d_p.add_(weight_decay, p.data) #GC operation for Conv layers if len(list(d_p.size()))>3: d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True)) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.zeros_like(p.data) buf.mul_(momentum).add_(d_p) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) if weight_decay != 0: p.data.add_(-weight_decay*group['lr'], old) return loss ================================================ FILE: GC_code/Mini_ImageNet/main.py ================================================ import argparse import os import random import shutil import time import warnings import sys import torch import torch.nn as nn import torch.nn.parallel import torch.backends.cudnn as cudnn import torch.distributed as dist import torch.optim import torch.multiprocessing as mp import torch.utils.data import torch.utils.data.distributed import torchvision.transforms as transforms import torchvision.datasets as datasets import torchvision.models as models from resnet_ws import l_resnet50 import torchvision.models as models import math import numpy as np from torch.optim import lr_scheduler from SGD import SGD_GC #import SGD with GC model_names = sorted(name for name in models.__dict__ if name.islower() and not name.startswith("__") and callable(models.__dict__[name])) parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') parser.add_argument('-b', '--batch_size', default=256, type=int, metavar='N', help='mini-batch size (default: 256), this is the total ' 'batch size of all GPUs on the current node when ' 'using Data Parallel or Distributed Data Parallel') parser.add_argument('--lr', '--learning-rate', default=0.1*32/32, type=float, metavar='LR', help='initial learning rate', dest='lr') parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50', choices=model_names, help='model architecture: ' + ' | '.join(model_names) + ' (default: resnet18)') parser.add_argument('data', metavar='DIR', help='path to dataset') parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', help='number of data loading workers (default: 4)') parser.add_argument('--epochs', default=100, type=int, metavar='N', help='number of total epochs to run') parser.add_argument('--start-epoch', default=0, type=int, metavar='N', help='manual epoch number (useful on restarts)') parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, metavar='W', help='weight decay (default: 1e-4)', dest='weight_decay') parser.add_argument('-p', '--print-freq', default=100, type=int, metavar='N', help='print frequency (default: 10)') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', help='evaluate model on validation set') parser.add_argument('--pretrained', dest='pretrained', action='store_true', help='use pre-trained model') parser.add_argument('--world-size', default=-1, type=int, help='number of nodes for distributed training') parser.add_argument('--rank', default=-1, type=int, help='node rank for distributed training') parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, help='url used to set up distributed training') parser.add_argument('--dist-backend', default='nccl', type=str, help='distributed backend') parser.add_argument('--seed', default=None, type=int, help='seed for initializing training. ') parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.') parser.add_argument('--multiprocessing-distributed', action='store_true', help='Use multi-processing distributed training to launch ' 'N processes per node, which has N GPUs. This is the ' 'fastest way to use PyTorch for either single node or ' 'multi node data parallel training') parser.add_argument('--model', default='r18', type=str, help='model') parser.add_argument('--path', default='test', type=str, help='model') parser.add_argument('--alg', default='sgd', type=str, help='model') best_acc1 = 0 device_ids=[0,1,2,3,4,5,6,7] def main(): args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3" if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') if args.gpu is not None: warnings.warn('You have chosen a specific GPU. This will completely ' 'disable data parallelism.') if args.dist_url == "env://" and args.world_size == -1: args.world_size = int(os.environ["WORLD_SIZE"]) args.distributed = args.world_size > 1 or args.multiprocessing_distributed ngpus_per_node = torch.cuda.device_count() if args.multiprocessing_distributed: # Since we have ngpus_per_node processes per node, the total world_size # needs to be adjusted accordingly args.world_size = ngpus_per_node * args.world_size # Use torch.multiprocessing.spawn to launch distributed processes: the # main_worker process function mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) else: # Simply call main_worker function main_worker(args.gpu, ngpus_per_node, args) def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu #momentum=pow(math.e,math.log(0.9)/64*args.batch_size/ngpus_per_node/args.bgn) if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model num_classes=100 if args.model=='r50': model = models.resnet50() model.fc= nn.Linear(in_features=2048, out_features=num_classes, bias=True) if args.model=='r50ws': model =l_resnet50(num_classes=num_classes) for m in model.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.GroupNorm): m.weight.data.uniform_() m.bias.data.zero_() if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int(args.workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) criterion = nn.CrossEntropyLoss().cuda(args.gpu) # choose optimizer if args.alg=='sgd': optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,momentum=args.momentum, weight_decay = args.weight_decay) if args.alg=='sgdGC': optimizer = SGD_GC(model.parameters(), lr=args.lr,momentum=args.momentum, weight_decay = args.weight_decay) exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler,drop_last=True) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) #adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) exp_lr_scheduler.step() # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) # if not args.multiprocessing_distributed or (args.multiprocessing_distributed # and args.rank % ngpus_per_node == 0): # save_checkpoint({ # 'epoch': epoch + 1, # 'arch': args.arch, # 'state_dict': model.state_dict(), # 'best_acc1': best_acc1, # 'optimizer' : optimizer.state_dict(), # }, is_best) #torch.save(model.module, './result_model/'+args.path+'.pth') #train def train(train_loader, model, criterion, optimizer, epoch, args): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() total = 0 train_loss = 0 correct = 0 # switch to train mode model.train() print('\nEpoch: %d' % epoch) end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) #if args.gpu is not None: #input = input.cuda(args.gpu, non_blocking=True) #target = target.cuda(args.gpu, non_blocking=True) input, target = input.to('cuda'), target.to('cuda') # compute output output = model(input) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(acc1[0], input.size(0)) top5.update(acc5[0], input.size(0)) _, predicted = output.max(1) correct += predicted.eq(target).sum().item() train_loss += loss.item() #correct +=acc1[0] total += target.size(0) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() print('Training: Top1: {top1.avg:.4f}|loss:{losses.avg:.4f}'.format(top1=top1, losses=losses)) #print('Training: top1: {:.4f} '.format(correct/total)) # test def validate(val_loader, model, criterion, args): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() val_loss = 0 total = 0 correct = 0 # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, (input, target) in enumerate(val_loader): if args.gpu is not None: input = input.cuda(args.gpu, non_blocking=True) target = target.cuda(args.gpu, non_blocking=True) # compute output output = model(input) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(acc1[0], input.size(0)) top5.update(acc5[0], input.size(0)) _, predicted = output.max(1) total += target.size(0) correct += predicted.eq(target).sum().item() val_loss +=loss.item() # measure elapsed time batch_time.update(time.time() - end) end = time.time() val_loss += loss.item() print('Testing: Top1: {top1.avg:.4f}|loss:{losses.avg:.4f}'.format(top1=top1, losses=losses)) #print('Testing: top1: {:.4f} '.format(correct/total)) return top1.avg def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): torch.save(state, filename) if is_best: shutil.copyfile(filename, 'model_best.pth.tar') class AverageMeter(object): """Computes and stores the average and current value""" def __init__(self): self.reset() def reset(self): self.val = 0 self.avg = 0 self.sum = 0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count def adjust_learning_rate(optimizer, epoch, args): """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" lr = args.lr * (0.1 ** (epoch // 30)) for param_group in optimizer.param_groups: param_group['lr'] = lr def accuracy(output, target, topk=(1,)): """Computes the accuracy over the k top predictions for the specified values of k""" with torch.no_grad(): maxk = max(topk) batch_size = target.size(0) _, pred = output.topk(maxk, 1, True, True) pred = pred.t() correct = pred.eq(target.view(1, -1).expand_as(pred)) res = [] for k in topk: correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) res.append(correct_k.mul_(1.0 / batch_size)) return res if __name__ == '__main__': main() ================================================ FILE: GC_code/Mini_ImageNet/os_run.py ================================================ #cifar100 e200 bs128 gs 2,4,8,16 import os,time #print('runing mini_imagenet.py') os.system("nohup python -W ignore main.py /home/yonghw/data/mini_imagenet/split_mini/ --model r50 -b 128 --alg sgd > logout/r50_b128_sgd.log ") os.system("nohup python -W ignore main.py /home/yonghw/data/mini_imagenet/split_mini/ --model r50 -b 128 --alg sgdGC > logout/r50_b128_sgdGC.log ") os.system("nohup python -W ignore main.py /home/yonghw/data/mini_imagenet/split_mini/ --model r50ws -b 128 --alg sgd > logout/r50ws_b128_sgd.log ") os.system("nohup python -W ignore main.py /home/yonghw/data/mini_imagenet/split_mini/ --model r50ws -b 128 --alg sgdGC > logout/r50ws_b128_sgdGC.log ") ================================================ FILE: GC_code/Mini_ImageNet/resnet_ws.py ================================================ import torch.nn as nn import torch.utils.model_zoo as model_zoo import torch import torch.nn as nn from torch.nn.parameter import Parameter from torch.nn import functional as F #from .. import layers as L import math __all__ = ['ResNet', 'l_resnet18', 'l_resnet34', 'l_resnet50', 'l_resnet101', 'l_resnet152'] class Conv2d(nn.Conv2d): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True): super(Conv2d, self).__init__(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias) def forward(self, x): # return super(Conv2d, self).forward(x) weight = self.weight weight_mean = weight.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True) weight = weight - weight_mean std = weight.view(weight.size(0), -1).std(dim=1).view(-1, 1, 1, 1) + 1e-5 weight = weight / std.expand_as(weight) return F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups) def BatchNorm2d(num_features): #return nn.GroupNorm(num_channels=num_features, num_groups=32) return nn.BatchNorm2d(num_features=num_features) def conv3x3(in_planes, out_planes, stride=1): """3x3 convolution with padding""" return Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) def conv1x1(in_planes, out_planes, stride=1): """1x1 convolution""" return Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None): super(BasicBlock, self).__init__() self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = BatchNorm2d(planes) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = BatchNorm2d(planes) self.downsample = downsample self.stride = stride def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None): super(Bottleneck, self).__init__() self.conv1 = conv1x1(inplanes, planes) self.bn1 = BatchNorm2d(planes) self.conv2 = conv3x3(planes, planes, stride) self.bn2 = BatchNorm2d(planes) self.conv3 = conv1x1(planes, planes * self.expansion) self.bn3 = BatchNorm2d(planes * self.expansion) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class ResNet(nn.Module): def __init__(self, block, layers, num_classes=1000, zero_init_residual=False): super(ResNet, self).__init__() self.inplanes = 64 self.conv1 = Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.fc = nn.Linear(512 * block.expansion, num_classes) for m in self.modules(): if isinstance(m, Conv2d): #nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m,nn.BatchNorm2d): #nn.init.constant_(m.weight, 1) #nn.init.constant_(m.bias, 0) m.weight.data.uniform_() m.bias.data.zero_() # Zero-initialize the last BN in each residual branch, # so that the residual branch starts with zeros, and each residual block behaves like an identity. # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 if zero_init_residual: for m in self.modules(): if isinstance(m, Bottleneck): nn.init.constant_(m.bn3.weight, 0) elif isinstance(m, BasicBlock): nn.init.constant_(m.bn2.weight, 0) def _make_layer(self, block, planes, blocks, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( conv1x1(self.inplanes, planes * block.expansion, stride), BatchNorm2d(planes * block.expansion), ) layers = [] layers.append(block(self.inplanes, planes, stride, downsample)) self.inplanes = planes * block.expansion for _ in range(1, blocks): layers.append(block(self.inplanes, planes)) return nn.Sequential(*layers) def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.avgpool(x) x = x.view(x.size(0), -1) x = self.fc(x) return x def l_resnet18(pretrained=False, **kwargs): """Constructs a ResNet-18 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) return model def l_resnet34(pretrained=False, **kwargs): """Constructs a ResNet-34 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) return model def l_resnet50(pretrained=False, **kwargs): """Constructs a ResNet-50 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) return model def l_resnet101(pretrained=False, **kwargs): """Constructs a ResNet-101 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) return model def l_resnet152(pretrained=False, **kwargs): """Constructs a ResNet-152 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) return model ================================================ FILE: README.md ================================================ # Gradient Centralization ## [Gradient Centralization: A New Optimization Technique for Deep Neural Networks](https://arxiv.org/abs/2004.01461) *** ## Introduction * Gradient Centralization (GC) is a simple and effective optimization technique for Deep Neural Networks (DNNs), which operates directly on gradients by centralizing the gradient vectors to have zero mean. It can both speedup training process and improve the final generalization performance of DNNs. GC is very simple to implement and can be easily embedded into existing gradient based DNN optimizers with only few lines of code. It can also be directly used to finetune the pre-trained DNNs. Please refer to the [algorithm-GC](https://github.com/Yonghongwei/Gradient-Centralization/blob/master/algorithm-GC/) to obtain the codes of more advanced optimizers.
Illustration of the GC operation on gradient matrix/tensor of weights in the fully-connected layer (left) and convolutional layer (right).
* GC can be viewed as a projected gradient descent method with a constrained loss function. The Lipschitzness of the constrained loss function and its gradient is better so that the training process becomes more efficient and stable. Our experiments on various applications, including `general image classification`, `fine-grained image classification`, `detection and segmentation` and `Person ReID` demonstrate that GC can consistently improve the performance of DNN learning.
* The optimizers are provided in the files: [`SGD.py`](https://github.com/Yonghongwei/Gradient-Centralization/blob/master/GC_code/CIFAR100/algorithm/SGD.py), [`Adam.py`](https://github.com/Yonghongwei/Gradient-Centralization/blob/master/GC_code/CIFAR100/algorithm/Adam.py) and [`Adagrad.py`](https://github.com/Yonghongwei/Gradient-Centralization/blob/master/GC_code/CIFAR100/algorithm/Adagrad.py), including SGD_GC, SGD_GCC, SGDW_GCC, Adam_GC, Adam_GCC, Adam_GCC2, AdamW_GCC, AdamW_GCC2 and Adagrad_GCC. The optimizers with "_GC" use GC for both Conv layers and FC layers, and the optimizers with "_GCC" use GC only for Conv layers. For adaptive learning rate methods, keeping mean of weight vector unchanged usually works better. Please refer to Adam_GCC2 and AdamW_GCC2. We can use the following codes to import SGD_GC: ```python from SGD import SGD_GC ``` *** ## Update * 2020/04/07:Release a pytorch implementation of optimizers with GC, and provide some examples on classification task, including general image classification (Mini-ImageNet, CIFAR100 and ImageNet) and Fine-grained image classification (FGVC Aircraft, Stanford Cars, Stanford Dogs and CUB-200-2011). * 2020/04/14:Release the code of GC on MMdetection and update some tables of experimental results. * 2020/05/07:Release the code of GC on Person ReID and show some results on Market1501. * 2020/08/08:Release the code of some advanced optimizers with GC. *** ## Citation @article{GradientCentra, title={Gradient-Centralization: A New Optimization Technique for Deep Neural Networks}, author={Hongwei Yong and Jianqiang Huang and Xiansheng Hua and Lei Zhang}, booktitle={the European Conference on Conputer Vision}, year={2020} } *** ## Link to the other implementation of GC * Gradient Centralization in TensorFlow [`https://github.com/Rishit-dagli/Gradient-Centralization-TensorFlow`](https://github.com/Rishit-dagli/Gradient-Centralization-TensorFlow) * Gradient Centralization in Ranger optimizer [`https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer`](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer) ## Experiments *** ### General Image Classification * Mini-ImageNet The codes are in [`GC_code/Mini_ImageNet`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/GC_code/Mini_ImageNet). The split dataset can be downloaded from [here](https://drive.google.com/open?id=1XWRjPzwRWChNgvemqsylYM1ocpxhGtfy) (Google drive) or [here](https://pan.baidu.com/s/1Ah6Lu8OSfAVc3PZM-mPpvw) (Baidu drive, safe code: 1681). The following figure is training loss (left) and testing accuracy (right) curves vs. training epoch on the Mini-ImageNet. The ResNet50 is used as the DNN model. The compared optimization techniques include BN, BN+GC, BN+WS and BN+WS+GC.
* CIFAR100 The codes are in [`GC_code/CIFAR100`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/GC_code/CIFAR100). * ImageNet The codes are in [`GC_code/ImageNet`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/GC_code/ImageNet). The following table is the Top-1 error rates on ImageNet w/o GC and w/ GC: |Backbone | R50BN |R50GN | R101BN | R101GN | | :-----------: | :-----------: | :----: |:------: |:-------: | | w/o GC | 23.71 |24.50 |22.37 |23.34 | | w/ GC | 23.21 |23.53 |21.82 |22.14 | The following figure is the training error (left) and validation error (right) curves vs. training epoch on ImageNet. The DNN model is ResNet50 with GN.
*** ### Fine-grained Image Classification The codes are in [`GC_code/Fine-grained_classification`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/GC_code/Fine-grained_classification). The preprocessed dataset can be downloaded from [here](https://drive.google.com/open?id=1c3OnKq3EsMKK1OerWdouCG7hvN8Rv8yh). The following table is the testing accuracies on the four fine-grained image classification datasets with ResNet50: |Datesets | FGVC Aircraft |Stanford Cars |Stanford Dogs| CUB-200-2011| | :-----------: | :-----------: | :----: |:------: |:-------: | | w/o GC | 86.62 |88.66 |76.16 |82.07 | | w/ GC | 87.77 |90.03 |78.23 |83.40 | The following figure is the training accuracy (solid line) and testing accuracy (dotted line) curves vs. training epoch on four fine-grained image classification datasets:
*** ### Objection Detection and Segmentation The codes are in [`MMdetection`](https://github.com/Yonghongwei/mmdetection). Please let [`SGD.py`](https://github.com/Yonghongwei/mmdetection/blob/master/tools/SGD.py) in [`MMdetection\tools\`](https://github.com/Yonghongwei/mmdetection/tree/master/tools), and update [`MMdetection\tools\train.py`](https://github.com/Yonghongwei/mmdetection/blob/master/tools/train.py). Then if you want use SGD_GC optimizer, just update optimizer in the [`configs`](https://github.com/Yonghongwei/mmdetection/blob/master/configs/) file. For example, if we want use SGD_GC to optimize Faster_RCNN with ResNet50 backbone and FPN, we update the 151th line in [`MMdetection/configs/faster_rcnn_r50_fpn_1x.py`](https://github.com/Yonghongwei/mmdetection/blob/master/configs/faster_rcnn_r50_fpn_1x.py). The following table is the detection results on COCO by using Faster-RCNN and FPN with various backbone models: | Method | Backbone | AP | AP.5 | AP.75 | Backbone | AP | AP.5 | AP.75 | | :-----------: | :-----------: | :----:|:------:|:-------: | :-----------: | :----:|:------:|:-------: | | w/o GC | R50 | 36.4 | 58.4 | 39.1 | X101-32x4d | 40.1 | 62.0 | 43.8 | | w/ GC | R50 | 37.0 | 59.0 | 40.2 | X101-32x4d | 40.7 | 62.7 | 43.9 | | w/o GC | R101 | 38.5 | 60.3 | 41.6 | X101-64x4d | 41.3 | 63.3 | 45.2 | | w/ GC | R101 | 38.9 | 60.8 | 42.2 | X101-64x4d | 41.6 | 63.8 | 45.4 | The following table is the detection and segmentation results on COCO by using Mask-RCNN and FPN with various backbone models: | Method | Backbone | APb | APb.5| APb.75| APm | APm.5| APm.75 | | :-----------: | :-----------: | :----:|:------:|:-------:| :----:|:------:|:-------: | | w/o GC | R50 | 37.4 | 59.0 | 40.6 | 34.1 | 55.5 | 36.1 | | w/ GC | R50 | 37.9 | 59.6 | 41.2 | 34.7 | 56.1 | 37.0 | | w/o GC | R101 | 39.4 | 60.9 | 43.3 | 35.9 | 57.7 | 38.4 | | w/ GC | R101 | 40.0 | 61.5 | 43.7 | 36.2 | 58.1 | 38.7 | | w/o GC | X101-32x4d | 41.1 | 62.8 | 45.0 | 37.1 | 59.4 | 39.8 | | w/ GC | X101-32x4d | 41.6 | 63.1 | 45.5 | 37.4 | 59.8 | 39.9 | | w/o GC | X101-64x4d | 42.1 | 63.8 | 46.3 | 38.0 | 60.6 | 40.9 | | w/ GC | X101-64x4d | 42.8 | 64.5 | 46.8 | 38.4 | 61.0 | 41.1 | | w/o GC | R50 (4c1f) | 37.5 | 58.2 | 41.0 | 33.9 | 55.0 | 36.1 | | w/ GC | R50 (4c1f) | 38.4 | 59.5 | 41.8 | 34.6 | 55.9 | 36.7 | | w/o GC | R101GN | 41.1 | 61.7 | 44.9 | 36.9 | 58.7 | 39.3 | | w/ GC | R101GN | 41.7 | 62.3 | 45.3 | 37.4 | 59.3 | 40.3 | | w/o GC | R50GN+WS | 40.0 | 60.7 | 43.6 | 36.1 | 57.8 | 38.6 | | w/ GC | R50GN+WS | 40.6 | 61.3 | 43.9 | 36.6 | 58.2 | 39.1 | *** ### Person ReId The codes are in [`PersonReId`](https://github.com/Yonghongwei/reid-strong-baseline). Please let [`SGD.py`](https://github.com/Yonghongwei/reid-strong-baseline/tree/master/tools/SGD.py) in [`reid-strong-baseline\tools\`](https://github.com/Yonghongwei/reid-strong-baseline/tree/master/tools), and update [`reid-strong-baseline\solver\build.py`](https://github.com/Yonghongwei/reid-strong-baseline/blob/master/solver/build.py). For Market1501, please use SGD_GCC algorithm with learning rate 0.03 or 0.02 and weight decay 0.002. For example, you can change the '.sh' file with the following codes: ```python python3 tools/train.py --config_file='configs/softmax_triplet_with_center.yml' MODEL.DEVICE_ID "('0')" DATASETS.NAMES "('market1501')" DATASETS.ROOT_DIR "('/home/yonghw/data/reid/')" OUTPUT_DIR "('out_dir/market1501/test')" SOLVER.OPTIMIZER_NAME "('SGD_GCC')" SOLVER.BASE_LR "(0.03)" SOLVER.WEIGHT_DECAY "(0.002)" SOLVER.WEIGHT_DECAY_BIAS "(0.002)" ``` The results of Market1501 without reranking are shown in the following table: | Method | Backbone | MAP | Top 1 | | :-----------: | :-----------: |:------:|:-------: | | Adam* | R18 | 77.8 | 91.7 | | SGD_GCC | R18 | 81.3 | 92.7 | | Adam* | R50 | 85.9 | 94.5 | | SGD_GCC | R50 | 86.6 | 94.8 | | Adam* | R101 | 87.1 | 94.5 | | SGD_GCC | R101 | 87.9 | 95.0 | The results with * are reported by the authors in [reid-strong-baseline](https://github.com/michuanhaohao/reid-strong-baseline). Our reproduced results are slightly lower than the results provided by the authors. ================================================ FILE: algorithm-GC/README.md ================================================ # Advanced-optimizer-with-Gradient-Centralization Advanced optimizer with Gradient-Centralization Please Refer to ## [Gradient Centralization: A New Optimization Technique for Deep Neural Networks](https://arxiv.org/abs/2004.01461) ## Introduction We embed GC into some advanced DNN optimizers, including [`SGD.py`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/SGD.py), [`Adam.py`](https://github.com/Yonghongwei/Advanced-optimizer-with-Gradient-Centralization/blob/master/algorithm/Adam.py), [`AdamW`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/Adam.py), [`RAdam`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/RAdam.py),[`Lookahead`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/Lookahead.py)+[`SGD.py`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/SGD.py), [`Lookahead`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/Lookahead.py)+[`Adam.py`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/Adam.py), [`Ranger`](https://github.com/Yonghongwei/Gradient-Centralization/tree/master/algorithm-GC/algorithm/Ranger.py). There are three hyper-parameters `use_gc`, `gc_conv_only` and `gc_loc`. `use_gc=True` means that the algorithm adds GC operation, otherwise, not. `gc_conv_only=True` means the algorithm only adds GC operation for Conv layer, otherwise, for both Conv and FC layer. `gc_loc` controls the location of GC operation for adaptive learning rate algorithms, including Adam, Radam, Ranger and so on. There are two locations in the algorithm to add GC operation for original gradient and generalized gradient, respectively. Generalized gradient is the variable which is directly used to update the weight. For adaptive learning rate algorithms, we suggest `gc_loc=False`. For SGD, these two locations for GC are equivalent, so we do not introduce the hyper-parameter `gc_loc`. We also give an example of how to use these algorithms in [`Cifar`](https://github.com/Yonghongwei/Gradient-Centralization/blob/master/algorithm-GC/cifar/main.py). For example: ```python # SGD optimizer = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = args.weight_decay,use_gc=True, gc_conv_only=False) ``` ```python # Adam optimizer = Adam(net.parameters(), lr=args.lr, weight_decay = args.weight_decay,use_gc=True, gc_conv_only=False,gc_loc=False) ``` ```python # RAdam optimizer = RAdam(net.parameters(), lr=args.lr, weight_decay = args.weight_decay,use_gc=True, gc_conv_only=False,gc_loc=False) ``` ```python # lookahead+SGD base_opt = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = args.weight_decay,use_gc=False, gc_conv_only=False) optimizer = Lookahead(base_opt, k=5, alpha=0.5) ``` ```python # Ranger optimizer = Ranger(net.parameters(), lr=args.lr, weight_decay = args.weight_decay,use_gc=True, gc_conv_only=False,gc_loc=False) ``` ## References: * Adam: https://arxiv.org/abs/1412.6980 * AdamW: https://arxiv.org/abs/1711.05101 * Lookahead: https://arxiv.org/abs/1907.08610 * RAdam: https://arxiv.org/abs/1908.03265, https://github.com/LiyuanLucasLiu/RAdam * Ranger: https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer * Gradient Centralization: https://arxiv.org/abs/2004.01461v2 ================================================ FILE: algorithm-GC/algorithm/Adam.py ================================================ import math import torch from torch.optim.optimizer import Optimizer from .Centralization import centralized_gradient class Adam(Optimizer): r"""Implements Adam algorithm. It has been proposed in `Adam: A Method for Stochastic Optimization`_. Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) amsgrad (boolean, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ (default: False) .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False,use_gc=False, gc_conv_only=False,gc_loc=False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) if not 0.0 <= weight_decay: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad) super(Adam, self).__init__(params, defaults) self.gc_loc=gc_loc self.use_gc=use_gc self.gc_conv_only=gc_conv_only def __setstate__(self, state): super(Adam, self).__setstate__(state) for group in self.param_groups: group.setdefault('amsgrad', False) @torch.no_grad() def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad if grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') amsgrad = group['amsgrad'] state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] if amsgrad: max_exp_avg_sq = state['max_exp_avg_sq'] beta1, beta2 = group['betas'] state['step'] += 1 bias_correction1 = 1 - beta1 ** state['step'] bias_correction2 = 1 - beta2 ** state['step'] if group['weight_decay'] != 0: grad = grad.add(p, alpha=group['weight_decay']) if self.gc_loc: grad=centralized_gradient(grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only) # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) else: denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) step_size = group['lr'] / bias_correction1 #GC operation G_grad=exp_avg/denom if self.gc_loc==False: G_grad=centralized_gradient(G_grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only) p.add_( G_grad, alpha=-step_size) return loss class AdamW(Optimizer): r"""Implements AdamW algorithm. The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_. The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_. Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay coefficient (default: 1e-2) amsgrad (boolean, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ (default: False) .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _Decoupled Weight Decay Regularization: https://arxiv.org/abs/1711.05101 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False,use_gc=False, gc_conv_only=False,gc_loc=True): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) if not 0.0 <= weight_decay: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad) super(AdamW, self).__init__(params, defaults) self.gc_loc=gc_loc self.use_gc=use_gc self.gc_conv_only=gc_conv_only def __setstate__(self, state): super(AdamW, self).__setstate__(state) for group in self.param_groups: group.setdefault('amsgrad', False) @torch.no_grad() def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue # Perform optimization step grad = p.grad if grad.is_sparse: raise RuntimeError('AdamW does not support sparse gradients') amsgrad = group['amsgrad'] state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] if amsgrad: max_exp_avg_sq = state['max_exp_avg_sq'] beta1, beta2 = group['betas'] state['step'] += 1 bias_correction1 = 1 - beta1 ** state['step'] bias_correction2 = 1 - beta2 ** state['step'] if self.gc_loc: grad=centralized_gradient(grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only) # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) else: denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) step_size = group['lr'] / bias_correction1 #GC operation and stepweight decay G_grad=(exp_avg/denom).add(p.data,alpha=group['weight_decay']) if self.gc_loc==False: G_grad=centralized_gradient(G_grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only) p.add_( G_grad, alpha=-step_size) return loss ================================================ FILE: algorithm-GC/algorithm/Centralization.py ================================================ import torch #from torch.optim.optimizer import Optimizer, required def centralized_gradient(x,use_gc=True,gc_conv_only=False): if use_gc: if gc_conv_only: if len(list(x.size()))>3: x.add_(-x.mean(dim = tuple(range(1,len(list(x.size())))), keepdim = True)) else: if len(list(x.size()))>1: x.add_(-x.mean(dim = tuple(range(1,len(list(x.size())))), keepdim = True)) return x ================================================ FILE: algorithm-GC/algorithm/Lookahead.py ================================================ from collections import defaultdict from itertools import chain from torch.optim import Optimizer import torch import warnings class Lookahead(Optimizer): def __init__(self, optimizer, k=5, alpha=0.5): self.optimizer = optimizer self.k = k self.alpha = alpha self.param_groups = self.optimizer.param_groups self.state = defaultdict(dict) self.fast_state = self.optimizer.state for group in self.param_groups: group["counter"] = 0 def update(self, group): for fast in group["params"]: param_state = self.state[fast] if "slow_param" not in param_state: param_state["slow_param"] = torch.zeros_like(fast.data) param_state["slow_param"].copy_(fast.data) slow = param_state["slow_param"] slow += (fast.data - slow) * self.alpha fast.data.copy_(slow) def update_lookahead(self): for group in self.param_groups: self.update(group) def step(self, closure=None): loss = self.optimizer.step(closure) for group in self.param_groups: if group["counter"] == 0: self.update(group) group["counter"] += 1 if group["counter"] >= self.k: group["counter"] = 0 return loss def state_dict(self): fast_state_dict = self.optimizer.state_dict() slow_state = { (id(k) if isinstance(k, torch.Tensor) else k): v for k, v in self.state.items() } fast_state = fast_state_dict["state"] param_groups = fast_state_dict["param_groups"] return { "fast_state": fast_state, "slow_state": slow_state, "param_groups": param_groups, } def load_state_dict(self, state_dict): slow_state_dict = { "state": state_dict["slow_state"], "param_groups": state_dict["param_groups"], } fast_state_dict = { "state": state_dict["fast_state"], "param_groups": state_dict["param_groups"], } super(Lookahead, self).load_state_dict(slow_state_dict) self.optimizer.load_state_dict(fast_state_dict) self.fast_state = self.optimizer.state def add_param_group(self, param_group): param_group["counter"] = 0 self.optimizer.add_param_group(param_group) ================================================ FILE: algorithm-GC/algorithm/RAdam.py ================================================ import math import torch from torch.optim.optimizer import Optimizer from .Centralization import centralized_gradient class RAdam(Optimizer): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True,use_gc=False, gc_conv_only=False,gc_loc=False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) self.degenerated_to_sgd = degenerated_to_sgd self.gc_loc=gc_loc self.use_gc=use_gc self.gc_conv_only=gc_conv_only if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): for param in params: if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]): param['buffer'] = [[None, None, None] for _ in range(10)] defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)]) super(RAdam, self).__init__(params, defaults) def __setstate__(self, state): super(RAdam, self).__setstate__(state) def step(self, closure=None): loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data.float() if grad.is_sparse: raise RuntimeError('RAdam does not support sparse gradients') p_data_fp32 = p.data.float() state = self.state[p] if len(state) == 0: state['step'] = 0 state['exp_avg'] = torch.zeros_like(p_data_fp32) state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) else: state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] beta1, beta2 = group['betas'] if self.gc_loc: grad=centralized_gradient(grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only) exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) state['step'] += 1 buffered = group['buffer'][int(state['step'] % 10)] if state['step'] == buffered[0]: N_sma, step_size = buffered[1], buffered[2] else: buffered[0] = state['step'] beta2_t = beta2 ** state['step'] N_sma_max = 2 / (1 - beta2) - 1 N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) buffered[1] = N_sma # more conservative since it's an approximated value if N_sma >= 5: step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) elif self.degenerated_to_sgd: step_size = 1.0 / (1 - beta1 ** state['step']) else: step_size = -1 buffered[2] = step_size # more conservative since it's an approximated value if N_sma >= 5: denom = exp_avg_sq.sqrt().add_(group['eps']) G_grad=exp_avg/denom elif step_size > 0: G_grad=exp_avg if group['weight_decay'] != 0: G_grad.add_(p_data_fp32,alpha=group['weight_decay']) #GC operation if self.gc_loc==False: G_grad=centralized_gradient(G_grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only) p_data_fp32.add_( G_grad, alpha=-step_size * group['lr']) #p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) p.data.copy_(p_data_fp32) return loss class PlainRAdam(Optimizer): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True,use_gc=False, gc_conv_only=False,gc_loc=False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) self.degenerated_to_sgd = degenerated_to_sgd self.gc_loc=gc_loc self.use_gc = use_gc self.gc_conv_only=gc_conv_only defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) super(PlainRAdam, self).__init__(params, defaults) def __setstate__(self, state): super(PlainRAdam, self).__setstate__(state) def step(self, closure=None): loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data.float() if grad.is_sparse: raise RuntimeError('RAdam does not support sparse gradients') p_data_fp32 = p.data.float() state = self.state[p] if len(state) == 0: state['step'] = 0 state['exp_avg'] = torch.zeros_like(p_data_fp32) state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) else: state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] beta1, beta2 = group['betas'] if self.gc_loc: grad=centralized_gradient(grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only) exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) state['step'] += 1 beta2_t = beta2 ** state['step'] N_sma_max = 2 / (1 - beta2) - 1 N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) # more conservative since it's an approximated value if N_sma >= 5: #if group['weight_decay'] != 0: # p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) denom = exp_avg_sq.sqrt().add_(group['eps']) G_grad=exp_avg/denom elif self.degenerated_to_sgd: #if group['weight_decay'] != 0: # p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) step_size = group['lr'] / (1 - beta1 ** state['step']) G_grad=exp_avg if group['weight_decay'] != 0: G_grad.add_(p.data,alpha=group['weight_decay']) if self.gc_loc==False: G_grad=centralized_gradient(G_grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only) p_data_fp32.add_( G_grad, alpha=-step_size * group['lr']) #p_data_fp32.addcdiv_(-step_size, exp_avg, denom) p.data.copy_(p_data_fp32) return loss ================================================ FILE: algorithm-GC/algorithm/Ranger.py ================================================ import math import torch from torch.optim.optimizer import Optimizer from .Centralization import centralized_gradient class Ranger(Optimizer): def __init__(self, params, lr=1e-3, # lr alpha=0.5, k=6, N_sma_threshhold=5, # Ranger options betas=(.95, 0.999), eps=1e-5, weight_decay=0, # Adam options # Gradient centralization on or off, applied to conv layers only or conv + fc layers use_gc=False, gc_conv_only=False,gc_loc=False ): # parameter checks if not 0.0 <= alpha <= 1.0: raise ValueError(f'Invalid slow update rate: {alpha}') if not 1 <= k: raise ValueError(f'Invalid lookahead steps: {k}') if not lr > 0: raise ValueError(f'Invalid Learning Rate: {lr}') if not eps > 0: raise ValueError(f'Invalid eps: {eps}') # parameter comments: # beta1 (momentum) of .95 seems to work better than .90... # N_sma_threshold of 5 seems better in testing than 4. # In both cases, worth testing on your dataset (.90 vs .95, 4 vs 5) to make sure which works best for you. # prep defaults and init torch.optim base defaults = dict(lr=lr, alpha=alpha, k=k, step_counter=0, betas=betas, N_sma_threshhold=N_sma_threshhold, eps=eps, weight_decay=weight_decay) super().__init__(params, defaults) # adjustable threshold self.N_sma_threshhold = N_sma_threshhold # look ahead params self.alpha = alpha self.k = k # radam buffer for state self.radam_buffer = [[None, None, None] for ind in range(10)] # gc on or off self.gc_loc=gc_loc self.use_gc = use_gc self.gc_conv_only=gc_conv_only # level of gradient centralization #self.gc_gradient_threshold = 3 if gc_conv_only else 1 print( f"Ranger optimizer loaded. \nGradient Centralization usage = {self.use_gc}") if (self.use_gc and self.gc_conv_only == False): print(f"GC applied to both conv and fc layers") elif (self.use_gc and self.gc_conv_only == True): print(f"GC applied to conv layers only") def __setstate__(self, state): print("set state called") super(Ranger, self).__setstate__(state) def step(self, closure=None): loss = None # note - below is commented out b/c I have other work that passes back the loss as a float, and thus not a callable closure. # Uncomment if you need to use the actual closure... # if closure is not None: #loss = closure() # Evaluate averages and grad, update param tensors for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data.float() if grad.is_sparse: raise RuntimeError( 'Ranger optimizer does not support sparse gradients') p_data_fp32 = p.data.float() state = self.state[p] # get state dict for this param if len(state) == 0: # if first time to run...init dictionary with our desired entries # if self.first_run_check==0: # self.first_run_check=1 #print("Initializing slow buffer...should not see this at load from saved model!") state['step'] = 0 state['exp_avg'] = torch.zeros_like(p_data_fp32) state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) # look ahead weight storage now in state dict state['slow_buffer'] = torch.empty_like(p.data) state['slow_buffer'].copy_(p.data) else: state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) state['exp_avg_sq'] = state['exp_avg_sq'].type_as( p_data_fp32) # begin computations exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] beta1, beta2 = group['betas'] # GC operation for Conv layers and FC layers #if grad.dim() > self.gc_gradient_threshold: # grad.add_(-grad.mean(dim=tuple(range(1, grad.dim())), keepdim=True)) if self.gc_loc: grad=centralized_gradient(grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only) state['step'] += 1 # compute variance mov avg exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) # compute mean moving avg exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) buffered = self.radam_buffer[int(state['step'] % 10)] if state['step'] == buffered[0]: N_sma, step_size = buffered[1], buffered[2] else: buffered[0] = state['step'] beta2_t = beta2 ** state['step'] N_sma_max = 2 / (1 - beta2) - 1 N_sma = N_sma_max - 2 * \ state['step'] * beta2_t / (1 - beta2_t) buffered[1] = N_sma if N_sma > self.N_sma_threshhold: step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * ( N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) else: step_size = 1.0 / (1 - beta1 ** state['step']) buffered[2] = step_size #if group['weight_decay'] != 0: # p_data_fp32.add_(-group['weight_decay'] # * group['lr'], p_data_fp32) # apply lr if N_sma > self.N_sma_threshhold: denom = exp_avg_sq.sqrt().add_(group['eps']) G_grad=exp_avg/denom else: G_grad=exp_avg if group['weight_decay'] != 0: G_grad.add_(p_data_fp32,alpha=group['weight_decay']) #GC operation if self.gc_loc==False: G_grad=centralized_gradient(G_grad,use_gc=self.use_gc,gc_conv_only=self.gc_conv_only) p_data_fp32.add_( G_grad, alpha=-step_size * group['lr']) p.data.copy_(p_data_fp32) # integrated look ahead... # we do it at the param level instead of group level if state['step'] % group['k'] == 0: # get access to slow param tensor slow_p = state['slow_buffer'] # (fast weights - slow weights) * alpha slow_p.add_( p.data - slow_p,alpha=self.alpha) # copy interpolated weights to RAdam param tensor p.data.copy_(slow_p) return loss ================================================ FILE: algorithm-GC/algorithm/SGD.py ================================================ import torch from torch.optim.optimizer import Optimizer, required from .Centralization import centralized_gradient class SGD(Optimizer): r"""Implements stochastic gradient descent (optionally with momentum). Nesterov momentum is based on the formula from `On the importance of initialization and momentum in deep learning`__. Args: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float): learning rate momentum (float, optional): momentum factor (default: 0) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) dampening (float, optional): dampening for momentum (default: 0) nesterov (bool, optional): enables Nesterov momentum (default: False) Example: >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9) >>> optimizer.zero_grad() >>> loss_fn(model(input), target).backward() >>> optimizer.step() __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf .. note:: The implementation of SGD with Momentum/Nesterov subtly differs from Sutskever et. al. and implementations in some other frameworks. Considering the specific case of Momentum, the update can be written as .. math:: \begin{aligned} v_{t+1} & = \mu * v_{t} + g_{t+1}, \\ p_{t+1} & = p_{t} - \text{lr} * v_{t+1}, \end{aligned} where :math:`p`, :math:`g`, :math:`v` and :math:`\mu` denote the parameters, gradient, velocity, and momentum respectively. This is in contrast to Sutskever et. al. and other frameworks which employ an update of the form .. math:: \begin{aligned} v_{t+1} & = \mu * v_{t} + \text{lr} * g_{t+1}, \\ p_{t+1} & = p_{t} - v_{t+1}. \end{aligned} The Nesterov version is analogously modified. """ def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False,use_gc=False, gc_conv_only=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov, use_gc=use_gc,gc_conv_only=gc_conv_only) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGD, self).__init__(params, defaults) def __setstate__(self, state): super(SGD, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) @torch.no_grad() def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad if weight_decay != 0: d_p = d_p.add(p, alpha=weight_decay) #GC operation d_p =centralized_gradient(d_p ,use_gc=group['use_gc'],gc_conv_only=group['gc_conv_only']) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(d_p, alpha=1 - dampening) if nesterov: d_p = d_p.add(buf, alpha=momentum) else: d_p = buf p.add_(d_p, alpha=-group['lr']) return loss ================================================ FILE: algorithm-GC/cifar/main.py ================================================ '''Train CIFAR100 with PyTorch.''' from __future__ import print_function import torch import torch.nn as nn import torch.backends.cudnn as cudnn import torch.optim as optim import torch.nn.functional as F import torchvision import torchvision.transforms as transforms from torch.optim import lr_scheduler import os import argparse from torchvision import datasets, models from models import * #from utils import progress_bar import numpy as np import sys sys.path.append('../') #import optimizers with GC from algorithm.SGD import SGD from algorithm.Adam import Adam,AdamW from algorithm.RAdam import RAdam from algorithm.Lookahead import Lookahead from algorithm.Ranger import Ranger #from algorithm.Adam import Adam_GCC,AdamW,AdamW_GCC #from algorithm.Adagrad import Adagrad_GCC parser = argparse.ArgumentParser(description='PyTorch CIFAR100 Training') parser.add_argument('--lr', default=0.1, type=float, help='learning rate') parser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint') parser.add_argument('--bs', default=128, type=int, help='batchsize') parser.add_argument('--wd', default=0.0005, type=float, help='weight decay') parser.add_argument('--alg', default='sgd', type=str, help='algorithm') parser.add_argument('--epochs', default=200, type=int, help='epochs') parser.add_argument('--path', default='logout/result', type=str, help='path') parser.add_argument('--model', default='r50', type=str, help='model') parser.add_argument('--gpug', default=1, type=int, help='gpugroup') args = parser.parse_args() #os.environ["CUDA_VISIBLE_DEVICES"]="0" if args.gpug==11: os.environ["CUDA_VISIBLE_DEVICES"]="1" if args.gpug==12: os.environ["CUDA_VISIBLE_DEVICES"]="2" if args.gpug==13: os.environ["CUDA_VISIBLE_DEVICES"]="3" if args.gpug==14: os.environ["CUDA_VISIBLE_DEVICES"]="4" if args.gpug==15: os.environ["CUDA_VISIBLE_DEVICES"]="5" if args.gpug==16: os.environ["CUDA_VISIBLE_DEVICES"]="6" if args.gpug==17: os.environ["CUDA_VISIBLE_DEVICES"]="7" if args.gpug==10: os.environ["CUDA_VISIBLE_DEVICES"]="0" epochs=args.epochs device = 'cuda' if torch.cuda.is_available() else 'cpu' best_acc = 0 # best test accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch # Data print('==> Preparing data..') transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.507, 0.487, 0.441), (0.267, 0.256, 0.276)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.507, 0.487, 0.441), (0.267, 0.256, 0.276)), ]) trainset = torchvision.datasets.CIFAR100(root='/home/yonghw/data/cifar100/', train=True, download=True, transform=transform_train) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=4,drop_last=True) testset = torchvision.datasets.CIFAR100(root='/home/yonghw/data/cifar100/', train=False, download=True, transform=transform_test) testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=4) # Model print('==> Building model..') Num_classes = 100 if args.model=='r18': net = ResNet18(Num_classes=Num_classes) if args.model=='r34': net = ResNet34(Num_classes=Num_classes) if args.model=='r50': net = ResNet50(Num_classes=Num_classes) if args.model=='r101': net = ResNet101(Num_classes=Num_classes) if args.model=='v11': net = VGG('VGG11',Num_classes=Num_classes) if args.model=='rx29': net = ResNeXt29_4x64d(Num_classes=Num_classes) if args.model=='d121': net = DenseNet121(Num_classes=Num_classes) if device == 'cuda': net = net.cuda() net = torch.nn.DataParallel(net) cudnn.benchmark = True if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!' checkpoint = torch.load('./checkpoint/ckpt.t7') net.load_state_dict(checkpoint['net']) best_acc = checkpoint['acc'] start_epoch = checkpoint['epoch'] criterion = nn.CrossEntropyLoss() #optimizer WD=args.wd print('==> choose optimizer..') if args.alg=='sgd': optimizer = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD,use_gc=False, gc_conv_only=False) if args.alg=='sgdGC': optimizer = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD,use_gc=True, gc_conv_only=False) if args.alg=='sgdGCC': optimizer = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD,use_gc=True, gc_conv_only=True) if args.alg=='adam': optimizer = Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=False, gc_conv_only=False) if args.alg=='adamGC': optimizer = Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=False) if args.alg=='adamGCC': optimizer = Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=True) if args.alg=='adamW': optimizer = AdamW(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=False, gc_conv_only=False) if args.alg=='adamWGC': optimizer = AdamW(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=False) if args.alg=='adamWGCC': optimizer = AdamW(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=True) if args.alg=='radam': optimizer = RAdam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=False, gc_conv_only=False) if args.alg=='radamGC': optimizer = RAdam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=False) if args.alg=='radamGCC': optimizer = RAdam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=True) if args.alg=='Lsgd': base_opt = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD,use_gc=False, gc_conv_only=False) optimizer = Lookahead(base_opt, k=5, alpha=0.5) if args.alg=='LsgdGC': base_opt = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD,use_gc=True, gc_conv_only=False) optimizer = Lookahead(base_opt, k=5, alpha=0.5) if args.alg=='LsgdGCC': base_opt = SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD,use_gc=True, gc_conv_only=True) optimizer = Lookahead(base_opt, k=5, alpha=0.5) if args.alg=='Ladam': base_opt = Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=False, gc_conv_only=False) optimizer = Lookahead(base_opt, k=5, alpha=0.5) if args.alg=='LadamGC': base_opt = Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=False) optimizer = Lookahead(base_opt, k=5, alpha=0.5) if args.alg=='LadamGCC': base_opt = Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=True) optimizer = Lookahead(base_opt, k=5, alpha=0.5) if args.alg=='Lradam': base_opt = RAdam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=False, gc_conv_only=False) optimizer = Lookahead(base_opt, k=5, alpha=0.5) if args.alg=='LradamGC': base_opt = RAdam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=False) optimizer = Lookahead(base_opt, k=5, alpha=0.5) if args.alg=='LradamGCC': base_opt = RAdam(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=True) optimizer = Lookahead(base_opt, k=5, alpha=0.5) if args.alg=='ranger': optimizer = Ranger(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=False, gc_conv_only=False) if args.alg=='rangerGC': optimizer = Ranger(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=False) if args.alg=='rangerGCC': optimizer = Ranger(net.parameters(), lr=args.lr*0.01, weight_decay = WD,use_gc=True, gc_conv_only=True) if args.epochs==200: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=60, gamma=0.1) if args.epochs==400: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=120, gamma=0.1) # Training def train(epoch,net,optimizer): print('\nEpoch: %d' % epoch) net.train() train_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(trainloader): inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() print('Training: Loss: {:.4f} | Acc: {:.4f}'.format(train_loss/(batch_idx+1),correct/total)) # progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' # % (train_loss/(batch_idx+1), 100.*correct/total, correct, total)) acc=100.*correct/total return acc # Testing def test(epoch,net): global best_acc net.eval() test_loss = 0 correct = 0 total = 0 with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(testloader): inputs, targets = inputs.to(device), targets.to(device) outputs = net(inputs) loss = criterion(outputs, targets) test_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() #progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' #% (test_loss/(batch_idx+1), 100.*correct/total, correct, total)) print('Testing:Loss: {:.4f} | Acc: {:.4f}'.format(test_loss/(batch_idx+1),correct/total) ) # Save checkpoint. acc = 100.*correct/total if acc > best_acc: print('Saving..') state = { 'net': net.state_dict(), 'acc': acc, 'epoch': epoch, } if not os.path.isdir('checkpoint'): os.mkdir('checkpoint') torch.save(state, './checkpoint/ckpt.t7') best_acc = acc return acc for epoch in range(start_epoch, start_epoch+epochs): train_acc=train(epoch,net,optimizer) exp_lr_scheduler.step() val_acc=test(epoch,net) ================================================ FILE: algorithm-GC/cifar/models/__init__.py ================================================ from .vgg import * from .dpn import * from .lenet import * from .senet import * from .pnasnet import * from .densenet import * from .googlenet import * from .shufflenet import * from .resnet import * from .resnext import * from .preact_resnet import * from .mobilenet import * from .mobilenetv2 import * ================================================ FILE: algorithm-GC/cifar/models/densenet.py ================================================ '''DenseNet in PyTorch.''' import math import torch import torch.nn as nn import torch.nn.functional as F class Bottleneck(nn.Module): def __init__(self, in_planes, growth_rate): super(Bottleneck, self).__init__() self.bn1 = nn.BatchNorm2d(in_planes) self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False) self.bn2 = nn.BatchNorm2d(4*growth_rate) self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False) def forward(self, x): out = self.conv1(F.relu(self.bn1(x))) out = self.conv2(F.relu(self.bn2(out))) out = torch.cat([out,x], 1) return out class Transition(nn.Module): def __init__(self, in_planes, out_planes): super(Transition, self).__init__() self.bn = nn.BatchNorm2d(in_planes) self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False) def forward(self, x): out = self.conv(F.relu(self.bn(x))) out = F.avg_pool2d(out, 2) return out class DenseNet(nn.Module): def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10): super(DenseNet, self).__init__() self.growth_rate = growth_rate num_planes = 2*growth_rate self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False) self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0]) num_planes += nblocks[0]*growth_rate out_planes = int(math.floor(num_planes*reduction)) self.trans1 = Transition(num_planes, out_planes) num_planes = out_planes self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1]) num_planes += nblocks[1]*growth_rate out_planes = int(math.floor(num_planes*reduction)) self.trans2 = Transition(num_planes, out_planes) num_planes = out_planes self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2]) num_planes += nblocks[2]*growth_rate out_planes = int(math.floor(num_planes*reduction)) self.trans3 = Transition(num_planes, out_planes) num_planes = out_planes self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3]) num_planes += nblocks[3]*growth_rate self.bn = nn.BatchNorm2d(num_planes) self.linear = nn.Linear(num_planes, num_classes) def _make_dense_layers(self, block, in_planes, nblock): layers = [] for i in range(nblock): layers.append(block(in_planes, self.growth_rate)) in_planes += self.growth_rate return nn.Sequential(*layers) def forward(self, x): out = self.conv1(x) out = self.trans1(self.dense1(out)) out = self.trans2(self.dense2(out)) out = self.trans3(self.dense3(out)) out = self.dense4(out) out = F.avg_pool2d(F.relu(self.bn(out)), 4) out = out.view(out.size(0), -1) out = self.linear(out) return out def DenseNet121(): return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32) def DenseNet169(): return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32) def DenseNet201(): return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32) def DenseNet161(): return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48) def densenet_cifar(): return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12) def test(): net = densenet_cifar() x = torch.randn(1,3,32,32) y = net(x) print(y) # test() ================================================ FILE: algorithm-GC/cifar/models/dpn.py ================================================ '''Dual Path Networks in PyTorch.''' import torch import torch.nn as nn import torch.nn.functional as F class Bottleneck(nn.Module): def __init__(self, last_planes, in_planes, out_planes, dense_depth, stride, first_layer): super(Bottleneck, self).__init__() self.out_planes = out_planes self.dense_depth = dense_depth self.conv1 = nn.Conv2d(last_planes, in_planes, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(in_planes) self.conv2 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=32, bias=False) self.bn2 = nn.BatchNorm2d(in_planes) self.conv3 = nn.Conv2d(in_planes, out_planes+dense_depth, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(out_planes+dense_depth) self.shortcut = nn.Sequential() if first_layer: self.shortcut = nn.Sequential( nn.Conv2d(last_planes, out_planes+dense_depth, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(out_planes+dense_depth) ) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = F.relu(self.bn2(self.conv2(out))) out = self.bn3(self.conv3(out)) x = self.shortcut(x) d = self.out_planes out = torch.cat([x[:,:d,:,:]+out[:,:d,:,:], x[:,d:,:,:], out[:,d:,:,:]], 1) out = F.relu(out) return out class DPN(nn.Module): def __init__(self, cfg): super(DPN, self).__init__() in_planes, out_planes = cfg['in_planes'], cfg['out_planes'] num_blocks, dense_depth = cfg['num_blocks'], cfg['dense_depth'] self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(64) self.last_planes = 64 self.layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1) self.layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2) self.layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=2) self.layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2) self.linear = nn.Linear(out_planes[3]+(num_blocks[3]+1)*dense_depth[3], 10) def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride): strides = [stride] + [1]*(num_blocks-1) layers = [] for i,stride in enumerate(strides): layers.append(Bottleneck(self.last_planes, in_planes, out_planes, dense_depth, stride, i==0)) self.last_planes = out_planes + (i+2) * dense_depth return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) out = self.layer4(out) out = F.avg_pool2d(out, 4) out = out.view(out.size(0), -1) out = self.linear(out) return out def DPN26(): cfg = { 'in_planes': (96,192,384,768), 'out_planes': (256,512,1024,2048), 'num_blocks': (2,2,2,2), 'dense_depth': (16,32,24,128) } return DPN(cfg) def DPN92(): cfg = { 'in_planes': (96,192,384,768), 'out_planes': (256,512,1024,2048), 'num_blocks': (3,4,20,3), 'dense_depth': (16,32,24,128) } return DPN(cfg) def test(): net = DPN92() x = torch.randn(1,3,32,32) y = net(x) print(y) # test() ================================================ FILE: algorithm-GC/cifar/models/googlenet.py ================================================ '''GoogLeNet with PyTorch.''' import torch import torch.nn as nn import torch.nn.functional as F class Inception(nn.Module): def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes): super(Inception, self).__init__() # 1x1 conv branch self.b1 = nn.Sequential( nn.Conv2d(in_planes, n1x1, kernel_size=1), nn.BatchNorm2d(n1x1), nn.ReLU(True), ) # 1x1 conv -> 3x3 conv branch self.b2 = nn.Sequential( nn.Conv2d(in_planes, n3x3red, kernel_size=1), nn.BatchNorm2d(n3x3red), nn.ReLU(True), nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1), nn.BatchNorm2d(n3x3), nn.ReLU(True), ) # 1x1 conv -> 5x5 conv branch self.b3 = nn.Sequential( nn.Conv2d(in_planes, n5x5red, kernel_size=1), nn.BatchNorm2d(n5x5red), nn.ReLU(True), nn.Conv2d(n5x5red, n5x5, kernel_size=3, padding=1), nn.BatchNorm2d(n5x5), nn.ReLU(True), nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1), nn.BatchNorm2d(n5x5), nn.ReLU(True), ) # 3x3 pool -> 1x1 conv branch self.b4 = nn.Sequential( nn.MaxPool2d(3, stride=1, padding=1), nn.Conv2d(in_planes, pool_planes, kernel_size=1), nn.BatchNorm2d(pool_planes), nn.ReLU(True), ) def forward(self, x): y1 = self.b1(x) y2 = self.b2(x) y3 = self.b3(x) y4 = self.b4(x) return torch.cat([y1,y2,y3,y4], 1) class GoogLeNet(nn.Module): def __init__(self): super(GoogLeNet, self).__init__() self.pre_layers = nn.Sequential( nn.Conv2d(3, 192, kernel_size=3, padding=1), nn.BatchNorm2d(192), nn.ReLU(True), ) self.a3 = Inception(192, 64, 96, 128, 16, 32, 32) self.b3 = Inception(256, 128, 128, 192, 32, 96, 64) self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) self.a4 = Inception(480, 192, 96, 208, 16, 48, 64) self.b4 = Inception(512, 160, 112, 224, 24, 64, 64) self.c4 = Inception(512, 128, 128, 256, 24, 64, 64) self.d4 = Inception(512, 112, 144, 288, 32, 64, 64) self.e4 = Inception(528, 256, 160, 320, 32, 128, 128) self.a5 = Inception(832, 256, 160, 320, 32, 128, 128) self.b5 = Inception(832, 384, 192, 384, 48, 128, 128) self.avgpool = nn.AvgPool2d(8, stride=1) self.linear = nn.Linear(1024, 10) def forward(self, x): out = self.pre_layers(x) out = self.a3(out) out = self.b3(out) out = self.maxpool(out) out = self.a4(out) out = self.b4(out) out = self.c4(out) out = self.d4(out) out = self.e4(out) out = self.maxpool(out) out = self.a5(out) out = self.b5(out) out = self.avgpool(out) out = out.view(out.size(0), -1) out = self.linear(out) return out def test(): net = GoogLeNet() x = torch.randn(1,3,32,32) y = net(x) print(y.size()) # test() ================================================ FILE: algorithm-GC/cifar/models/lenet.py ================================================ '''LeNet in PyTorch.''' import torch.nn as nn import torch.nn.functional as F class LeNet(nn.Module): def __init__(self): super(LeNet, self).__init__() self.conv1 = nn.Conv2d(3, 6, 5) self.conv2 = nn.Conv2d(6, 16, 5) self.fc1 = nn.Linear(16*5*5, 120) self.fc2 = nn.Linear(120, 84) self.fc3 = nn.Linear(84, 10) def forward(self, x): out = F.relu(self.conv1(x)) out = F.max_pool2d(out, 2) out = F.relu(self.conv2(out)) out = F.max_pool2d(out, 2) out = out.view(out.size(0), -1) out = F.relu(self.fc1(out)) out = F.relu(self.fc2(out)) out = self.fc3(out) return out ================================================ FILE: algorithm-GC/cifar/models/mobilenet.py ================================================ '''MobileNet in PyTorch. See the paper "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" for more details. ''' import torch import torch.nn as nn import torch.nn.functional as F class Block(nn.Module): '''Depthwise conv + Pointwise conv''' def __init__(self, in_planes, out_planes, stride=1): super(Block, self).__init__() self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False) self.bn1 = nn.BatchNorm2d(in_planes) self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) self.bn2 = nn.BatchNorm2d(out_planes) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = F.relu(self.bn2(self.conv2(out))) return out class MobileNet(nn.Module): # (128,2) means conv planes=128, conv stride=2, by default conv stride=1 cfg = [64, (128,2), 128, (256,2), 256, (512,2), 512, 512, 512, 512, 512, (1024,2), 1024] def __init__(self, num_classes=10): super(MobileNet, self).__init__() self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(32) self.layers = self._make_layers(in_planes=32) self.linear = nn.Linear(1024, num_classes) def _make_layers(self, in_planes): layers = [] for x in self.cfg: out_planes = x if isinstance(x, int) else x[0] stride = 1 if isinstance(x, int) else x[1] layers.append(Block(in_planes, out_planes, stride)) in_planes = out_planes return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layers(out) out = F.avg_pool2d(out, 2) out = out.view(out.size(0), -1) out = self.linear(out) return out def test(): net = MobileNet() x = torch.randn(1,3,32,32) y = net(x) print(y.size()) # test() ================================================ FILE: algorithm-GC/cifar/models/mobilenetv2.py ================================================ '''MobileNetV2 in PyTorch. See the paper "Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification, Detection and Segmentation" for more details. ''' import torch import torch.nn as nn import torch.nn.functional as F class Block(nn.Module): '''expand + depthwise + pointwise''' def __init__(self, in_planes, out_planes, expansion, stride): super(Block, self).__init__() self.stride = stride planes = expansion * in_planes self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) self.bn3 = nn.BatchNorm2d(out_planes) self.shortcut = nn.Sequential() if stride == 1 and in_planes != out_planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False), nn.BatchNorm2d(out_planes), ) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = F.relu(self.bn2(self.conv2(out))) out = self.bn3(self.conv3(out)) out = out + self.shortcut(x) if self.stride==1 else out return out class MobileNetV2(nn.Module): # (expansion, out_planes, num_blocks, stride) cfg = [(1, 16, 1, 1), (6, 24, 2, 1), # NOTE: change stride 2 -> 1 for CIFAR10 (6, 32, 3, 2), (6, 64, 4, 2), (6, 96, 3, 1), (6, 160, 3, 2), (6, 320, 1, 1)] def __init__(self, num_classes=10): super(MobileNetV2, self).__init__() # NOTE: change conv1 stride 2 -> 1 for CIFAR10 self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(32) self.layers = self._make_layers(in_planes=32) self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False) self.bn2 = nn.BatchNorm2d(1280) self.linear = nn.Linear(1280, num_classes) def _make_layers(self, in_planes): layers = [] for expansion, out_planes, num_blocks, stride in self.cfg: strides = [stride] + [1]*(num_blocks-1) for stride in strides: layers.append(Block(in_planes, out_planes, expansion, stride)) in_planes = out_planes return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layers(out) out = F.relu(self.bn2(self.conv2(out))) # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10 out = F.avg_pool2d(out, 4) out = out.view(out.size(0), -1) out = self.linear(out) return out def test(): net = MobileNetV2() x = torch.randn(2,3,32,32) y = net(x) print(y.size()) # test() ================================================ FILE: algorithm-GC/cifar/models/pnasnet.py ================================================ '''PNASNet in PyTorch. Paper: Progressive Neural Architecture Search ''' import torch import torch.nn as nn import torch.nn.functional as F class SepConv(nn.Module): '''Separable Convolution.''' def __init__(self, in_planes, out_planes, kernel_size, stride): super(SepConv, self).__init__() self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding=(kernel_size-1)//2, bias=False, groups=in_planes) self.bn1 = nn.BatchNorm2d(out_planes) def forward(self, x): return self.bn1(self.conv1(x)) class CellA(nn.Module): def __init__(self, in_planes, out_planes, stride=1): super(CellA, self).__init__() self.stride = stride self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride) if stride==2: self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) self.bn1 = nn.BatchNorm2d(out_planes) def forward(self, x): y1 = self.sep_conv1(x) y2 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1) if self.stride==2: y2 = self.bn1(self.conv1(y2)) return F.relu(y1+y2) class CellB(nn.Module): def __init__(self, in_planes, out_planes, stride=1): super(CellB, self).__init__() self.stride = stride # Left branch self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride) self.sep_conv2 = SepConv(in_planes, out_planes, kernel_size=3, stride=stride) # Right branch self.sep_conv3 = SepConv(in_planes, out_planes, kernel_size=5, stride=stride) if stride==2: self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) self.bn1 = nn.BatchNorm2d(out_planes) # Reduce channels self.conv2 = nn.Conv2d(2*out_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) self.bn2 = nn.BatchNorm2d(out_planes) def forward(self, x): # Left branch y1 = self.sep_conv1(x) y2 = self.sep_conv2(x) # Right branch y3 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1) if self.stride==2: y3 = self.bn1(self.conv1(y3)) y4 = self.sep_conv3(x) # Concat & reduce channels b1 = F.relu(y1+y2) b2 = F.relu(y3+y4) y = torch.cat([b1,b2], 1) return F.relu(self.bn2(self.conv2(y))) class PNASNet(nn.Module): def __init__(self, cell_type, num_cells, num_planes): super(PNASNet, self).__init__() self.in_planes = num_planes self.cell_type = cell_type self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(num_planes) self.layer1 = self._make_layer(num_planes, num_cells=6) self.layer2 = self._downsample(num_planes*2) self.layer3 = self._make_layer(num_planes*2, num_cells=6) self.layer4 = self._downsample(num_planes*4) self.layer5 = self._make_layer(num_planes*4, num_cells=6) self.linear = nn.Linear(num_planes*4, 10) def _make_layer(self, planes, num_cells): layers = [] for _ in range(num_cells): layers.append(self.cell_type(self.in_planes, planes, stride=1)) self.in_planes = planes return nn.Sequential(*layers) def _downsample(self, planes): layer = self.cell_type(self.in_planes, planes, stride=2) self.in_planes = planes return layer def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) out = self.layer4(out) out = self.layer5(out) out = F.avg_pool2d(out, 8) out = self.linear(out.view(out.size(0), -1)) return out def PNASNetA(): return PNASNet(CellA, num_cells=6, num_planes=44) def PNASNetB(): return PNASNet(CellB, num_cells=6, num_planes=32) def test(): net = PNASNetB() x = torch.randn(1,3,32,32) y = net(x) print(y) # test() ================================================ FILE: algorithm-GC/cifar/models/preact_resnet.py ================================================ '''Pre-activation ResNet in PyTorch. Reference: [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Identity Mappings in Deep Residual Networks. arXiv:1603.05027 ''' import torch import torch.nn as nn import torch.nn.functional as F class PreActBlock(nn.Module): '''Pre-activation version of the BasicBlock.''' expansion = 1 def __init__(self, in_planes, planes, stride=1): super(PreActBlock, self).__init__() self.bn1 = nn.BatchNorm2d(in_planes) self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) if stride != 1 or in_planes != self.expansion*planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) ) def forward(self, x): out = F.relu(self.bn1(x)) shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x out = self.conv1(out) out = self.conv2(F.relu(self.bn2(out))) out += shortcut return out class PreActBottleneck(nn.Module): '''Pre-activation version of the original Bottleneck module.''' expansion = 4 def __init__(self, in_planes, planes, stride=1): super(PreActBottleneck, self).__init__() self.bn1 = nn.BatchNorm2d(in_planes) self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn3 = nn.BatchNorm2d(planes) self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) if stride != 1 or in_planes != self.expansion*planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) ) def forward(self, x): out = F.relu(self.bn1(x)) shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x out = self.conv1(out) out = self.conv2(F.relu(self.bn2(out))) out = self.conv3(F.relu(self.bn3(out))) out += shortcut return out class PreActResNet(nn.Module): def __init__(self, block, num_blocks, num_classes=10): super(PreActResNet, self).__init__() self.in_planes = 64 self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) self.linear = nn.Linear(512*block.expansion, num_classes) def _make_layer(self, block, planes, num_blocks, stride): strides = [stride] + [1]*(num_blocks-1) layers = [] for stride in strides: layers.append(block(self.in_planes, planes, stride)) self.in_planes = planes * block.expansion return nn.Sequential(*layers) def forward(self, x): out = self.conv1(x) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) out = self.layer4(out) out = F.avg_pool2d(out, 4) out = out.view(out.size(0), -1) out = self.linear(out) return out def PreActResNet18(): return PreActResNet(PreActBlock, [2,2,2,2]) def PreActResNet34(): return PreActResNet(PreActBlock, [3,4,6,3]) def PreActResNet50(): return PreActResNet(PreActBottleneck, [3,4,6,3]) def PreActResNet101(): return PreActResNet(PreActBottleneck, [3,4,23,3]) def PreActResNet152(): return PreActResNet(PreActBottleneck, [3,8,36,3]) def test(): net = PreActResNet18() y = net((torch.randn(1,3,32,32))) print(y.size()) # test() ================================================ FILE: algorithm-GC/cifar/models/resnet.py ================================================ '''ResNet in PyTorch. For Pre-activation ResNet, see 'preact_resnet.py'. Reference: [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Deep Residual Learning for Image Recognition. arXiv:1512.03385 ''' import torch import torch.nn as nn import torch.nn.functional as F class BasicBlock(nn.Module): expansion = 1 def __init__(self, in_planes, planes, stride=1): super(BasicBlock, self).__init__() self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.shortcut = nn.Sequential() if stride != 1 or in_planes != self.expansion*planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(self.expansion*planes) ) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.bn2(self.conv2(out)) out += self.shortcut(x) out = F.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, in_planes, planes, stride=1): super(Bottleneck, self).__init__() self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(self.expansion*planes) self.shortcut = nn.Sequential() if stride != 1 or in_planes != self.expansion*planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(self.expansion*planes) ) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = F.relu(self.bn2(self.conv2(out))) out = self.bn3(self.conv3(out)) out += self.shortcut(x) out = F.relu(out) return out class ResNet(nn.Module): def __init__(self, block, num_blocks, num_classes=10): super(ResNet, self).__init__() self.in_planes = 64 self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(64) self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) self.linear = nn.Linear(512*block.expansion, num_classes) def _make_layer(self, block, planes, num_blocks, stride): strides = [stride] + [1]*(num_blocks-1) layers = [] for stride in strides: layers.append(block(self.in_planes, planes, stride)) self.in_planes = planes * block.expansion return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) out = self.layer4(out) out = F.avg_pool2d(out, 4) out = out.view(out.size(0), -1) out = self.linear(out) return out def ResNet18(Num_classes=10): return ResNet(BasicBlock, [2,2,2,2],num_classes=Num_classes) def ResNet34(Num_classes=10): return ResNet(BasicBlock, [3,4,6,3],num_classes=Num_classes) def ResNet50(Num_classes=10): return ResNet(Bottleneck, [3,4,6,3],num_classes=Num_classes) def ResNet101(Num_classes=10): return ResNet(Bottleneck, [3,4,23,3],num_classes=Num_classes) def ResNet152(Num_classes=10): return ResNet(Bottleneck, [3,8,36,3],num_classes=Num_classes) def test(): net = ResNet18() y = net(torch.randn(1,3,32,32)) print(y.size()) # test() ================================================ FILE: algorithm-GC/cifar/models/resnext.py ================================================ '''ResNeXt in PyTorch. See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details. ''' import torch import torch.nn as nn import torch.nn.functional as F class Block(nn.Module): '''Grouped convolution block.''' expansion = 2 def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1): super(Block, self).__init__() group_width = cardinality * bottleneck_width self.conv1 = nn.Conv2d(in_planes, group_width, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(group_width) self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False) self.bn2 = nn.BatchNorm2d(group_width) self.conv3 = nn.Conv2d(group_width, self.expansion*group_width, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(self.expansion*group_width) self.shortcut = nn.Sequential() if stride != 1 or in_planes != self.expansion*group_width: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, self.expansion*group_width, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(self.expansion*group_width) ) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = F.relu(self.bn2(self.conv2(out))) out = self.bn3(self.conv3(out)) out += self.shortcut(x) out = F.relu(out) return out class ResNeXt(nn.Module): def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10): super(ResNeXt, self).__init__() self.cardinality = cardinality self.bottleneck_width = bottleneck_width self.in_planes = 64 self.conv1 = nn.Conv2d(3, 64, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(64) self.layer1 = self._make_layer(num_blocks[0], 1) self.layer2 = self._make_layer(num_blocks[1], 2) self.layer3 = self._make_layer(num_blocks[2], 2) # self.layer4 = self._make_layer(num_blocks[3], 2) self.linear = nn.Linear(cardinality*bottleneck_width*8, num_classes) def _make_layer(self, num_blocks, stride): strides = [stride] + [1]*(num_blocks-1) layers = [] for stride in strides: layers.append(Block(self.in_planes, self.cardinality, self.bottleneck_width, stride)) self.in_planes = Block.expansion * self.cardinality * self.bottleneck_width # Increase bottleneck_width by 2 after each stage. self.bottleneck_width *= 2 return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) # out = self.layer4(out) out = F.avg_pool2d(out, 8) out = out.view(out.size(0), -1) out = self.linear(out) return out def ResNeXt29_2x64d(): return ResNeXt(num_blocks=[3,3,3], cardinality=2, bottleneck_width=64) def ResNeXt29_4x64d(): return ResNeXt(num_blocks=[3,3,3], cardinality=4, bottleneck_width=64) def ResNeXt29_8x64d(): return ResNeXt(num_blocks=[3,3,3], cardinality=8, bottleneck_width=64) def ResNeXt29_32x4d(): return ResNeXt(num_blocks=[3,3,3], cardinality=32, bottleneck_width=4) def test_resnext(): net = ResNeXt29_2x64d() x = torch.randn(1,3,32,32) y = net(x) print(y.size()) # test_resnext() ================================================ FILE: algorithm-GC/cifar/models/senet.py ================================================ '''SENet in PyTorch. SENet is the winner of ImageNet-2017. The paper is not released yet. ''' import torch import torch.nn as nn import torch.nn.functional as F class BasicBlock(nn.Module): def __init__(self, in_planes, planes, stride=1): super(BasicBlock, self).__init__() self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.shortcut = nn.Sequential() if stride != 1 or in_planes != planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes) ) # SE layers self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1) # Use nn.Conv2d instead of nn.Linear self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.bn2(self.conv2(out)) # Squeeze w = F.avg_pool2d(out, out.size(2)) w = F.relu(self.fc1(w)) w = F.sigmoid(self.fc2(w)) # Excitation out = out * w # New broadcasting feature from v0.2! out += self.shortcut(x) out = F.relu(out) return out class PreActBlock(nn.Module): def __init__(self, in_planes, planes, stride=1): super(PreActBlock, self).__init__() self.bn1 = nn.BatchNorm2d(in_planes) self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) if stride != 1 or in_planes != planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False) ) # SE layers self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1) self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1) def forward(self, x): out = F.relu(self.bn1(x)) shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x out = self.conv1(out) out = self.conv2(F.relu(self.bn2(out))) # Squeeze w = F.avg_pool2d(out, out.size(2)) w = F.relu(self.fc1(w)) w = F.sigmoid(self.fc2(w)) # Excitation out = out * w out += shortcut return out class SENet(nn.Module): def __init__(self, block, num_blocks, num_classes=10): super(SENet, self).__init__() self.in_planes = 64 self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(64) self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) self.linear = nn.Linear(512, num_classes) def _make_layer(self, block, planes, num_blocks, stride): strides = [stride] + [1]*(num_blocks-1) layers = [] for stride in strides: layers.append(block(self.in_planes, planes, stride)) self.in_planes = planes return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) out = self.layer4(out) out = F.avg_pool2d(out, 4) out = out.view(out.size(0), -1) out = self.linear(out) return out def SENet18(): return SENet(PreActBlock, [2,2,2,2]) def test(): net = SENet18() y = net(torch.randn(1,3,32,32)) print(y.size()) # test() ================================================ FILE: algorithm-GC/cifar/models/shufflenet.py ================================================ '''ShuffleNet in PyTorch. See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details. ''' import torch import torch.nn as nn import torch.nn.functional as F class ShuffleBlock(nn.Module): def __init__(self, groups): super(ShuffleBlock, self).__init__() self.groups = groups def forward(self, x): '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]''' N,C,H,W = x.size() g = self.groups return x.view(N,g,C/g,H,W).permute(0,2,1,3,4).contiguous().view(N,C,H,W) class Bottleneck(nn.Module): def __init__(self, in_planes, out_planes, stride, groups): super(Bottleneck, self).__init__() self.stride = stride mid_planes = out_planes/4 g = 1 if in_planes==24 else groups self.conv1 = nn.Conv2d(in_planes, mid_planes, kernel_size=1, groups=g, bias=False) self.bn1 = nn.BatchNorm2d(mid_planes) self.shuffle1 = ShuffleBlock(groups=g) self.conv2 = nn.Conv2d(mid_planes, mid_planes, kernel_size=3, stride=stride, padding=1, groups=mid_planes, bias=False) self.bn2 = nn.BatchNorm2d(mid_planes) self.conv3 = nn.Conv2d(mid_planes, out_planes, kernel_size=1, groups=groups, bias=False) self.bn3 = nn.BatchNorm2d(out_planes) self.shortcut = nn.Sequential() if stride == 2: self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1)) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.shuffle1(out) out = F.relu(self.bn2(self.conv2(out))) out = self.bn3(self.conv3(out)) res = self.shortcut(x) out = F.relu(torch.cat([out,res], 1)) if self.stride==2 else F.relu(out+res) return out class ShuffleNet(nn.Module): def __init__(self, cfg): super(ShuffleNet, self).__init__() out_planes = cfg['out_planes'] num_blocks = cfg['num_blocks'] groups = cfg['groups'] self.conv1 = nn.Conv2d(3, 24, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(24) self.in_planes = 24 self.layer1 = self._make_layer(out_planes[0], num_blocks[0], groups) self.layer2 = self._make_layer(out_planes[1], num_blocks[1], groups) self.layer3 = self._make_layer(out_planes[2], num_blocks[2], groups) self.linear = nn.Linear(out_planes[2], 10) def _make_layer(self, out_planes, num_blocks, groups): layers = [] for i in range(num_blocks): stride = 2 if i == 0 else 1 cat_planes = self.in_planes if i == 0 else 0 layers.append(Bottleneck(self.in_planes, out_planes-cat_planes, stride=stride, groups=groups)) self.in_planes = out_planes return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) out = F.avg_pool2d(out, 4) out = out.view(out.size(0), -1) out = self.linear(out) return out def ShuffleNetG2(): cfg = { 'out_planes': [200,400,800], 'num_blocks': [4,8,4], 'groups': 2 } return ShuffleNet(cfg) def ShuffleNetG3(): cfg = { 'out_planes': [240,480,960], 'num_blocks': [4,8,4], 'groups': 3 } return ShuffleNet(cfg) def test(): net = ShuffleNetG2() x = torch.randn(1,3,32,32) y = net(x) print(y) # test() ================================================ FILE: algorithm-GC/cifar/models/vgg.py ================================================ '''VGG11/13/16/19 in Pytorch.''' import torch import torch.nn as nn cfg = { 'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], 'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], } class VGG(nn.Module): def __init__(self, vgg_name,Num_classes=100): super(VGG, self).__init__() self.features = self._make_layers(cfg[vgg_name]) self.classifier = nn.Linear(512, Num_classes) def forward(self, x): out = self.features(x) out = out.view(out.size(0), -1) out = self.classifier(out) return out def _make_layers(self, cfg): layers = [] in_channels = 3 for x in cfg: if x == 'M': layers += [nn.MaxPool2d(kernel_size=2, stride=2)] else: layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1), nn.BatchNorm2d(x), nn.ReLU(inplace=True)] in_channels = x layers += [nn.AvgPool2d(kernel_size=1, stride=1)] return nn.Sequential(*layers) def test(): net = VGG('VGG11') x = torch.randn(2,3,32,32) y = net(x) print(y.size()) # test() ================================================ FILE: algorithm-GC/cifar/nohup.out ================================================ Traceback (most recent call last): File "main.py", line 281, in train_acc=train(epoch,net,optimizer) File "main.py", line 227, in train outputs = net(inputs) File "/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 153, in forward return self.module(*inputs[0], **kwargs[0]) File "/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "/home/yonghw/mycode/Opt_GC/cifar/models/resnet.py", line 90, in forward out = self.layer1(out) File "/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/container.py", line 100, in forward input = module(input) File "/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "/home/yonghw/mycode/Opt_GC/cifar/models/resnet.py", line 61, in forward out = self.bn3(self.conv3(out)) File "/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 349, in forward return self._conv_forward(input, self.weight) File "/home/yonghw/anaconda3/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 346, in _conv_forward self.padding, self.dilation, self.groups) RuntimeError: CUDA out of memory. Tried to allocate 128.00 MiB (GPU 0; 15.90 GiB total capacity; 1.06 GiB already allocated; 31.38 MiB free; 1.23 GiB reserved in total by PyTorch) Terminated ================================================ FILE: algorithm-GC/cifar/os_run.py ================================================ #cifar100 e200 bs128 gs 2,4,8,16 import os,time ############################# #r18 ############## #### sgd #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_sgd_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_sgd_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_sgd_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_sgd_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_sgd_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_sgd_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_sgd_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_sgd_8.log ") #time.sleep(500) # #### sgdGC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_sgdGC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_sgdGC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_sgdGC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_sgdGC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_sgdGC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_sgdGC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_sgdGC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_sgdGC_8.log ") #time.sleep(500) # #### sgdGCC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGCC --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_sgdGCC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGCC --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_sgdGCC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGCC --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_sgdGCC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGCC --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_sgdGCC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGCC --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_sgdGCC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGCC --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_sgdGCC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGCC --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_sgdGCC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGCC --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_sgdGCC_8.log ") #time.sleep(500) # ############### ############### # #### adam #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adam --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_adam_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adam --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_adam_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adam --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_adam_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adam --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_adam_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adam --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_adam_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adam --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_adam_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adam --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_adam_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adam --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_adam_8.log ") # #time.sleep(500) #### adamGC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGC --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_adamGC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGC --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_adamGC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGC --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_adamGC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGC --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_adamGC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGC --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_adamGC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGC --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_adamGC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGC --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_adamGC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGC --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_adamGC_8.log ") #time.sleep(500) # #### adamGCC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGCC --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_adamGCC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGCC --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_adamGCC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGCC --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_adamGCC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGCC --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_adamGCC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGCC --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_adamGCC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGCC --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_adamGCC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGCC --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_adamGCC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGCC --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_adamGCC_8.log ") #time.sleep(500) # ############### ############### # #### adamW #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_adamW_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_adamW_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_adamW_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_adamW_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_adamW_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_adamW_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_adamW_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_adamW_8.log ") # #time.sleep(500) #### adamWGC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_adamWGC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_adamWGC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_adamWGC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_adamWGC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_adamWGC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_adamWGC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_adamWGC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_adamWGC_8.log ") #time.sleep(500) # #### adamWGCC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_adamWGCC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_adamWGCC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_adamWGCC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_adamWGCC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_adamWGCC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_adamWGCC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_adamWGCC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_adamWGCC_8.log ") #time.sleep(500) # ############### ############### # #### radam #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_radam_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_radam_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_radam_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_radam_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_radam_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_radam_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_radam_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_radam_8.log ") # #time.sleep(500) #### radamGC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_radamGC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_radamGC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_radamGC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_radamGC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_radamGC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_radamGC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_radamGC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_radamGC_8.log ") #time.sleep(500) # #### radamGCC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_radamGCC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_radamGCC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_radamGCC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_radamGCC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_radamGCC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_radamGCC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_radamGCC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_radamGCC_8.log ") #time.sleep(500) # ############### ############### # #### Lsgd #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_Lsgd_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_Lsgd_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_Lsgd_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_Lsgd_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_Lsgd_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_Lsgd_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_Lsgd_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_Lsgd_8.log ") #time.sleep(500) # #### LsgdGC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_LsgdGC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_LsgdGC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_LsgdGC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_LsgdGC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_LsgdGC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_LsgdGC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_LsgdGC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_LsgdGC_8.log ") #time.sleep(500) # #### LsgdGCC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_LsgdGCC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_LsgdGCC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_LsgdGCC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_LsgdGCC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_LsgdGCC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_LsgdGCC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_LsgdGCC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_LsgdGCC_8.log ") #time.sleep(500) # ############### ############### # #### Ladam #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_Ladam_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_Ladam_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_Ladam_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_Ladam_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_Ladam_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_Ladam_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_Ladam_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_Ladam_8.log ") # #time.sleep(500) #### LadamGC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_LadamGC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_LadamGC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_LadamGC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_LadamGC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_LadamGC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_LadamGC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_LadamGC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_LadamGC_8.log ") #time.sleep(500) # #### LadamGCC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_LadamGCC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_LadamGCC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_LadamGCC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_LadamGCC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_LadamGCC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_LadamGCC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_LadamGCC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_LadamGCC_8.log ") #time.sleep(500) # ############### ############### # #### ranger #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_ranger_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_ranger_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_ranger_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_ranger_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_ranger_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_ranger_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_ranger_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_ranger_8.log ") # #time.sleep(500) #### ranger #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_rangerGC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_rangerGC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_rangerGC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_rangerGC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_rangerGC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_rangerGC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_rangerGC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_rangerGC_8.log ") #time.sleep(500) # #### ranger #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r18 --gpug 10 > logout/r18_lr11_wd45_rangerGCC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r18 --gpug 11 > logout/r18_lr11_wd45_rangerGCC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r18 --gpug 12 > logout/r18_lr11_wd45_rangerGCC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r18 --gpug 13 > logout/r18_lr11_wd45_rangerGCC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r18 --gpug 14 > logout/r18_lr11_wd45_rangerGCC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r18 --gpug 15 > logout/r18_lr11_wd45_rangerGCC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r18 --gpug 16 > logout/r18_lr11_wd45_rangerGCC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r18 --gpug 17 > logout/r18_lr11_wd45_rangerGCC_8.log ") #time.sleep(500) # ############### # ##r50 ############### # #### sgd #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_sgd_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_sgd_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_sgd_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_sgd_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_sgd_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_sgd_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_sgd_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_sgd_8.log ") #time.sleep(500) # #### sgdGC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_sgdGC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_sgdGC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_sgdGC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_sgdGC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_sgdGC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_sgdGC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_sgdGC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_sgdGC_8.log ") #time.sleep(500) # #### sgdGCC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGCC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_sgdGCC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGCC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_sgdGCC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGCC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_sgdGCC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGCC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_sgdGCC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGCC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_sgdGCC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGCC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_sgdGCC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGCC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_sgdGCC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGCC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_sgdGCC_8.log ") #time.sleep(500) # ############### ############### # #### adam #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_adam_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_adam_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_adam_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_adam_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_adam_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_adam_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_adam_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_adam_8.log ") # #time.sleep(500) #### adamGC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_adamGC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_adamGC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_adamGC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_adamGC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_adamGC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_adamGC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_adamGC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_adamGC_8.log ") #time.sleep(500) # #### adamGCC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_adamGCC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_adamGCC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_adamGCC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_adamGCC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_adamGCC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_adamGCC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_adamGCC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_adamGCC_8.log ") #time.sleep(500) # ############### ############### # #### adamW #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_adamW_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_adamW_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_adamW_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_adamW_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_adamW_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_adamW_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_adamW_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_adamW_8.log ") # #time.sleep(500) #### adamWGC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_adamWGC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_adamWGC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_adamWGC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_adamWGC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_adamWGC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_adamWGC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_adamWGC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_adamWGC_8.log ") #time.sleep(500) ### adamWGCC os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_adamWGCC_1.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_adamWGCC_2.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_adamWGCC_3.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_adamWGCC_4.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_adamWGCC_5.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_adamWGCC_6.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_adamWGCC_7.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_adamWGCC_8.log ") time.sleep(500) ############## ############## ### radam os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_radam_1.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_radam_2.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_radam_3.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_radam_4.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_radam_5.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_radam_6.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_radam_7.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_radam_8.log ") time.sleep(500) ### radamGC os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_radamGC_1.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_radamGC_2.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_radamGC_3.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_radamGC_4.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_radamGC_5.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_radamGC_6.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_radamGC_7.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_radamGC_8.log ") time.sleep(500) ### radamGCC os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_radamGCC_1.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_radamGCC_2.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_radamGCC_3.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_radamGCC_4.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_radamGCC_5.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_radamGCC_6.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_radamGCC_7.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_radamGCC_8.log ") time.sleep(500) ############## ############## ### Lsgd os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_Lsgd_1.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_Lsgd_2.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_Lsgd_3.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_Lsgd_4.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_Lsgd_5.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_Lsgd_6.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_Lsgd_7.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_Lsgd_8.log ") time.sleep(500) ### LsgdGC os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_LsgdGC_1.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_LsgdGC_2.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_LsgdGC_3.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_LsgdGC_4.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_LsgdGC_5.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_LsgdGC_6.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_LsgdGC_7.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_LsgdGC_8.log ") time.sleep(500) ### LsgdGCC os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_LsgdGCC_1.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_LsgdGCC_2.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_LsgdGCC_3.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_LsgdGCC_4.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_LsgdGCC_5.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_LsgdGCC_6.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_LsgdGCC_7.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_LsgdGCC_8.log ") time.sleep(500) ############## ############## ### Ladam os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_Ladam_1.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_Ladam_2.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_Ladam_3.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_Ladam_4.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_Ladam_5.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_Ladam_6.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_Ladam_7.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_Ladam_8.log ") time.sleep(500) ### LadamGC os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_LadamGC_1.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_LadamGC_2.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_LadamGC_3.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_LadamGC_4.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_LadamGC_5.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_LadamGC_6.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_LadamGC_7.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_LadamGC_8.log ") time.sleep(500) ### LadamGCC os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_LadamGCC_1.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_LadamGCC_2.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_LadamGCC_3.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_LadamGCC_4.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_LadamGCC_5.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_LadamGCC_6.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_LadamGCC_7.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_LadamGCC_8.log ") time.sleep(500) ############## ############## ### ranger os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_ranger_1.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_ranger_2.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_ranger_3.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_ranger_4.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_ranger_5.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_ranger_6.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_ranger_7.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_ranger_8.log ") time.sleep(500) ### ranger os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_rangerGC_1.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_rangerGC_2.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_rangerGC_3.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_rangerGC_4.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_rangerGC_5.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_rangerGC_6.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_rangerGC_7.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_rangerGC_8.log ") time.sleep(500) ### ranger os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_rangerGCC_1.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_rangerGCC_2.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_rangerGCC_3.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_rangerGCC_4.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_rangerGCC_5.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_rangerGCC_6.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_rangerGCC_7.log &") os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_rangerGCC_8.log ") time.sleep(500) ############## ================================================ FILE: algorithm-GC/cifar/os_run2.py ================================================ #cifar100 e200 bs128 gs 2,4,8,16 import os,time #r50 ############## ### adam os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 10 > logout2/r50_lr21_wd45_adam_1.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 11 > logout2/r50_lr21_wd45_adam_2.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 12 > logout2/r50_lr21_wd45_adam_3.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 13 > logout2/r50_lr21_wd45_adam_4.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 14 > logout2/r50_lr21_wd45_adam_5.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 15 > logout2/r50_lr21_wd45_adam_6.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 16 > logout2/r50_lr21_wd45_adam_7.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 17 > logout2/r50_lr21_wd45_adam_8.log ") time.sleep(500) ### adamGC os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 10 > logout2/r50_lr21_wd45_adamGC_1.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 11 > logout2/r50_lr21_wd45_adamGC_2.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 12 > logout2/r50_lr21_wd45_adamGC_3.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 13 > logout2/r50_lr21_wd45_adamGC_4.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 14 > logout2/r50_lr21_wd45_adamGC_5.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 15 > logout2/r50_lr21_wd45_adamGC_6.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 16 > logout2/r50_lr21_wd45_adamGC_7.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 17 > logout2/r50_lr21_wd45_adamGC_8.log ") time.sleep(500) ### adamGCC os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 10 > logout2/r50_lr21_wd45_adamGCC_1.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 11 > logout2/r50_lr21_wd45_adamGCC_2.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 12 > logout2/r50_lr21_wd45_adamGCC_3.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 13 > logout2/r50_lr21_wd45_adamGCC_4.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 14 > logout2/r50_lr21_wd45_adamGCC_5.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 15 > logout2/r50_lr21_wd45_adamGCC_6.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 16 > logout2/r50_lr21_wd45_adamGCC_7.log &") os.system("nohup python main.py --lr 0.01 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 17 > logout2/r50_lr21_wd45_adamGCC_8.log ") time.sleep(500) ############## os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 10 > logout2/r50_lr25_wd45_adam_1.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 11 > logout2/r50_lr25_wd45_adam_2.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 12 > logout2/r50_lr25_wd45_adam_3.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 13 > logout2/r50_lr25_wd45_adam_4.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 14 > logout2/r50_lr25_wd45_adam_5.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 15 > logout2/r50_lr25_wd45_adam_6.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 16 > logout2/r50_lr25_wd45_adam_7.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 17 > logout2/r50_lr25_wd45_adam_8.log ") time.sleep(500) ### adamGC os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 10 > logout2/r50_lr25_wd45_adamGC_1.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 11 > logout2/r50_lr25_wd45_adamGC_2.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 12 > logout2/r50_lr25_wd45_adamGC_3.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 13 > logout2/r50_lr25_wd45_adamGC_4.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 14 > logout2/r50_lr25_wd45_adamGC_5.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 15 > logout2/r50_lr25_wd45_adamGC_6.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 16 > logout2/r50_lr25_wd45_adamGC_7.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 17 > logout2/r50_lr25_wd45_adamGC_8.log ") time.sleep(500) ### adamGCC os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 10 > logout2/r50_lr25_wd45_adamGCC_1.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 11 > logout2/r50_lr25_wd45_adamGCC_2.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 12 > logout2/r50_lr25_wd45_adamGCC_3.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 13 > logout2/r50_lr25_wd45_adamGCC_4.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 14 > logout2/r50_lr25_wd45_adamGCC_5.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 15 > logout2/r50_lr25_wd45_adamGCC_6.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 16 > logout2/r50_lr25_wd45_adamGCC_7.log &") os.system("nohup python main.py --lr 0.05 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 17 > logout2/r50_lr25_wd45_adamGCC_8.log ") time.sleep(500) ############## os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 10 > logout2/r50_lr115_wd45_adam_1.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 11 > logout2/r50_lr115_wd45_adam_2.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 12 > logout2/r50_lr115_wd45_adam_3.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 13 > logout2/r50_lr115_wd45_adam_4.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 14 > logout2/r50_lr115_wd45_adam_5.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 15 > logout2/r50_lr115_wd45_adam_6.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 16 > logout2/r50_lr115_wd45_adam_7.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adam --epochs 200 --model r50 --gpug 17 > logout2/r50_lr115_wd45_adam_8.log ") time.sleep(500) ### adamGC os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 10 > logout2/r50_lr115_wd45_adamGC_1.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 11 > logout2/r50_lr115_wd45_adamGC_2.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 12 > logout2/r50_lr115_wd45_adamGC_3.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 13 > logout2/r50_lr115_wd45_adamGC_4.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 14 > logout2/r50_lr115_wd45_adamGC_5.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 15 > logout2/r50_lr115_wd45_adamGC_6.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 16 > logout2/r50_lr115_wd45_adamGC_7.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adamGC --epochs 200 --model r50 --gpug 17 > logout2/r50_lr115_wd45_adamGC_8.log ") time.sleep(500) ### adamGCC os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 10 > logout2/r50_lr115_wd45_adamGCC_1.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 11 > logout2/r50_lr115_wd45_adamGCC_2.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 12 > logout2/r50_lr115_wd45_adamGCC_3.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 13 > logout2/r50_lr115_wd45_adamGCC_4.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 14 > logout2/r50_lr115_wd45_adamGCC_5.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 15 > logout2/r50_lr115_wd45_adamGCC_6.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 16 > logout2/r50_lr115_wd45_adamGCC_7.log &") os.system("nohup python main.py --lr 0.15 --wd 0.0005 --alg adamGCC --epochs 200 --model r50 --gpug 17 > logout2/r50_lr115_wd45_adamGCC_8.log ") time.sleep(500) # ############### ############### # #### adamW #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_adamW_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_adamW_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_adamW_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_adamW_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_adamW_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_adamW_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_adamW_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamW --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_adamW_8.log ") # #time.sleep(500) #### adamWGC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_adamWGC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_adamWGC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_adamWGC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_adamWGC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_adamWGC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_adamWGC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_adamWGC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_adamWGC_8.log ") #time.sleep(500) ### adamWGCC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_adamWGCC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_adamWGCC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_adamWGCC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_adamWGCC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_adamWGCC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_adamWGCC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_adamWGCC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg adamWGCC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_adamWGCC_8.log ") #time.sleep(500) # ############### ############### # #### radam #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_radam_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_radam_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_radam_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_radam_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_radam_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_radam_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_radam_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radam --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_radam_8.log ") # #time.sleep(500) #### radamGC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_radamGC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_radamGC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_radamGC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_radamGC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_radamGC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_radamGC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_radamGC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_radamGC_8.log ") #time.sleep(500) # #### radamGCC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_radamGCC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_radamGCC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_radamGCC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_radamGCC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_radamGCC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_radamGCC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_radamGCC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg radamGCC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_radamGCC_8.log ") #time.sleep(500) # ############### ############### # #### Lsgd #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_Lsgd_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_Lsgd_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_Lsgd_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_Lsgd_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_Lsgd_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_Lsgd_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_Lsgd_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Lsgd --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_Lsgd_8.log ") #time.sleep(500) # #### LsgdGC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_LsgdGC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_LsgdGC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_LsgdGC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_LsgdGC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_LsgdGC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_LsgdGC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_LsgdGC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_LsgdGC_8.log ") #time.sleep(500) # #### LsgdGCC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_LsgdGCC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_LsgdGCC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_LsgdGCC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_LsgdGCC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_LsgdGCC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_LsgdGCC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_LsgdGCC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LsgdGCC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_LsgdGCC_8.log ") #time.sleep(500) # ############### ############### # #### Ladam #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_Ladam_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_Ladam_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_Ladam_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_Ladam_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_Ladam_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_Ladam_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_Ladam_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg Ladam --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_Ladam_8.log ") # #time.sleep(500) #### LadamGC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_LadamGC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_LadamGC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_LadamGC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_LadamGC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_LadamGC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_LadamGC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_LadamGC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_LadamGC_8.log ") #time.sleep(500) # #### LadamGCC #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_LadamGCC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_LadamGCC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_LadamGCC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_LadamGCC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_LadamGCC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_LadamGCC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_LadamGCC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg LadamGCC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_LadamGCC_8.log ") #time.sleep(500) # ############### ############### # #### ranger #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_ranger_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_ranger_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_ranger_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_ranger_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_ranger_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_ranger_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_ranger_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg ranger --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_ranger_8.log ") # #time.sleep(500) #### ranger #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_rangerGC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_rangerGC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_rangerGC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_rangerGC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_rangerGC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_rangerGC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_rangerGC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_rangerGC_8.log ") #time.sleep(500) # #### ranger #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r50 --gpug 10 > logout/r50_lr11_wd45_rangerGCC_1.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r50 --gpug 11 > logout/r50_lr11_wd45_rangerGCC_2.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r50 --gpug 12 > logout/r50_lr11_wd45_rangerGCC_3.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r50 --gpug 13 > logout/r50_lr11_wd45_rangerGCC_4.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r50 --gpug 14 > logout/r50_lr11_wd45_rangerGCC_5.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r50 --gpug 15 > logout/r50_lr11_wd45_rangerGCC_6.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r50 --gpug 16 > logout/r50_lr11_wd45_rangerGCC_7.log &") #os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg rangerGCC --epochs 200 --model r50 --gpug 17 > logout/r50_lr11_wd45_rangerGCC_8.log ") #time.sleep(500) ##############