Showing preview only (414K chars total). Download the full file or copy to clipboard to get everything.
Repository: Yonghongwei/Gradient-Centralization
Branch: master
Commit: ed2a608ccdbb
Files: 57
Total size: 394.5 KB
Directory structure:
gitextract_l162dn_3/
├── GC_code/
│ ├── CIFAR100/
│ │ ├── algorithm/
│ │ │ ├── Adagrad.py
│ │ │ ├── Adam.py
│ │ │ └── SGD.py
│ │ ├── main.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── densenet.py
│ │ │ ├── dpn.py
│ │ │ ├── googlenet.py
│ │ │ ├── lenet.py
│ │ │ ├── mobilenet.py
│ │ │ ├── mobilenetv2.py
│ │ │ ├── pnasnet.py
│ │ │ ├── preact_resnet.py
│ │ │ ├── resnet.py
│ │ │ ├── resnext.py
│ │ │ ├── senet.py
│ │ │ ├── shufflenet.py
│ │ │ └── vgg.py
│ │ └── os_run.py
│ ├── Fine-grained_classification/
│ │ ├── SGD.py
│ │ ├── main.py
│ │ └── os_run.py
│ ├── ImageNet/
│ │ ├── SGD.py
│ │ ├── main.py
│ │ ├── myresnet.py
│ │ ├── myresnetgn.py
│ │ └── os_run.py
│ └── Mini_ImageNet/
│ ├── SGD.py
│ ├── main.py
│ ├── os_run.py
│ └── resnet_ws.py
├── README.md
└── algorithm-GC/
├── README.md
├── algorithm/
│ ├── Adam.py
│ ├── Centralization.py
│ ├── Lookahead.py
│ ├── RAdam.py
│ ├── Ranger.py
│ └── SGD.py
└── cifar/
├── main.py
├── models/
│ ├── __init__.py
│ ├── densenet.py
│ ├── dpn.py
│ ├── googlenet.py
│ ├── lenet.py
│ ├── mobilenet.py
│ ├── mobilenetv2.py
│ ├── pnasnet.py
│ ├── preact_resnet.py
│ ├── resnet.py
│ ├── resnext.py
│ ├── senet.py
│ ├── shufflenet.py
│ └── vgg.py
├── nohup.out
├── os_run.py
└── os_run2.py
================================================
FILE CONTENTS
================================================
================================================
FILE: GC_code/CIFAR100/algorithm/Adagrad.py
================================================
import torch
from torch.optim.optimizer import Optimizer
class Adagrad_GCC(Optimizer):
"""Implements Adagrad algorithm.
It has been proposed in `Adaptive Subgradient Methods for Online Learning
and Stochastic Optimization`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1e-2)
lr_decay (float, optional): learning rate decay (default: 0)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-10)
.. _Adaptive Subgradient Methods for Online Learning and Stochastic
Optimization: http://jmlr.org/papers/v12/duchi11a.html
"""
def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0, eps=1e-10):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= lr_decay:
raise ValueError("Invalid lr_decay value: {}".format(lr_decay))
if not 0.0 <= weight_decay:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
if not 0.0 <= initial_accumulator_value:
raise ValueError("Invalid initial_accumulator_value value: {}".format(initial_accumulator_value))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
defaults = dict(lr=lr, lr_decay=lr_decay, eps=eps, weight_decay=weight_decay,
initial_accumulator_value=initial_accumulator_value)
super(Adagrad_GCC, self).__init__(params, defaults)
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
state['step'] = 0
state['sum'] = torch.full_like(p.data, initial_accumulator_value)
def share_memory(self):
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
state['sum'].share_memory_()
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
state = self.state[p]
state['step'] += 1
if group['weight_decay'] != 0:
if p.grad.data.is_sparse:
raise RuntimeError("weight_decay option is not compatible with sparse gradients")
grad = grad.add(group['weight_decay'], p.data)
#GC operation for Conv layers
if len(list(grad.size()))>3:
grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
clr = group['lr'] / (1 + (state['step'] - 1) * group['lr_decay'])
if grad.is_sparse:
grad = grad.coalesce() # the update is non-linear so indices must be unique
grad_indices = grad._indices()
grad_values = grad._values()
size = grad.size()
def make_sparse(values):
constructor = grad.new
if grad_indices.dim() == 0 or values.dim() == 0:
return constructor().resize_as_(grad)
return constructor(grad_indices, values, size)
state['sum'].add_(make_sparse(grad_values.pow(2)))
std = state['sum'].sparse_mask(grad)
std_values = std._values().sqrt_().add_(group['eps'])
p.data.add_(-clr, make_sparse(grad_values / std_values))
else:
state['sum'].addcmul_(1, grad, grad)
std = state['sum'].sqrt().add_(group['eps'])
p.data.addcdiv_(-clr, grad, std)
return loss
class Adagrad_GC(Optimizer):
"""Implements Adagrad algorithm.
It has been proposed in `Adaptive Subgradient Methods for Online Learning
and Stochastic Optimization`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1e-2)
lr_decay (float, optional): learning rate decay (default: 0)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-10)
.. _Adaptive Subgradient Methods for Online Learning and Stochastic
Optimization: http://jmlr.org/papers/v12/duchi11a.html
"""
def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0, eps=1e-10):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= lr_decay:
raise ValueError("Invalid lr_decay value: {}".format(lr_decay))
if not 0.0 <= weight_decay:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
if not 0.0 <= initial_accumulator_value:
raise ValueError("Invalid initial_accumulator_value value: {}".format(initial_accumulator_value))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
defaults = dict(lr=lr, lr_decay=lr_decay, eps=eps, weight_decay=weight_decay,
initial_accumulator_value=initial_accumulator_value)
super(Adagrad_GC, self).__init__(params, defaults)
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
state['step'] = 0
state['sum'] = torch.full_like(p.data, initial_accumulator_value)
def share_memory(self):
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
state['sum'].share_memory_()
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
state = self.state[p]
state['step'] += 1
if group['weight_decay'] != 0:
if p.grad.data.is_sparse:
raise RuntimeError("weight_decay option is not compatible with sparse gradients")
grad = grad.add(group['weight_decay'], p.data)
#GC operation for Conv layers
if len(list(grad.size()))>1:
grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
clr = group['lr'] / (1 + (state['step'] - 1) * group['lr_decay'])
if grad.is_sparse:
grad = grad.coalesce() # the update is non-linear so indices must be unique
grad_indices = grad._indices()
grad_values = grad._values()
size = grad.size()
def make_sparse(values):
constructor = grad.new
if grad_indices.dim() == 0 or values.dim() == 0:
return constructor().resize_as_(grad)
return constructor(grad_indices, values, size)
state['sum'].add_(make_sparse(grad_values.pow(2)))
std = state['sum'].sparse_mask(grad)
std_values = std._values().sqrt_().add_(group['eps'])
p.data.add_(-clr, make_sparse(grad_values / std_values))
else:
state['sum'].addcmul_(1, grad, grad)
std = state['sum'].sqrt().add_(group['eps'])
p.data.addcdiv_(-clr, grad, std)
return loss
================================================
FILE: GC_code/CIFAR100/algorithm/Adam.py
================================================
import math
import torch
from torch.optim.optimizer import Optimizer
class Adam_GCC(Optimizer):
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
weight_decay=0, amsgrad=False):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
defaults = dict(lr=lr, betas=betas, eps=eps,
weight_decay=weight_decay, amsgrad=amsgrad)
super(Adam_GCC, self).__init__(params, defaults)
def __setstate__(self, state):
super(Adam_GCC, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('amsgrad', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
amsgrad = group['amsgrad']
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
if amsgrad:
# Maintains max of all exp. moving avg. of sq. grad. values
state['max_exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
if amsgrad:
max_exp_avg_sq = state['max_exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
if group['weight_decay'] != 0:
grad.add_(group['weight_decay'], p.data)
#GC operation for Conv layers
if len(list(grad.size()))>3:
grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
# Use the max. for normalizing running avg. of gradient
denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
else:
denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
step_size = group['lr'] / bias_correction1
p.data.addcdiv_(-step_size, exp_avg, denom)
return loss
class Adam_GCC2(Optimizer):
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
weight_decay=0, amsgrad=False):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
defaults = dict(lr=lr, betas=betas, eps=eps,
weight_decay=weight_decay, amsgrad=amsgrad)
super(Adam_GCC2, self).__init__(params, defaults)
def __setstate__(self, state):
super(Adam_GCC2, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('amsgrad', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
amsgrad = group['amsgrad']
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
if amsgrad:
# Maintains max of all exp. moving avg. of sq. grad. values
state['max_exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
if amsgrad:
max_exp_avg_sq = state['max_exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
if group['weight_decay'] != 0:
grad.add_(group['weight_decay'], p.data)
# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
# Use the max. for normalizing running avg. of gradient
denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
else:
denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
step_size = group['lr'] / bias_correction1
#GC operation for Conv layers
if len(list(grad.size()))>3:
delta=(step_size*exp_avg/denom).clone()
delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
p.data.add_(-delta)
else:
p.data.addcdiv_(-step_size, exp_avg, denom)
return loss
class Adam_GC(Optimizer):
r"""Implements Adam algorithm.
It has been proposed in `Adam: A Method for Stochastic Optimization`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
(default: False)
.. _Adam\: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
weight_decay=0, amsgrad=False):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
defaults = dict(lr=lr, betas=betas, eps=eps,
weight_decay=weight_decay, amsgrad=amsgrad)
super(Adam_GC, self).__init__(params, defaults)
def __setstate__(self, state):
super(Adam_GC, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('amsgrad', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
amsgrad = group['amsgrad']
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
if amsgrad:
# Maintains max of all exp. moving avg. of sq. grad. values
state['max_exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
if amsgrad:
max_exp_avg_sq = state['max_exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
if group['weight_decay'] != 0:
grad.add_(group['weight_decay'], p.data)
#GC operation for Conv layers and FC layers
if len(list(grad.size()))>1:
grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
# Use the max. for normalizing running avg. of gradient
denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
else:
denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
step_size = group['lr'] / bias_correction1
p.data.addcdiv_(-step_size, exp_avg, denom)
return loss
class Adam_GC2(Optimizer):
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
weight_decay=0, amsgrad=False):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
defaults = dict(lr=lr, betas=betas, eps=eps,
weight_decay=weight_decay, amsgrad=amsgrad)
super(Adam_GC2, self).__init__(params, defaults)
def __setstate__(self, state):
super(Adam_GC2, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('amsgrad', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
amsgrad = group['amsgrad']
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
if amsgrad:
# Maintains max of all exp. moving avg. of sq. grad. values
state['max_exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
if amsgrad:
max_exp_avg_sq = state['max_exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
if group['weight_decay'] != 0:
grad.add_(group['weight_decay'], p.data)
# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
# Use the max. for normalizing running avg. of gradient
denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
else:
denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
step_size = group['lr'] / bias_correction1
#GC operation for Conv layers and FC layers
if len(list(grad.size()))>1:
delta=(step_size*exp_avg/denom).clone()
delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
p.data.add_(-delta)
else:
p.data.addcdiv_(-step_size, exp_avg, denom)
return loss
class AdamW(Optimizer):
"""Implements Adam algorithm.
It has been proposed in `Adam: A Method for Stochastic Optimization`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
.. _Adam\: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
weight_decay=0, amsgrad=False):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
defaults = dict(lr=lr, betas=betas, eps=eps,
weight_decay=weight_decay, amsgrad=amsgrad)
super(AdamW, self).__init__(params, defaults)
def __setstate__(self, state):
super(AdamW, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('amsgrad', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
amsgrad = group['amsgrad']
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
if amsgrad:
# Maintains max of all exp. moving avg. of sq. grad. values
state['max_exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
if amsgrad:
max_exp_avg_sq = state['max_exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
# if group['weight_decay'] != 0:
# grad = grad.add(group['weight_decay'], p.data)
# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
# Use the max. for normalizing running avg. of gradient
denom = max_exp_avg_sq.sqrt().add_(group['eps'])
else:
denom = exp_avg_sq.sqrt().add_(group['eps'])
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
# p.data.addcdiv_(-step_size, exp_avg, denom)
p.data.add_(-step_size, torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )
return loss
class AdamW_GCC(Optimizer):
"""Implements Adam algorithm.
It has been proposed in `Adam: A Method for Stochastic Optimization`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
.. _Adam\: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
weight_decay=0, amsgrad=False):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
defaults = dict(lr=lr, betas=betas, eps=eps,
weight_decay=weight_decay, amsgrad=amsgrad)
super(AdamW_GCC, self).__init__(params, defaults)
def __setstate__(self, state):
super(AdamW_GCC, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('amsgrad', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
amsgrad = group['amsgrad']
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
if amsgrad:
# Maintains max of all exp. moving avg. of sq. grad. values
state['max_exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
if amsgrad:
max_exp_avg_sq = state['max_exp_avg_sq']
beta1, beta2 = group['betas']
#GC operation for Conv layers
if len(list(grad.size()))>3:
grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
state['step'] += 1
# if group['weight_decay'] != 0:
# grad = grad.add(group['weight_decay'], p.data)
# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
# Use the max. for normalizing running avg. of gradient
denom = max_exp_avg_sq.sqrt().add_(group['eps'])
else:
denom = exp_avg_sq.sqrt().add_(group['eps'])
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
# p.data.addcdiv_(-step_size, exp_avg, denom)
p.data.add_(-step_size, torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )
return loss
class AdamW_GC(Optimizer):
"""Implements Adam algorithm.
It has been proposed in `Adam: A Method for Stochastic Optimization`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
.. _Adam\: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
weight_decay=0, amsgrad=False):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
defaults = dict(lr=lr, betas=betas, eps=eps,
weight_decay=weight_decay, amsgrad=amsgrad)
super(AdamW_GC, self).__init__(params, defaults)
def __setstate__(self, state):
super(AdamW_GC, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('amsgrad', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
amsgrad = group['amsgrad']
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
if amsgrad:
# Maintains max of all exp. moving avg. of sq. grad. values
state['max_exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
if amsgrad:
max_exp_avg_sq = state['max_exp_avg_sq']
beta1, beta2 = group['betas']
#GC operation for Conv and FC layers
if len(list(grad.size()))>1:
grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
state['step'] += 1
# if group['weight_decay'] != 0:
# grad = grad.add(group['weight_decay'], p.data)
# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
# Use the max. for normalizing running avg. of gradient
denom = max_exp_avg_sq.sqrt().add_(group['eps'])
else:
denom = exp_avg_sq.sqrt().add_(group['eps'])
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
# p.data.addcdiv_(-step_size, exp_avg, denom)
p.data.add_(-step_size, torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )
return loss
class AdamW_GCC2(Optimizer):
"""Implements Adam algorithm.
It has been proposed in `Adam: A Method for Stochastic Optimization`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
.. _Adam\: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
weight_decay=0, amsgrad=False):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
defaults = dict(lr=lr, betas=betas, eps=eps,
weight_decay=weight_decay, amsgrad=amsgrad)
super(AdamW_GCC2, self).__init__(params, defaults)
def __setstate__(self, state):
super(AdamW_GCC2, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('amsgrad', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
amsgrad = group['amsgrad']
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
if amsgrad:
# Maintains max of all exp. moving avg. of sq. grad. values
state['max_exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
if amsgrad:
max_exp_avg_sq = state['max_exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
# if group['weight_decay'] != 0:
# grad = grad.add(group['weight_decay'], p.data)
# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
# Use the max. for normalizing running avg. of gradient
denom = max_exp_avg_sq.sqrt().add_(group['eps'])
else:
denom = exp_avg_sq.sqrt().add_(group['eps'])
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
# GC operation for Conv layers
if len(list(grad.size()))>3:
delta=(step_size*torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom)).clone()
delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
p.data.add_(-delta)
else:
p.data.add_(-step_size, torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )
return loss
class AdamW_GC2(Optimizer):
"""Implements Adam algorithm.
It has been proposed in `Adam: A Method for Stochastic Optimization`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
.. _Adam\: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
weight_decay=0, amsgrad=False):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
defaults = dict(lr=lr, betas=betas, eps=eps,
weight_decay=weight_decay, amsgrad=amsgrad)
super(AdamW_GC2, self).__init__(params, defaults)
def __setstate__(self, state):
super(AdamW_GC2, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('amsgrad', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
amsgrad = group['amsgrad']
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
if amsgrad:
# Maintains max of all exp. moving avg. of sq. grad. values
state['max_exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
if amsgrad:
max_exp_avg_sq = state['max_exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
# if group['weight_decay'] != 0:
# grad = grad.add(group['weight_decay'], p.data)
# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
# Use the max. for normalizing running avg. of gradient
denom = max_exp_avg_sq.sqrt().add_(group['eps'])
else:
denom = exp_avg_sq.sqrt().add_(group['eps'])
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
# GC operation for Conv and FC layers
if len(list(grad.size()))>1:
delta=(step_size*torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom)).clone()
delta.add_(-delta.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
p.data.add_(-delta)
else:
p.data.add_(-step_size, torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )
return loss
================================================
FILE: GC_code/CIFAR100/algorithm/SGD.py
================================================
import torch
from torch.optim.optimizer import Optimizer, required
class SGD_GCC(Optimizer):
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGD_GCC, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGD_GCC, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
if weight_decay != 0:
d_p.add_(weight_decay, p.data)
#GC operation for Conv layers
if len(list(d_p.size()))>3:
d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(1 - dampening, d_p)
if nesterov:
d_p = d_p.add(momentum, buf)
else:
d_p = buf
p.data.add_(-group['lr'], d_p)
return loss
class SGD_GC(Optimizer):
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGD_GC, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGD_GC, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
if weight_decay != 0:
d_p.add_(weight_decay, p.data)
#GC operation for Conv layers and FC layers
if len(list(d_p.size()))>1:
d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(1 - dampening, d_p)
if nesterov:
d_p = d_p.add(momentum, buf)
else:
d_p = buf
p.data.add_(-group['lr'], d_p)
return loss
class SGDW(Optimizer):
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGDW, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGDW, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
old = torch.clone(p.data).detach()
#if weight_decay != 0:
# d_p.add_(weight_decay, p.data)
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
buf.mul_(momentum).add_(d_p)
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(1 - dampening, d_p)
if nesterov:
d_p = d_p.add(momentum, buf)
else:
d_p = buf
p.data.add_(-group['lr'], d_p)
if weight_decay != 0:
p.data.add_(-weight_decay*group['lr'], old)
return loss
class SGDW_GCC(Optimizer):
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGDW_GCC, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGDW_GCC, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
old = torch.clone(p.data).detach()
#if weight_decay != 0:
# d_p.add_(weight_decay, p.data)
#GC operation for Conv layers
if len(list(d_p.size()))>3:
d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
buf.mul_(momentum).add_(d_p)
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(1 - dampening, d_p)
if nesterov:
d_p = d_p.add(momentum, buf)
else:
d_p = buf
p.data.add_(-group['lr'], d_p)
if weight_decay != 0:
p.data.add_(-weight_decay*group['lr'], old)
return loss
class SGDW_GC(Optimizer):
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGDW_GC, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGDW_GC, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
old = torch.clone(p.data).detach()
#if weight_decay != 0:
# d_p.add_(weight_decay, p.data)
#GC operation for Conv and FC layers
if len(list(d_p.size()))>1:
d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
buf.mul_(momentum).add_(d_p)
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(1 - dampening, d_p)
if nesterov:
d_p = d_p.add(momentum, buf)
else:
d_p = buf
p.data.add_(-group['lr'], d_p)
if weight_decay != 0:
p.data.add_(-weight_decay*group['lr'], old)
return loss
================================================
FILE: GC_code/CIFAR100/main.py
================================================
'''Train CIFAR100 with PyTorch.'''
from __future__ import print_function
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.optim import lr_scheduler
import os
import argparse
from torchvision import datasets, models
from models import *
#from utils import progress_bar
import numpy as np
#import optimizers with GC
from algorithm.SGD import *
from algorithm.Adam import *
from algorithm.Adagrad import *
parser = argparse.ArgumentParser(description='PyTorch CIFAR100 Training')
parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
parser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint')
parser.add_argument('--bs', default=128, type=int, help='batchsize')
parser.add_argument('--wd', default=0.0005, type=float, help='weight decay')
parser.add_argument('--alg', default='sgd', type=str, help='algorithm')
parser.add_argument('--epochs', default=200, type=int, help='epochs')
parser.add_argument('--path', default='logout/result', type=str, help='path')
parser.add_argument('--model', default='r50', type=str, help='model')
args = parser.parse_args()
os.environ["CUDA_VISIBLE_DEVICES"]="0"
epochs=args.epochs
device = 'cuda' if torch.cuda.is_available() else 'cpu'
best_acc = 0 # best test accuracy
start_epoch = 0 # start from epoch 0 or last checkpoint epoch
# Data
print('==> Preparing data..')
transform_train = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.507, 0.487, 0.441), (0.267, 0.256, 0.276)),
])
transform_test = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.507, 0.487, 0.441), (0.267, 0.256, 0.276)),
])
trainset = torchvision.datasets.CIFAR100(root='/home/yonghw/data/cifar100/', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=4,drop_last=True)
testset = torchvision.datasets.CIFAR100(root='/home/yonghw/data/cifar100/', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=4)
# Model
print('==> Building model..')
Num_classes = 100
if args.model=='r18':
net = ResNet18(Num_classes=Num_classes)
if args.model=='r34':
net = ResNet34(Num_classes=Num_classes)
if args.model=='r50':
net = ResNet50(Num_classes=Num_classes)
if args.model=='r101':
net = ResNet101(Num_classes=Num_classes)
if args.model=='v11':
net = VGG('VGG11',Num_classes=Num_classes)
if args.model=='rx29':
net = ResNeXt29_4x64d(Num_classes=Num_classes)
if args.model=='d121':
net = DenseNet121(Num_classes=Num_classes)
if device == 'cuda':
net = net.cuda()
net = torch.nn.DataParallel(net)
cudnn.benchmark = True
if args.resume:
# Load checkpoint.
print('==> Resuming from checkpoint..')
assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
checkpoint = torch.load('./checkpoint/ckpt.t7')
net.load_state_dict(checkpoint['net'])
best_acc = checkpoint['acc']
start_epoch = checkpoint['epoch']
criterion = nn.CrossEntropyLoss()
#optimizer
WD=args.wd
print('==> choose optimizer..')
if args.alg=='sgd':
optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)
if args.alg=='sgdGC':
optimizer = SGD_GC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)
if args.alg=='sgdGCC':
optimizer = SGD_GCC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)
if args.alg=='adam':
optimizer = optim.Adam(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamGC':
optimizer = Adam_GC(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamGCC':
optimizer = Adam_GCC(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamGC2':
optimizer = Adam_GC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamGCC2':
optimizer = Adam_GCC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adagrad':
optimizer = optim.Adagrad(net.parameters(), lr=args.lr*0.1,weight_decay = WD)
if args.alg=='adagradGC':
optimizer = Adagrad_GC(net.parameters(), lr=args.lr*0.1,weight_decay = WD)
if args.alg=='adagradGCC':
optimizer = Adagrad_GCC(net.parameters(), lr=args.lr*0.1,weight_decay = WD)
if args.alg=='adagradGC2':
optimizer = Adagrad_GC2(net.parameters(), lr=args.lr*0.1,weight_decay = WD)
if args.alg=='adagradGCC2':
optimizer = Adagrad_GCC2(net.parameters(), lr=args.lr*0.1,weight_decay = WD)
if args.alg=='sgdW':
optimizer = SGDW(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)
if args.alg=='sgdWGC':
optimizer = SGDW_GC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)
if args.alg=='sgdWGCC':
optimizer = SGDW_GCC(net.parameters(), lr=args.lr, momentum=0.9,weight_decay = WD)
if args.alg=='adamW':
optimizer = AdamW(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamWGC':
optimizer = Adam_GC(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamWGCC':
optimizer = Adam_GCC(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamWGC2':
optimizer = Adam_GC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
if args.alg=='adamWGCC2':
optimizer = Adam_GCC2(net.parameters(), lr=args.lr*0.01, weight_decay = WD)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=60, gamma=0.1)
# Training
def train(epoch,net,optimizer):
print('\nEpoch: %d' % epoch)
net.train()
train_loss = 0
correct = 0
total = 0
for batch_idx, (inputs, targets) in enumerate(trainloader):
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
train_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
print('Training: Loss: {:.4f} | Acc: {:.4f}'.format(train_loss/(batch_idx+1),correct/total))
# progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
# % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
acc=100.*correct/total
return acc
# Testing
def test(epoch,net):
global best_acc
net.eval()
test_loss = 0
correct = 0
total = 0
with torch.no_grad():
for batch_idx, (inputs, targets) in enumerate(testloader):
inputs, targets = inputs.to(device), targets.to(device)
outputs = net(inputs)
loss = criterion(outputs, targets)
test_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
#progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
#% (test_loss/(batch_idx+1), 100.*correct/total, correct, total))
print('Testing:Loss: {:.4f} | Acc: {:.4f}'.format(test_loss/(batch_idx+1),correct/total) )
# Save checkpoint.
acc = 100.*correct/total
if acc > best_acc:
print('Saving..')
state = {
'net': net.state_dict(),
'acc': acc,
'epoch': epoch,
}
if not os.path.isdir('checkpoint'):
os.mkdir('checkpoint')
torch.save(state, './checkpoint/ckpt.t7')
best_acc = acc
return acc
for epoch in range(start_epoch, start_epoch+epochs):
train_acc=train(epoch,net,optimizer)
exp_lr_scheduler.step()
val_acc=test(epoch,net)
================================================
FILE: GC_code/CIFAR100/models/__init__.py
================================================
from .vgg import *
from .dpn import *
from .lenet import *
from .senet import *
from .pnasnet import *
from .densenet import *
from .googlenet import *
from .shufflenet import *
from .resnet import *
from .resnext import *
from .preact_resnet import *
from .mobilenet import *
from .mobilenetv2 import *
================================================
FILE: GC_code/CIFAR100/models/densenet.py
================================================
'''DenseNet in PyTorch.'''
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
class Bottleneck(nn.Module):
def __init__(self, in_planes, growth_rate):
super(Bottleneck, self).__init__()
self.bn1 = nn.BatchNorm2d(in_planes)
self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False)
self.bn2 = nn.BatchNorm2d(4*growth_rate)
self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)
def forward(self, x):
out = self.conv1(F.relu(self.bn1(x)))
out = self.conv2(F.relu(self.bn2(out)))
out = torch.cat([out,x], 1)
return out
class Transition(nn.Module):
def __init__(self, in_planes, out_planes):
super(Transition, self).__init__()
self.bn = nn.BatchNorm2d(in_planes)
self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False)
def forward(self, x):
out = self.conv(F.relu(self.bn(x)))
out = F.avg_pool2d(out, 2)
return out
class DenseNet(nn.Module):
def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
super(DenseNet, self).__init__()
self.growth_rate = growth_rate
num_planes = 2*growth_rate
self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False)
self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
num_planes += nblocks[0]*growth_rate
out_planes = int(math.floor(num_planes*reduction))
self.trans1 = Transition(num_planes, out_planes)
num_planes = out_planes
self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
num_planes += nblocks[1]*growth_rate
out_planes = int(math.floor(num_planes*reduction))
self.trans2 = Transition(num_planes, out_planes)
num_planes = out_planes
self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
num_planes += nblocks[2]*growth_rate
out_planes = int(math.floor(num_planes*reduction))
self.trans3 = Transition(num_planes, out_planes)
num_planes = out_planes
self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3])
num_planes += nblocks[3]*growth_rate
self.bn = nn.BatchNorm2d(num_planes)
self.linear = nn.Linear(num_planes, num_classes)
def _make_dense_layers(self, block, in_planes, nblock):
layers = []
for i in range(nblock):
layers.append(block(in_planes, self.growth_rate))
in_planes += self.growth_rate
return nn.Sequential(*layers)
def forward(self, x):
out = self.conv1(x)
out = self.trans1(self.dense1(out))
out = self.trans2(self.dense2(out))
out = self.trans3(self.dense3(out))
out = self.dense4(out)
out = F.avg_pool2d(F.relu(self.bn(out)), 4)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
def DenseNet121(Num_classes=10):
return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32, num_classes=Num_classes)
def DenseNet169(Num_classes=10):
return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32, num_classes=Num_classes)
def DenseNet201(Num_classes=10):
return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32, num_classes=Num_classes)
def DenseNet161(Num_classes=10):
return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48, num_classes=Num_classes)
def densenet_cifar(Num_classes=10):
return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12, num_classes=Num_classes)
def test():
net = densenet_cifar()
x = torch.randn(1,3,32,32)
y = net(x)
print(y)
# test()
================================================
FILE: GC_code/CIFAR100/models/dpn.py
================================================
'''Dual Path Networks in PyTorch.'''
import torch
import torch.nn as nn
import torch.nn.functional as F
class Bottleneck(nn.Module):
def __init__(self, last_planes, in_planes, out_planes, dense_depth, stride, first_layer):
super(Bottleneck, self).__init__()
self.out_planes = out_planes
self.dense_depth = dense_depth
self.conv1 = nn.Conv2d(last_planes, in_planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(in_planes)
self.conv2 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=32, bias=False)
self.bn2 = nn.BatchNorm2d(in_planes)
self.conv3 = nn.Conv2d(in_planes, out_planes+dense_depth, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(out_planes+dense_depth)
self.shortcut = nn.Sequential()
if first_layer:
self.shortcut = nn.Sequential(
nn.Conv2d(last_planes, out_planes+dense_depth, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(out_planes+dense_depth)
)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = F.relu(self.bn2(self.conv2(out)))
out = self.bn3(self.conv3(out))
x = self.shortcut(x)
d = self.out_planes
out = torch.cat([x[:,:d,:,:]+out[:,:d,:,:], x[:,d:,:,:], out[:,d:,:,:]], 1)
out = F.relu(out)
return out
class DPN(nn.Module):
def __init__(self, cfg):
super(DPN, self).__init__()
in_planes, out_planes = cfg['in_planes'], cfg['out_planes']
num_blocks, dense_depth = cfg['num_blocks'], cfg['dense_depth']
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.last_planes = 64
self.layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1)
self.layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2)
self.layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=2)
self.layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2)
self.linear = nn.Linear(out_planes[3]+(num_blocks[3]+1)*dense_depth[3], 10)
def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride):
strides = [stride] + [1]*(num_blocks-1)
layers = []
for i,stride in enumerate(strides):
layers.append(Bottleneck(self.last_planes, in_planes, out_planes, dense_depth, stride, i==0))
self.last_planes = out_planes + (i+2) * dense_depth
return nn.Sequential(*layers)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = self.layer4(out)
out = F.avg_pool2d(out, 4)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
def DPN26():
cfg = {
'in_planes': (96,192,384,768),
'out_planes': (256,512,1024,2048),
'num_blocks': (2,2,2,2),
'dense_depth': (16,32,24,128)
}
return DPN(cfg)
def DPN92():
cfg = {
'in_planes': (96,192,384,768),
'out_planes': (256,512,1024,2048),
'num_blocks': (3,4,20,3),
'dense_depth': (16,32,24,128)
}
return DPN(cfg)
def test():
net = DPN92()
x = torch.randn(1,3,32,32)
y = net(x)
print(y)
# test()
================================================
FILE: GC_code/CIFAR100/models/googlenet.py
================================================
'''GoogLeNet with PyTorch.'''
import torch
import torch.nn as nn
import torch.nn.functional as F
class Inception(nn.Module):
def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes):
super(Inception, self).__init__()
# 1x1 conv branch
self.b1 = nn.Sequential(
nn.Conv2d(in_planes, n1x1, kernel_size=1),
nn.BatchNorm2d(n1x1),
nn.ReLU(True),
)
# 1x1 conv -> 3x3 conv branch
self.b2 = nn.Sequential(
nn.Conv2d(in_planes, n3x3red, kernel_size=1),
nn.BatchNorm2d(n3x3red),
nn.ReLU(True),
nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1),
nn.BatchNorm2d(n3x3),
nn.ReLU(True),
)
# 1x1 conv -> 5x5 conv branch
self.b3 = nn.Sequential(
nn.Conv2d(in_planes, n5x5red, kernel_size=1),
nn.BatchNorm2d(n5x5red),
nn.ReLU(True),
nn.Conv2d(n5x5red, n5x5, kernel_size=3, padding=1),
nn.BatchNorm2d(n5x5),
nn.ReLU(True),
nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1),
nn.BatchNorm2d(n5x5),
nn.ReLU(True),
)
# 3x3 pool -> 1x1 conv branch
self.b4 = nn.Sequential(
nn.MaxPool2d(3, stride=1, padding=1),
nn.Conv2d(in_planes, pool_planes, kernel_size=1),
nn.BatchNorm2d(pool_planes),
nn.ReLU(True),
)
def forward(self, x):
y1 = self.b1(x)
y2 = self.b2(x)
y3 = self.b3(x)
y4 = self.b4(x)
return torch.cat([y1,y2,y3,y4], 1)
class GoogLeNet(nn.Module):
def __init__(self):
super(GoogLeNet, self).__init__()
self.pre_layers = nn.Sequential(
nn.Conv2d(3, 192, kernel_size=3, padding=1),
nn.BatchNorm2d(192),
nn.ReLU(True),
)
self.a3 = Inception(192, 64, 96, 128, 16, 32, 32)
self.b3 = Inception(256, 128, 128, 192, 32, 96, 64)
self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)
self.a4 = Inception(480, 192, 96, 208, 16, 48, 64)
self.b4 = Inception(512, 160, 112, 224, 24, 64, 64)
self.c4 = Inception(512, 128, 128, 256, 24, 64, 64)
self.d4 = Inception(512, 112, 144, 288, 32, 64, 64)
self.e4 = Inception(528, 256, 160, 320, 32, 128, 128)
self.a5 = Inception(832, 256, 160, 320, 32, 128, 128)
self.b5 = Inception(832, 384, 192, 384, 48, 128, 128)
self.avgpool = nn.AvgPool2d(8, stride=1)
self.linear = nn.Linear(1024, 10)
def forward(self, x):
out = self.pre_layers(x)
out = self.a3(out)
out = self.b3(out)
out = self.maxpool(out)
out = self.a4(out)
out = self.b4(out)
out = self.c4(out)
out = self.d4(out)
out = self.e4(out)
out = self.maxpool(out)
out = self.a5(out)
out = self.b5(out)
out = self.avgpool(out)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
def test():
net = GoogLeNet()
x = torch.randn(1,3,32,32)
y = net(x)
print(y.size())
# test()
================================================
FILE: GC_code/CIFAR100/models/lenet.py
================================================
'''LeNet in PyTorch.'''
import torch.nn as nn
import torch.nn.functional as F
class LeNet(nn.Module):
def __init__(self):
super(LeNet, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16*5*5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
out = F.relu(self.conv1(x))
out = F.max_pool2d(out, 2)
out = F.relu(self.conv2(out))
out = F.max_pool2d(out, 2)
out = out.view(out.size(0), -1)
out = F.relu(self.fc1(out))
out = F.relu(self.fc2(out))
out = self.fc3(out)
return out
================================================
FILE: GC_code/CIFAR100/models/mobilenet.py
================================================
'''MobileNet in PyTorch.
See the paper "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
class Block(nn.Module):
'''Depthwise conv + Pointwise conv'''
def __init__(self, in_planes, out_planes, stride=1):
super(Block, self).__init__()
self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False)
self.bn1 = nn.BatchNorm2d(in_planes)
self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
self.bn2 = nn.BatchNorm2d(out_planes)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = F.relu(self.bn2(self.conv2(out)))
return out
class MobileNet(nn.Module):
# (128,2) means conv planes=128, conv stride=2, by default conv stride=1
cfg = [64, (128,2), 128, (256,2), 256, (512,2), 512, 512, 512, 512, 512, (1024,2), 1024]
def __init__(self, num_classes=10):
super(MobileNet, self).__init__()
self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(32)
self.layers = self._make_layers(in_planes=32)
self.linear = nn.Linear(1024, num_classes)
def _make_layers(self, in_planes):
layers = []
for x in self.cfg:
out_planes = x if isinstance(x, int) else x[0]
stride = 1 if isinstance(x, int) else x[1]
layers.append(Block(in_planes, out_planes, stride))
in_planes = out_planes
return nn.Sequential(*layers)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.layers(out)
out = F.avg_pool2d(out, 2)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
def test():
net = MobileNet()
x = torch.randn(1,3,32,32)
y = net(x)
print(y.size())
# test()
================================================
FILE: GC_code/CIFAR100/models/mobilenetv2.py
================================================
'''MobileNetV2 in PyTorch.
See the paper "Inverted Residuals and Linear Bottlenecks:
Mobile Networks for Classification, Detection and Segmentation" for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
class Block(nn.Module):
'''expand + depthwise + pointwise'''
def __init__(self, in_planes, out_planes, expansion, stride):
super(Block, self).__init__()
self.stride = stride
planes = expansion * in_planes
self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
self.bn3 = nn.BatchNorm2d(out_planes)
self.shortcut = nn.Sequential()
if stride == 1 and in_planes != out_planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False),
nn.BatchNorm2d(out_planes),
)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = F.relu(self.bn2(self.conv2(out)))
out = self.bn3(self.conv3(out))
out = out + self.shortcut(x) if self.stride==1 else out
return out
class MobileNetV2(nn.Module):
# (expansion, out_planes, num_blocks, stride)
cfg = [(1, 16, 1, 1),
(6, 24, 2, 1), # NOTE: change stride 2 -> 1 for CIFAR10
(6, 32, 3, 2),
(6, 64, 4, 2),
(6, 96, 3, 1),
(6, 160, 3, 2),
(6, 320, 1, 1)]
def __init__(self, num_classes=10):
super(MobileNetV2, self).__init__()
# NOTE: change conv1 stride 2 -> 1 for CIFAR10
self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(32)
self.layers = self._make_layers(in_planes=32)
self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False)
self.bn2 = nn.BatchNorm2d(1280)
self.linear = nn.Linear(1280, num_classes)
def _make_layers(self, in_planes):
layers = []
for expansion, out_planes, num_blocks, stride in self.cfg:
strides = [stride] + [1]*(num_blocks-1)
for stride in strides:
layers.append(Block(in_planes, out_planes, expansion, stride))
in_planes = out_planes
return nn.Sequential(*layers)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.layers(out)
out = F.relu(self.bn2(self.conv2(out)))
# NOTE: change pooling kernel_size 7 -> 4 for CIFAR10
out = F.avg_pool2d(out, 4)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
def test():
net = MobileNetV2()
x = torch.randn(2,3,32,32)
y = net(x)
print(y.size())
# test()
================================================
FILE: GC_code/CIFAR100/models/pnasnet.py
================================================
'''PNASNet in PyTorch.
Paper: Progressive Neural Architecture Search
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
class SepConv(nn.Module):
'''Separable Convolution.'''
def __init__(self, in_planes, out_planes, kernel_size, stride):
super(SepConv, self).__init__()
self.conv1 = nn.Conv2d(in_planes, out_planes,
kernel_size, stride,
padding=(kernel_size-1)//2,
bias=False, groups=in_planes)
self.bn1 = nn.BatchNorm2d(out_planes)
def forward(self, x):
return self.bn1(self.conv1(x))
class CellA(nn.Module):
def __init__(self, in_planes, out_planes, stride=1):
super(CellA, self).__init__()
self.stride = stride
self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)
if stride==2:
self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
self.bn1 = nn.BatchNorm2d(out_planes)
def forward(self, x):
y1 = self.sep_conv1(x)
y2 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)
if self.stride==2:
y2 = self.bn1(self.conv1(y2))
return F.relu(y1+y2)
class CellB(nn.Module):
def __init__(self, in_planes, out_planes, stride=1):
super(CellB, self).__init__()
self.stride = stride
# Left branch
self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)
self.sep_conv2 = SepConv(in_planes, out_planes, kernel_size=3, stride=stride)
# Right branch
self.sep_conv3 = SepConv(in_planes, out_planes, kernel_size=5, stride=stride)
if stride==2:
self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
self.bn1 = nn.BatchNorm2d(out_planes)
# Reduce channels
self.conv2 = nn.Conv2d(2*out_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
self.bn2 = nn.BatchNorm2d(out_planes)
def forward(self, x):
# Left branch
y1 = self.sep_conv1(x)
y2 = self.sep_conv2(x)
# Right branch
y3 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)
if self.stride==2:
y3 = self.bn1(self.conv1(y3))
y4 = self.sep_conv3(x)
# Concat & reduce channels
b1 = F.relu(y1+y2)
b2 = F.relu(y3+y4)
y = torch.cat([b1,b2], 1)
return F.relu(self.bn2(self.conv2(y)))
class PNASNet(nn.Module):
def __init__(self, cell_type, num_cells, num_planes):
super(PNASNet, self).__init__()
self.in_planes = num_planes
self.cell_type = cell_type
self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(num_planes)
self.layer1 = self._make_layer(num_planes, num_cells=6)
self.layer2 = self._downsample(num_planes*2)
self.layer3 = self._make_layer(num_planes*2, num_cells=6)
self.layer4 = self._downsample(num_planes*4)
self.layer5 = self._make_layer(num_planes*4, num_cells=6)
self.linear = nn.Linear(num_planes*4, 10)
def _make_layer(self, planes, num_cells):
layers = []
for _ in range(num_cells):
layers.append(self.cell_type(self.in_planes, planes, stride=1))
self.in_planes = planes
return nn.Sequential(*layers)
def _downsample(self, planes):
layer = self.cell_type(self.in_planes, planes, stride=2)
self.in_planes = planes
return layer
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = self.layer4(out)
out = self.layer5(out)
out = F.avg_pool2d(out, 8)
out = self.linear(out.view(out.size(0), -1))
return out
def PNASNetA():
return PNASNet(CellA, num_cells=6, num_planes=44)
def PNASNetB():
return PNASNet(CellB, num_cells=6, num_planes=32)
def test():
net = PNASNetB()
x = torch.randn(1,3,32,32)
y = net(x)
print(y)
# test()
================================================
FILE: GC_code/CIFAR100/models/preact_resnet.py
================================================
'''Pre-activation ResNet in PyTorch.
Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
Identity Mappings in Deep Residual Networks. arXiv:1603.05027
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
class PreActBlock(nn.Module):
'''Pre-activation version of the BasicBlock.'''
expansion = 1
def __init__(self, in_planes, planes, stride=1):
super(PreActBlock, self).__init__()
self.bn1 = nn.BatchNorm2d(in_planes)
self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
if stride != 1 or in_planes != self.expansion*planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
)
def forward(self, x):
out = F.relu(self.bn1(x))
shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
out = self.conv1(out)
out = self.conv2(F.relu(self.bn2(out)))
out += shortcut
return out
class PreActBottleneck(nn.Module):
'''Pre-activation version of the original Bottleneck module.'''
expansion = 4
def __init__(self, in_planes, planes, stride=1):
super(PreActBottleneck, self).__init__()
self.bn1 = nn.BatchNorm2d(in_planes)
self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
if stride != 1 or in_planes != self.expansion*planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
)
def forward(self, x):
out = F.relu(self.bn1(x))
shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
out = self.conv1(out)
out = self.conv2(F.relu(self.bn2(out)))
out = self.conv3(F.relu(self.bn3(out)))
out += shortcut
return out
class PreActResNet(nn.Module):
def __init__(self, block, num_blocks, num_classes=10):
super(PreActResNet, self).__init__()
self.in_planes = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
self.linear = nn.Linear(512*block.expansion, num_classes)
def _make_layer(self, block, planes, num_blocks, stride):
strides = [stride] + [1]*(num_blocks-1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride))
self.in_planes = planes * block.expansion
return nn.Sequential(*layers)
def forward(self, x):
out = self.conv1(x)
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = self.layer4(out)
out = F.avg_pool2d(out, 4)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
def PreActResNet18():
return PreActResNet(PreActBlock, [2,2,2,2])
def PreActResNet34():
return PreActResNet(PreActBlock, [3,4,6,3])
def PreActResNet50():
return PreActResNet(PreActBottleneck, [3,4,6,3])
def PreActResNet101():
return PreActResNet(PreActBottleneck, [3,4,23,3])
def PreActResNet152():
return PreActResNet(PreActBottleneck, [3,8,36,3])
def test():
net = PreActResNet18()
y = net((torch.randn(1,3,32,32)))
print(y.size())
# test()
================================================
FILE: GC_code/CIFAR100/models/resnet.py
================================================
'''ResNet in PyTorch.
For Pre-activation ResNet, see 'preact_resnet.py'.
Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
Deep Residual Learning for Image Recognition. arXiv:1512.03385
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, in_planes, planes, stride=1):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion*planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion*planes)
)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x)
out = F.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, in_planes, planes, stride=1):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(self.expansion*planes)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion*planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion*planes)
)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = F.relu(self.bn2(self.conv2(out)))
out = self.bn3(self.conv3(out))
out += self.shortcut(x)
out = F.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, num_blocks, num_classes=10):
super(ResNet, self).__init__()
self.in_planes = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
self.linear = nn.Linear(512*block.expansion, num_classes)
def _make_layer(self, block, planes, num_blocks, stride):
strides = [stride] + [1]*(num_blocks-1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride))
self.in_planes = planes * block.expansion
return nn.Sequential(*layers)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = self.layer4(out)
out = F.avg_pool2d(out, 4)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
def ResNet18(Num_classes=10):
return ResNet(BasicBlock, [2,2,2,2],num_classes=Num_classes)
def ResNet34(Num_classes=10):
return ResNet(BasicBlock, [3,4,6,3],num_classes=Num_classes)
def ResNet50(Num_classes=10):
return ResNet(Bottleneck, [3,4,6,3],num_classes=Num_classes)
def ResNet101(Num_classes=10):
return ResNet(Bottleneck, [3,4,23,3],num_classes=Num_classes)
def ResNet152(Num_classes=10):
return ResNet(Bottleneck, [3,8,36,3],num_classes=Num_classes)
def test():
net = ResNet18()
y = net(torch.randn(1,3,32,32))
print(y.size())
# test()
================================================
FILE: GC_code/CIFAR100/models/resnext.py
================================================
'''ResNeXt in PyTorch.
See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
class Block(nn.Module):
'''Grouped convolution block.'''
expansion = 2
def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1):
super(Block, self).__init__()
group_width = cardinality * bottleneck_width
self.conv1 = nn.Conv2d(in_planes, group_width, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(group_width)
self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False)
self.bn2 = nn.BatchNorm2d(group_width)
self.conv3 = nn.Conv2d(group_width, self.expansion*group_width, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(self.expansion*group_width)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion*group_width:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion*group_width, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion*group_width)
)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = F.relu(self.bn2(self.conv2(out)))
out = self.bn3(self.conv3(out))
out += self.shortcut(x)
out = F.relu(out)
return out
class ResNeXt(nn.Module):
def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10):
super(ResNeXt, self).__init__()
self.cardinality = cardinality
self.bottleneck_width = bottleneck_width
self.in_planes = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.layer1 = self._make_layer(num_blocks[0], 1)
self.layer2 = self._make_layer(num_blocks[1], 2)
self.layer3 = self._make_layer(num_blocks[2], 2)
# self.layer4 = self._make_layer(num_blocks[3], 2)
self.linear = nn.Linear(cardinality*bottleneck_width*8, num_classes)
def _make_layer(self, num_blocks, stride):
strides = [stride] + [1]*(num_blocks-1)
layers = []
for stride in strides:
layers.append(Block(self.in_planes, self.cardinality, self.bottleneck_width, stride))
self.in_planes = Block.expansion * self.cardinality * self.bottleneck_width
# Increase bottleneck_width by 2 after each stage.
self.bottleneck_width *= 2
return nn.Sequential(*layers)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
# out = self.layer4(out)
out = F.avg_pool2d(out, 8)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
def ResNeXt29_2x64d(Num_classes=10):
return ResNeXt(num_blocks=[3,3,3], cardinality=2, bottleneck_width=64,num_classes=Num_classes)
def ResNeXt29_4x64d(Num_classes=10):
return ResNeXt(num_blocks=[3,3,3], cardinality=4, bottleneck_width=64,num_classes=Num_classes)
def ResNeXt29_8x64d(Num_classes=10):
return ResNeXt(num_blocks=[3,3,3], cardinality=8, bottleneck_width=64,num_classes=Num_classes)
def ResNeXt29_32x4d(Num_classes=10):
return ResNeXt(num_blocks=[3,3,3], cardinality=32, bottleneck_width=4,num_classes=Num_classes)
def test_resnext():
net = ResNeXt29_2x64d()
x = torch.randn(1,3,32,32)
y = net(x)
print(y.size())
# test_resnext()
================================================
FILE: GC_code/CIFAR100/models/senet.py
================================================
'''SENet in PyTorch.
SENet is the winner of ImageNet-2017. The paper is not released yet.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
class BasicBlock(nn.Module):
def __init__(self, in_planes, planes, stride=1):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes)
)
# SE layers
self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1) # Use nn.Conv2d instead of nn.Linear
self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
# Squeeze
w = F.avg_pool2d(out, out.size(2))
w = F.relu(self.fc1(w))
w = F.sigmoid(self.fc2(w))
# Excitation
out = out * w # New broadcasting feature from v0.2!
out += self.shortcut(x)
out = F.relu(out)
return out
class PreActBlock(nn.Module):
def __init__(self, in_planes, planes, stride=1):
super(PreActBlock, self).__init__()
self.bn1 = nn.BatchNorm2d(in_planes)
self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
if stride != 1 or in_planes != planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False)
)
# SE layers
self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)
self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)
def forward(self, x):
out = F.relu(self.bn1(x))
shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
out = self.conv1(out)
out = self.conv2(F.relu(self.bn2(out)))
# Squeeze
w = F.avg_pool2d(out, out.size(2))
w = F.relu(self.fc1(w))
w = F.sigmoid(self.fc2(w))
# Excitation
out = out * w
out += shortcut
return out
class SENet(nn.Module):
def __init__(self, block, num_blocks, num_classes=10):
super(SENet, self).__init__()
self.in_planes = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
self.linear = nn.Linear(512, num_classes)
def _make_layer(self, block, planes, num_blocks, stride):
strides = [stride] + [1]*(num_blocks-1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride))
self.in_planes = planes
return nn.Sequential(*layers)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = self.layer4(out)
out = F.avg_pool2d(out, 4)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
def SENet18():
return SENet(PreActBlock, [2,2,2,2])
def test():
net = SENet18()
y = net(torch.randn(1,3,32,32))
print(y.size())
# test()
================================================
FILE: GC_code/CIFAR100/models/shufflenet.py
================================================
'''ShuffleNet in PyTorch.
See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
class ShuffleBlock(nn.Module):
def __init__(self, groups):
super(ShuffleBlock, self).__init__()
self.groups = groups
def forward(self, x):
'''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''
N,C,H,W = x.size()
g = self.groups
return x.view(N,g,C/g,H,W).permute(0,2,1,3,4).contiguous().view(N,C,H,W)
class Bottleneck(nn.Module):
def __init__(self, in_planes, out_planes, stride, groups):
super(Bottleneck, self).__init__()
self.stride = stride
mid_planes = out_planes/4
g = 1 if in_planes==24 else groups
self.conv1 = nn.Conv2d(in_planes, mid_planes, kernel_size=1, groups=g, bias=False)
self.bn1 = nn.BatchNorm2d(mid_planes)
self.shuffle1 = ShuffleBlock(groups=g)
self.conv2 = nn.Conv2d(mid_planes, mid_planes, kernel_size=3, stride=stride, padding=1, groups=mid_planes, bias=False)
self.bn2 = nn.BatchNorm2d(mid_planes)
self.conv3 = nn.Conv2d(mid_planes, out_planes, kernel_size=1, groups=groups, bias=False)
self.bn3 = nn.BatchNorm2d(out_planes)
self.shortcut = nn.Sequential()
if stride == 2:
self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1))
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.shuffle1(out)
out = F.relu(self.bn2(self.conv2(out)))
out = self.bn3(self.conv3(out))
res = self.shortcut(x)
out = F.relu(torch.cat([out,res], 1)) if self.stride==2 else F.relu(out+res)
return out
class ShuffleNet(nn.Module):
def __init__(self, cfg):
super(ShuffleNet, self).__init__()
out_planes = cfg['out_planes']
num_blocks = cfg['num_blocks']
groups = cfg['groups']
self.conv1 = nn.Conv2d(3, 24, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(24)
self.in_planes = 24
self.layer1 = self._make_layer(out_planes[0], num_blocks[0], groups)
self.layer2 = self._make_layer(out_planes[1], num_blocks[1], groups)
self.layer3 = self._make_layer(out_planes[2], num_blocks[2], groups)
self.linear = nn.Linear(out_planes[2], 10)
def _make_layer(self, out_planes, num_blocks, groups):
layers = []
for i in range(num_blocks):
stride = 2 if i == 0 else 1
cat_planes = self.in_planes if i == 0 else 0
layers.append(Bottleneck(self.in_planes, out_planes-cat_planes, stride=stride, groups=groups))
self.in_planes = out_planes
return nn.Sequential(*layers)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = F.avg_pool2d(out, 4)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
def ShuffleNetG2():
cfg = {
'out_planes': [200,400,800],
'num_blocks': [4,8,4],
'groups': 2
}
return ShuffleNet(cfg)
def ShuffleNetG3():
cfg = {
'out_planes': [240,480,960],
'num_blocks': [4,8,4],
'groups': 3
}
return ShuffleNet(cfg)
def test():
net = ShuffleNetG2()
x = torch.randn(1,3,32,32)
y = net(x)
print(y)
# test()
================================================
FILE: GC_code/CIFAR100/models/vgg.py
================================================
'''VGG11/13/16/19 in Pytorch.'''
import torch
import torch.nn as nn
cfg = {
'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}
class VGG(nn.Module):
def __init__(self, vgg_name,Num_classes=100):
super(VGG, self).__init__()
self.features = self._make_layers(cfg[vgg_name])
self.classifier = nn.Linear(512, Num_classes)
def forward(self, x):
out = self.features(x)
out = out.view(out.size(0), -1)
out = self.classifier(out)
return out
def _make_layers(self, cfg):
layers = []
in_channels = 3
for x in cfg:
if x == 'M':
layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
else:
layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
nn.BatchNorm2d(x),
nn.ReLU(inplace=True)]
in_channels = x
layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
return nn.Sequential(*layers)
def test():
net = VGG('VGG11')
x = torch.randn(2,3,32,32)
y = net(x)
print(y.size())
# test()
================================================
FILE: GC_code/CIFAR100/os_run.py
================================================
import os,time
#cifar100 sgd & sgdGCC
os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 --model r50 > logout/r50_lr11_wd45_sgd.log ")
os.system("nohup python main.py --lr 0.1 --wd 0.0005 --alg sgdGC --epochs 200 --model r50 > logout/r50_lr11_wd45_sgdGC.log ")
================================================
FILE: GC_code/Fine-grained_classification/SGD.py
================================================
import torch
from torch.optim.optimizer import Optimizer, required
class SGD_GCC(Optimizer):
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGD_GCC, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGD_GCC, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
if weight_decay != 0:
d_p.add_(weight_decay, p.data)
#GC operation for Conv layers
if len(list(d_p.size()))>3:
d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(1 - dampening, d_p)
if nesterov:
d_p = d_p.add(momentum, buf)
else:
d_p = buf
p.data.add_(-group['lr'], d_p)
return loss
class SGD_GC(Optimizer):
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGD_GC, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGD_GC, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
if weight_decay != 0:
d_p.add_(weight_decay, p.data)
#GC operation for Conv layers and FC layers
if len(list(d_p.size()))>1:
d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(1 - dampening, d_p)
if nesterov:
d_p = d_p.add(momentum, buf)
else:
d_p = buf
p.data.add_(-group['lr'], d_p)
return loss
class SGDW(Optimizer):
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGDW, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGDW, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
old = torch.clone(p.data).detach()
#if weight_decay != 0:
# d_p.add_(weight_decay, p.data)
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
buf.mul_(momentum).add_(d_p)
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(1 - dampening, d_p)
if nesterov:
d_p = d_p.add(momentum, buf)
else:
d_p = buf
p.data.add_(-group['lr'], d_p)
if weight_decay != 0:
p.data.add_(-weight_decay*group['lr'], old)
return loss
class SGDW_GCC(Optimizer):
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGDW_GCC, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGDW_GCC, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
old = torch.clone(p.data).detach()
#if weight_decay != 0:
# d_p.add_(weight_decay, p.data)
#GC operation for Conv layers
if len(list(d_p.size()))>3:
d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
buf.mul_(momentum).add_(d_p)
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(1 - dampening, d_p)
if nesterov:
d_p = d_p.add(momentum, buf)
else:
d_p = buf
p.data.add_(-group['lr'], d_p)
if weight_decay != 0:
p.data.add_(-weight_decay*group['lr'], old)
return loss
================================================
FILE: GC_code/Fine-grained_classification/main.py
================================================
import argparse
import os
import random
import shutil
import time
import warnings
import sys
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from torch.optim import lr_scheduler
from SGD import SGD_GC #import SGD with GC
model_names = sorted(name for name in models.__dict__
if name.islower() and not name.startswith("__")
and callable(models.__dict__[name]))
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument('-b', '--batch-size', default=256, type=int,
metavar='N',
help='mini-batch size (default: 256), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--lr', '--learning-rate', default=0.1*128/128, type=float,
metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',
choices=model_names,
help='model architecture: ' +
' | '.join(model_names) +
' (default: resnet18)')
parser.add_argument('data', metavar='DIR',
help='path to dataset')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=100, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
help='manual epoch number (useful on restarts)')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=100, type=int,
metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
help='use pre-trained model')
parser.add_argument('--world-size', default=-1, type=int,
help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--seed', default=None, type=int,
help='seed for initializing training. ')
parser.add_argument('--gpu', default=None, type=int,
help='GPU id to use.')
parser.add_argument('--multiprocessing-distributed', action='store_true',
help='Use multi-processing distributed training to launch '
'N processes per node, which has N GPUs. This is the '
'fastest way to use PyTorch for either single node or '
'multi node data parallel training')
parser.add_argument('--model', default='r50p', type=str, help='model')
parser.add_argument('--path', default='test', type=str, help='model')
parser.add_argument('--alg', default='sgd', type=str, help='algorithm')
parser.add_argument('--dataset', default='cub', type=str, help='model')
best_acc1 = 0
def main():
args = parser.parse_args()
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"
if args.seed is not None:
random.seed(args.seed)
torch.manual_seed(args.seed)
cudnn.deterministic = True
warnings.warn('You have chosen to seed training. '
'This will turn on the CUDNN deterministic setting, '
'which can slow down your training considerably! '
'You may see unexpected behavior when restarting '
'from checkpoints.')
if args.gpu is not None:
warnings.warn('You have chosen a specific GPU. This will completely '
'disable data parallelism.')
if args.dist_url == "env://" and args.world_size == -1:
args.world_size = int(os.environ["WORLD_SIZE"])
args.distributed = args.world_size > 1 or args.multiprocessing_distributed
ngpus_per_node = torch.cuda.device_count()
if args.multiprocessing_distributed:
# Since we have ngpus_per_node processes per node, the total world_size
# needs to be adjusted accordingly
args.world_size = ngpus_per_node * args.world_size
# Use torch.multiprocessing.spawn to launch distributed processes: the
# main_worker process function
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
else:
# Simply call main_worker function
main_worker(args.gpu, ngpus_per_node, args)
def main_worker(gpu, ngpus_per_node, args):
global best_acc1
args.gpu = gpu
class_num={'cub':200,'cars':196,'dogs':120,'fgvc':100}
if args.gpu is not None:
print("Use GPU: {} for training".format(args.gpu))
if args.distributed:
if args.dist_url == "env://" and args.rank == -1:
args.rank = int(os.environ["RANK"])
if args.multiprocessing_distributed:
# For multiprocessing distributed training, rank needs to be the
# global rank among all the processes
args.rank = args.rank * ngpus_per_node + gpu
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
# create model
if args.model=='r18p':
model =models.resnet18(pretrained=True)
model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True)
if args.model=='r18':
model =models.resnet18()
model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True)
if args.model=='r50p':
model =models.resnet50(pretrained=True)
model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True)
if args.model=='r50':
model =models.resnet50()
model.fc= nn.Linear(in_features=2048, out_features=class_num[args.dataset], bias=True)
if args.distributed:
# For multiprocessing distributed, DistributedDataParallel constructor
# should always set the single device scope, otherwise,
# DistributedDataParallel will use all available devices.
if args.gpu is not None:
torch.cuda.set_device(args.gpu)
model.cuda(args.gpu)
# When using a single GPU per process and per
# DistributedDataParallel, we need to divide the batch size
# ourselves based on the total number of GPUs we have
args.batch_size = int(args.batch_size / ngpus_per_node)
args.workers = int(args.workers / ngpus_per_node)
model = torch.nn.parallel.DistributedDataParallel(model)
else:
model.cuda()
# DistributedDataParallel will divide and allocate batch_size to all
# available GPUs if device_ids are not set
model = torch.nn.parallel.DistributedDataParallel(model)
else:
# DataParallel will divide and allocate batch_size to all available GPUs
if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
model.features = torch.nn.DataParallel(model.features)
model.cuda()
else:
model = torch.nn.DataParallel(model).cuda()
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda(args.gpu)
# choose optimizer
if args.model=='r50p' or args.model=='r50':
new_param_ids = set(map(id, model.module.fc.parameters()))
base_params = [p for p in model.parameters() if
id(p) not in new_param_ids]
param_groups_base =[{'params': base_params, 'lr_mult': 0.1}]
if args.model=='r50p' or args.model=='r50':
param_groups_new=[{'params': model.module.fc.parameters(), 'lr_mult': 1.0}]
if args.alg=='sgd':
optimizer_base = torch.optim.SGD(param_groups_base, args.lr, momentum=args.momentum,weight_decay=args.weight_decay)
optimizer_new= torch.optim.SGD(param_groups_new, args.lr, momentum=args.momentum,weight_decay=args.weight_decay)
if args.alg=='sgdGC':
optimizer_base = SGD_GC(param_groups_base, args.lr, momentum=args.momentum,weight_decay=args.weight_decay)
optimizer_new= SGD_GC(param_groups_new, args.lr, momentum=args.momentum,weight_decay=args.weight_decay)
exp_lr_scheduler_new = lr_scheduler.MultiStepLR(optimizer_new, milestones=[50,80], gamma=0.1)
exp_lr_scheduler_base = lr_scheduler.MultiStepLR(optimizer_base, milestones=[50,80], gamma=0.1)
# optionally resume from a checkpoint
if args.resume:
if os.path.isfile(args.resume):
print("=> loading checkpoint '{}'".format(args.resume))
checkpoint = torch.load(args.resume)
args.start_epoch = checkpoint['epoch']
best_acc1 = checkpoint['best_acc1']
if args.gpu is not None:
# best_acc1 may be from a checkpoint from a different GPU
best_acc1 = best_acc1.to(args.gpu)
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
print("=> loaded checkpoint '{}' (epoch {})"
.format(args.resume, checkpoint['epoch']))
else:
print("=> no checkpoint found at '{}'".format(args.resume))
cudnn.benchmark = True
# Data loading code
traindir = os.path.join(args.data, 'train')
valdir = os.path.join(args.data, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
train_dataset = datasets.ImageFolder(
traindir,
transforms.Compose([
transforms.Resize(512),
transforms.RandomHorizontalFlip(),
transforms.CenterCrop(448),
transforms.ToTensor(),
normalize,
]))
if args.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
train_sampler = None
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
num_workers=args.workers, pin_memory=True, sampler=train_sampler,drop_last=True)
val_loader = torch.utils.data.DataLoader(
datasets.ImageFolder(valdir, transforms.Compose([
transforms.Resize(512),
transforms.CenterCrop(448),
transforms.ToTensor(),
normalize,
])),
batch_size=args.batch_size, shuffle=False,
num_workers=args.workers, pin_memory=True,drop_last=True)
if args.evaluate:
validate(val_loader, model, criterion, args)
return
for epoch in range(args.start_epoch, args.epochs):
if args.distributed:
train_sampler.set_epoch(epoch)
#adjust_learning_rate(optimizer, epoch, args)
# train for one epoch
train(train_loader, model, criterion, optimizer_base, optimizer_new,epoch, args)
#exp_lr_scheduler.step()
exp_lr_scheduler_new.step()
exp_lr_scheduler_base.step()
# evaluate on validation set
acc1 = validate(val_loader, model, criterion, args)
# remember best acc@1 and save checkpoint
is_best = acc1 > best_acc1
best_acc1 = max(acc1, best_acc1)
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
and args.rank % ngpus_per_node == 0):
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.state_dict(),
'best_acc1': best_acc1,
#'optimizer' : optimizer.state_dict(),
}, is_best)
#torch.save(model.module, './result_model/'+args.path+'.pth')
# train
def train(train_loader, model, criterion, optimizer_base, optimizer_new, epoch, args):
batch_time = AverageMeter()
data_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
top5 = AverageMeter()
total = 0
train_loss = 0
correct = 0
# switch to train mode
model.train()
print('\nEpoch: %d' % epoch)
end = time.time()
for i, (input, target) in enumerate(train_loader):
# measure data loading time
data_time.update(time.time() - end)
#if args.gpu is not None:
#input = input.cuda(args.gpu, non_blocking=True)
#target = target.cuda(args.gpu, non_blocking=True)
input, target = input.to('cuda'), target.to('cuda')
# compute output
output = model(input)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), input.size(0))
top1.update(acc1[0], input.size(0))
top5.update(acc5[0], input.size(0))
_, predicted = output.max(1)
correct += predicted.eq(target).sum().item()
train_loss += loss.item()
#correct +=acc1[0]
total += target.size(0)
# compute gradient and do SGD step
optimizer_new.zero_grad()
optimizer_base.zero_grad()
loss.backward()
optimizer_new.step()
optimizer_base.step()
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
print('Training: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5))
#print('Training: Loss: {:.3f} | Acc: {:.3f}'.format(train_loss/(i+1),correct/total))
# test
def validate(val_loader, model, criterion, args):
batch_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
top5 = AverageMeter()
val_loss = 0
total = 0
correct = 0
# switch to evaluate mode
model.eval()
with torch.no_grad():
end = time.time()
for i, (input, target) in enumerate(val_loader):
if args.gpu is not None:
input = input.cuda(args.gpu, non_blocking=True)
target = target.cuda(args.gpu, non_blocking=True)
# compute output
output = model(input)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), input.size(0))
top1.update(acc1[0], input.size(0))
top5.update(acc5[0], input.size(0))
_, predicted = output.max(1)
total += target.size(0)
correct += predicted.eq(target).sum().item()
val_loss +=loss.item()
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
val_loss += loss.item()
print('Testing: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5))
return top1.avg
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
torch.save(state, filename)
if is_best:
shutil.copyfile(filename, 'model_best.pth.tar')
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def adjust_learning_rate(optimizer, epoch, args):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr = args.lr * (0.1 ** (epoch // 30))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def accuracy(output, target, topk=(1,)):
"""Computes the accuracy over the k top predictions for the specified values of k"""
with torch.no_grad():
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(1.0 / batch_size))
return res
if __name__ == '__main__':
main()
================================================
FILE: GC_code/Fine-grained_classification/os_run.py
================================================
import os,time
os.system("nohup python -W ignore main.py /home/yonghw/data/data/CUB_200_2011/ --model r50p -b 128 --alg sgd --dataset cub > logout/Cub_r50p_sgd_b128_g4.log ")
os.system("nohup python -W ignore main.py /home/yonghw/data/data/CUB_200_2011/ --model r50p -b 128 --alg sgdGC --dataset cub > logout/Cub_r50p_sgdGC_b128_g4.log ")
os.system("nohup python -W ignore main.py /home/yonghw/data/data/Car196/ --model r50p -b 128 --alg sgd --dataset cars > logout/Car_r50p_sgd_b128_g4.log ")
os.system("nohup python -W ignore main.py /home/yonghw/data/data/Car196/ --model r50p -b 128 --alg sgdGC --dataset cars> logout/Car_r50p_sgdGC_b128_g4.log ")
os.system("nohup python -W ignore main.py /home/yonghw/data/data/fgvc_aricraft/ --model r50p -b 128 --alg sgd --dataset fgvc > logout/Ari_r50p_sgd_b128_g4.log ")
os.system("nohup python -W ignore main.py /home/yonghw/data/data/fgvc_aricraft/ --model r50p -b 128 --alg sgdGC --dataset fgvc > logout/Ari_r50p_sgdGC_b128_g4.log ")
os.system("nohup python -W ignore main.py /home/yonghw/data/data/StanfordDogs/ --model r50p -b 128 --alg sgd --dataset dogs > logout/Dog_r50p_sgd_b128_g4.log ")
os.system("nohup python -W ignore main.py /home/yonghw/data/data/StanfordDogs/ --model r50p -b 128 --alg sgdGC --dataset dogs > logout/Dog_r50p_sgdGC_b128_g4.log ")
================================================
FILE: GC_code/ImageNet/SGD.py
================================================
import torch
from torch.optim.optimizer import Optimizer, required
class SGD_GCC(Optimizer):
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGD_GCC, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGD_GCC, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
if weight_decay != 0:
d_p.add_(weight_decay, p.data)
#GC operation for Conv layers
if len(list(d_p.size()))>3:
d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(1 - dampening, d_p)
if nesterov:
d_p = d_p.add(momentum, buf)
else:
d_p = buf
p.data.add_(-group['lr'], d_p)
return loss
class SGD_GC(Optimizer):
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGD_GC, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGD_GC, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
if weight_decay != 0:
d_p.add_(weight_decay, p.data)
#GC operation for Conv layers and FC layers
if len(list(d_p.size()))>1:
d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(1 - dampening, d_p)
if nesterov:
d_p = d_p.add(momentum, buf)
else:
d_p = buf
p.data.add_(-group['lr'], d_p)
return loss
class SGDW(Optimizer):
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGDW, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGDW, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
old = torch.clone(p.data).detach()
#if weight_decay != 0:
# d_p.add_(weight_decay, p.data)
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
buf.mul_(momentum).add_(d_p)
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(1 - dampening, d_p)
if nesterov:
d_p = d_p.add(momentum, buf)
else:
d_p = buf
p.data.add_(-group['lr'], d_p)
if weight_decay != 0:
p.data.add_(-weight_decay*group['lr'], old)
return loss
class SGDW_GCC(Optimizer):
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGDW_GCC, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGDW_GCC, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
old = torch.clone(p.data).detach()
#if weight_decay != 0:
# d_p.add_(weight_decay, p.data)
#GC operation for Conv layers
if len(list(d_p.size()))>3:
d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
buf.mul_(momentum).add_(d_p)
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(1 - dampening, d_p)
if nesterov:
d_p = d_p.add(momentum, buf)
else:
d_p = buf
p.data.add_(-group['lr'], d_p)
if weight_decay != 0:
p.data.add_(-weight_decay*group['lr'], old)
return loss
================================================
FILE: GC_code/ImageNet/main.py
================================================
import argparse
import os
import random
import shutil
import time
import warnings
import sys
#nohup python -W ignore main.py /mnt/v0/ --model r50bn --alg sgd1 -b 256 --gpug 1 --path r50bn_sgd1_b256_g4 > logout/r50bn_sgd1_b256_g4.log
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
#from myresnet_nbn import resnet18_nbn, resnet101_nbn,resnet50_nbn
from myresnet import resnet50, resnet101
from myresnetgn import resnet50gn, resnet101gn
from torch.optim import lr_scheduler
from SGD import SGD_GCC #import SGD with GC for Conv layer
model_names = sorted(name for name in models.__dict__
if name.islower() and not name.startswith("__")
and callable(models.__dict__[name]))
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument('-b', '--batch-size', default=256, type=int,
metavar='N',
help='mini-batch size (default: 256), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--lr', '--learning-rate', default=0.1*128/128, type=float,
metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',
choices=model_names,
help='model architecture: ' +
' | '.join(model_names) +
' (default: resnet18)')
parser.add_argument('data', metavar='DIR',
help='path to dataset')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=100, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
help='manual epoch number (useful on restarts)')
parser.add_argument('--bgn', default=1, type=int, help='bn group number')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=100, type=int,
metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
help='use pre-trained model')
parser.add_argument('--world-size', default=-1, type=int,
help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--seed', default=None, type=int,
help='seed for initializing training. ')
parser.add_argument('--gpu', default=None, type=int,
help='GPU id to use.')
parser.add_argument('--multiprocessing-distributed', action='store_true',
help='Use multi-processing distributed training to launch '
'N processes per node, which has N GPUs. This is the '
'fastest way to use PyTorch for either single node or '
'multi node data parallel training')
parser.add_argument('--model', default='r50bn', type=str, help='model')
parser.add_argument('--path', default='test', type=str, help='model')
parser.add_argument('--alg', default='sgd', type=str, help='algorithm')
best_acc1 = 0
device_ids=[0,1,2,3,4,5,6,7]
def main():
args = parser.parse_args()
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"
if args.seed is not None:
random.seed(args.seed)
torch.manual_seed(args.seed)
cudnn.deterministic = True
warnings.warn('You have chosen to seed training. '
'This will turn on the CUDNN deterministic setting, '
'which can slow down your training considerably! '
'You may see unexpected behavior when restarting '
'from checkpoints.')
if args.gpu is not None:
warnings.warn('You have chosen a specific GPU. This will completely '
'disable data parallelism.')
if args.dist_url == "env://" and args.world_size == -1:
args.world_size = int(os.environ["WORLD_SIZE"])
args.distributed = args.world_size > 1 or args.multiprocessing_distributed
ngpus_per_node = torch.cuda.device_count()
if args.multiprocessing_distributed:
# Since we have ngpus_per_node processes per node, the total world_size
# needs to be adjusted accordingly
args.world_size = ngpus_per_node * args.world_size
# Use torch.multiprocessing.spawn to launch distributed processes: the
# main_worker process function
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
else:
# Simply call main_worker function
main_worker(args.gpu, ngpus_per_node, args)
def main_worker(gpu, ngpus_per_node, args):
global best_acc1
args.gpu = gpu
if args.gpu is not None:
print("Use GPU: {} for training".format(args.gpu))
if args.distributed:
if args.dist_url == "env://" and args.rank == -1:
args.rank = int(os.environ["RANK"])
if args.multiprocessing_distributed:
# For multiprocessing distributed training, rank needs to be the
# global rank among all the processes
args.rank = args.rank * ngpus_per_node + gpu
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
# create model
if args.model=='r50bn':
model = resnet50()
if args.model=='r50gn':
model = resnet50gn()
if args.model=='r101bn':
model = resnet101()
if args.model=='r101gn':
model = resnet101gn()
if args.distributed:
# For multiprocessing distributed, DistributedDataParallel constructor
# should always set the single device scope, otherwise,
# DistributedDataParallel will use all available devices.
if args.gpu is not None:
torch.cuda.set_device(args.gpu)
model.cuda(args.gpu)
# When using a single GPU per process and per
# DistributedDataParallel, we need to divide the batch size
# ourselves based on the total number of GPUs we have
args.batch_size = int(args.batch_size / ngpus_per_node)
args.workers = int(args.workers / ngpus_per_node)
model = torch.nn.parallel.DistributedDataParallel(model)
else:
model.cuda()
# DistributedDataParallel will divide and allocate batch_size to all
# available GPUs if device_ids are not set
model = torch.nn.parallel.DistributedDataParallel(model)
else:
# DataParallel will divide and allocate batch_size to all available GPUs
if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
model.features = torch.nn.DataParallel(model.features)
model.cuda()
else:
model = torch.nn.DataParallel(model).cuda()
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda(args.gpu)
# choose optimizer
if args.alg=='sgd':
optimizer =torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum,weight_decay=args.weight_decay)
if args.alg=='sgdGC':
optimizer = SGD_GCC(model.parameters(), args.lr, momentum=args.momentum,weight_decay=args.weight_decay)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
# optionally resume from a checkpoint
if args.resume:
if os.path.isfile(args.resume):
print("=> loading checkpoint '{}'".format(args.resume))
checkpoint = torch.load(args.resume)
args.start_epoch = checkpoint['epoch']
best_acc1 = checkpoint['best_acc1']
if args.gpu is not None:
# best_acc1 may be from a checkpoint from a different GPU
best_acc1 = best_acc1.to(args.gpu)
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
print("=> loaded checkpoint '{}' (epoch {})"
.format(args.resume, checkpoint['epoch']))
else:
print("=> no checkpoint found at '{}'".format(args.resume))
cudnn.benchmark = True
# Data loading code
traindir = os.path.join(args.data, 'train')
valdir = os.path.join(args.data, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
train_dataset = datasets.ImageFolder(
traindir,
transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]))
if args.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
train_sampler = None
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
num_workers=args.workers, pin_memory=True, sampler=train_sampler,drop_last=True)
val_loader = torch.utils.data.DataLoader(
datasets.ImageFolder(valdir, transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
])),
batch_size=args.batch_size, shuffle=False,
num_workers=args.workers, pin_memory=True,drop_last=True)
if args.evaluate:
validate(val_loader, model, criterion, args)
return
for epoch in range(args.start_epoch, args.epochs):
if args.distributed:
train_sampler.set_epoch(epoch)
#adjust_learning_rate(optimizer, epoch, args)
# train for one epoch
train(train_loader, model, criterion, optimizer, epoch, args)
exp_lr_scheduler.step()
# evaluate on validation set
acc1 = validate(val_loader, model, criterion, args)
# remember best acc@1 and save checkpoint
is_best = acc1 > best_acc1
best_acc1 = max(acc1, best_acc1)
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
and args.rank % ngpus_per_node == 0):
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.state_dict(),
'best_acc1': best_acc1,
'optimizer' : optimizer.state_dict(),
}, is_best)
torch.save(model.module, './result_model/'+args.path+'.pth')
# train
def train(train_loader, model, criterion, optimizer, epoch, args):
batch_time = AverageMeter()
data_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
top5 = AverageMeter()
total = 0
train_loss = 0
correct = 0
# switch to train mode
model.train()
print('\nEpoch: %d' % epoch)
end = time.time()
for i, (input, target) in enumerate(train_loader):
# measure data loading time
data_time.update(time.time() - end)
#if args.gpu is not None:
#input = input.cuda(args.gpu, non_blocking=True)
#target = target.cuda(args.gpu, non_blocking=True)
input, target = input.to('cuda'), target.to('cuda')
# compute output
output = model(input)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), input.size(0))
top1.update(acc1[0], input.size(0))
top5.update(acc5[0], input.size(0))
_, predicted = output.max(1)
correct += predicted.eq(target).sum().item()
train_loss += loss.item()
#correct +=acc1[0]
total += target.size(0)
# compute gradient and do SGD step
optimizer.zero_grad()
loss.backward()
optimizer.step()
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
print('Training: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5))
#print('Training: Loss: {:.3f} | Acc: {:.3f}'.format(train_loss/(i+1),correct/total))
# validate
def validate(val_loader, model, criterion, args):
batch_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
top5 = AverageMeter()
val_loss = 0
total = 0
correct = 0
# switch to evaluate mode
model.eval()
with torch.no_grad():
end = time.time()
for i, (input, target) in enumerate(val_loader):
if args.gpu is not None:
input = input.cuda(args.gpu, non_blocking=True)
target = target.cuda(args.gpu, non_blocking=True)
# compute output
output = model(input)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), input.size(0))
top1.update(acc1[0], input.size(0))
top5.update(acc5[0], input.size(0))
_, predicted = output.max(1)
total += target.size(0)
correct += predicted.eq(target).sum().item()
val_loss +=loss.item()
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
val_loss += loss.item()
print('Testing: Top1: {top1.avg:.4f}|Top5:{top5.avg:.4f}'.format(top1=top1, top5=top5))
return top1.avg
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
torch.save(state, filename)
if is_best:
shutil.copyfile(filename, 'model_best.pth.tar')
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def adjust_learning_rate(optimizer, epoch, args):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr = args.lr * (0.1 ** (epoch // 30))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def accuracy(output, target, topk=(1,)):
"""Computes the accuracy over the k top predictions for the specified values of k"""
with torch.no_grad():
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(1.0 / batch_size))
return res
if __name__ == '__main__':
main()
================================================
FILE: GC_code/ImageNet/myresnet.py
================================================
from __future__ import print_function, division, absolute_import
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo
__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
'resnet152']
model_urls = {
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}
def conv3x3(in_planes, out_planes, stride=1):
"3x3 convolution with padding"
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=True)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=True)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=1, bias=True)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=True)
self.bn3 = nn.BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
#from torch.legacy import nn as nnl
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000):
self.inplanes = 64
super(ResNet, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
bias=True)
#self.conv1 = nnl.SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AvgPool2d(7)
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=True),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
self.conv1_input = x.clone()
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def resnet18(pretrained=False, **kwargs):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
return model
def resnet34(pretrained=False, **kwargs):
"""Constructs a ResNet-34 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
return model
def resnet50(pretrained=False, **kwargs):
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
return model
def resnet101(pretrained=False, **kwargs):
"""Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
return model
def resnet152(pretrained=False, **kwargs):
"""Constructs a ResNet-152 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
return model
def test():
net = resnet18()
net.eval()
x=Variable(torch.randn(2,3,224,224))
y = net(x)
print(y.size())
print(net)
#test()
================================================
FILE: GC_code/ImageNet/myresnetgn.py
================================================
from __future__ import print_function, division, absolute_import
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo
__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
'resnet152']
model_urls = {
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}
def conv3x3(in_planes, out_planes, stride=1):
"3x3 convolution with padding"
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=True)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.GroupNorm(32,planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.GroupNorm(32,planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=True)
self.bn1 = nn.GroupNorm(32,planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=1, bias=True)
self.bn2 = nn.GroupNorm(32,planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=True)
self.bn3 = nn.GroupNorm(32,planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
#from torch.legacy import nn as nnl
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000):
self.inplanes = 64
super(ResNet, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
bias=True)
#self.conv1 = nnl.SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3)
self.bn1 = nn.GroupNorm(32,64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AvgPool2d(7)
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.GroupNorm):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=True),
nn.GroupNorm(32,planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
self.conv1_input = x.clone()
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def resnet18gn(pretrained=False, **kwargs):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
return model
def resnet34gn(pretrained=False, **kwargs):
"""Constructs a ResNet-34 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
return model
def resnet50gn(pretrained=False, **kwargs):
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
return model
def resnet101gn(pretrained=False, **kwargs):
"""Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
return model
def resnet152gn(pretrained=False, **kwargs):
"""Constructs a ResNet-152 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
return model
def test():
net = resnet18gn()
net.eval()
x=torch.randn(2,3,224,224)
y = net(x)
print(y.size())
print(net)
#test()
================================================
FILE: GC_code/ImageNet/os_run.py
================================================
import os,time
os.system("#nohup python -W ignore main.py /ssd/data/yonghw/Imagenet/v0/ --model r50bn --alg sgd -b 256 --path r50bn_sgd_b256_g4 > logout/r50bn_sgd_b256_g4.log &")
os.system("#nohup python -W ignore main.py /ssd/data/yonghw/Imagenet/v0/ --model r50bn --alg sgdGC -b 256 --path r50bn_sgdGC_b256_g4 > logout/r50bn_sgdGC_b256_g4.log &")
================================================
FILE: GC_code/Mini_ImageNet/SGD.py
================================================
import torch
from torch.optim.optimizer import Optimizer, required
class SGD_GCC(Optimizer):
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGD_GCC, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGD_GCC, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
if weight_decay != 0:
d_p.add_(weight_decay, p.data)
#GC operation for Conv layers
if len(list(d_p.size()))>3:
d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(1 - dampening, d_p)
if nesterov:
d_p = d_p.add(momentum, buf)
else:
d_p = buf
p.data.add_(-group['lr'], d_p)
return loss
class SGD_GC(Optimizer):
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGD_GC, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGD_GC, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
if weight_decay != 0:
d_p.add_(weight_decay, p.data)
#GC operation for Conv layers and FC layers
if len(list(d_p.size()))>1:
d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(1 - dampening, d_p)
if nesterov:
d_p = d_p.add(momentum, buf)
else:
d_p = buf
p.data.add_(-group['lr'], d_p)
return loss
class SGDW(Optimizer):
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGDW, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGDW, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
old = torch.clone(p.data).detach()
#if weight_decay != 0:
# d_p.add_(weight_decay, p.data)
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
buf.mul_(momentum).add_(d_p)
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(1 - dampening, d_p)
if nesterov:
d_p = d_p.add(momentum, buf)
else:
d_p = buf
p.data.add_(-group['lr'], d_p)
if weight_decay != 0:
p.data.add_(-weight_decay*group['lr'], old)
return loss
class SGDW_GCC(Optimizer):
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGDW_GCC, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGDW_GCC, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
old = torch.clone(p.data).detach()
#if weight_decay != 0:
# d_p.add_(weight_decay, p.data)
#GC operation for Conv layers
if len(list(d_p.size()))>3:
d_p.add_(-d_p.mean(dim = tuple(range(1,len(list(d_p.size())))), keepdim = True))
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
buf.mul_(momentum).add_(d_p)
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(1 - dampening, d_p)
if nesterov:
d_p = d_p.add(momentum, buf)
else:
d_p = buf
p.data.add_(-group['lr'], d_p)
if weight_decay != 0:
p.data.add_(-weight_decay*group['lr'], old)
return loss
================================================
FILE: GC_code/Mini_ImageNet/main.py
================================================
import argparse
import os
import random
import shutil
import time
import warnings
import sys
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from resnet_ws import l_resnet50
import torchvision.models as models
import math
import numpy as np
from torch.optim import lr_scheduler
from SGD import SGD_GC #import SGD with GC
model_names = sorted(name for name in models.__dict__
if name.islower() and not name.startswith("__")
and callable(models.__dict__[name]))
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument('-b', '--batch_size', default=256, type=int,
metavar='N',
help='mini-batch size (default: 256), this is the total '
'batch s
gitextract_l162dn_3/
├── GC_code/
│ ├── CIFAR100/
│ │ ├── algorithm/
│ │ │ ├── Adagrad.py
│ │ │ ├── Adam.py
│ │ │ └── SGD.py
│ │ ├── main.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── densenet.py
│ │ │ ├── dpn.py
│ │ │ ├── googlenet.py
│ │ │ ├── lenet.py
│ │ │ ├── mobilenet.py
│ │ │ ├── mobilenetv2.py
│ │ │ ├── pnasnet.py
│ │ │ ├── preact_resnet.py
│ │ │ ├── resnet.py
│ │ │ ├── resnext.py
│ │ │ ├── senet.py
│ │ │ ├── shufflenet.py
│ │ │ └── vgg.py
│ │ └── os_run.py
│ ├── Fine-grained_classification/
│ │ ├── SGD.py
│ │ ├── main.py
│ │ └── os_run.py
│ ├── ImageNet/
│ │ ├── SGD.py
│ │ ├── main.py
│ │ ├── myresnet.py
│ │ ├── myresnetgn.py
│ │ └── os_run.py
│ └── Mini_ImageNet/
│ ├── SGD.py
│ ├── main.py
│ ├── os_run.py
│ └── resnet_ws.py
├── README.md
└── algorithm-GC/
├── README.md
├── algorithm/
│ ├── Adam.py
│ ├── Centralization.py
│ ├── Lookahead.py
│ ├── RAdam.py
│ ├── Ranger.py
│ └── SGD.py
└── cifar/
├── main.py
├── models/
│ ├── __init__.py
│ ├── densenet.py
│ ├── dpn.py
│ ├── googlenet.py
│ ├── lenet.py
│ ├── mobilenet.py
│ ├── mobilenetv2.py
│ ├── pnasnet.py
│ ├── preact_resnet.py
│ ├── resnet.py
│ ├── resnext.py
│ ├── senet.py
│ ├── shufflenet.py
│ └── vgg.py
├── nohup.out
├── os_run.py
└── os_run2.py
SYMBOL INDEX (523 symbols across 46 files)
FILE: GC_code/CIFAR100/algorithm/Adagrad.py
class Adagrad_GCC (line 5) | class Adagrad_GCC(Optimizer):
method __init__ (line 24) | def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initia...
method share_memory (line 46) | def share_memory(self):
method step (line 52) | def step(self, closure=None):
class Adagrad_GC (line 106) | class Adagrad_GC(Optimizer):
method __init__ (line 125) | def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initia...
method share_memory (line 147) | def share_memory(self):
method step (line 153) | def step(self, closure=None):
FILE: GC_code/CIFAR100/algorithm/Adam.py
class Adam_GCC (line 5) | class Adam_GCC(Optimizer):
method __init__ (line 6) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
method __setstate__ (line 20) | def __setstate__(self, state):
method step (line 25) | def step(self, closure=None):
class Adam_GCC2 (line 91) | class Adam_GCC2(Optimizer):
method __init__ (line 92) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
method __setstate__ (line 106) | def __setstate__(self, state):
method step (line 111) | def step(self, closure=None):
class Adam_GC (line 176) | class Adam_GC(Optimizer):
method __init__ (line 200) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
method __setstate__ (line 214) | def __setstate__(self, state):
method step (line 219) | def step(self, closure=None):
class Adam_GC2 (line 286) | class Adam_GC2(Optimizer):
method __init__ (line 287) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
method __setstate__ (line 301) | def __setstate__(self, state):
method step (line 306) | def step(self, closure=None):
class AdamW (line 371) | class AdamW(Optimizer):
method __init__ (line 391) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
method __setstate__ (line 405) | def __setstate__(self, state):
method step (line 410) | def step(self, closure=None):
class AdamW_GCC (line 474) | class AdamW_GCC(Optimizer):
method __init__ (line 494) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
method __setstate__ (line 508) | def __setstate__(self, state):
method step (line 513) | def step(self, closure=None):
class AdamW_GC (line 579) | class AdamW_GC(Optimizer):
method __init__ (line 599) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
method __setstate__ (line 613) | def __setstate__(self, state):
method step (line 618) | def step(self, closure=None):
class AdamW_GCC2 (line 684) | class AdamW_GCC2(Optimizer):
method __init__ (line 704) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
method __setstate__ (line 718) | def __setstate__(self, state):
method step (line 723) | def step(self, closure=None):
class AdamW_GC2 (line 790) | class AdamW_GC2(Optimizer):
method __init__ (line 810) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
method __setstate__ (line 824) | def __setstate__(self, state):
method step (line 829) | def step(self, closure=None):
FILE: GC_code/CIFAR100/algorithm/SGD.py
class SGD_GCC (line 6) | class SGD_GCC(Optimizer):
method __init__ (line 8) | def __init__(self, params, lr=required, momentum=0, dampening=0,
method __setstate__ (line 23) | def __setstate__(self, state):
method step (line 28) | def step(self, closure=None):
class SGD_GC (line 73) | class SGD_GC(Optimizer):
method __init__ (line 75) | def __init__(self, params, lr=required, momentum=0, dampening=0,
method __setstate__ (line 90) | def __setstate__(self, state):
method step (line 95) | def step(self, closure=None):
class SGDW (line 141) | class SGDW(Optimizer):
method __init__ (line 143) | def __init__(self, params, lr=required, momentum=0, dampening=0,
method __setstate__ (line 158) | def __setstate__(self, state):
method step (line 163) | def step(self, closure=None):
class SGDW_GCC (line 209) | class SGDW_GCC(Optimizer):
method __init__ (line 210) | def __init__(self, params, lr=required, momentum=0, dampening=0,
method __setstate__ (line 225) | def __setstate__(self, state):
method step (line 230) | def step(self, closure=None):
class SGDW_GC (line 281) | class SGDW_GC(Optimizer):
method __init__ (line 282) | def __init__(self, params, lr=required, momentum=0, dampening=0,
method __setstate__ (line 297) | def __setstate__(self, state):
method step (line 302) | def step(self, closure=None):
FILE: GC_code/CIFAR100/main.py
function train (line 165) | def train(epoch,net,optimizer):
function test (line 190) | def test(epoch,net):
FILE: GC_code/CIFAR100/models/densenet.py
class Bottleneck (line 9) | class Bottleneck(nn.Module):
method __init__ (line 10) | def __init__(self, in_planes, growth_rate):
method forward (line 17) | def forward(self, x):
class Transition (line 24) | class Transition(nn.Module):
method __init__ (line 25) | def __init__(self, in_planes, out_planes):
method forward (line 30) | def forward(self, x):
class DenseNet (line 36) | class DenseNet(nn.Module):
method __init__ (line 37) | def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_...
method _make_dense_layers (line 68) | def _make_dense_layers(self, block, in_planes, nblock):
method forward (line 75) | def forward(self, x):
function DenseNet121 (line 86) | def DenseNet121(Num_classes=10):
function DenseNet169 (line 89) | def DenseNet169(Num_classes=10):
function DenseNet201 (line 92) | def DenseNet201(Num_classes=10):
function DenseNet161 (line 95) | def DenseNet161(Num_classes=10):
function densenet_cifar (line 98) | def densenet_cifar(Num_classes=10):
function test (line 101) | def test():
FILE: GC_code/CIFAR100/models/dpn.py
class Bottleneck (line 7) | class Bottleneck(nn.Module):
method __init__ (line 8) | def __init__(self, last_planes, in_planes, out_planes, dense_depth, st...
method forward (line 27) | def forward(self, x):
class DPN (line 38) | class DPN(nn.Module):
method __init__ (line 39) | def __init__(self, cfg):
method _make_layer (line 53) | def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, ...
method forward (line 61) | def forward(self, x):
function DPN26 (line 73) | def DPN26():
function DPN92 (line 82) | def DPN92():
function test (line 92) | def test():
FILE: GC_code/CIFAR100/models/googlenet.py
class Inception (line 7) | class Inception(nn.Module):
method __init__ (line 8) | def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool...
method forward (line 48) | def forward(self, x):
class GoogLeNet (line 56) | class GoogLeNet(nn.Module):
method __init__ (line 57) | def __init__(self):
method forward (line 82) | def forward(self, x):
function test (line 101) | def test():
FILE: GC_code/CIFAR100/models/lenet.py
class LeNet (line 5) | class LeNet(nn.Module):
method __init__ (line 6) | def __init__(self):
method forward (line 14) | def forward(self, x):
FILE: GC_code/CIFAR100/models/mobilenet.py
class Block (line 11) | class Block(nn.Module):
method __init__ (line 13) | def __init__(self, in_planes, out_planes, stride=1):
method forward (line 20) | def forward(self, x):
class MobileNet (line 26) | class MobileNet(nn.Module):
method __init__ (line 30) | def __init__(self, num_classes=10):
method _make_layers (line 37) | def _make_layers(self, in_planes):
method forward (line 46) | def forward(self, x):
function test (line 55) | def test():
FILE: GC_code/CIFAR100/models/mobilenetv2.py
class Block (line 11) | class Block(nn.Module):
method __init__ (line 13) | def __init__(self, in_planes, out_planes, expansion, stride):
method forward (line 32) | def forward(self, x):
class MobileNetV2 (line 40) | class MobileNetV2(nn.Module):
method __init__ (line 50) | def __init__(self, num_classes=10):
method _make_layers (line 60) | def _make_layers(self, in_planes):
method forward (line 69) | def forward(self, x):
function test (line 80) | def test():
FILE: GC_code/CIFAR100/models/pnasnet.py
class SepConv (line 10) | class SepConv(nn.Module):
method __init__ (line 12) | def __init__(self, in_planes, out_planes, kernel_size, stride):
method forward (line 20) | def forward(self, x):
class CellA (line 24) | class CellA(nn.Module):
method __init__ (line 25) | def __init__(self, in_planes, out_planes, stride=1):
method forward (line 33) | def forward(self, x):
class CellB (line 40) | class CellB(nn.Module):
method __init__ (line 41) | def __init__(self, in_planes, out_planes, stride=1):
method forward (line 56) | def forward(self, x):
class PNASNet (line 71) | class PNASNet(nn.Module):
method __init__ (line 72) | def __init__(self, cell_type, num_cells, num_planes):
method _make_layer (line 88) | def _make_layer(self, planes, num_cells):
method _downsample (line 95) | def _downsample(self, planes):
method forward (line 100) | def forward(self, x):
function PNASNetA (line 112) | def PNASNetA():
function PNASNetB (line 115) | def PNASNetB():
function test (line 119) | def test():
FILE: GC_code/CIFAR100/models/preact_resnet.py
class PreActBlock (line 12) | class PreActBlock(nn.Module):
method __init__ (line 16) | def __init__(self, in_planes, planes, stride=1):
method forward (line 28) | def forward(self, x):
class PreActBottleneck (line 37) | class PreActBottleneck(nn.Module):
method __init__ (line 41) | def __init__(self, in_planes, planes, stride=1):
method forward (line 55) | def forward(self, x):
class PreActResNet (line 65) | class PreActResNet(nn.Module):
method __init__ (line 66) | def __init__(self, block, num_blocks, num_classes=10):
method _make_layer (line 77) | def _make_layer(self, block, planes, num_blocks, stride):
method forward (line 85) | def forward(self, x):
function PreActResNet18 (line 97) | def PreActResNet18():
function PreActResNet34 (line 100) | def PreActResNet34():
function PreActResNet50 (line 103) | def PreActResNet50():
function PreActResNet101 (line 106) | def PreActResNet101():
function PreActResNet152 (line 109) | def PreActResNet152():
function test (line 113) | def test():
FILE: GC_code/CIFAR100/models/resnet.py
class BasicBlock (line 14) | class BasicBlock(nn.Module):
method __init__ (line 17) | def __init__(self, in_planes, planes, stride=1):
method forward (line 31) | def forward(self, x):
class Bottleneck (line 39) | class Bottleneck(nn.Module):
method __init__ (line 42) | def __init__(self, in_planes, planes, stride=1):
method forward (line 58) | def forward(self, x):
class ResNet (line 67) | class ResNet(nn.Module):
method __init__ (line 68) | def __init__(self, block, num_blocks, num_classes=10):
method _make_layer (line 80) | def _make_layer(self, block, planes, num_blocks, stride):
method forward (line 88) | def forward(self, x):
function ResNet18 (line 100) | def ResNet18(Num_classes=10):
function ResNet34 (line 103) | def ResNet34(Num_classes=10):
function ResNet50 (line 106) | def ResNet50(Num_classes=10):
function ResNet101 (line 109) | def ResNet101(Num_classes=10):
function ResNet152 (line 112) | def ResNet152(Num_classes=10):
function test (line 116) | def test():
FILE: GC_code/CIFAR100/models/resnext.py
class Block (line 10) | class Block(nn.Module):
method __init__ (line 14) | def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stri...
method forward (line 31) | def forward(self, x):
class ResNeXt (line 40) | class ResNeXt(nn.Module):
method __init__ (line 41) | def __init__(self, num_blocks, cardinality, bottleneck_width, num_clas...
method _make_layer (line 55) | def _make_layer(self, num_blocks, stride):
method forward (line 65) | def forward(self, x):
function ResNeXt29_2x64d (line 77) | def ResNeXt29_2x64d(Num_classes=10):
function ResNeXt29_4x64d (line 80) | def ResNeXt29_4x64d(Num_classes=10):
function ResNeXt29_8x64d (line 83) | def ResNeXt29_8x64d(Num_classes=10):
function ResNeXt29_32x4d (line 86) | def ResNeXt29_32x4d(Num_classes=10):
function test_resnext (line 89) | def test_resnext():
FILE: GC_code/CIFAR100/models/senet.py
class BasicBlock (line 10) | class BasicBlock(nn.Module):
method __init__ (line 11) | def __init__(self, in_planes, planes, stride=1):
method forward (line 29) | def forward(self, x):
class PreActBlock (line 45) | class PreActBlock(nn.Module):
method __init__ (line 46) | def __init__(self, in_planes, planes, stride=1):
method forward (line 62) | def forward(self, x):
class SENet (line 79) | class SENet(nn.Module):
method __init__ (line 80) | def __init__(self, block, num_blocks, num_classes=10):
method _make_layer (line 92) | def _make_layer(self, block, planes, num_blocks, stride):
method forward (line 100) | def forward(self, x):
function SENet18 (line 112) | def SENet18():
function test (line 116) | def test():
FILE: GC_code/CIFAR100/models/shufflenet.py
class ShuffleBlock (line 10) | class ShuffleBlock(nn.Module):
method __init__ (line 11) | def __init__(self, groups):
method forward (line 15) | def forward(self, x):
class Bottleneck (line 22) | class Bottleneck(nn.Module):
method __init__ (line 23) | def __init__(self, in_planes, out_planes, stride, groups):
method forward (line 41) | def forward(self, x):
class ShuffleNet (line 51) | class ShuffleNet(nn.Module):
method __init__ (line 52) | def __init__(self, cfg):
method _make_layer (line 66) | def _make_layer(self, out_planes, num_blocks, groups):
method forward (line 75) | def forward(self, x):
function ShuffleNetG2 (line 86) | def ShuffleNetG2():
function ShuffleNetG3 (line 94) | def ShuffleNetG3():
function test (line 103) | def test():
FILE: GC_code/CIFAR100/models/vgg.py
class VGG (line 14) | class VGG(nn.Module):
method __init__ (line 15) | def __init__(self, vgg_name,Num_classes=100):
method forward (line 20) | def forward(self, x):
method _make_layers (line 26) | def _make_layers(self, cfg):
function test (line 41) | def test():
FILE: GC_code/Fine-grained_classification/SGD.py
class SGD_GCC (line 6) | class SGD_GCC(Optimizer):
method __init__ (line 8) | def __init__(self, params, lr=required, momentum=0, dampening=0,
method __setstate__ (line 23) | def __setstate__(self, state):
method step (line 28) | def step(self, closure=None):
class SGD_GC (line 73) | class SGD_GC(Optimizer):
method __init__ (line 75) | def __init__(self, params, lr=required, momentum=0, dampening=0,
method __setstate__ (line 90) | def __setstate__(self, state):
method step (line 95) | def step(self, closure=None):
class SGDW (line 141) | class SGDW(Optimizer):
method __init__ (line 143) | def __init__(self, params, lr=required, momentum=0, dampening=0,
method __setstate__ (line 158) | def __setstate__(self, state):
method step (line 163) | def step(self, closure=None):
class SGDW_GCC (line 209) | class SGDW_GCC(Optimizer):
method __init__ (line 210) | def __init__(self, params, lr=required, momentum=0, dampening=0,
method __setstate__ (line 225) | def __setstate__(self, state):
method step (line 230) | def step(self, closure=None):
FILE: GC_code/Fine-grained_classification/main.py
function main (line 97) | def main():
function main_worker (line 133) | def main_worker(gpu, ngpus_per_node, args):
function train (line 302) | def train(train_loader, model, criterion, optimizer_base, optimizer_new,...
function validate (line 355) | def validate(val_loader, model, criterion, args):
function save_checkpoint (line 399) | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
class AverageMeter (line 405) | class AverageMeter(object):
method __init__ (line 407) | def __init__(self):
method reset (line 410) | def reset(self):
method update (line 416) | def update(self, val, n=1):
function adjust_learning_rate (line 423) | def adjust_learning_rate(optimizer, epoch, args):
function accuracy (line 430) | def accuracy(output, target, topk=(1,)):
FILE: GC_code/ImageNet/SGD.py
class SGD_GCC (line 6) | class SGD_GCC(Optimizer):
method __init__ (line 8) | def __init__(self, params, lr=required, momentum=0, dampening=0,
method __setstate__ (line 23) | def __setstate__(self, state):
method step (line 28) | def step(self, closure=None):
class SGD_GC (line 73) | class SGD_GC(Optimizer):
method __init__ (line 75) | def __init__(self, params, lr=required, momentum=0, dampening=0,
method __setstate__ (line 90) | def __setstate__(self, state):
method step (line 95) | def step(self, closure=None):
class SGDW (line 141) | class SGDW(Optimizer):
method __init__ (line 143) | def __init__(self, params, lr=required, momentum=0, dampening=0,
method __setstate__ (line 158) | def __setstate__(self, state):
method step (line 163) | def step(self, closure=None):
class SGDW_GCC (line 209) | class SGDW_GCC(Optimizer):
method __init__ (line 210) | def __init__(self, params, lr=required, momentum=0, dampening=0,
method __setstate__ (line 225) | def __setstate__(self, state):
method step (line 230) | def step(self, closure=None):
FILE: GC_code/ImageNet/main.py
function main (line 103) | def main():
function main_worker (line 140) | def main_worker(gpu, ngpus_per_node, args):
function train (line 289) | def train(train_loader, model, criterion, optimizer, epoch, args):
function validate (line 338) | def validate(val_loader, model, criterion, args):
function save_checkpoint (line 380) | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
class AverageMeter (line 386) | class AverageMeter(object):
method __init__ (line 388) | def __init__(self):
method reset (line 391) | def reset(self):
method update (line 397) | def update(self, val, n=1):
function adjust_learning_rate (line 404) | def adjust_learning_rate(optimizer, epoch, args):
function accuracy (line 411) | def accuracy(output, target, topk=(1,)):
FILE: GC_code/ImageNet/myresnet.py
function conv3x3 (line 20) | def conv3x3(in_planes, out_planes, stride=1):
class BasicBlock (line 26) | class BasicBlock(nn.Module):
method __init__ (line 29) | def __init__(self, inplanes, planes, stride=1, downsample=None):
method forward (line 39) | def forward(self, x):
class Bottleneck (line 58) | class Bottleneck(nn.Module):
method __init__ (line 61) | def __init__(self, inplanes, planes, stride=1, downsample=None):
method forward (line 74) | def forward(self, x):
class ResNet (line 98) | class ResNet(nn.Module):
method __init__ (line 100) | def __init__(self, block, layers, num_classes=1000):
method _make_layer (line 124) | def _make_layer(self, block, planes, blocks, stride=1):
method forward (line 141) | def forward(self, x):
function resnet18 (line 160) | def resnet18(pretrained=False, **kwargs):
function resnet34 (line 171) | def resnet34(pretrained=False, **kwargs):
function resnet50 (line 182) | def resnet50(pretrained=False, **kwargs):
function resnet101 (line 193) | def resnet101(pretrained=False, **kwargs):
function resnet152 (line 204) | def resnet152(pretrained=False, **kwargs):
function test (line 215) | def test():
FILE: GC_code/ImageNet/myresnetgn.py
function conv3x3 (line 20) | def conv3x3(in_planes, out_planes, stride=1):
class BasicBlock (line 26) | class BasicBlock(nn.Module):
method __init__ (line 29) | def __init__(self, inplanes, planes, stride=1, downsample=None):
method forward (line 39) | def forward(self, x):
class Bottleneck (line 58) | class Bottleneck(nn.Module):
method __init__ (line 61) | def __init__(self, inplanes, planes, stride=1, downsample=None):
method forward (line 74) | def forward(self, x):
class ResNet (line 98) | class ResNet(nn.Module):
method __init__ (line 100) | def __init__(self, block, layers, num_classes=1000):
method _make_layer (line 124) | def _make_layer(self, block, planes, blocks, stride=1):
method forward (line 141) | def forward(self, x):
function resnet18gn (line 160) | def resnet18gn(pretrained=False, **kwargs):
function resnet34gn (line 171) | def resnet34gn(pretrained=False, **kwargs):
function resnet50gn (line 182) | def resnet50gn(pretrained=False, **kwargs):
function resnet101gn (line 193) | def resnet101gn(pretrained=False, **kwargs):
function resnet152gn (line 204) | def resnet152gn(pretrained=False, **kwargs):
function test (line 215) | def test():
FILE: GC_code/Mini_ImageNet/SGD.py
class SGD_GCC (line 6) | class SGD_GCC(Optimizer):
method __init__ (line 8) | def __init__(self, params, lr=required, momentum=0, dampening=0,
method __setstate__ (line 23) | def __setstate__(self, state):
method step (line 28) | def step(self, closure=None):
class SGD_GC (line 73) | class SGD_GC(Optimizer):
method __init__ (line 75) | def __init__(self, params, lr=required, momentum=0, dampening=0,
method __setstate__ (line 90) | def __setstate__(self, state):
method step (line 95) | def step(self, closure=None):
class SGDW (line 141) | class SGDW(Optimizer):
method __init__ (line 143) | def __init__(self, params, lr=required, momentum=0, dampening=0,
method __setstate__ (line 158) | def __setstate__(self, state):
method step (line 163) | def step(self, closure=None):
class SGDW_GCC (line 209) | class SGDW_GCC(Optimizer):
method __init__ (line 210) | def __init__(self, params, lr=required, momentum=0, dampening=0,
method __setstate__ (line 225) | def __setstate__(self, state):
method step (line 230) | def step(self, closure=None):
FILE: GC_code/Mini_ImageNet/main.py
function main (line 99) | def main():
function main_worker (line 135) | def main_worker(gpu, ngpus_per_node, args):
function train (line 279) | def train(train_loader, model, criterion, optimizer, epoch, args):
function validate (line 333) | def validate(val_loader, model, criterion, args):
function save_checkpoint (line 376) | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
class AverageMeter (line 382) | class AverageMeter(object):
method __init__ (line 384) | def __init__(self):
method reset (line 387) | def reset(self):
method update (line 393) | def update(self, val, n=1):
function adjust_learning_rate (line 400) | def adjust_learning_rate(optimizer, epoch, args):
function accuracy (line 407) | def accuracy(output, target, topk=(1,)):
FILE: GC_code/Mini_ImageNet/resnet_ws.py
class Conv2d (line 16) | class Conv2d(nn.Conv2d):
method __init__ (line 18) | def __init__(self, in_channels, out_channels, kernel_size, stride=1,
method forward (line 23) | def forward(self, x):
function BatchNorm2d (line 35) | def BatchNorm2d(num_features):
function conv3x3 (line 41) | def conv3x3(in_planes, out_planes, stride=1):
function conv1x1 (line 47) | def conv1x1(in_planes, out_planes, stride=1):
class BasicBlock (line 52) | class BasicBlock(nn.Module):
method __init__ (line 55) | def __init__(self, inplanes, planes, stride=1, downsample=None):
method forward (line 65) | def forward(self, x):
class Bottleneck (line 84) | class Bottleneck(nn.Module):
method __init__ (line 87) | def __init__(self, inplanes, planes, stride=1, downsample=None):
method forward (line 99) | def forward(self, x):
class ResNet (line 122) | class ResNet(nn.Module):
method __init__ (line 124) | def __init__(self, block, layers, num_classes=1000, zero_init_residual...
method _make_layer (line 161) | def _make_layer(self, block, planes, blocks, stride=1):
method forward (line 177) | def forward(self, x):
function l_resnet18 (line 195) | def l_resnet18(pretrained=False, **kwargs):
function l_resnet34 (line 204) | def l_resnet34(pretrained=False, **kwargs):
function l_resnet50 (line 213) | def l_resnet50(pretrained=False, **kwargs):
function l_resnet101 (line 222) | def l_resnet101(pretrained=False, **kwargs):
function l_resnet152 (line 231) | def l_resnet152(pretrained=False, **kwargs):
FILE: algorithm-GC/algorithm/Adam.py
class Adam (line 6) | class Adam(Optimizer):
method __init__ (line 30) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
method __setstate__ (line 49) | def __setstate__(self, state):
method step (line 55) | def step(self, closure=None):
class AdamW (line 127) | class AdamW(Optimizer):
method __init__ (line 154) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
method __setstate__ (line 173) | def __setstate__(self, state):
method step (line 179) | def step(self, closure=None):
FILE: algorithm-GC/algorithm/Centralization.py
function centralized_gradient (line 6) | def centralized_gradient(x,use_gc=True,gc_conv_only=False):
FILE: algorithm-GC/algorithm/Lookahead.py
class Lookahead (line 7) | class Lookahead(Optimizer):
method __init__ (line 8) | def __init__(self, optimizer, k=5, alpha=0.5):
method update (line 18) | def update(self, group):
method update_lookahead (line 28) | def update_lookahead(self):
method step (line 32) | def step(self, closure=None):
method state_dict (line 42) | def state_dict(self):
method load_state_dict (line 56) | def load_state_dict(self, state_dict):
method add_param_group (line 69) | def add_param_group(self, param_group):
FILE: algorithm-GC/algorithm/RAdam.py
class RAdam (line 7) | class RAdam(Optimizer):
method __init__ (line 9) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weig...
method __setstate__ (line 32) | def __setstate__(self, state):
method step (line 35) | def step(self, closure=None):
class PlainRAdam (line 109) | class PlainRAdam(Optimizer):
method __init__ (line 111) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weig...
method __setstate__ (line 129) | def __setstate__(self, state):
method step (line 132) | def step(self, closure=None):
FILE: algorithm-GC/algorithm/Ranger.py
class Ranger (line 7) | class Ranger(Optimizer):
method __init__ (line 9) | def __init__(self, params, lr=1e-3, # lr
method __setstate__ (line 61) | def __setstate__(self, state):
method step (line 65) | def step(self, closure=None):
FILE: algorithm-GC/algorithm/SGD.py
class SGD (line 6) | class SGD(Optimizer):
method __init__ (line 56) | def __init__(self, params, lr=required, momentum=0, dampening=0,
method __setstate__ (line 71) | def __setstate__(self, state):
method step (line 77) | def step(self, closure=None):
FILE: algorithm-GC/cifar/main.py
function train (line 218) | def train(epoch,net,optimizer):
function test (line 243) | def test(epoch,net):
FILE: algorithm-GC/cifar/models/densenet.py
class Bottleneck (line 9) | class Bottleneck(nn.Module):
method __init__ (line 10) | def __init__(self, in_planes, growth_rate):
method forward (line 17) | def forward(self, x):
class Transition (line 24) | class Transition(nn.Module):
method __init__ (line 25) | def __init__(self, in_planes, out_planes):
method forward (line 30) | def forward(self, x):
class DenseNet (line 36) | class DenseNet(nn.Module):
method __init__ (line 37) | def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_...
method _make_dense_layers (line 68) | def _make_dense_layers(self, block, in_planes, nblock):
method forward (line 75) | def forward(self, x):
function DenseNet121 (line 86) | def DenseNet121():
function DenseNet169 (line 89) | def DenseNet169():
function DenseNet201 (line 92) | def DenseNet201():
function DenseNet161 (line 95) | def DenseNet161():
function densenet_cifar (line 98) | def densenet_cifar():
function test (line 101) | def test():
FILE: algorithm-GC/cifar/models/dpn.py
class Bottleneck (line 7) | class Bottleneck(nn.Module):
method __init__ (line 8) | def __init__(self, last_planes, in_planes, out_planes, dense_depth, st...
method forward (line 27) | def forward(self, x):
class DPN (line 38) | class DPN(nn.Module):
method __init__ (line 39) | def __init__(self, cfg):
method _make_layer (line 53) | def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, ...
method forward (line 61) | def forward(self, x):
function DPN26 (line 73) | def DPN26():
function DPN92 (line 82) | def DPN92():
function test (line 92) | def test():
FILE: algorithm-GC/cifar/models/googlenet.py
class Inception (line 7) | class Inception(nn.Module):
method __init__ (line 8) | def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool...
method forward (line 48) | def forward(self, x):
class GoogLeNet (line 56) | class GoogLeNet(nn.Module):
method __init__ (line 57) | def __init__(self):
method forward (line 82) | def forward(self, x):
function test (line 101) | def test():
FILE: algorithm-GC/cifar/models/lenet.py
class LeNet (line 5) | class LeNet(nn.Module):
method __init__ (line 6) | def __init__(self):
method forward (line 14) | def forward(self, x):
FILE: algorithm-GC/cifar/models/mobilenet.py
class Block (line 11) | class Block(nn.Module):
method __init__ (line 13) | def __init__(self, in_planes, out_planes, stride=1):
method forward (line 20) | def forward(self, x):
class MobileNet (line 26) | class MobileNet(nn.Module):
method __init__ (line 30) | def __init__(self, num_classes=10):
method _make_layers (line 37) | def _make_layers(self, in_planes):
method forward (line 46) | def forward(self, x):
function test (line 55) | def test():
FILE: algorithm-GC/cifar/models/mobilenetv2.py
class Block (line 11) | class Block(nn.Module):
method __init__ (line 13) | def __init__(self, in_planes, out_planes, expansion, stride):
method forward (line 32) | def forward(self, x):
class MobileNetV2 (line 40) | class MobileNetV2(nn.Module):
method __init__ (line 50) | def __init__(self, num_classes=10):
method _make_layers (line 60) | def _make_layers(self, in_planes):
method forward (line 69) | def forward(self, x):
function test (line 80) | def test():
FILE: algorithm-GC/cifar/models/pnasnet.py
class SepConv (line 10) | class SepConv(nn.Module):
method __init__ (line 12) | def __init__(self, in_planes, out_planes, kernel_size, stride):
method forward (line 20) | def forward(self, x):
class CellA (line 24) | class CellA(nn.Module):
method __init__ (line 25) | def __init__(self, in_planes, out_planes, stride=1):
method forward (line 33) | def forward(self, x):
class CellB (line 40) | class CellB(nn.Module):
method __init__ (line 41) | def __init__(self, in_planes, out_planes, stride=1):
method forward (line 56) | def forward(self, x):
class PNASNet (line 71) | class PNASNet(nn.Module):
method __init__ (line 72) | def __init__(self, cell_type, num_cells, num_planes):
method _make_layer (line 88) | def _make_layer(self, planes, num_cells):
method _downsample (line 95) | def _downsample(self, planes):
method forward (line 100) | def forward(self, x):
function PNASNetA (line 112) | def PNASNetA():
function PNASNetB (line 115) | def PNASNetB():
function test (line 119) | def test():
FILE: algorithm-GC/cifar/models/preact_resnet.py
class PreActBlock (line 12) | class PreActBlock(nn.Module):
method __init__ (line 16) | def __init__(self, in_planes, planes, stride=1):
method forward (line 28) | def forward(self, x):
class PreActBottleneck (line 37) | class PreActBottleneck(nn.Module):
method __init__ (line 41) | def __init__(self, in_planes, planes, stride=1):
method forward (line 55) | def forward(self, x):
class PreActResNet (line 65) | class PreActResNet(nn.Module):
method __init__ (line 66) | def __init__(self, block, num_blocks, num_classes=10):
method _make_layer (line 77) | def _make_layer(self, block, planes, num_blocks, stride):
method forward (line 85) | def forward(self, x):
function PreActResNet18 (line 97) | def PreActResNet18():
function PreActResNet34 (line 100) | def PreActResNet34():
function PreActResNet50 (line 103) | def PreActResNet50():
function PreActResNet101 (line 106) | def PreActResNet101():
function PreActResNet152 (line 109) | def PreActResNet152():
function test (line 113) | def test():
FILE: algorithm-GC/cifar/models/resnet.py
class BasicBlock (line 14) | class BasicBlock(nn.Module):
method __init__ (line 17) | def __init__(self, in_planes, planes, stride=1):
method forward (line 31) | def forward(self, x):
class Bottleneck (line 39) | class Bottleneck(nn.Module):
method __init__ (line 42) | def __init__(self, in_planes, planes, stride=1):
method forward (line 58) | def forward(self, x):
class ResNet (line 67) | class ResNet(nn.Module):
method __init__ (line 68) | def __init__(self, block, num_blocks, num_classes=10):
method _make_layer (line 80) | def _make_layer(self, block, planes, num_blocks, stride):
method forward (line 88) | def forward(self, x):
function ResNet18 (line 100) | def ResNet18(Num_classes=10):
function ResNet34 (line 103) | def ResNet34(Num_classes=10):
function ResNet50 (line 106) | def ResNet50(Num_classes=10):
function ResNet101 (line 109) | def ResNet101(Num_classes=10):
function ResNet152 (line 112) | def ResNet152(Num_classes=10):
function test (line 116) | def test():
FILE: algorithm-GC/cifar/models/resnext.py
class Block (line 10) | class Block(nn.Module):
method __init__ (line 14) | def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stri...
method forward (line 31) | def forward(self, x):
class ResNeXt (line 40) | class ResNeXt(nn.Module):
method __init__ (line 41) | def __init__(self, num_blocks, cardinality, bottleneck_width, num_clas...
method _make_layer (line 55) | def _make_layer(self, num_blocks, stride):
method forward (line 65) | def forward(self, x):
function ResNeXt29_2x64d (line 77) | def ResNeXt29_2x64d():
function ResNeXt29_4x64d (line 80) | def ResNeXt29_4x64d():
function ResNeXt29_8x64d (line 83) | def ResNeXt29_8x64d():
function ResNeXt29_32x4d (line 86) | def ResNeXt29_32x4d():
function test_resnext (line 89) | def test_resnext():
FILE: algorithm-GC/cifar/models/senet.py
class BasicBlock (line 10) | class BasicBlock(nn.Module):
method __init__ (line 11) | def __init__(self, in_planes, planes, stride=1):
method forward (line 29) | def forward(self, x):
class PreActBlock (line 45) | class PreActBlock(nn.Module):
method __init__ (line 46) | def __init__(self, in_planes, planes, stride=1):
method forward (line 62) | def forward(self, x):
class SENet (line 79) | class SENet(nn.Module):
method __init__ (line 80) | def __init__(self, block, num_blocks, num_classes=10):
method _make_layer (line 92) | def _make_layer(self, block, planes, num_blocks, stride):
method forward (line 100) | def forward(self, x):
function SENet18 (line 112) | def SENet18():
function test (line 116) | def test():
FILE: algorithm-GC/cifar/models/shufflenet.py
class ShuffleBlock (line 10) | class ShuffleBlock(nn.Module):
method __init__ (line 11) | def __init__(self, groups):
method forward (line 15) | def forward(self, x):
class Bottleneck (line 22) | class Bottleneck(nn.Module):
method __init__ (line 23) | def __init__(self, in_planes, out_planes, stride, groups):
method forward (line 41) | def forward(self, x):
class ShuffleNet (line 51) | class ShuffleNet(nn.Module):
method __init__ (line 52) | def __init__(self, cfg):
method _make_layer (line 66) | def _make_layer(self, out_planes, num_blocks, groups):
method forward (line 75) | def forward(self, x):
function ShuffleNetG2 (line 86) | def ShuffleNetG2():
function ShuffleNetG3 (line 94) | def ShuffleNetG3():
function test (line 103) | def test():
FILE: algorithm-GC/cifar/models/vgg.py
class VGG (line 14) | class VGG(nn.Module):
method __init__ (line 15) | def __init__(self, vgg_name,Num_classes=100):
method forward (line 20) | def forward(self, x):
method _make_layers (line 26) | def _make_layers(self, cfg):
function test (line 41) | def test():
Condensed preview — 57 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (420K chars).
[
{
"path": "GC_code/CIFAR100/algorithm/Adagrad.py",
"chars": 8949,
"preview": "import torch\r\nfrom torch.optim.optimizer import Optimizer\r\n\r\n\r\nclass Adagrad_GCC(Optimizer):\r\n \"\"\"Implements Adagrad "
},
{
"path": "GC_code/CIFAR100/algorithm/Adam.py",
"chars": 40981,
"preview": "import math\nimport torch\nfrom torch.optim.optimizer import Optimizer\n\nclass Adam_GCC(Optimizer):\n def __init__(self, "
},
{
"path": "GC_code/CIFAR100/algorithm/SGD.py",
"chars": 13299,
"preview": "import torch\nfrom torch.optim.optimizer import Optimizer, required\n\n\n\nclass SGD_GCC(Optimizer):\n\n def __init__(self, "
},
{
"path": "GC_code/CIFAR100/main.py",
"chars": 7991,
"preview": "'''Train CIFAR100 with PyTorch.'''\nfrom __future__ import print_function\n\nimport torch\nimport torch.nn as nn\nimport torc"
},
{
"path": "GC_code/CIFAR100/models/__init__.py",
"chars": 304,
"preview": "from .vgg import *\nfrom .dpn import *\nfrom .lenet import *\nfrom .senet import *\nfrom .pnasnet import *\nfrom .densenet im"
},
{
"path": "GC_code/CIFAR100/models/densenet.py",
"chars": 3737,
"preview": "'''DenseNet in PyTorch.'''\nimport math\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Bottl"
},
{
"path": "GC_code/CIFAR100/models/dpn.py",
"chars": 3562,
"preview": "'''Dual Path Networks in PyTorch.'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Bottlene"
},
{
"path": "GC_code/CIFAR100/models/googlenet.py",
"chars": 3221,
"preview": "'''GoogLeNet with PyTorch.'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Inception(nn.Mo"
},
{
"path": "GC_code/CIFAR100/models/lenet.py",
"chars": 699,
"preview": "'''LeNet in PyTorch.'''\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass LeNet(nn.Module):\n def __init__("
},
{
"path": "GC_code/CIFAR100/models/mobilenet.py",
"chars": 2025,
"preview": "'''MobileNet in PyTorch.\n\nSee the paper \"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applicati"
},
{
"path": "GC_code/CIFAR100/models/mobilenetv2.py",
"chars": 3092,
"preview": "'''MobileNetV2 in PyTorch.\n\nSee the paper \"Inverted Residuals and Linear Bottlenecks:\nMobile Networks for Classification"
},
{
"path": "GC_code/CIFAR100/models/pnasnet.py",
"chars": 4258,
"preview": "'''PNASNet in PyTorch.\n\nPaper: Progressive Neural Architecture Search\n'''\nimport torch\nimport torch.nn as nn\nimport torc"
},
{
"path": "GC_code/CIFAR100/models/preact_resnet.py",
"chars": 4078,
"preview": "'''Pre-activation ResNet in PyTorch.\n\nReference:\n[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun\n Identity Mapp"
},
{
"path": "GC_code/CIFAR100/models/resnet.py",
"chars": 4195,
"preview": "'''ResNet in PyTorch.\n\nFor Pre-activation ResNet, see 'preact_resnet.py'.\n\nReference:\n[1] Kaiming He, Xiangyu Zhang, Sha"
},
{
"path": "GC_code/CIFAR100/models/resnext.py",
"chars": 3630,
"preview": "'''ResNeXt in PyTorch.\n\nSee the paper \"Aggregated Residual Transformations for Deep Neural Networks\" for more details.\n'"
},
{
"path": "GC_code/CIFAR100/models/senet.py",
"chars": 4027,
"preview": "'''SENet in PyTorch.\n\nSENet is the winner of ImageNet-2017. The paper is not released yet.\n'''\nimport torch\nimport torch"
},
{
"path": "GC_code/CIFAR100/models/shufflenet.py",
"chars": 3551,
"preview": "'''ShuffleNet in PyTorch.\n\nSee the paper \"ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Dev"
},
{
"path": "GC_code/CIFAR100/models/vgg.py",
"chars": 1467,
"preview": "'''VGG11/13/16/19 in Pytorch.'''\nimport torch\nimport torch.nn as nn\n\n\ncfg = {\n 'VGG11': [64, 'M', 128, 'M', 256, 256,"
},
{
"path": "GC_code/CIFAR100/os_run.py",
"chars": 296,
"preview": "\nimport os,time\n\n#cifar100 sgd & sgdGCC\n\nos.system(\"nohup python main.py --lr 0.1 --wd 0.0005 --alg sgd --epochs 200 "
},
{
"path": "GC_code/Fine-grained_classification/SGD.py",
"chars": 10518,
"preview": "import torch\nfrom torch.optim.optimizer import Optimizer, required\n\n\n\nclass SGD_GCC(Optimizer):\n\n def __init__(self, "
},
{
"path": "GC_code/Fine-grained_classification/main.py",
"chars": 17597,
"preview": "import argparse\nimport os\nimport random\nimport shutil\nimport time\nimport warnings\nimport sys\n\nimport torch\nimport torch."
},
{
"path": "GC_code/Fine-grained_classification/os_run.py",
"chars": 1322,
"preview": "\nimport os,time\n\n\n\n\nos.system(\"nohup python -W ignore main.py /home/yonghw/data/data/CUB_200_2011/ --model r50p -b 128 -"
},
{
"path": "GC_code/ImageNet/SGD.py",
"chars": 10518,
"preview": "import torch\nfrom torch.optim.optimizer import Optimizer, required\n\n\n\nclass SGD_GCC(Optimizer):\n\n def __init__(self, "
},
{
"path": "GC_code/ImageNet/main.py",
"chars": 16397,
"preview": "import argparse\nimport os\nimport random\nimport shutil\nimport time\nimport warnings\nimport sys\n#nohup python -W ignore mai"
},
{
"path": "GC_code/ImageNet/myresnet.py",
"chars": 7140,
"preview": "from __future__ import print_function, division, absolute_import\r\nimport torch.nn as nn\r\nimport math\r\nimport torch.utils"
},
{
"path": "GC_code/ImageNet/myresnetgn.py",
"chars": 7147,
"preview": "from __future__ import print_function, division, absolute_import\r\nimport torch.nn as nn\r\nimport math\r\nimport torch.utils"
},
{
"path": "GC_code/ImageNet/os_run.py",
"chars": 353,
"preview": "\nimport os,time\n\n\nos.system(\"#nohup python -W ignore main.py /ssd/data/yonghw/Imagenet/v0/ --model r50bn --alg sgd -b 25"
},
{
"path": "GC_code/Mini_ImageNet/SGD.py",
"chars": 10518,
"preview": "import torch\nfrom torch.optim.optimizer import Optimizer, required\n\n\n\nclass SGD_GCC(Optimizer):\n\n def __init__(self, "
},
{
"path": "GC_code/Mini_ImageNet/main.py",
"chars": 15800,
"preview": "import argparse\nimport os\nimport random\nimport shutil\nimport time\nimport warnings\nimport sys\nimport torch\nimport torch.n"
},
{
"path": "GC_code/Mini_ImageNet/os_run.py",
"chars": 702,
"preview": "#cifar100 e200 bs128 gs 2,4,8,16\nimport os,time\n\n#print('runing mini_imagenet.py')\n\n\nos.system(\"nohup python -W ignor"
},
{
"path": "GC_code/Mini_ImageNet/resnet_ws.py",
"chars": 7797,
"preview": "import torch.nn as nn\r\nimport torch.utils.model_zoo as model_zoo\r\n\r\nimport torch\r\nimport torch.nn as nn\r\nfrom torch.nn.p"
},
{
"path": "README.md",
"chars": 11780,
"preview": "# Gradient Centralization\n\n## [Gradient Centralization: A New Optimization Technique for Deep Neural Networks](https://a"
},
{
"path": "algorithm-GC/README.md",
"chars": 3444,
"preview": "# Advanced-optimizer-with-Gradient-Centralization\nAdvanced optimizer with Gradient-Centralization\nPlease Refer to\n## [Gr"
},
{
"path": "algorithm-GC/algorithm/Adam.py",
"chars": 11078,
"preview": "import math\nimport torch\nfrom torch.optim.optimizer import Optimizer\nfrom .Centralization import centralized_gradient\n\nc"
},
{
"path": "algorithm-GC/algorithm/Centralization.py",
"chars": 456,
"preview": "import torch\n#from torch.optim.optimizer import Optimizer, required\n\n\n\ndef centralized_gradient(x,use_gc=True,gc_conv_on"
},
{
"path": "algorithm-GC/algorithm/Lookahead.py",
"chars": 2449,
"preview": "from collections import defaultdict\nfrom itertools import chain\nfrom torch.optim import Optimizer\nimport torch\nimport wa"
},
{
"path": "algorithm-GC/algorithm/RAdam.py",
"chars": 8846,
"preview": "import math\nimport torch\nfrom torch.optim.optimizer import Optimizer\nfrom .Centralization import centralized_gradient\n\n\n"
},
{
"path": "algorithm-GC/algorithm/Ranger.py",
"chars": 7457,
"preview": "import math\nimport torch\nfrom torch.optim.optimizer import Optimizer\nfrom .Centralization import centralized_gradient\n\n\n"
},
{
"path": "algorithm-GC/algorithm/SGD.py",
"chars": 4718,
"preview": "import torch\nfrom torch.optim.optimizer import Optimizer, required\n\nfrom .Centralization import centralized_gradient\n\ncl"
},
{
"path": "algorithm-GC/cifar/main.py",
"chars": 10414,
"preview": "'''Train CIFAR100 with PyTorch.'''\nfrom __future__ import print_function\n\nimport torch\nimport torch.nn as nn\nimport torc"
},
{
"path": "algorithm-GC/cifar/models/__init__.py",
"chars": 304,
"preview": "from .vgg import *\nfrom .dpn import *\nfrom .lenet import *\nfrom .senet import *\nfrom .pnasnet import *\nfrom .densenet im"
},
{
"path": "algorithm-GC/cifar/models/densenet.py",
"chars": 3542,
"preview": "'''DenseNet in PyTorch.'''\nimport math\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Bottl"
},
{
"path": "algorithm-GC/cifar/models/dpn.py",
"chars": 3562,
"preview": "'''Dual Path Networks in PyTorch.'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Bottlene"
},
{
"path": "algorithm-GC/cifar/models/googlenet.py",
"chars": 3221,
"preview": "'''GoogLeNet with PyTorch.'''\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Inception(nn.Mo"
},
{
"path": "algorithm-GC/cifar/models/lenet.py",
"chars": 699,
"preview": "'''LeNet in PyTorch.'''\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass LeNet(nn.Module):\n def __init__("
},
{
"path": "algorithm-GC/cifar/models/mobilenet.py",
"chars": 2025,
"preview": "'''MobileNet in PyTorch.\n\nSee the paper \"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applicati"
},
{
"path": "algorithm-GC/cifar/models/mobilenetv2.py",
"chars": 3092,
"preview": "'''MobileNetV2 in PyTorch.\n\nSee the paper \"Inverted Residuals and Linear Bottlenecks:\nMobile Networks for Classification"
},
{
"path": "algorithm-GC/cifar/models/pnasnet.py",
"chars": 4258,
"preview": "'''PNASNet in PyTorch.\n\nPaper: Progressive Neural Architecture Search\n'''\nimport torch\nimport torch.nn as nn\nimport torc"
},
{
"path": "algorithm-GC/cifar/models/preact_resnet.py",
"chars": 4078,
"preview": "'''Pre-activation ResNet in PyTorch.\n\nReference:\n[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun\n Identity Mapp"
},
{
"path": "algorithm-GC/cifar/models/resnet.py",
"chars": 4195,
"preview": "'''ResNet in PyTorch.\n\nFor Pre-activation ResNet, see 'preact_resnet.py'.\n\nReference:\n[1] Kaiming He, Xiangyu Zhang, Sha"
},
{
"path": "algorithm-GC/cifar/models/resnext.py",
"chars": 3478,
"preview": "'''ResNeXt in PyTorch.\n\nSee the paper \"Aggregated Residual Transformations for Deep Neural Networks\" for more details.\n'"
},
{
"path": "algorithm-GC/cifar/models/senet.py",
"chars": 4027,
"preview": "'''SENet in PyTorch.\n\nSENet is the winner of ImageNet-2017. The paper is not released yet.\n'''\nimport torch\nimport torch"
},
{
"path": "algorithm-GC/cifar/models/shufflenet.py",
"chars": 3551,
"preview": "'''ShuffleNet in PyTorch.\n\nSee the paper \"ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Dev"
},
{
"path": "algorithm-GC/cifar/models/vgg.py",
"chars": 1467,
"preview": "'''VGG11/13/16/19 in Pytorch.'''\nimport torch\nimport torch.nn as nn\n\n\ncfg = {\n 'VGG11': [64, 'M', 128, 'M', 256, 256,"
},
{
"path": "algorithm-GC/cifar/nohup.out",
"chars": 1982,
"preview": "Traceback (most recent call last):\n File \"main.py\", line 281, in <module>\n train_acc=train(epoch,net,optimizer)\n Fi"
},
{
"path": "algorithm-GC/cifar/os_run.py",
"chars": 51263,
"preview": "#cifar100 e200 bs128 gs 2,4,8,16\nimport os,time\n#############################\n#r18\n##############\n\n#### sgd \n#os.syste"
},
{
"path": "algorithm-GC/cifar/os_run2.py",
"chars": 29425,
"preview": "#cifar100 e200 bs128 gs 2,4,8,16\nimport os,time\n\n\n#r50\n##############\n\n\n### adam \nos.system(\"nohup python main.py --"
}
]
About this extraction
This page contains the full source code of the Yonghongwei/Gradient-Centralization GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 57 files (394.5 KB), approximately 115.9k tokens, and a symbol index with 523 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.