Repository: itayhubara/BinaryNet
Branch: master
Commit: c23b86285cd1
Files: 18
Total size: 98.9 KB

Directory structure:
gitextract_kj09dah_/

├── Data.lua
├── Dockerfile/
│   └── binarynet-torch-gpu-cuda-8.0
├── Main_BinaryNet_Cifar10.lua
├── Main_BinaryNet_MNIST.lua
├── Main_BinaryNet_SVHN.lua
├── Models/
│   ├── BatchNormalizationShiftPow2.lua
│   ├── BinarizedNeurons.lua
│   ├── BinaryLinear.lua
│   ├── BinaryNet_Cifar10_Model.lua
│   ├── BinaryNet_MNIST_Model.lua
│   ├── BinaryNet_SVHN_Model.lua
│   ├── BinarySpatialConvolution.lua
│   ├── SpatialBatchNormalizationShiftPow2.lua
│   └── cudnnBinarySpatialConvolution.lua
├── README.md
├── SqrHingeEmbeddingCriterion.lua
├── adaMax_binary_clip_shift.lua
└── adam_binary_clip_b.lua

================================================
FILE CONTENTS
================================================

================================================
FILE: Data.lua
================================================
--[[
This code create the training test and validation datasets and preform diffrent kinds of preprocessing
This code is based on elad hoffer Data.lua file from ConvNet-torch library (https://github.com/eladhoffer/ConvNet-torch.git) and uses:
  - Elad Hoffer DataProvidor.torch library: https://github.com/eladhoffer/DataProvider.torch.git
  - Nicholas Leonard dp library: https://github.com/nicholas-leonard/dp.git
  - Koray Kavukcuoglu dp library: https://github.com/koraykv/unsup.git
]]
require 'dp'
local DataProvider = require 'DataProvider'
local opt = opt or {}
local Dataset = opt.dataset or 'Cifar10'
local PreProcDir = opt.preProcDir or './PreProcData/'
local Whiten = opt.whiten or false
local NormelizeWhiten = opt.NormelizeWhiten or false
local DataPath = opt.datapath or '/home/itayh/Datasets/'
local normalization = opt.normalization or 'simple'
local format = opt.format or 'rgb'
local TestData
local TrainData
local ValidData
local Classes

if Dataset =='Cifar100' then
  local file_valid = paths.concat(PreProcDir, format .. 'whiten_valid.t7')
  local file_train = paths.concat(PreProcDir, format .. 'whiten_train.t7')
  local file_test = paths.concat(PreProcDir, format .. 'whiten_test.t7')
  if (paths.filep(file_valid) and paths.filep(file_train) and paths.filep(file_test)) then
    ValidData=torch.load(file_valid)
    TrainData=torch.load(file_train)
    TestData=torch.load(file_test)
  else
    if paths.dirp(PreProcDir)==false then
     sys.execute('mkdir PreProcData/Cifar100')
    end
    input_preprocess = {}
    table.insert(input_preprocess, dp.ZCA())
    ds = dp.Cifar100{scale={0,1}, valid_ratio=0.1,input_preprocess = input_preprocess}
    ValidData = {data=ds:validSet():inputs():input():clone():float(), label=ds:validSet():targets():input():clone():byte() }
    TrainData = {data=ds:trainSet():inputs():input():float(), label=ds:trainSet():targets():input():byte() }
    TestData  = {data=ds:testSet():inputs():input():float() , label=ds:testSet():targets():input():byte()  }
    collectgarbage()
    torch.save(file_valid,ValidData)
    torch.save(file_train,TrainData)
    torch.save(file_test,TestData)
  end
elseif Dataset == 'Cifar10' then
    local file_valid = paths.concat(PreProcDir, format .. 'whiten_valid.t7')
    local file_train = paths.concat(PreProcDir, format .. 'whiten_train.t7')
    local file_test = paths.concat(PreProcDir, format .. 'whiten_test.t7')
    if (paths.filep(file_valid) and paths.filep(file_train) and paths.filep(file_test)) then
      ValidData=torch.load(file_valid)
      TrainData=torch.load(file_train)
      TestData=torch.load(file_test)
    else
      if paths.dirp(PreProcDir)==false then
       sys.execute('mkdir PreProcData/Cifar10')
      end
      input_preprocess = {}
      table.insert(input_preprocess, dp.ZCA())
      ds = dp.Cifar10{scale={0,1},valid_ratio=0.1,input_preprocess = input_preprocess} --,input_preprocess = input_preprocess}  scale={0,1},
      ValidData = {data=ds:validSet():inputs():input():float(), label=ds:validSet():targets():input():clone():byte() }
      TrainData = {data=ds:trainSet():inputs():input():float(), label=ds:trainSet():targets():input():byte() }
      TestData  = {data=ds:testSet():inputs():input():float(), label=ds:testSet():targets():input():byte()  }
      collectgarbage()
      torch.save(file_valid,ValidData)
      torch.save(file_train,TrainData)
      torch.save(file_test,TestData)
    end
    Classes = {'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'}
elseif Dataset == 'MNIST' then
  local file_valid = paths.concat(PreProcDir, format .. '_valid.t7')
  local file_train = paths.concat(PreProcDir, format .. '_train.t7')
  local file_test = paths.concat(PreProcDir, format .. '_test.t7')
  if (paths.filep(file_valid) and paths.filep(file_train) and paths.filep(file_test)) then
    ValidData=torch.load(file_valid)
    TrainData=torch.load(file_train)
    TestData=torch.load(file_test)
  else
    if paths.dirp(PreProcDir)==false then
     sys.execute('mkdir PreProcData/MNIST')
    end
    ds = dp.Mnist{scale={0,1}}
    ValidData = {data=ds:validSet():inputs():input():clone():float(), label=ds:validSet():targets():input():clone():byte() }
    TrainData = {data=ds:trainSet():inputs():input():float(), label=ds:trainSet():targets():input():byte() }
    TestData  = {data=ds:testSet():inputs():input():float() , label=ds:testSet():targets():input():byte()  }
    collectgarbage()
    torch.save(file_valid,ValidData)
    torch.save(file_train,TrainData)
    torch.save(file_test,TestData)
  end
  Classes = {1,2,3,4,5,6,7,8,9,0}
elseif Dataset == 'SVHN' then
    local LCNfile_valid = paths.concat(PreProcDir, format .. 'GCN_LCN_valid.t7')
    local LCNfile_train = paths.concat(PreProcDir, format .. 'GCN_LCN_train.t7')
    local LCNfile_test = paths.concat(PreProcDir, format .. 'GCN_LCN_test.t7')
    print(LCNfile_valid)
    if (paths.filep(LCNfile_valid) and paths.filep(LCNfile_train) and paths.filep(LCNfile_test)) then
      ValidData=torch.load(LCNfile_valid)
      TrainData=torch.load(LCNfile_train)
      TestData=torch.load(LCNfile_test)
    else
      if paths.dirp(PreProcDir)==false then
       sys.execute('mkdir PreProcData/SVHN')
      end
      local input_preprocess = {}
      table.insert(input_preprocess, dp.GCN{batch_size=5000,use_std=true,sqrt_bias=10})
      table.insert(input_preprocess, dp.LeCunLCN{kernel_size=9,divide_by_std=true,batch_size=5000,progress=true}) --,kernel_size=31,kernel_std=32})
      ds = dp.Svhn{scale={0,1}, input_preprocess = input_preprocess}
      ValidData = {data=ds:validSet():inputs():input():float(), label=ds:validSet():targets():input():byte() }; ValidData.data:div( ValidData.data:max())
      TrainData = {data=ds:trainSet():inputs():input():float(), label=ds:trainSet():targets():input():byte() }; TrainData.data:div( TrainData.data:max())
      TestData  = {data=ds:testSet():inputs():input():float(), label=ds:testSet():targets():input():byte() };  TestData.data:div( TestData.data:max())

      collectgarbage()
      torch.save(LCNfile_valid,ValidData)
      torch.save(LCNfile_train,TrainData)
      torch.save(LCNfile_test,TestData)
    end
    Classes = {1,2,3,4,5,6,7,8,9,0}
end

TrainData.data = TrainData.data:float()
TestData.data = TestData.data:float()

local TrainDataProvider = DataProvider.Container{
  Name = 'TrainingData',
  CachePrefix = nil,
  CacheFiles = false,
  Source = {TrainData.data,TrainData.label},
  MaxNumItems = 1e6,
  CopyData = false,
  TensorType = 'torch.FloatTensor',
}
local TestDataProvider = DataProvider.Container{
  Name = 'TestData',
  CachePrefix = nil,
  CacheFiles = false,
  Source = {TestData.data, TestData.label},
  MaxNumItems = 1e6,
  CopyData = false,
  TensorType = 'torch.FloatTensor',

}
local ValidDataProvider = DataProvider.Container{
  Name = 'ValidData',
  CachePrefix = nil,
  CacheFiles = false,
  Source = {ValidData.data, ValidData.label},
  MaxNumItems = 1e6,
  CopyData = false,
  TensorType = 'torch.FloatTensor',

}

--Preprocesss

  if format == 'yuv' then
    require 'image'
    TrainDataProvider:apply(image.rgb2yuv)
    TestDataProvider:apply(image.rgb2yuv)
  end
  if Whiten then
    require 'unsup'
    local meanfile = paths.concat(PreProcDir, format .. 'imageMean.t7')
    local mean, P, invP
    local Pfile = paths.concat(PreProcDir,format .. 'P.t7')
    local invPfile = paths.concat(PreProcDir,format .. 'invP.t7')

    if (paths.filep(Pfile) and paths.filep(invPfile) and paths.filep(meanfile)) then
      P = torch.load(Pfile)
      invP = torch.load(invPfile)
      mean = torch.load(meanfile)
      TrainDataProvider.Data = unsup.zca_whiten(TrainDataProvider.Data, mean, P, invP)
    else
      TrainDataProvider.Data, mean, P, invP = unsup.zca_whiten(TrainDataProvider.Data)
      torch.save(Pfile,P)
      torch.save(invPfile,invP)
      torch.save(meanfile,mean)
    end
      TestDataProvider.Data = unsup.zca_whiten(TestDataProvider.Data, mean, P, invP)
      ValidDataProvider.Data = unsup.zca_whiten(ValidDataProvider.Data, mean, P, invP)
  elseif dp_prepro then
        -- Do nothing since we use dp lib for GCN and LCN
  else
      local meanfile = paths.concat(PreProcDir, format .. normalization .. 'Mean.t7')
      local stdfile = paths.concat(PreProcDir,format .. normalization .. 'Std.t7')
      local mean, std
      local loaded = false

      if paths.filep(meanfile) and paths.filep(stdfile) then
        mean = torch.load(meanfile)
        std = torch.load(stdfile)
        loaded = true
      end

      mean, std = TrainDataProvider:normalize(normalization, mean, std)
      TestDataProvider:normalize(normalization, mean, std)
      ValidDataProvider:normalize(normalization, mean, std)
      if not loaded then
        torch.save(meanfile,mean)
        torch.save(stdfile,std)
      end
    end


return{
    TrainData = TrainDataProvider,
    TestData = TestDataProvider,
    ValidData = ValidDataProvider,
    Classes = Classes
}


================================================
FILE: Dockerfile/binarynet-torch-gpu-cuda-8.0
================================================
FROM nvidia/cuda:8.0-cudnn5-devel
WORKDIR /workspace

# Install dependencies
RUN apt-get update \
 && apt-get install -y \
    build-essential git gfortran \
    python3 python3-setuptools python3-dev \
    cmake curl wget unzip libreadline-dev libjpeg-dev libpng-dev ncurses-dev \
    imagemagick gnuplot gnuplot-x11 libssl-dev libzmq3-dev graphviz vim sudo tmux

# Install OpenBLAS
RUN apt-get -y install libopenblas-dev

# Install Torch commit no: 0219027e6c4644a0ba5c5bf137c989a0a8c9e01b
RUN git clone https://github.com/torch/distro.git torch --recursive
RUN cd torch \
 && /bin/bash install-deps \
 && ./install.sh

# get torch tutorials. comment out this line if no need
RUN git clone https://github.com/torch/tutorials.git

# Install dependency for [BinaryNet](https://github.com/itayhubara/BinaryNet)
RUN /workspace/torch/install/bin/luarocks install https://raw.githubusercontent.com/eladhoffer/DataProvider.torch/master/dataprovider-scm-1.rockspec
RUN /workspace/torch/install/bin/luarocks install cudnn 
RUN /workspace/torch/install/bin/luarocks install dp
RUN /workspace/torch/install/bin/luarocks install unsup

# copy BinaryNet into the image
ADD . BinaryNet

================================================
FILE: Main_BinaryNet_Cifar10.lua
================================================
require 'torch'
require 'xlua'
require 'optim'
require 'gnuplot'
require 'pl'
require 'trepl'
require 'adaMax_binary_clip_shift'
require 'adam_binary_clip_b'
require 'nn'
require 'SqrHingeEmbeddingCriterion'
----------------------------------------------------------------------

cmd = torch.CmdLine()
cmd:addTime()
cmd:text()
cmd:text('Training a convolutional network for visual classification')
cmd:text()
cmd:text('==>Options')

cmd:text('===>Model And Training Regime')
cmd:option('-modelsFolder',       './Models/',            'Models Folder')
cmd:option('-network',            'Model.lua',            'Model file - must return valid network.')
cmd:option('-LR',                 2^-6,                   'learning rate')
cmd:option('-LRDecay',            0,                      'learning rate decay (in # samples)')
cmd:option('-weightDecay',        0.0,                    'L2 penalty on the weights')
cmd:option('-momentum',           0.0,                    'momentum')
cmd:option('-batchSize',          200,                    'batch size')
cmd:option('-stcNeurons',         true,                   'use stochastic binarization for the neurons')
cmd:option('-stcWeights',         false,                  'use stochastic binarization for the weights')
cmd:option('-optimization',       'adam',                 'optimization method')
cmd:option('-SBN',                true,                   'shift based batch-normalization')
cmd:option('-runningVal',         false,                  'use running mean and std')
cmd:option('-epoch',              -1,                     'number of epochs to train, -1 for unbounded')

cmd:text('===>Platform Optimization')
cmd:option('-threads',            8,                      'number of threads')
cmd:option('-type',               'cuda',                 'float or cuda')
cmd:option('-devid',              1,                      'device ID (if using CUDA)')
cmd:option('-nGPU',               1,                      'num of gpu devices used')
cmd:option('-constBatchSize',     false,                  'do not allow varying batch sizes - e.g for ccn2 kernel')


cmd:text('===>Save/Load Options')
cmd:option('-load',               '',                     'load existing net weights')
cmd:option('-save',               os.date():gsub(' ',''), 'save directory')

cmd:text('===>Data Options')
cmd:option('-dataset',            'Cifar10',              'Dataset - Cifar10, Cifar100, STL10, SVHN, MNIST')
cmd:option('-normalization',      'simple',               'simple - whole sample, channel - by image channel, image - mean and std images')
cmd:option('-format',             'rgb',                  'rgb or yuv')
cmd:option('-whiten',             true,                   'whiten data')
cmd:option('-dp_prepro',          false,                   'preprocessing using dp lib')
cmd:option('-augment',            false,                  'Augment training data')
cmd:option('-preProcDir',         './PreProcData/',       'Data for pre-processing (means,P,invP)')
cmd:text('===>Misc')
cmd:option('-visualize',          0,                      'visualizing results')

torch.manualSeed(432)
opt = cmd:parse(arg or {})
opt.network = opt.modelsFolder .. paths.basename(opt.network, '.lua')
opt.save = paths.concat('./Results', opt.save)
opt.preProcDir = paths.concat(opt.preProcDir, opt.dataset .. '/')

-- If you choose to use exponentialy decaying learning rate use uncomment this line
--opt.LRDecay=torch.pow((2e-6/opt.LR),(1./500));
--
os.execute('mkdir -p ' .. opt.preProcDir)
torch.setnumthreads(opt.threads)

torch.setdefaulttensortype('torch.FloatTensor')
if opt.augment then
    require 'image'
end
----------------------------------------------------------------------
-- Model + Loss:
local modelAll = require(opt.network)
model=modelAll.model
GLRvec=modelAll.lrs
clipV=modelAll.clipV

local loss = SqrtHingeEmbeddingCriterion(1)


local data = require 'Data'
local classes = data.Classes

----------------------------------------------------------------------

-- This matrix records the current confusion across classes
local confusion = optim.ConfusionMatrix(classes)

local AllowVarBatch = not opt.constBatchSize


----------------------------------------------------------------------


-- Output files configuration
os.execute('mkdir -p ' .. opt.save)
cmd:log(opt.save .. '/Log.txt', opt)
local netFilename = paths.concat(opt.save, 'Net')
local logFilename = paths.concat(opt.save,'ErrorRate.log')
local optStateFilename = paths.concat(opt.save,'optState')
local Log = optim.Logger(logFilename)
----------------------------------------------------------------------

local TensorType = 'torch.FloatTensor'
if paths.filep(opt.load) then
    model = torch.load(opt.load)
    print('==>Loaded model from: ' .. opt.load)
    print(model)
end
if opt.type =='cuda' then
    require 'cutorch'
    cutorch.setDevice(opt.devid)
    cutorch.setHeapTracking(true)
    model:cuda()
    GLRvec=GLRvec:cuda()
    clipV=clipV:cuda()
    loss = loss:cuda()
    TensorType = 'torch.CudaTensor'
end


---Support for multiple GPUs - currently data parallel scheme
if opt.nGPU > 1 then
    local net = model
    model = nn.DataParallelTable(1)
    for i = 1, opt.nGPU do
        cutorch.setDevice(i)
        model:add(net:clone():cuda(), i)  -- Use the ith GPU
    end
    cutorch.setDevice(opt.devid)
end

-- Optimization configuration
local Weights,Gradients = model:getParameters()


----------------------------------------------------------------------
print '==> Network'
print(model)
print('==>' .. Weights:nElement() ..  ' Parameters')

print '==> Loss'
print(loss)


------------------Optimization Configuration--------------------------
local optimState = {
    learningRate = opt.LR,
    momentum = opt.momentum,
    weightDecay = opt.weightDecay,
    learningRateDecay = opt.LRDecay,
    GLRvec=GLRvec,
    clipV=clipV
}
----------------------------------------------------------------------

local function SampleImages(images,labels)
    if not opt.augment then
        return images,labels
    else

        local sampled_imgs = images:clone()
        for i=1,images:size(1) do
            local sz = math.random(9) - 1
            local hflip = math.random(2)==1

            local startx = math.random(sz)
            local starty = math.random(sz)
            local img = images[i]:narrow(2,starty,32-sz):narrow(3,startx,32-sz)
            if hflip then
                img = image.hflip(img)
            end
            img = image.scale(img,32,32)
            sampled_imgs[i]:copy(img)
        end
        return sampled_imgs,labels
    end
end


------------------------------
local function Forward(Data, train)


  local MiniBatch = DataProvider.Container{
    Name = 'GPU_Batch',
    MaxNumItems = opt.batchSize,
    Source = Data,
    ExtractFunction = SampleImages,
    TensorType = TensorType
  }

  local yt = MiniBatch.Labels
  local x = MiniBatch.Data
  local SizeData = Data:size()
  if not AllowVarBatch then SizeData = math.floor(SizeData/opt.batchSize)*opt.batchSize end

  local NumSamples = 0
  local NumBatches = 0
  local lossVal = 0

  while NumSamples < SizeData do
    MiniBatch:getNextBatch()
    local y, currLoss
    NumSamples = NumSamples + x:size(1)
    NumBatches = NumBatches + 1
    if opt.nGPU > 1 then
      model:syncParameters()
    end
    y = model:forward(x)
    one_hot_yt=torch.zeros(yt:size(1),10)
    one_hot_yt:scatter(2, yt:long():view(-1,1), 1)
    one_hot_yt=one_hot_yt:mul(2):float():add(-1)
    if opt.type == 'cuda' then
      one_hot_yt=one_hot_yt:cuda()
    end

    currLoss = loss:forward(y,one_hot_yt)
    if train then
      function feval()
        model:zeroGradParameters()
        local dE_dy = loss:backward(y, one_hot_yt)
        model:backward(x, dE_dy)
        return currLoss, Gradients
      end
       --_G.optim[opt.optimization](feval, Weights, optimState) -- If you choose to use different optimization remember to clip the weights
       adaMax_binary_clip_shift(feval, Weights, optimState)
    end

    lossVal = currLoss + lossVal

    if type(y) == 'table' then --table results - always take first prediction
      y = y[1]
    end

    confusion:batchAdd(y,one_hot_yt)
    xlua.progress(NumSamples, SizeData)
    if math.fmod(NumBatches,100)==0 then
      collectgarbage()
    end
  end
  return(lossVal/math.ceil(SizeData/opt.batchSize))
end

------------------------------
local function Train(Data)
  model:training()
  return Forward(Data, true)
end

local function Test(Data)
  model:evaluate()
  return Forward(Data, false)
end
------------------------------

local epoch = 1
print '\n==> Starting Training\n'


while epoch ~= opt.epoch do
    data.TrainData:shuffleItems()
    print('Epoch ' .. epoch)
    --Train
    confusion:zero()
    local LossTrain = Train(data.TrainData)
    if epoch%10==0 then
      torch.save(netFilename, model)
    end
    confusion:updateValids()
    local ErrTrain = (1-confusion.totalValid)
    if #classes <= 10 then
        print(confusion)
    end
    print('Training Error = ' .. ErrTrain)
    print('Training Loss = ' .. LossTrain)

    --validation
    confusion:zero()
    local LossValid = Test(data.ValidData)
    confusion:updateValids()
    local ErrValid = (1-confusion.totalValid)
    if #classes <= 10 then
        print(confusion)
    end
    print('Valid Error = ' .. ErrValid)
    print('Valid Loss = ' .. LossValid)

    --Test
    confusion:zero()
    local LossTest = Test(data.TestData)
    confusion:updateValids()
    local ErrTest = (1-confusion.totalValid)
    if #classes <= 10 then
        print(confusion)
    end

    print('Test Error = ' .. ErrTest)
    print('Test Loss = ' .. LossTest)

    Log:add{['Training Error']= ErrTrain, ['Valid Error'] = ErrValid, ['Test Error'] = ErrTest}
    -- the training stops at epoch 3 if visualize is set to 1
    if opt.visualize == 1 then
        Log:style{['Training Error'] = '-',['Validation Error'] = '-', ['Test Error'] = '-'}
        Log:plot()
    end
    --optimState.learningRate=optimState.learningRate*opt.LRDecay
    if epoch%50==0  then
      optimState.learningRate=optimState.learningRate*0.5
    else
      optimState.learningRate=optimState.learningRate --*opt.LRDecay
    end
    print('-------------------LR-------------------')
    print(optimState.learningRate)
    epoch = epoch + 1
end


================================================
FILE: Main_BinaryNet_MNIST.lua
================================================
require 'torch'
require 'xlua'
require 'optim'
require 'gnuplot'
require 'pl'
require 'trepl'
require 'adaMax_binary_clip_shift'
require 'nn'
require 'SqrHingeEmbeddingCriterion'
----------------------------------------------

cmd = torch.CmdLine()
cmd:addTime()
cmd:text()
cmd:text('Training a convolutional network for visual classification')
cmd:text()
cmd:text('==>Options')

cmd:text('===>Model And Training Regime')
cmd:option('-modelsFolder',       './Models/',            'Models Folder')
cmd:option('-network',            'Model.lua',            'Model file - must return valid network.')
cmd:option('-LR',                 2^-6,                    'learning rate')
cmd:option('-LRDecay',            0,                     'learning rate decay (in # samples)')
cmd:option('-weightDecay',        0.0,                   'L2 penalty on the weights')
cmd:option('-momentum',           0.0,                    'momentum')
cmd:option('-batchSize',          100,                    'batch size')
cmd:option('-stcNeurons',         true,                    'batch size')
cmd:option('-stcWeights',         false,                    'batch size')
cmd:option('-optimization',       'adam',                  'optimization method')
cmd:option('-SBN',                true,                   'shift based batch-normalization')
cmd:option('-runningVal',         true,                    'use running mean and std')
cmd:option('-epoch',              -1,                     'number of epochs to train, -1 for unbounded')

cmd:text('===>Platform Optimization')
cmd:option('-threads',            8,                      'number of threads')
cmd:option('-type',               'cuda',                 'float or cuda')
cmd:option('-devid',              1,                      'device ID (if using CUDA)')
cmd:option('-nGPU',               1,                      'num of gpu devices used')
cmd:option('-constBatchSize',     false,                    'do not allow varying batch sizes - e.g for ccn2 kernel')

cmd:text('===>Save/Load Options')
cmd:option('-load',               '',                  'load existing net weights')
cmd:option('-save',               os.date():gsub(' ',''), 'save directory')

cmd:text('===>Data Options')
cmd:option('-dataset',            'MNIST',              'Dataset - Cifar10, Cifar100, STL10, SVHN, MNIST')
cmd:option('-normalization',      'simple',               'simple - whole sample, channel - by image channel, image - mean and std images')
cmd:option('-format',             'rgb',                  'rgb or yuv')
cmd:option('-whiten',             false,                  'whiten data')
cmd:option('-dp_prepro',          false,                  'preprocessing using dp lib')
cmd:option('-augment',            false,                  'Augment training data')
cmd:option('-preProcDir',         './PreProcData/',       'Data for pre-processing (means,P,invP)')

cmd:text('===>Misc')
cmd:option('-visualize',          1,                      'visualizing results')

torch.manualSeed(432)
opt = cmd:parse(arg or {})
opt.network = opt.modelsFolder .. paths.basename(opt.network, '.lua')
opt.save = paths.concat('./Results', opt.save)
opt.preProcDir = paths.concat(opt.preProcDir, opt.dataset .. '/')


-- If you choose to use exponentialy decaying learning rate use uncomment this line
--opt.LRDecay=torch.pow((2e-6/opt.LR),(1./500));
--


os.execute('mk1ir -p ' .. opt.preProcDir)
torch.setnumthreads(opt.threads)

torch.setdefaulttensortype('torch.FloatTensor')
if opt.augment then
    require 'image'
end
----------------------------------------------------------------------

local modelAll = require(opt.network)
model=modelAll.model
GLRvec=modelAll.lrs
clipV=modelAll.clipV
local loss = SqrtHingeEmbeddingCriterion(1)

local data = require 'Data'
local classes = data.Classes

----------------------------------------------------------------------

-- This matrix records the current confusion across classes
local confusion = optim.ConfusionMatrix(classes)

local AllowVarBatch = not opt.constBatchSize


----------------------------------------------------------------------


-- Output files configuration
os.execute('mkdir -p ' .. opt.save)
cmd:log(opt.save .. '/Log.txt', opt)
local netFilename = paths.concat(opt.save, 'Net')
local logFilename = paths.concat(opt.save,'ErrorRate.log')
local optStateFilename = paths.concat(opt.save,'optState')
local Log = optim.Logger(logFilename)
----------------------------------------------------------------------

local TensorType = 'torch.FloatTensor'
if paths.filep(opt.load) then
    model = torch.load(opt.load)
    print('==>Loaded model from: ' .. opt.load)
    print(model)
end
if opt.type =='cuda' then
    require 'cutorch'
    cutorch.setDevice(opt.devid)
    cutorch.setHeapTracking(true)
    model:cuda()
    GLRvec=GLRvec:cuda()
    clipV=clipV:cuda()
    loss = loss:cuda()
    TensorType = 'torch.CudaTensor'
end


---Support for multiple GPUs - currently data parallel scheme
if opt.nGPU > 1 then
    local net = model
    model = nn.DataParallelTable(1)
    for i = 1, opt.nGPU do
        cutorch.setDevice(i)
        model:add(net:clone():cuda(), i)  -- Use the ith GPU
    end
    cutorch.setDevice(opt.devid)
end

-- Optimization configuration
local Weights,Gradients = model:getParameters()


----------------------------------------------------------------------
print '==> Network'
print(model)
print('==>' .. Weights:nElement() ..  ' Parameters')

print '==> Loss'
print(loss)


------------------Optimization Configuration--------------------------
local optimState = {
    learningRate = opt.LR,
    momentum = opt.momentum,
    weightDecay = opt.weightDecay,
    learningRateDecay = opt.LRDecay,
    GLRvec=GLRvec,
    clipV=clipV
}
----------------------------------------------------------------------

local function SampleImages(images,labels)
    if not opt.augment then
        return images,labels
    else

        local sampled_imgs = images:clone()
        for i=1,images:size(1) do
            local sz = math.random(9) - 1
            local hflip = math.random(2)==1

            local startx = math.random(sz)
            local starty = math.random(sz)
            local img = images[i]:narrow(2,starty,32-sz):narrow(3,startx,32-sz)
            if hflip then
                img = image.hflip(img)
            end
            img = image.scale(img,32,32)
            sampled_imgs[i]:copy(img)
        end
        return sampled_imgs,labels
    end
end


------------------------------
local function Forward(Data, train)


  local MiniBatch = DataProvider.Container{
    Name = 'GPU_Batch',
    MaxNumItems = opt.batchSize,
    Source = Data,
    ExtractFunction = SampleImages,
    TensorType = TensorType
  }

  local yt = MiniBatch.Labels
  local x = MiniBatch.Data
  local SizeData = Data:size()
  if not AllowVarBatch then SizeData = math.floor(SizeData/opt.batchSize)*opt.batchSize end

  local NumSamples = 0
  local NumBatches = 0
  local lossVal = 0

  while NumSamples < SizeData do
    MiniBatch:getNextBatch()
    local y, currLoss
    NumSamples = NumSamples + x:size(1)
    NumBatches = NumBatches + 1
    if opt.nGPU > 1 then
      model:syncParameters()
    end

    y = model:forward(x)

    one_hot_yt=torch.zeros(yt:size(1),10)
    one_hot_yt:scatter(2, yt:long():view(-1,1), 1)
    one_hot_yt=one_hot_yt:mul(2):float():add(-1):cuda()


    currLoss = loss:forward(y,one_hot_yt)
    if train then
      function feval()
        model:zeroGradParameters()
        local dE_dy = loss:backward(y, one_hot_yt)
        model:backward(x, dE_dy)
        return currLoss, Gradients
      end


       adaMax_binary_clip_shift(feval, Weights, optimState)

      local indLayer=0
      for i, layer in ipairs(model.modules) do
          indLayer=indLayer+1;
          if layer.__typename == 'cudnnBinarySpatialConvolution' then
            model.modules[indLayer].weight:clamp(-1,1)
          elseif layer.__typename == 'BinaryLinear' then
            --print(indLayer)
            model.modules[indLayer].weight:clamp(-1,1)
        end
      end
    end

    lossVal = currLoss + lossVal

    if type(y) == 'table' then --table results - always take first prediction
      y = y[1]
    end


    confusion:batchAdd(y,one_hot_yt)
    xlua.progress(NumSamples, SizeData)
    if math.fmod(NumBatches,100)==0 then
      collectgarbage()
    end
  end
  return(lossVal/math.ceil(SizeData/opt.batchSize))
end

------------------------------
local function Train(Data)
  model:training()
  return Forward(Data, true)
end

local function Test(Data)
  model:evaluate()
  return Forward(Data, false)
end
------------------------------

local epoch = 1
print '\n==> Starting Training\n'

local epoch = 1
print '\n==> Starting Training\n'

while epoch ~= opt.epoch do
    data.TrainData:shuffleItems()
    print('Epoch ' .. epoch)
    --Train
    confusion:zero()
    local LossTrain = Train(data.TrainData)
    if epoch%10==0 then
      torch.save(netFilename, model)
    end
    confusion:updateValids()
    local ErrTrain = (1-confusion.totalValid)
    if #classes <= 10 then
        print(confusion)
    end
    print('Training Error = ' .. ErrTrain)
    print('Training Loss = ' .. LossTrain)

    --validation
    confusion:zero()
    local LossValid = Test(data.ValidData)
    confusion:updateValids()
    local ErrValid = (1-confusion.totalValid)
    if #classes <= 10 then
        print(confusion)
    end
    print('Valid Error = ' .. ErrValid)
    print('Valid Loss = ' .. LossValid)

    --Test
    confusion:zero()
    local LossTest = Test(data.TestData)
    confusion:updateValids()
    local ErrTest = (1-confusion.totalValid)
    if #classes <= 10 then
        print(confusion)
    end

    print('Test Error = ' .. ErrTest)
    print('Test Loss = ' .. LossTest)

    Log:add{['Training Error']= ErrTrain, ['Valid Error'] = ErrValid, ['Test Error'] = ErrTest}
    if opt.visualize == 1 then
        Log:style{['Training Error'] = '-',['Validation Error'] = '-', ['Test Error'] = '-'}
        Log:plot()
    end
    if epoch%20==0  then
      optimState.learningRate=optimState.learningRate*0.5
    else
      optimState.learningRate=optimState.learningRate --*opt.LRDecay
    end
    print('-------------------LR-------------------')
    print(optimState.learningRate)
    epoch = epoch + 1
end


================================================
FILE: Main_BinaryNet_SVHN.lua
================================================
require 'torch'
require 'xlua'
require 'optim'
require 'gnuplot'
require 'pl'
require 'trepl'
require 'adaMax_binary_clip_shift'
require 'nn'
require 'SqrHingeEmbeddingCriterion'
----------------------------------------------------------------------

cmd = torch.CmdLine()
cmd:addTime()
cmd:text()
cmd:text('Training a convolutional network for visual classification')
cmd:text()
cmd:text('==>Options')

cmd:text('===>Model And Training Regime')
cmd:option('-modelsFolder',       './Models/',            'Models Folder')
cmd:option('-network',            'Model.lua',            'Model file - must return valid network.')
cmd:option('-LR',                 2^-7,                    'learning rate')
cmd:option('-LRDecay',            0,                     'learning rate decay (in # samples)')
cmd:option('-weightDecay',        0.0,                   'L2 penalty on the weights')
cmd:option('-momentum',           0.0,                    'momentum')
cmd:option('-batchSize',          200,                    'batch size')
cmd:option('-stcNeurons',         true,                    'batch size')
cmd:option('-stcWeights',         false,                    'batch size')
cmd:option('-optimization',       'adam',                  'optimization method')
cmd:option('-SBN',                true,                   'shift based batch-normalization')
cmd:option('-runningVal',         true,                    'use running mean and std')
cmd:option('-epoch',              -1,                     'number of epochs to train, -1 for unbounded')

cmd:text('===>Platform Optimization')
cmd:option('-threads',            8,                      'number of threads')
cmd:option('-type',               'cuda',                 'float or cuda')
cmd:option('-devid',              1,                      'device ID (if using CUDA)')
cmd:option('-nGPU',               1,                      'num of gpu devices used')
cmd:option('-constBatchSize',     false,                    'do not allow varying batch sizes - e.g for ccn2 kernel')

cmd:text('===>Save/Load Options')
cmd:option('-load',               '',                  'load existing net weights')
cmd:option('-save',               os.date():gsub(' ',''), 'save directory')

cmd:text('===>Data Options')
cmd:option('-dataset',            'SVHN',              'Dataset - Cifar10, Cifar100, STL10, SVHN, MNIST')
cmd:option('-normalization',      'simple',               'simple - whole sample, channel - by image channel, image - mean and std images')
cmd:option('-format',             'rgb',                  'rgb or yuv')
cmd:option('-whiten',             false,                  'whiten data')
cmd:option('-dp_prepro',          true,                   'preprocessing using dp lib')
cmd:option('-augment',            false,                  'Augment training data')
cmd:option('-preProcDir',         './PreProcData/',       'Data for pre-processing (means,P,invP)')

cmd:text('===>Misc')
cmd:option('-visualize',          1,                      'visualizing results')

torch.manualSeed(432)
opt = cmd:parse(arg or {})
opt.network = opt.modelsFolder .. paths.basename(opt.network, '.lua')
opt.save = paths.concat('./Results', opt.save)
opt.preProcDir = paths.concat(opt.preProcDir, opt.dataset .. '/')


-- If you choose to use exponentialy decaying learning rate use uncomment this line
--opt.LRDecay=torch.pow((2e-6/opt.LR),(1./500));
--

os.execute('mk1ir -p ' .. opt.preProcDir)
torch.setnumthreads(opt.threads)

torch.setdefaulttensortype('torch.FloatTensor')
if opt.augment then
    require 'image'
end
----------------------------------------------------------------------
-- Model + Loss:
local modelAll = require(opt.network)
model=modelAll.model
GLRvec=modelAll.lrs
clipV=modelAll.clipV

local loss = SqrtHingeEmbeddingCriterion(1) --nn.ClassNLLCriterion()
local data = require 'Data'
local classes = data.Classes

----------------------------------------------------------------------

-- This matrix records the current confusion across classes
local confusion = optim.ConfusionMatrix(classes)

local AllowVarBatch = not opt.constBatchSize


----------------------------------------------------------------------


-- Output files configuration
os.execute('mkdir -p ' .. opt.save)
cmd:log(opt.save .. '/Log.txt', opt)
local netFilename = paths.concat(opt.save, 'Net')
local logFilename = paths.concat(opt.save,'ErrorRate.log')
local optStateFilename = paths.concat(opt.save,'optState')
local Log = optim.Logger(logFilename)
----------------------------------------------------------------------

local TensorType = 'torch.FloatTensor'

if opt.type =='cuda' then
    require 'cutorch'
    cutorch.setDevice(opt.devid)
    cutorch.setHeapTracking(true)
    model:cuda()
    GLRvec=GLRvec:cuda()
    clipV=clipV:cuda()
    loss = loss:cuda()
    TensorType = 'torch.CudaTensor'
end
if paths.filep(opt.load) then
    model = torch.load(opt.load)
    print('==>Loaded model from: ' .. opt.load)
    print(model)
end


---Support for multiple GPUs - currently data parallel scheme
if opt.nGPU > 1 then
    local net = model
    model = nn.DataParallelTable(1)
    for i = 1, opt.nGPU do
        cutorch.setDevice(i)
        model:add(net:clone():cuda(), i)  -- Use the ith GPU
    end
    cutorch.setDevice(opt.devid)
end

-- Optimization configuration
local Weights,Gradients = model:getParameters()


----------------------------------------------------------------------
print '==> Network'
print(model)
print('==>' .. Weights:nElement() ..  ' Parameters')

print '==> Loss'
print(loss)


------------------Optimization Configuration--------------------------
local optimState = {
    learningRate = opt.LR,
    momentum = opt.momentum,
    weightDecay = opt.weightDecay,
    learningRateDecay = opt.LRDecay,
    GLRvec=GLRvec,
    clipV=clipV
}
----------------------------------------------------------------------

local function SampleImages(images,labels)
    if not opt.augment then
        return images,labels
    else

        local sampled_imgs = images:clone()
        for i=1,images:size(1) do
            local sz = math.random(9) - 1
            local hflip = math.random(2)==1

            local startx = math.random(sz)
            local starty = math.random(sz)
            local img = images[i]:narrow(2,starty,32-sz):narrow(3,startx,32-sz)
            if hflip then
                img = image.hflip(img)
            end
            img = image.scale(img,32,32)
            sampled_imgs[i]:copy(img)
        end
        return sampled_imgs,labels
    end
end


------------------------------
local function Forward(Data, train)


  local MiniBatch = DataProvider.Container{
    Name = 'GPU_Batch',
    MaxNumItems = opt.batchSize,
    Source = Data,
    ExtractFunction = SampleImages,
    TensorType = TensorType
  }

  local yt = MiniBatch.Labels
  local x = MiniBatch.Data
  local SizeData = Data:size()
  if not AllowVarBatch then SizeData = math.floor(SizeData/opt.batchSize)*opt.batchSize end

  local NumSamples = 0
  local NumBatches = 0
  local lossVal = 0

  while NumSamples < SizeData do
    MiniBatch:getNextBatch()
    local y, currLoss
    NumSamples = NumSamples + x:size(1)
    NumBatches = NumBatches + 1
    if opt.nGPU > 1 then
      model:syncParameters()
    end

    y = model:forward(x)
    one_hot_yt=torch.zeros(yt:size(1),10)
    one_hot_yt:scatter(2, yt:long():view(-1,1), 1)
    one_hot_yt=one_hot_yt:mul(2):float():add(-1):cuda()

    currLoss = loss:forward(y,one_hot_yt)
    if train then
      function feval()
        model:zeroGradParameters()
        local dE_dy = loss:backward(y, one_hot_yt)
        model:backward(x, dE_dy)
        return currLoss, Gradients
      end

       adaMax_binary_clip_shift(feval, Weights, optimState)

      local indLayer=0
      for i, layer in ipairs(model.modules) do
          indLayer=indLayer+1;
          if layer.__typename == 'cudnnBinarySpatialConvolution' then
            model.modules[indLayer].weight:copy(model.modules[indLayer].weight:clamp(-1,1))
          elseif layer.__typename == 'BinaryLinear' then
            model.modules[indLayer].weight:copy(model.modules[indLayer].weight:clamp(-1,1))
        end
      end
    end

    lossVal = currLoss + lossVal

    if type(y) == 'table' then --table results - always take first prediction
      y = y[1]
    end


    confusion:batchAdd(y,one_hot_yt)
    xlua.progress(NumSamples, SizeData)
    if math.fmod(NumBatches,100)==0 then
      collectgarbage()
    end
  end
  return(lossVal/math.ceil(SizeData/opt.batchSize))
end

------------------------------
local function Train(Data)
  model:training()
  return Forward(Data, true)
end

local function Test(Data)
  model:evaluate()
  return Forward(Data, false)
end
------------------------------

local epoch = 1
print '\n==> Starting Training\n'


while epoch ~= opt.epoch do
    data.TrainData:shuffleItems()
    print('Epoch ' .. epoch)
    --Train
    confusion:zero()
    local LossTrain = Train(data.TrainData)
    if epoch%10==0 then
      torch.save(netFilename, model)
    end
    confusion:updateValids()
    local ErrTrain = (1-confusion.totalValid)
    if #classes <= 10 then
        print(confusion)
    end
    print('Training Error = ' .. ErrTrain)
    print('Training Loss = ' .. LossTrain)
    --validation
    confusion:zero()
    local LossValid = Test(data.ValidData)
    confusion:updateValids()
    local ErrValid = (1-confusion.totalValid)
    if #classes <= 10 then
        print(confusion)
    end
    print('Valid Error = ' .. ErrValid)
    print('Valid Loss = ' .. LossValid)
    --Test
    confusion:zero()
    local LossTest = Test(data.TestData)
    confusion:updateValids()
    local ErrTest = (1-confusion.totalValid)
    if #classes <= 10 then
        print(confusion)
    end

    print('Test Error = ' .. ErrTest)
    print('Test Loss = ' .. LossTest)

    Log:add{['Training Error']= ErrTrain, ['Valid Error'] = ErrValid, ['Test Error'] = ErrTest}
    if opt.visualize == 1 then
        Log:style{['Training Error'] = '-',['Validation Error'] = '-', ['Test Error'] = '-'}
        Log:plot()
    end
    if epoch%20==0 then
      optimState.learningRate=optimState.learningRate*0.5
    else
      optimState.learningRate=optimState.learningRate
    end
    print('-------------------LR-------------------')
    print(optimState.learningRate)


    epoch = epoch + 1
end


================================================
FILE: Models/BatchNormalizationShiftPow2.lua
================================================
--[[
   This file implements Shift based Batch Normalization based a variant of the vanilla BN as described in the paper:
   "Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Matthieu Courbariaux, Itay Hubara, Daniel Soudry, Ran El-Yaniv, Yoshua Bengio'

   The code is based on nn library
   --]]


local BatchNormalizationShiftPow2,parent = torch.class('BatchNormalizationShiftPow2', 'nn.Module')

function BatchNormalizationShiftPow2:__init(nOutput, runningVal, eps, momentum, affine)
   parent.__init(self)
   assert(nOutput and type(nOutput) == 'number',
          'Missing argument #1: dimensionality of input. ')
   assert(nOutput ~= 0, 'To set affine=false call BatchNormalization'
     .. '(nOutput,  eps, momentum, false) ')
   if affine ~= nil then
      assert(type(affine) == 'boolean', 'affine has to be true/false')
      self.affine = affine
   else
      self.affine = true
   end
   self.eps = eps or 1e-5
   self.train = true
   self.momentum = momentum or 0.125
   self.runningVal = runningVal or true
   self.running_mean = torch.zeros(nOutput)
   self.running_std = torch.ones(nOutput)
   self.running_std_ap2 = torch.ones(nOutput)
   if self.affine then
      self.weight = torch.Tensor(nOutput)
      self.weightSign = torch.Tensor(nOutput)
      self.weight_ap2 = torch.Tensor(nOutput)
      self.bias = torch.Tensor(nOutput)
      self.gradWeight = torch.Tensor(nOutput)
      self.gradBias = torch.Tensor(nOutput)
      self:reset()
   end
end

function BatchNormalizationShiftPow2:reset()
   self.weight:fill(1)
   self.bias:zero()
   self.running_mean:zero()
   self.running_std:fill(1)
end

function BatchNormalizationShiftPow2:updateOutput(input)
   assert(input:dim() == 2, 'only mini-batch supported (2D tensor), got '
             .. input:dim() .. 'D tensor instead')
   local nBatch = input:size(1)
   -- buffers that are reused
   self.buffer = self.buffer or input.new()
   self.buffer2 = self.buffer2 or input.new()
   self.centered = self.centered or input.new()
   self.centered:resizeAs(input)
   self.centerSign = self.centerSign or input.new()
   self.centerSign:resizeAs(input)
   self.centeredOrg = self.centeredOrg or input.new()
   self.centeredOrg:resizeAs(input)
   self.std = self.std or input.new()
   self.normalized = self.normalized or input.new()
   self.normalized:resizeAs(input)
   self.normalizedSign = self.normalizedSign or input.new()
   self.normalizedSign:resizeAs(input)
   self.output:resizeAs(input)
   self.gradInput:resizeAs(input)
   if self.train == false and self.runningVal == true then
     self.output:copy(input)
     self.buffer:repeatTensor(self.running_mean, nBatch, 1)
     self.output:add(-1, self.buffer)
     self.running_std_ap2:copy(torch.pow(2,torch.round(torch.log(self.running_std):div(math.log(2)))))
     self.buffer:repeatTensor(self.running_std_ap2, nBatch, 1)
     self.output:cmul(self.buffer)
   else -- training mode
      -- calculate mean over mini-batch
      self.buffer:mean(input, 1)                        -- E(x) = expectation of x.
      self.running_mean:mul(1 - self.momentum):add(self.momentum, self.buffer) -- add to running mean
      self.buffer:repeatTensor(self.buffer, nBatch, 1)

      -- subtract mean
      self.centered:add(input, -1, self.buffer)         -- x - E(x)
      self.centeredOrg:copy(self.centered)
      self.centerSign:copy(self.centered)
      self.centerSign:sign()
      self.centered:copy(torch.pow(2,torch.round(torch.log(self.centered:abs()):div(math.log(2))))):cmul(self.centerSign)
      -- calculate standard deviation over mini-batch
      self.buffer:copy(self.centered):cmul(self.centeredOrg) -- [x - E(x)]^2
      -- 1 / E([x - E(x)]^2)
      self.std:mean(self.buffer, 1):add(self.eps):sqrt():pow(-1)
      self.running_std:mul(1 - self.momentum):add(self.momentum, self.std) -- add to running stdv
      self.std:copy(torch.pow(2,torch.round(torch.log(self.std):div(math.log(2)))))
      self.buffer:repeatTensor(self.std, nBatch, 1)

      -- divide standard-deviation + eps

      self.output:cmul(self.centeredOrg, self.buffer)
      self.normalized:copy(self.output)
      self.normalizedSign:copy(self.normalized)
      self.normalizedSign:sign()

      self.normalized:copy(torch.pow(2,torch.round(torch.log(self.normalized:abs()):div(math.log(2)))):cmul(self.normalizedSign))
      --self.normalized[self.normalized:lt(0)]=1; -- Can improve results
   end

   if self.affine then
      -- multiply with gamma and add beta
      self.weightSign:copy(self.weight)
      self.weightSign:sign()
      self.weight_ap2:copy(torch.pow(2,torch.round(torch.log(self.weight:clone():abs()):div(math.log(2))))):cmul(self.weightSign)
      --self.weight:fill(1) --Almost similar results
      self.buffer:repeatTensor(self.weight_ap2, nBatch, 1)
      self.output:cmul(self.buffer)
      self.buffer:repeatTensor(self.bias, nBatch, 1)
      self.output:add(self.buffer)
   end
   return self.output
end

function BatchNormalizationShiftPow2:updateGradInput(input, gradOutput)
   assert(input:dim() == 2, 'only mini-batch supported')
   assert(gradOutput:dim() == 2, 'only mini-batch supported')
   assert(self.train == true, 'should be in training mode when self.train is true')
   local nBatch = input:size(1)

   self.gradInput:cmul(self.centered, gradOutput)
   self.buffer:mean(self.gradInput, 1)
   self.gradInput:repeatTensor(self.buffer, nBatch, 1)
   self.gradInput:cmul(self.centered):mul(-1)
   self.buffer:repeatTensor(self.std, nBatch, 1)
   self.gradInput:cmul(self.buffer):cmul(self.buffer)

   self.buffer:mean(gradOutput, 1)
   self.buffer:repeatTensor(self.buffer, nBatch, 1)
   self.gradInput:add(gradOutput):add(-1, self.buffer)
   self.buffer:repeatTensor(self.std, nBatch, 1)
   self.gradInput:cmul(self.buffer)

   if self.affine then
      self.buffer:repeatTensor(self.weight_ap2, nBatch, 1)
      self.gradInput:cmul(self.buffer)
   end

   return self.gradInput
end

function BatchNormalizationShiftPow2:accGradParameters(input, gradOutput, scale)
   if self.affine then
      scale = scale or 1.0
      self.buffer2:resizeAs(self.normalized):copy(self.normalized)
      self.buffer2:cmul(gradOutput)
      self.buffer:sum(self.buffer2, 1) -- sum over mini-batch
      self.gradWeight:add(scale, self.buffer)
      self.buffer:sum(gradOutput, 1) -- sum over mini-batch
      self.gradBias:add(scale, self.buffer)
   end
end


================================================
FILE: Models/BinarizedNeurons.lua
================================================
local BinarizedNeurons,parent = torch.class('BinarizedNeurons', 'nn.Module')


function BinarizedNeurons:__init(stcFlag)
   parent.__init(self)
   self.stcFlag = stcFlag
   self.randmat=torch.Tensor();
   self.outputR=torch.Tensor();
 end
function BinarizedNeurons:updateOutput(input)
    self.randmat:resizeAs(input);
    self.outputR:resizeAs(input);
    self.output:resizeAs(input);
    self.outputR:copy(input):add(1):div(2)
     if self.train and self.stcFlag then
       local mask=self.outputR-self.randmat:rand(self.randmat:size())
       self.output=mask:sign()
     else
       self.output:copy(self.outputR):add(-0.5):sign()
     end
   return self.output
end

function BinarizedNeurons:updateGradInput(input, gradOutput)
        self.gradInput:resizeAs(gradOutput)
        self.gradInput:copy(gradOutput) --:mul(0.5)
   return self.gradInput
end


================================================
FILE: Models/BinaryLinear.lua
================================================
--require 'randomkit'

local BinaryLinear, parent = torch.class('BinaryLinear', 'nn.Linear')

function BinaryLinear:__init(inputSize, outputSize,stcWeights)
   local delayedReset = self.reset
   self.reset = function() end
   parent.__init(self, inputSize, outputSize)
   self.reset = delayedReset

   self.weight = torch.Tensor(outputSize, inputSize)
   self.weightB = torch.Tensor(outputSize, inputSize)
   self.weightOrg = torch.Tensor(outputSize, inputSize)
   self.maskStc = torch.Tensor(outputSize, inputSize)
   self.randmat = torch.Tensor(outputSize, inputSize)
   self.bias = torch.Tensor(outputSize)
   self.gradWeight = torch.Tensor(outputSize, inputSize)
   self.gradBias = torch.Tensor(outputSize)
   self.stcWeights=stcWeights
   self:reset()
   -- should nil for serialization, the reset will still work
   self.reset = nil
end

function BinaryLinear:reset(stdv)
   if stdv then
      stdv = stdv * math.sqrt(3)
   else
      stdv = 1./math.sqrt(self.weight:size(2))
   end
   if nn.oldSeed then
      for i=1,self.weight:size(1) do
         self.weight:select(1, i):apply(function()
            return torch.uniform(-1, 1)
         end)
         self.bias[i] = torch.uniform(-stdv, stdv)
      end
   else
      self.weight:uniform(-1, 1)
      self.bias:uniform(-stdv, stdv)
   end

   return self
end

function BinaryLinear:binarized(trainFlag)
  self.weightOrg:copy(self.weight)
  self.binaryFlag = true
  if not self.binaryFlag then
    self.weight:copy(self.weightOrg)
  else
    self.weightB:copy(self.weight):add(1):div(2):clamp(0,1)

    if not self.stcWeights or not trainFlag then
      self.weightB:round():mul(2):add(-1)
    else
      self.maskStc=self.weightB-self.randmat:rand(self.randmat:size())
      self.weightB:copy(self.maskStc)

    end
  end

  return  self.weightB
end

function BinaryLinear:updateOutput(input)

  self.weightB = self:binarized(self.train)
  self.weight:copy(self.weightB)
   parent.updateOutput(self,input)
   self.weight:copy(self.weightOrg);
   return self.output
end

function BinaryLinear:updateGradInput(input, gradOutput)

   if self.gradInput then
      self.weight:copy(self.weightB)
      parent.updateGradInput(self,input, gradOutput)
      self.weight:copy(self.weightOrg);
      return self.gradInput
   end

end

function BinaryLinear:accGradParameters(input, gradOutput, scale)
  parent.accGradParameters(self,input, gradOutput, scale)
end

-- we do not need to accumulate parameters when sharing
BinaryLinear.sharedAccUpdateGradParameters = BinaryLinear.accUpdateGradParameters


function BinaryLinear:__tostring__()
  return torch.type(self) ..
      string.format('(%d -> %d)', self.weight:size(2), self.weight:size(1))
end


================================================
FILE: Models/BinaryNet_Cifar10_Model.lua
================================================
--[[This code specify the model for CIFAR 10 dataset. This model uses the Shift based batch-normalization algorithm.
In this file we also secify the Glorot learning parameter and the which of the learnable parameter we clip ]]
require 'nn'
require './BinaryLinear.lua'
require './BinarizedNeurons'

local SpatialConvolution
local SpatialMaxPooling
if opt.type =='cuda' then
  require 'cunn'
  require 'cudnn'
  require './cudnnBinarySpatialConvolution.lua'
  SpatialConvolution = cudnnBinarySpatialConvolution
  SpatialMaxPooling = cudnn.SpatialMaxPooling
else
  require './BinarySpatialConvolution.lua'
  SpatialConvolution = BinarySpatialConvolution
  SpatialMaxPooling = nn.SpatialMaxPooling
end
if opt.SBN == true then
  require './BatchNormalizationShiftPow2.lua'
  require './SpatialBatchNormalizationShiftPow2.lua'
  BatchNormalization = BatchNormalizationShiftPow2
  SpatialBatchNormalization = SpatialBatchNormalizationShiftPow2
else
  BatchNormalization = nn.BatchNormalization
  SpatialBatchNormalization = nn.SpatialBatchNormalization
end
numHid=1024;
local model = nn.Sequential()

-- Convolution Layers
model:add(SpatialConvolution(3, 128, 3, 3 ,1,1,1,1,opt.stcWeights ))
model:add(SpatialBatchNormalization(128, opt.runningVal))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))

model:add(SpatialConvolution(128, 128, 3, 3,1,1,1,1,opt.stcWeights ))
model:add(SpatialMaxPooling(2, 2))
model:add(SpatialBatchNormalization(128, opt.runningVal))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))

model:add(SpatialConvolution(128, 256, 3, 3 ,1,1,1,1,opt.stcWeights ))
model:add(SpatialBatchNormalization(256, opt.runningVal))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))

model:add(SpatialConvolution(256, 256, 3, 3 ,1,1,1,1,opt.stcWeights ))
model:add(SpatialMaxPooling(2, 2))
model:add(SpatialBatchNormalization(256, opt.runningVal))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))

model:add(SpatialConvolution(256, 512, 3, 3,1,1,1,1,opt.stcWeights ))
model:add(SpatialBatchNormalization(512, opt.runningVal))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))

model:add(SpatialConvolution(512, 512, 3, 3,1,1,1,1,opt.stcWeights ))
model:add(SpatialMaxPooling(2, 2))
model:add(SpatialBatchNormalization(512, opt.runningVal))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))

model:add(nn.View(512*4*4))
model:add(BinaryLinear(512*4*4,numHid,opt.stcWeights))
model:add(BatchNormalization(numHid))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))

model:add(BinaryLinear(numHid,numHid,opt.stcWeights))
model:add(BatchNormalization(numHid, opt.runningVal))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))

model:add(BinaryLinear(numHid,10,opt.stcWeights))
model:add(nn.BatchNormalization(10))

local dE, param = model:getParameters()
local weight_size = dE:size(1)
local learningRates = torch.Tensor(weight_size):fill(0)
local clipvector = torch.Tensor(weight_size):fill(1)
local counter = 0
for i, layer in ipairs(model.modules) do
   if layer.__typename == 'BinaryLinear' then
      local weight_size = layer.weight:size(1)*layer.weight:size(2)
      local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]+size_w[2]))
      GLR=(math.pow(2,torch.round(math.log(GLR)/(math.log(2)))))
      learningRates[{{counter+1, counter+weight_size}}]:fill(GLR)
      clipvector[{{counter+1, counter+weight_size}}]:fill(1)
      counter = counter+weight_size
      local bias_size = layer.bias:size(1)
      learningRates[{{counter+1, counter+bias_size}}]:fill(GLR)
      clipvector[{{counter+1, counter+bias_size}}]:fill(0)
      counter = counter+bias_size
    elseif layer.__typename == 'BatchNormalizationShiftPow2' then
        local weight_size = layer.weight:size(1)
        local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))
        learningRates[{{counter+1, counter+weight_size}}]:fill(1)
        clipvector[{{counter+1, counter+weight_size}}]:fill(0)
        counter = counter+weight_size
        local bias_size = layer.bias:size(1)
        learningRates[{{counter+1, counter+bias_size}}]:fill(1)
        clipvector[{{counter+1, counter+bias_size}}]:fill(0)
        counter = counter+bias_size
    elseif layer.__typename == 'nn.BatchNormalization' then
      local weight_size = layer.weight:size(1)
      learningRates[{{counter+1, counter+weight_size}}]:fill(1)
      clipvector[{{counter+1, counter+weight_size}}]:fill(0)
      counter = counter+weight_size
      local bias_size = layer.bias:size(1)
      learningRates[{{counter+1, counter+bias_size}}]:fill(1)
      clipvector[{{counter+1, counter+bias_size}}]:fill(0)
      counter = counter+bias_size
    elseif layer.__typename == 'SpatialBatchNormalizationShiftPow2' then
        local weight_size = layer.weight:size(1)
        local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))
        learningRates[{{counter+1, counter+weight_size}}]:fill(1)
        clipvector[{{counter+1, counter+weight_size}}]:fill(0)
        counter = counter+weight_size
        local bias_size = layer.bias:size(1)
        learningRates[{{counter+1, counter+bias_size}}]:fill(1)
        clipvector[{{counter+1, counter+bias_size}}]:fill(0)
        counter = counter+bias_size
    elseif layer.__typename == 'nn.SpatialBatchNormalization' then
            local weight_size = layer.weight:size(1)
            local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))
            learningRates[{{counter+1, counter+weight_size}}]:fill(1)
            clipvector[{{counter+1, counter+weight_size}}]:fill(0)
            counter = counter+weight_size
            local bias_size = layer.bias:size(1)
            learningRates[{{counter+1, counter+bias_size}}]:fill(1)
            clipvector[{{counter+1, counter+bias_size}}]:fill(0)
            counter = counter+bias_size
    elseif layer.__typename == 'cudnnBinarySpatialConvolution' then
      local size_w=layer.weight:size();
      local weight_size = size_w[1]*size_w[2]*size_w[3]*size_w[4]

      local filter_size=size_w[3]*size_w[4]
      GLR=1/torch.sqrt(1.5/(size_w[1]*filter_size+size_w[2]*filter_size))
      GLR=(math.pow(2,torch.round(math.log(GLR)/(math.log(2)))))
      learningRates[{{counter+1, counter+weight_size}}]:fill(GLR)
      clipvector[{{counter+1, counter+weight_size}}]:fill(1)
      counter = counter+weight_size
      local bias_size = layer.bias:size(1)
      learningRates[{{counter+1, counter+bias_size}}]:fill(GLR)
      clipvector[{{counter+1, counter+bias_size}}]:fill(0)
      counter = counter+bias_size
      elseif layer.__typename == 'BinarySpatialConvolution' then
        local size_w=layer.weight:size();
        local weight_size = size_w[1]*size_w[2]*size_w[3]*size_w[4]

        local filter_size=size_w[3]*size_w[4]
        GLR=1/torch.sqrt(1.5/(size_w[1]*filter_size+size_w[2]*filter_size))
        GLR=(math.pow(2,torch.round(math.log(GLR)/(math.log(2)))))
        learningRates[{{counter+1, counter+weight_size}}]:fill(GLR)
        clipvector[{{counter+1, counter+weight_size}}]:fill(1)
        counter = counter+weight_size
        local bias_size = layer.bias:size(1)
        learningRates[{{counter+1, counter+bias_size}}]:fill(GLR)
        clipvector[{{counter+1, counter+bias_size}}]:fill(0)
        counter = counter+bias_size

  end
end
-- clip all parameter
clipvector:fill(1)
--
print(learningRates:eq(0):sum())
print(learningRates:ne(0):sum())
print(clipvector:ne(0):sum())
print(counter)
return {
     model = model,
     lrs = learningRates,
     clipV =clipvector,
  }


================================================
FILE: Models/BinaryNet_MNIST_Model.lua
================================================
--[[This code specify the model for MNIST dataset. This model uses the Shift based batch-normalization algorithm.
In this file we also secify the Glorot learning parameter and which of the learnable parameter we clip ]]
require 'nn'
require './BinaryLinear.lua'

require './BinarizedNeurons'
if opt.type=='cuda' then
  require 'cunn'
  require 'cudnn'
end

local BatchNormalization;
if opt.SBN == true then
  require './BatchNormalizationShiftPow2'
  BatchNormalization = BatchNormalizationShiftPow2
else
  BatchNormalization = nn.BatchNormalization
end

local model = nn.Sequential()
local numHid =2048
-- Convolution Layers
model:add(nn.View(-1,784))

model:add(BinaryLinear(784,numHid))
model:add(BatchNormalization(numHid, opt.runningVal))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))
model:add(BinaryLinear(numHid,numHid,opt.stcWeights))
model:add(BatchNormalization(numHid, opt.runningVal))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))
model:add(BinaryLinear(numHid,numHid,opt.stcWeights))
model:add(BatchNormalization(numHid, opt.runningVal))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))
model:add(BinaryLinear(numHid,10,opt.stcWeights))
model:add(nn.BatchNormalization(10))


local dE, param = model:getParameters()
local weight_size = dE:size(1)
local learningRates = torch.Tensor(weight_size):fill(0)
local clipvector = torch.Tensor(weight_size):fill(0)

local counter = 0
for i, layer in ipairs(model.modules) do
   if layer.__typename == 'BinaryLinear' then
      local weight_size = layer.weight:size(1)*layer.weight:size(2)
      local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]+size_w[2]))
      learningRates[{{counter+1, counter+weight_size}}]:fill(GLR)
      clipvector[{{counter+1, counter+weight_size}}]:fill(1)
      counter = counter+weight_size
      local bias_size = layer.bias:size(1)
      learningRates[{{counter+1, counter+bias_size}}]:fill(GLR)
      clipvector[{{counter+1, counter+bias_size}}]:fill(0)
      counter = counter+bias_size
    elseif layer.__typename == 'BatchNormalizationShiftPow2' then
        local weight_size = layer.weight:size(1)
        local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))
        learningRates[{{counter+1, counter+weight_size}}]:fill(GLR)
        clipvector[{{counter+1, counter+weight_size}}]:fill(0)
        counter = counter+weight_size
        local bias_size = layer.bias:size(1)
        learningRates[{{counter+1, counter+bias_size}}]:fill(1)
        clipvector[{{counter+1, counter+bias_size}}]:fill(0)
        counter = counter+bias_size
    elseif layer.__typename == 'nn.BatchNormalization' then
      local weight_size = layer.weight:size(1)
      local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))
      learningRates[{{counter+1, counter+weight_size}}]:fill(GLR)
      clipvector[{{counter+1, counter+weight_size}}]:fill(0)
      counter = counter+weight_size
      local bias_size = layer.bias:size(1)
      learningRates[{{counter+1, counter+bias_size}}]:fill(1)
      clipvector[{{counter+1, counter+bias_size}}]:fill(0)
      counter = counter+bias_size
  end
end
print(learningRates:eq(0):sum())
print(learningRates:ne(0):sum())
print(counter)

return {
   model = model,
   lrs = learningRates,
   clipV =clipvector,
}


================================================
FILE: Models/BinaryNet_SVHN_Model.lua
================================================
--[[This code specify the model for SVHN dataset. This model uses the Shift based batch-normalization algorithm.
In this file we also secify the Glorot learning parameter and which of the learnable parameter we clip ]]
require 'nn'
require './BinaryLinear.lua'
require './BinarizedNeurons'

local SpatialConvolution
if opt.type =='cuda' then
  require 'cunn'
  require 'cudnn'
  require './cudnnBinarySpatialConvolution.lua'
  SpatialConvolution = cudnnBinarySpatialConvolution
else
  require './BinarySpatialConvolution.lua'
  SpatialConvolution = BinarySpatialConvolution
end
if opt.SBN == true then
  require './BatchNormalizationShiftPow2.lua'
  require './SpatialBatchNormalizationShiftPow2.lua'
  BatchNormalization = BatchNormalizationShiftPow2
  SpatialBatchNormalization = SpatialBatchNormalizationShiftPow2
else
  BatchNormalization = nn.BatchNormalization
  SpatialBatchNormalization = nn.SpatialBatchNormalization
end


numHid=1024;
local model = nn.Sequential()

-- Convolution Layers
model:add(SpatialConvolution(3, 64, 3, 3 ,1,1,1,1,opt.stcWeights ))
model:add(SpatialBatchNormalization(64, opt.runningVal))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))

model:add(SpatialConvolution(64, 64, 3, 3,1,1,1,1,opt.stcWeights ))
model:add(cudnn.SpatialMaxPooling(2, 2))
model:add(SpatialBatchNormalization(64, opt.runningVal))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))


model:add(SpatialConvolution(64, 128, 3, 3 ,1,1,1,1,opt.stcWeights ))
model:add(SpatialBatchNormalization(128, opt.runningVal))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))

model:add(SpatialConvolution(128, 128, 3, 3 ,1,1,1,1,opt.stcWeights ))
model:add(cudnn.SpatialMaxPooling(2, 2))
model:add(SpatialBatchNormalization(128, opt.runningVal))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))


model:add(SpatialConvolution(128, 256, 3, 3,1,1,1,1,opt.stcWeights ))
model:add(SpatialBatchNormalization(256, opt.runningVal))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))

model:add(SpatialConvolution(256, 256, 3, 3,1,1,1,1,opt.stcWeights ))
model:add(cudnn.SpatialMaxPooling(2, 2))
model:add(SpatialBatchNormalization(256, opt.runningVal))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))

model:add(nn.View(256*4*4))

model:add(BinaryLinear(256*4*4,numHid,opt.stcWeights))
model:add(BatchNormalization(numHid, opt.runningVal))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))

model:add(BinaryLinear(numHid,numHid,opt.stcWeights))
model:add(BatchNormalization(numHid, opt.runningVal))
model:add(nn.HardTanh())
model:add(BinarizedNeurons(opt.stcNeurons))


model:add(BinaryLinear(numHid,10,opt.stcWeights))
model:add(nn.BatchNormalization(10))

local dE, param = model:getParameters()
local weight_size = dE:size(1)
local learningRates = torch.Tensor(weight_size):fill(0)
local clipvector = torch.Tensor(weight_size):fill(0)
local counter = 0
for i, layer in ipairs(model.modules) do
   if layer.__typename == 'BinaryLinear' then
      local weight_size = layer.weight:size(1)*layer.weight:size(2)
      local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]+size_w[2]))
      GLR=(math.pow(2,torch.round(math.log(GLR)/(math.log(2)))))
      learningRates[{{counter+1, counter+weight_size}}]:fill(GLR)
      clipvector[{{counter+1, counter+weight_size}}]:fill(1)
      counter = counter+weight_size
      local bias_size = layer.bias:size(1)
      learningRates[{{counter+1, counter+bias_size}}]:fill(GLR)
      clipvector[{{counter+1, counter+bias_size}}]:fill(0)
      counter = counter+bias_size
    elseif layer.__typename == 'BatchNormalizationShiftPow2' then
        local weight_size = layer.weight:size(1)
        local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))
        learningRates[{{counter+1, counter+weight_size}}]:fill(1)
        clipvector[{{counter+1, counter+weight_size}}]:fill(0)
        counter = counter+weight_size
        local bias_size = layer.bias:size(1)
        learningRates[{{counter+1, counter+bias_size}}]:fill(1)
        clipvector[{{counter+1, counter+bias_size}}]:fill(0)
        counter = counter+bias_size
    elseif layer.__typename == 'nn.BatchNormalization' then
      local weight_size = layer.weight:size(1)
      local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))
      learningRates[{{counter+1, counter+weight_size}}]:fill(1)
      clipvector[{{counter+1, counter+weight_size}}]:fill(0)
      counter = counter+weight_size
      local bias_size = layer.bias:size(1)
      learningRates[{{counter+1, counter+bias_size}}]:fill(1)
      clipvector[{{counter+1, counter+bias_size}}]:fill(0)
      counter = counter+bias_size
    elseif layer.__typename == 'nn.SpatialBatchNormalization' then
        local weight_size = layer.weight:size(1)
        local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))
        learningRates[{{counter+1, counter+weight_size}}]:fill(1)
        clipvector[{{counter+1, counter+weight_size}}]:fill(0)
        counter = counter+weight_size
        local bias_size = layer.bias:size(1)
        learningRates[{{counter+1, counter+bias_size}}]:fill(1)
        clipvector[{{counter+1, counter+bias_size}}]:fill(0)
        counter = counter+bias_size
    elseif layer.__typename == 'nn.SpatialBatchNormalization' then
                local weight_size = layer.weight:size(1)
                local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))
                learningRates[{{counter+1, counter+weight_size}}]:fill(1)
                clipvector[{{counter+1, counter+weight_size}}]:fill(0)
                counter = counter+weight_size
                local bias_size = layer.bias:size(1)
                learningRates[{{counter+1, counter+bias_size}}]:fill(1)
                clipvector[{{counter+1, counter+bias_size}}]:fill(0)
                counter = counter+bias_size
    elseif layer.__typename == 'SpatialBatchNormalizationShiftPow2' then
        local weight_size = layer.weight:size(1)
        local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))
        learningRates[{{counter+1, counter+weight_size}}]:fill(1)
        clipvector[{{counter+1, counter+weight_size}}]:fill(0)
        counter = counter+weight_size
        local bias_size = layer.bias:size(1)
        learningRates[{{counter+1, counter+bias_size}}]:fill(1)
        clipvector[{{counter+1, counter+bias_size}}]:fill(0)
        counter = counter+bias_size
    elseif layer.__typename == 'cudnnBinarySpatialConvolution' then
      local size_w=layer.weight:size();
      local weight_size = size_w[1]*size_w[2]*size_w[3]*size_w[4]

      local filter_size=size_w[3]*size_w[4]
      GLR=1/torch.sqrt(1.5/(size_w[1]*filter_size+size_w[2]*filter_size))
      GLR=(math.pow(2,torch.round(math.log(GLR)/(math.log(2)))))
      learningRates[{{counter+1, counter+weight_size}}]:fill(GLR)
      clipvector[{{counter+1, counter+weight_size}}]:fill(1)
      counter = counter+weight_size
      local bias_size = layer.bias:size(1)
      learningRates[{{counter+1, counter+bias_size}}]:fill(GLR)
      clipvector[{{counter+1, counter+bias_size}}]:fill(0)
      counter = counter+bias_size
    elseif layer.__typename == 'BinarySpatialConvolution' then
        local size_w=layer.weight:size();
        local weight_size = size_w[1]*size_w[2]*size_w[3]*size_w[4]

        local filter_size=size_w[3]*size_w[4]
        GLR=1/torch.sqrt(1.5/(size_w[1]*filter_size+size_w[2]*filter_size))
        GLR=(math.pow(2,torch.round(math.log(GLR)/(math.log(2)))))
        learningRates[{{counter+1, counter+weight_size}}]:fill(GLR)
        clipvector[{{counter+1, counter+weight_size}}]:fill(1)
        counter = counter+weight_size
        local bias_size = layer.bias:size(1)
        learningRates[{{counter+1, counter+bias_size}}]:fill(GLR)
        clipvector[{{counter+1, counter+bias_size}}]:fill(0)
        counter = counter+bias_size

  end
end

print(learningRates:eq(0):sum())
print(learningRates:ne(0):sum())
print(clipvector:ne(0):sum())
print(counter)
return {
     model = model,
     lrs = learningRates,
     clipV =clipvector,
  }


================================================
FILE: Models/BinarySpatialConvolution.lua
================================================
local BinarySpatialConvolution, parent = torch.class('BinarySpatialConvolution', 'nn.SpatialConvolution')

function BinarySpatialConvolution:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)
  local delayedReset = self.reset
  self.reset = function() end
  parent.__init(self, nInputPlane, nOutputPlane, kW, kH, dW, dH)
  self.reset = delayedReset
  self.padW = padW or 0
  self.padH = padH or 0
  self.stcWeights = stcWeights or false
  self.groups = groups or 1
  assert(nInputPlane % self.groups == 0,
         'nInputPlane should be divisible by nGroups')
  assert(nOutputPlane % self.groups == 0,
         'nOutputPlane should be divisible by nGroups')
  self.weight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)
  self.weightB = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)
  self.weightOrg = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)
  self.randmat = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)
  self.maskStc = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)
  self:reset()
  -- should nil for serialization, the reset will still work
  self.reset = nil
  self.iSize = torch.LongStorage(4):fill(0)


end

function BinarySpatialConvolution:reset(stdv)
  if stdv then
     stdv = stdv * math.sqrt(3)
  else
     stdv = 1/math.sqrt(self.kW*self.kH*self.nInputPlane)
  end
  if nn.oldSeed then
     self.weight:apply(function()
        return torch.uniform(-1, 1)
     end)
     if self.bias then
        self.bias:apply(function()
        return torch.uniform(-stdv, stdv)
        end)
     end
  else
     self.weight:uniform(-1, 1)
     if self.bias then
        self.bias:uniform(-stdv, stdv)
     end
  end
end

function BinarySpatialConvolution:binarized(trainFlag)
  self.weightOrg:copy(self.weight)
  self.binaryFlag = true
  if not self.binaryFlag then
    self.weight:copy(self.weightOrg)
  else
    self.weightB:copy(self.weight):add(1):div(2):clamp(0,1)

    if not self.stcWeights or not trainFlag then
      self.weightB:round():mul(2):add(-1)
    else
      self.maskStc=self.weightB-self.randmat:rand(self.randmat:size())
      self.weightB:copy(self.maskStc)

    end
  end

  return  self.weightB
end

local function backCompatibility(self)
   self.finput = self.finput or self.weight.new()
   self.fgradInput = self.fgradInput or self.weight.new()
   if self.padding then
      self.padW = self.padding
      self.padH = self.padding
      self.padding = nil
   else
      self.padW = self.padW or 0
      self.padH = self.padH or 0
   end
   if self.weight:dim() == 2 then
      self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
   end
   if self.gradWeight and self.gradWeight:dim() == 2 then
      self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
   end
end

local function makeContiguous(self, input, gradOutput)
  if not input:isContiguous() then
    self._input = self._input or input.new()
    self._input:resizeAs(input):copy(input)
    input = self._input
 end
 if gradOutput then
    if not gradOutput:isContiguous() then
 self._gradOutput = self._gradOutput or gradOutput.new()
 self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
 gradOutput = self._gradOutput
    end
 end
 return input, gradOutput
end

-- function to re-view the weight layout in a way that would make the MM ops happy
local function viewWeight(self)
   self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
   if self.gradWeight and self.gradWeight:dim() > 0 then
      self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
   end
end

local function unviewWeight(self)
   self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
   if self.gradWeight and self.gradWeight:dim() > 0 then
      self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
   end
end

function BinarySpatialConvolution:updateOutput(input)
   backCompatibility(self)
   viewWeight(self)
   input = makeContiguous(self, input)
   self.weightB = self:binarized(self.train)
   self.weight:copy(self.weightB)
   input.THNN.SpatialConvolutionMM_updateOutput(
      input:cdata(),
      self.output:cdata(),
      self.weight:cdata(),
      self.bias:cdata(),
      self.finput:cdata(),
      self.fgradInput:cdata(),
      self.kW, self.kH,
      self.dW, self.dH,
      self.padW, self.padH
   )
   self.weight:copy(self.weightOrg)
   unviewWeight(self)
   return self.output
end

function BinarySpatialConvolution:updateGradInput(input, gradOutput)
   if self.gradInput then
      backCompatibility(self)
      viewWeight(self)
      input, gradOutput = makeContiguous(self, input, gradOutput)
      self.weight:copy(self.weightB)
      input.THNN.SpatialConvolutionMM_updateGradInput(
         input:cdata(),
         gradOutput:cdata(),
         self.gradInput:cdata(),
         self.weight:cdata(),
         -- self.bias:cdata(), -- removed from this commit https://github.com/torch/nn/commit/651103f3aabc2dd154d6bd95ad565d14009255e6
         self.finput:cdata(),
         self.fgradInput:cdata(),
         self.kW, self.kH,
         self.dW, self.dH,
         self.padW, self.padH
      )
      self.weight:copy(self.weightOrg)
      unviewWeight(self)
      return self.gradInput
   end
end

function BinarySpatialConvolution:accGradParameters(input, gradOutput, scale)
  scale = scale or 1
  backCompatibility(self)
  input, gradOutput = makeContiguous(self, input, gradOutput)
  viewWeight(self)
  input.THNN.SpatialConvolutionMM_accGradParameters(
     input:cdata(),
     gradOutput:cdata(),
     self.gradWeight:cdata(),
     self.gradBias:cdata(),
     self.finput:cdata(),
     self.fgradInput:cdata(),
     self.kW, self.kH,
     self.dW, self.dH,
     self.padW, self.padH,
     scale
  )
  unviewWeight(self)
end

function BinarySpatialConvolution:type(type,tensorCache)
   self.finput = self.finput and torch.Tensor()
   self.fgradInput = self.fgradInput and torch.Tensor()
   return parent.type(self,type,tensorCache)
end

function BinarySpatialConvolution:__tostring__()
   return parent.__tostring__(self)
end

function BinarySpatialConvolution:clearState()
   nn.utils.clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
   return parent.clearState(self)
end


================================================
FILE: Models/SpatialBatchNormalizationShiftPow2.lua
================================================
--[[
   This file implements Shift based Batch Normalization based a variant of the vanilla BN as described in the paper:
   "Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Matthieu Courbariaux, Itay Hubara, Daniel Soudry, Ran El-Yaniv, Yoshua Bengio'

   The code is based on nn library
   --]]
local SpatialBatchNormalizationShiftPow2,parent = torch.class('SpatialBatchNormalizationShiftPow2', 'nn.Module')

function SpatialBatchNormalizationShiftPow2:__init(nFeature, runningVal, eps, momentum)
   parent.__init(self)
   assert(nFeature and type(nFeature) == 'number',
          'Missing argument #1: Number of feature planes. ' ..
          'Give 0 for no affine transform')
   self.eps = eps or 1e-5
   self.train = true
   self.momentum = momentum or 0.125
   self.runningVal = runningVal or true
   self.running_mean = torch.Tensor()
   self.running_std = torch.Tensor()
   self.running_std_ap2 = torch.Tensor()
   if nFeature > 0 then self.affine = true end

   if self.affine then
      self.weight = torch.Tensor(nFeature)
      self.weightSign = torch.Tensor(nFeature)
      self.weight_ap2 = torch.Tensor(nFeature)
      self.bias = torch.Tensor(nFeature)
      self.gradWeight = torch.Tensor(nFeature)
      self.gradBias = torch.Tensor(nFeature)
      self:reset()
   end
end

function SpatialBatchNormalizationShiftPow2:reset()
   self.weight:fill(1)
   self.bias:zero()
end

function SpatialBatchNormalizationShiftPow2:updateOutput(input)
   assert(input:dim() == 4, 'only mini-batch supported (4D tensor), got '
             .. input:dim() .. 'D tensor instead')
   local nBatch = input:size(1)
   local nFeature = input:size(2)
   local iH = input:size(3)
   local iW = input:size(4)

   -- buffers that are reused
   self.buffer = self.buffer or input.new()
   self.buffer2 = self.buffer2 or input.new()
   self.centered = self.centered or input.new()
   self.centered:resizeAs(input)
   self.centeredOrg = self.centeredOrg or input.new()
   self.centeredOrg:resizeAs(input)
   self.centeredSign = self.centeredSign or input.new()
   self.centeredSign:resizeAs(input)
   self.std = self.std or input.new()
   self.normalized = self.normalized or input.new()
   self.normalized:resizeAs(input)
   self.normalizedSign = self.normalizedSign or input.new()
   self.normalizedSign:resizeAs(input)
   self.output:resizeAs(input)
   self.gradInput:resizeAs(input)
   if self.train == false and self.runningVal == true then
      assert(self.running_mean:nDimension() ~= 0,
             'Module never run on training data. First run on some training data before evaluating.')
      self.output:copy(input)
      self.buffer:repeatTensor(self.running_mean:view(1, nFeature, 1, 1), nBatch, 1, iH, iW)
      self.output:add(-1, self.buffer)
      self.running_std_ap2:copy(torch.pow(2,torch.round(torch.log(self.running_std):div(math.log(2)))))
      self.buffer:repeatTensor(self.running_std_ap2:view(1, nFeature, 1, 1), nBatch, 1, iH, iW)
      self.output:cmul(self.buffer)
   else -- training mode
      if self.running_mean:nDimension() == 0 then
         self.running_mean:resize(nFeature):zero()
      end
      if self.running_std:nDimension() == 0 then
         self.running_std:resize(nFeature):zero()
         self.running_std_ap2:resize(nFeature):zero()
      end
      -- calculate mean over mini-batch, over feature-maps
      local in_folded = input:view(nBatch, nFeature, iH * iW)
      self.buffer:mean(in_folded, 1)
      self.buffer2:mean(self.buffer, 3)
      self.running_mean:mul(1 - self.momentum):add(self.momentum, self.buffer2) -- add to running mean
      self.buffer:repeatTensor(self.buffer2:view(1, nFeature, 1, 1),
                               nBatch, 1, iH, iW)

      -- subtract mean
      self.centered:add(input, -1, self.buffer)                  -- x - E(x)
      self.centeredOrg:copy(self.centered)
      self.centeredSign:copy(self.centered)

      self.centeredSign:sign()
      self.centered:copy(torch.pow(2,torch.round(torch.log(self.centered:abs()):div(math.log(2))))):cmul(self.centeredSign)
      -- calculate standard deviation over mini-batch

      self.buffer:copy(self.centered):cmul(self.centeredOrg) --:abs()
      -- calculate standard deviation over mini-batch

      local buf_folded = self.buffer:view(nBatch,nFeature,iH*iW)
      self.std:mean(self.buffer2:mean(buf_folded, 1), 3)
      self.std:add(self.eps):sqrt():pow(-1)      -- 1 / E([x - E(x)]^2)
      self.running_std:mul(1 - self.momentum):add(self.momentum, self.std) -- add to running stdv
      self.std:copy(torch.pow(2,torch.round(torch.log(self.std):div(math.log(2)))))


      self.buffer:repeatTensor(self.std:view(1, nFeature, 1, 1),
                               nBatch, 1, iH, iW)

      -- divide standard-deviation + eps
      self.output:cmul(self.centeredOrg, self.buffer)
      self.normalized:copy(self.output)
      self.normalizedSign:copy(self.normalized)
      self.normalizedSign:sign()
      self.normalized:copy(torch.pow(2,torch.round(torch.log(self.normalized:abs()):div(math.log(2)))):cmul(self.normalizedSign))
    --  self.normalized[self.normalized:lt(0)]=1; -- Can improve results
   end

   if self.affine then
      -- multiply with gamma and add beta
      self.weight_ap2:copy(self.weight)
      self.weightSign:copy(self.weight):sign()
      self.weight_ap2:copy(torch.pow(2,torch.round(torch.log(self.weight:clone():abs()):div(math.log(2))))):cmul(self.weightSign)
      --self.weight:fill(1) --Almost similar results
      self.buffer:repeatTensor(self.weight_ap2:view(1, nFeature, 1, 1),nBatch, 1, iH, iW)
      self.output:cmul(self.buffer)
      self.buffer:repeatTensor(self.bias:view(1, nFeature, 1, 1),
                               nBatch, 1, iH, iW)
      self.output:add(self.buffer)
   end

   return self.output
end

function SpatialBatchNormalizationShiftPow2:updateGradInput(input, gradOutput)
   assert(input:dim() == 4, 'only mini-batch supported')
   assert(gradOutput:dim() == 4, 'only mini-batch supported')
   assert(self.train == true, 'should be in training mode when self.train is true')
   local nBatch = input:size(1)
   local nFeature = input:size(2)
   local iH = input:size(3)
   local iW = input:size(4)

   self.gradInput:cmul(self.centered, gradOutput)
   local gi_folded = self.gradInput:view(nBatch, nFeature, iH * iW)
   self.buffer2:mean(self.buffer:mean(gi_folded, 1), 3)
   self.gradInput:repeatTensor(self.buffer2:view(1, nFeature, 1, 1),
                               nBatch, 1, iH, iW)
   self.gradInput:cmul(self.centered):mul(-1)
   self.buffer:repeatTensor(self.std:view(1, nFeature, 1, 1),
                            nBatch, 1, iH, iW)
   self.gradInput:cmul(self.buffer):cmul(self.buffer)

   self.buffer:mean(gradOutput:view(nBatch, nFeature, iH*iW), 1)
   self.buffer2:mean(self.buffer, 3)
   self.buffer:repeatTensor(self.buffer2:view(1, nFeature, 1, 1),
                            nBatch, 1, iH, iW)
   self.gradInput:add(gradOutput):add(-1, self.buffer)
   self.buffer:repeatTensor(self.std:view(1, nFeature, 1, 1),
                            nBatch, 1, iH, iW)
   self.gradInput:cmul(self.buffer)

   if self.affine then
      self.buffer:repeatTensor(self.weight_ap2:view(1, nFeature, 1, 1),
                               nBatch, 1, iH, iW)
      self.gradInput:cmul(self.buffer)
   end

   return self.gradInput
end

function SpatialBatchNormalizationShiftPow2:accGradParameters(input, gradOutput, scale)
   if self.affine then
      scale = scale or 1.0
      local nBatch = input:size(1)
      local nFeature = input:size(2)
      local iH = input:size(3)
      local iW = input:size(4)
      self.buffer2:resizeAs(self.normalized):copy(self.normalized)
      self.buffer2 = self.buffer2:cmul(gradOutput):view(nBatch, nFeature, iH*iW)
      self.buffer:sum(self.buffer2, 1) -- sum over mini-batch
      self.buffer2:sum(self.buffer, 3) -- sum over pixels
      self.gradWeight:add(scale, self.buffer2)

      self.buffer:sum(gradOutput:view(nBatch, nFeature, iH*iW), 1)
      self.buffer2:sum(self.buffer, 3)
      self.gradBias:add(scale, self.buffer2) -- sum over mini-batch
   end
end


================================================
FILE: Models/cudnnBinarySpatialConvolution.lua
================================================
local cudnnBinarySpatialConvolution, parent =
    torch.class('cudnnBinarySpatialConvolution', 'cudnn.SpatialConvolution')
local ffi = require 'ffi'
local errcheck = cudnn.errcheck

local autotunerCache = {}
autotunerCache[1] = {} -- forward
autotunerCache[2] = {} -- backwardFilter
autotunerCache[3] = {} -- backwardData

function cudnnBinarySpatialConvolution:__init(nInputPlane, nOutputPlane,
                            kW, kH, dW, dH, padW, padH,stcWeights, groups)
    local delayedReset = self.reset
    self.reset = function() end
    parent.__init(self, nInputPlane, nOutputPlane, kW, kH, dW, dH)
    self.reset = delayedReset
    self.padW = padW or 0
    self.padH = padH or 0
    self.groups = groups or 1
    self.stcWeights = stcWeights or false
    assert(nInputPlane % self.groups == 0,
           'nInputPlane should be divisible by nGroups')
    assert(nOutputPlane % self.groups == 0,
           'nOutputPlane should be divisible by nGroups')
    self.weight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kH, kW)
    self.weightB = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)
    self.weightOrg = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)
    self.randmat = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)
    self.maskStc = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)
    self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kH, kW)
    self:reset()
    -- should nil for serialization, the reset will still work
    self.reset = nil
end

function cudnnBinarySpatialConvolution:binarized(trainFlag)
  self.weightOrg:copy(self.weight)
  self.binaryFlag = true
  if not self.binaryFlag then
    self.weight:copy(self.weightOrg)
  else
    self.weightB:copy(self.weight):add(1):div(2):clamp(0,1)

    if not self.stcWeights or not trainFlag then
      self.weightB:round():mul(2):add(-1)
      --print(self.weightB)
    else
      self.maskStc=self.weightB-self.randmat:rand(self.randmat:size())
      self.weightB:copy(self.maskStc)

    end
  end

  return  self.weightB
end

-- if you change the configuration of the module manually, call this
function cudnnBinarySpatialConvolution:resetWeightDescriptors()
    assert(torch.typename(self.weight) == 'torch.CudaTensor',
           'Only Cuda supported duh!')
    assert(torch.typename(self.bias) == 'torch.CudaTensor' or not self.bias,
           'Only Cuda supported duh!')
    -- for compatibility
    self.groups = self.groups or 1
    -- create filterDescriptor for weight
    self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]')
    errcheck('cudnnCreateFilterDescriptor', self.weightDesc)
    local desc = torch.IntTensor({self.nOutputPlane/self.groups,
                              self.nInputPlane/self.groups,
                              self.kH, self.kW})
    errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0],
             'CUDNN_DATA_FLOAT', 'CUDNN_TENSOR_NCHW', 4,
             desc:data());
    local function destroyWDesc(d)
        errcheck('cudnnDestroyFilterDescriptor', d[0]);
    end
    ffi.gc(self.weightDesc, destroyWDesc)

    -- create descriptor for bias
    if self.bias then
        self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1))
    end
end

function cudnnBinarySpatialConvolution:fastest(mode)
    if mode == nil then mode = true end
    self.fastest_mode = mode
    self.iSize = self.iSize or torch.LongStorage(4)
    self.iSize:fill(0)
    return self
end

function cudnnBinarySpatialConvolution:setMode(fmode, bdmode, bwmode)
    if fmode ~= nil then
        self.fmode = fmode
    end
    if bdmode ~= nil then
        self.bdmode = bdmode
    end
    if bwmode ~= nil then
        self.bwmode = bwmode
    end
    self.iSize = self.iSize or torch.LongStorage(4)
    self.iSize:fill(0)
    return self
end

function cudnnBinarySpatialConvolution:resetMode()
    self.fmode = nil
    self.bdmode = nil
    self.bwmode = nil
    return self
end

function cudnnBinarySpatialConvolution:noBias()
   self.bias = nil
   self.gradBias = nil
   return self
end

function cudnnBinarySpatialConvolution:createIODescriptors(input)
    parent.createIODescriptors(self,input)
end

local one = torch.FloatTensor({1});
local zero = torch.FloatTensor({0});

local function makeContiguous(self, input, gradOutput)
   if not input:isContiguous() then
      self._input = self._input or input.new()
      self._input:typeAs(input):resizeAs(input):copy(input)
      input = self._input
   end
   if gradOutput and not gradOutput:isContiguous() then
      self._gradOutput = self._gradOutput or gradOutput.new()
      self._gradOutput:typeAs(gradOutput):resizeAs(gradOutput):copy(gradOutput)
      gradOutput = self._gradOutput
   end
   return input, gradOutput
end

function cudnnBinarySpatialConvolution:updateOutput(input)
    self.weightOrg:copy(self.weight)
    self.weightB = self:binarized(self.train)
    self.weight:copy(self.weightB)
    parent.updateOutput(self,input)
    self.weight:copy(self.weightOrg)
    return self.output
end

function cudnnBinarySpatialConvolution:updateGradInput(input, gradOutput)
    if not self.gradInput then return end
    self.weight:copy(self.weightB)
    parent.updateGradInput(self, input, gradOutput:contiguous(), scale)
    self.weight:copy(self.weightOrg)
    return self.gradInput
end

function cudnnBinarySpatialConvolution:accGradParameters(input, gradOutput, scale)
    parent.accGradParameters(self, input, gradOutput:contiguous(), scale)
end

function cudnnBinarySpatialConvolution:clearDesc()
    self.weightDesc = nil
    self.biasDesc = nil
    self.convDesc = nil
    self.iDesc = nil
    self.oDesc = nil
    self.oDescForBias = nil
    self.algType = nil
    self.fwdAlgType = nil
    self.bwdDataAlgType = nil
    self.bwdFilterAlgType = nil
    self.extraBuffer = nil
    self.extraBufferSizeInBytes = nil
    self.scaleT = nil
end

function cudnnBinarySpatialConvolution:write(f)
    self:clearDesc()
    local var = {}
    for k,v in pairs(self) do
        var[k] = v
    end
    f:writeObject(var)
end

function cudnnBinarySpatialConvolution:clearState()
   self:clearDesc()
   return nn.Module.clearState(self)
end


================================================
FILE: README.md
================================================
Deep Networks on classification tasks using Torch
=================================================
This is a complete training example for BinaryNets using Binary-Backpropagation algorithm as explained in
"Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Matthieu Courbariaux, Itay Hubara, Daniel Soudry, Ran El-Yaniv, Yoshua Bengio'
on following datasets: Cifar10/100, SVHN, MNIST

## Data
We use dp library to extract all the data please view installation section

## Dependencies
* Torch (http://torch.ch)
* "DataProvider.torch" (https://github.com/eladhoffer/DataProvider.torch) for DataProvider class.
* "cudnn.torch" (https://github.com/soumith/cudnn.torch) for faster training. Can be avoided by changing "cudnn" to "nn" in models.
* "dp" (https://github.com/nicholas-leonard/dp.git) for data extraction
* "unsup" (https://github.com/koraykv/unsup.git) for data pre-processing

To install all dependencies (assuming torch is installed) use:
```bash
luarocks install https://raw.githubusercontent.com/eladhoffer/DataProvider.torch/master/dataprovider-scm-1.rockspec
luarocks install cudnn
luarocks install dp
luarocks install unsup
```

## Training
Create pre-processing folder:
```lua
cd BinaryNet
mkdir PreProcData
```

Start training using:
```lua
th Main_BinaryNet_Cifar10.lua -network BinaryNet_Cifar10_Model
```

or,

```lua
th Main_BinaryNet_MNIST.lua -network BinaryNet_MNIST_Model
```

## Run with Docker
The Docker is built from `nvidia/cuda:8.0-cudnn5-devel` with Torch commit `0219027e6c4644a0ba5c5bf137c989a0a8c9e01b`

- To build image, run: `docker build -t binarynet:torch-gpu-cuda-8.0 -f Dockerfile/binarynet-torch-gpu-cuda-8.0 .` or to pull docker image: `docker pull hychiang/binarynet:torch-gpu-cuda-8.0`

- To launch image with gpu, run: `docker run -it --gpus all binarynet:torch-gpu-cuda-8.0`

- To train BNN with Cifar10: `th Main_BinaryNet_Cifar10.lua -network BinaryNet_Cifar10_Model`


## Additional flags
|Flag             | Default Value        |Description
|:----------------|:--------------------:|:----------------------------------------------
|modelsFolder     |  ./Models/           | Models Folder
|network          |  Model.lua           | Model file - must return valid network.
|LR               |  0.1                 | learning rate
|LRDecay          |  0                   | learning rate decay (in # samples
|weightDecay      |  1e-4                | L2 penalty on the weights
|momentum         |  0.9                 | momentum
|batchSize        |  128                 | batch size
|stcNeurons       |  true                | using stochastic binarization for the neurons or not
|stcWeights       |  false               | using stochastic binarization for the weights or not
|optimization     |  adam                | optimization method
|SBN              |  true                | use shift based batch-normalization or not
|runningVal       |  true                | use running mean and std or not
|epoch            |  -1                  | number of epochs to train (-1 for unbounded)
|threads          |  8                   | number of threads
|type             |  cuda                | float or cuda
|devid            |  1                   | device ID (if using CUDA)
|load             |  none                |  load existing net weights
|save             |  time-identifier     | save directory
|dataset          |  Cifar10             | Dataset - Cifar10, Cifar100, STL10, SVHN, MNIST
|dp_prepro        |  false               | preprocessing using dp lib
|whiten           |  false               | whiten data
|augment          |  false               | Augment training data
|preProcDir       |  ./PreProcData/      | Data for pre-processing (means,Pinv,P)


================================================
FILE: SqrHingeEmbeddingCriterion.lua
================================================
--[[
This Function implement the squared hinge loss criterion
]]
local SqrtHingeEmbeddingCriterion, parent = torch.class('SqrtHingeEmbeddingCriterion', 'nn.Criterion')

function SqrtHingeEmbeddingCriterion:__init(margin)
   parent.__init(self)
   self.margin = margin or 1
   self.sizeAverage = true
end

function SqrtHingeEmbeddingCriterion:updateOutput(input,y)
   self.buffer = self.buffer or input.new()
   if not torch.isTensor(y) then
      self.ty = self.ty or input.new():resize(1)
      self.ty[1]=y
      y=self.ty
   end

   self.buffer:resizeAs(input):copy(input)
   self.buffer:cmul(y):mul(-1):add(self.margin)
   self.buffer[torch.le(self.buffer ,0)]=0
   self.output=self.buffer:clone():pow(2):sum()

   if (self.sizeAverage == nil or self.sizeAverage == true) then
      self.output = self.output / input:nElement()
   end

   return self.output
end

function SqrtHingeEmbeddingCriterion:updateGradInput(input, y)
   if not torch.isTensor(y) then self.ty[1]=y; y=self.ty end
   self.gradInput:resizeAs(input):copy(y):mul(-2):cmul(self.buffer)
   self.gradInput[torch.cmul(y,input):gt(self.margin)] = 0
   if (self.sizeAverage == nil or self.sizeAverage == true) then
      self.gradInput:mul(1 / input:nElement())
   end
   return self.gradInput
end


================================================
FILE: adaMax_binary_clip_shift.lua
================================================
--[[ An implementation of Shift based AdaMax based on  http://arxiv.org/pdf/1412.6980.pdf as described the paper:
   "Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Matthieu Courbariaux, Itay Hubara, Daniel Soudry, Ran El-Yaniv, Yoshua Bengio'

Note that this function perform the weight cliping as well

ARGS:

- 'opfunc' : a function that takes a single input (X), the point
             of a evaluation, and returns f(X) and df/dX
- 'x'      : the initial point
- 'config` : a table with configuration parameters for the optimizer
- 'config.learningRate'      : learning rate
- 'config.beta1'             : first moment coefficient
- 'config.beta2'             : second moment coefficient
- 'config.epsilon'           : for numerical stability
- 'state'                    : a table describing the state of the optimizer; after each
                              call the state is modified

RETURN:
- `x`     : the new x vector
- `f(x)`  : the function, evaluated before the update

]]

function adaMax_binary_clip_shift(opfunc, x, config, state)
    -- (0) get/update state
    local config = config or {}
    local state = state or config
    local lr = config.learningRate or 0.002
    local GLRvec = config.GLRvec or 1
    local clipV = config.clipV or 0

    local beta1 = config.beta1 or 0.9
    local beta2 = config.beta2 or 0.999
    local epsilon = config.epsilon or 2^-27

    -- (1) evaluate f(x) and df/dx
    local fx, dfdx = opfunc(x)
    -- Initialization
    state.t = state.t or 0
    -- Exponential moving average of gradient values
    state.m = state.m or x.new(dfdx:size()):zero()
    -- Exponential moving average of squared gradient values
    state.v = state.v or x.new(dfdx:size()):zero()
    -- A tmp tensor to hold the sqrt(v) + epsilon
    state.denom = state.denom or x.new(dfdx:size()):zero()

    state.t = state.t + 1

    -- Decay the first and second moment running average coefficient
    state.m:mul(beta1):add(1-beta1, dfdx)
    state.v:copy( torch.cmax(state.v:mul(beta2),dfdx:abs()) )
    local biasCorrection1 = 1 - beta1^state.t

    local stepSize = lr/biasCorrection1 --math.sqrt(biasCorrection2)/biasCorrection1

    stepSize=math.pow(2,torch.round(math.log(stepSize)/(math.log(2))))
    -- (2) update x
    local tmp=torch.zeros(x:size())
    if opt.type == 'cuda' then
      tmp=tmp:cuda()
    end


    state.v:copy(torch.pow(2,torch.round(torch.log(state.v):div(math.log(2)))))
    state.v:add(epsilon)
    tmp:addcdiv(1, state.m, state.v)
    -- Multiply by Glorot learning rate vector
    x:addcmul(-stepSize, tmp, GLRvec)
    -- Clip to [-1,1]
    x[clipV:eq(1)]=x[clipV:eq(1)]:clamp(-1,1)
    -- return x*, f(x) before optimization
    return x, {fx}
end


================================================
FILE: adam_binary_clip_b.lua
================================================
--[[ An implementation of Adam http://arxiv.org/pdf/1412.6980.pdf

Note that this function perform the weight cliping as well

ARGS:

- 'opfunc' : a function that takes a single input (X), the point
             of a evaluation, and returns f(X) and df/dX
- 'x'      : the initial point
- 'config` : a table with configuration parameters for the optimizer
- 'config.learningRate'      : learning rate
- 'config.beta1'             : first moment coefficient
- 'config.beta2'             : second moment coefficient
- 'config.epsilon'           : for numerical stability
- 'state'                    : a table describing the state of the optimizer; after each
                              call the state is modified

RETURN:
- `x`     : the new x vector
- `f(x)`  : the function, evaluated before the update

]]

function adam_binary_clip_b(opfunc, x, config, state)
    -- (0) get/update state
    local config = config or {}
    local state = state or config
    local lr = config.learningRate or 0.001
    local GLRvec = config.GLRvec or 1

    local beta1 = config.beta1 or 0.9
    local beta2 = config.beta2 or 0.999
    local epsilon = config.epsilon or 1e-8

    -- (1) evaluate f(x) and df/dx
    local fx, dfdx = opfunc(x)
    --print(lr,dfdx:size())
    -- Initialization
    state.t = state.t or 0
    -- Exponential moving average of gradient values
    state.m = state.m or x.new(dfdx:size()):zero()
    -- Exponential moving average of squared gradient values
    state.v = state.v or x.new(dfdx:size()):zero()
    -- A tmp tensor to hold the sqrt(v) + epsilon
    state.denom = state.denom or x.new(dfdx:size()):zero()

    state.t = state.t + 1

    -- Decay the first and second moment running average coefficient
    state.m:mul(beta1):add(1-beta1, dfdx)
    state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx)

    state.denom:copy(state.v):sqrt():add(epsilon)

    local biasCorrection1 = 1 - beta1^state.t
    local biasCorrection2 = 1 - beta2^state.t
    local stepSize = lr * math.sqrt(biasCorrection2)/biasCorrection1
    -- (2) update x
    local tmp=torch.zeros(x:size())
    if opt.type == 'cuda' then
      tmp=tmp:cuda()
    end

    tmp:addcdiv(1, state.m, state.denom)
    x:addcmul(-stepSize, tmp, GLRvec)
    x[clipV:eq(1)]=x[clipV:eq(1)]:clamp(-1,1)

    return x, {fx}
end