[
  {
    "path": "Data.lua",
    "content": "--[[\nThis code create the training test and validation datasets and preform diffrent kinds of preprocessing\nThis code is based on elad hoffer Data.lua file from ConvNet-torch library (https://github.com/eladhoffer/ConvNet-torch.git) and uses:\n  - Elad Hoffer DataProvidor.torch library: https://github.com/eladhoffer/DataProvider.torch.git\n  - Nicholas Leonard dp library: https://github.com/nicholas-leonard/dp.git\n  - Koray Kavukcuoglu dp library: https://github.com/koraykv/unsup.git\n]]\nrequire 'dp'\nlocal DataProvider = require 'DataProvider'\nlocal opt = opt or {}\nlocal Dataset = opt.dataset or 'Cifar10'\nlocal PreProcDir = opt.preProcDir or './PreProcData/'\nlocal Whiten = opt.whiten or false\nlocal NormelizeWhiten = opt.NormelizeWhiten or false\nlocal DataPath = opt.datapath or '/home/itayh/Datasets/'\nlocal normalization = opt.normalization or 'simple'\nlocal format = opt.format or 'rgb'\nlocal TestData\nlocal TrainData\nlocal ValidData\nlocal Classes\n\nif Dataset =='Cifar100' then\n  local file_valid = paths.concat(PreProcDir, format .. 'whiten_valid.t7')\n  local file_train = paths.concat(PreProcDir, format .. 'whiten_train.t7')\n  local file_test = paths.concat(PreProcDir, format .. 'whiten_test.t7')\n  if (paths.filep(file_valid) and paths.filep(file_train) and paths.filep(file_test)) then\n    ValidData=torch.load(file_valid)\n    TrainData=torch.load(file_train)\n    TestData=torch.load(file_test)\n  else\n    if paths.dirp(PreProcDir)==false then\n     sys.execute('mkdir PreProcData/Cifar100')\n    end\n    input_preprocess = {}\n    table.insert(input_preprocess, dp.ZCA())\n    ds = dp.Cifar100{scale={0,1}, valid_ratio=0.1,input_preprocess = input_preprocess}\n    ValidData = {data=ds:validSet():inputs():input():clone():float(), label=ds:validSet():targets():input():clone():byte() }\n    TrainData = {data=ds:trainSet():inputs():input():float(), label=ds:trainSet():targets():input():byte() }\n    TestData  = {data=ds:testSet():inputs():input():float() , label=ds:testSet():targets():input():byte()  }\n    collectgarbage()\n    torch.save(file_valid,ValidData)\n    torch.save(file_train,TrainData)\n    torch.save(file_test,TestData)\n  end\nelseif Dataset == 'Cifar10' then\n    local file_valid = paths.concat(PreProcDir, format .. 'whiten_valid.t7')\n    local file_train = paths.concat(PreProcDir, format .. 'whiten_train.t7')\n    local file_test = paths.concat(PreProcDir, format .. 'whiten_test.t7')\n    if (paths.filep(file_valid) and paths.filep(file_train) and paths.filep(file_test)) then\n      ValidData=torch.load(file_valid)\n      TrainData=torch.load(file_train)\n      TestData=torch.load(file_test)\n    else\n      if paths.dirp(PreProcDir)==false then\n       sys.execute('mkdir PreProcData/Cifar10')\n      end\n      input_preprocess = {}\n      table.insert(input_preprocess, dp.ZCA())\n      ds = dp.Cifar10{scale={0,1},valid_ratio=0.1,input_preprocess = input_preprocess} --,input_preprocess = input_preprocess}  scale={0,1},\n      ValidData = {data=ds:validSet():inputs():input():float(), label=ds:validSet():targets():input():clone():byte() }\n      TrainData = {data=ds:trainSet():inputs():input():float(), label=ds:trainSet():targets():input():byte() }\n      TestData  = {data=ds:testSet():inputs():input():float(), label=ds:testSet():targets():input():byte()  }\n      collectgarbage()\n      torch.save(file_valid,ValidData)\n      torch.save(file_train,TrainData)\n      torch.save(file_test,TestData)\n    end\n    Classes = {'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'}\nelseif Dataset == 'MNIST' then\n  local file_valid = paths.concat(PreProcDir, format .. '_valid.t7')\n  local file_train = paths.concat(PreProcDir, format .. '_train.t7')\n  local file_test = paths.concat(PreProcDir, format .. '_test.t7')\n  if (paths.filep(file_valid) and paths.filep(file_train) and paths.filep(file_test)) then\n    ValidData=torch.load(file_valid)\n    TrainData=torch.load(file_train)\n    TestData=torch.load(file_test)\n  else\n    if paths.dirp(PreProcDir)==false then\n     sys.execute('mkdir PreProcData/MNIST')\n    end\n    ds = dp.Mnist{scale={0,1}}\n    ValidData = {data=ds:validSet():inputs():input():clone():float(), label=ds:validSet():targets():input():clone():byte() }\n    TrainData = {data=ds:trainSet():inputs():input():float(), label=ds:trainSet():targets():input():byte() }\n    TestData  = {data=ds:testSet():inputs():input():float() , label=ds:testSet():targets():input():byte()  }\n    collectgarbage()\n    torch.save(file_valid,ValidData)\n    torch.save(file_train,TrainData)\n    torch.save(file_test,TestData)\n  end\n  Classes = {1,2,3,4,5,6,7,8,9,0}\nelseif Dataset == 'SVHN' then\n    local LCNfile_valid = paths.concat(PreProcDir, format .. 'GCN_LCN_valid.t7')\n    local LCNfile_train = paths.concat(PreProcDir, format .. 'GCN_LCN_train.t7')\n    local LCNfile_test = paths.concat(PreProcDir, format .. 'GCN_LCN_test.t7')\n    print(LCNfile_valid)\n    if (paths.filep(LCNfile_valid) and paths.filep(LCNfile_train) and paths.filep(LCNfile_test)) then\n      ValidData=torch.load(LCNfile_valid)\n      TrainData=torch.load(LCNfile_train)\n      TestData=torch.load(LCNfile_test)\n    else\n      if paths.dirp(PreProcDir)==false then\n       sys.execute('mkdir PreProcData/SVHN')\n      end\n      local input_preprocess = {}\n      table.insert(input_preprocess, dp.GCN{batch_size=5000,use_std=true,sqrt_bias=10})\n      table.insert(input_preprocess, dp.LeCunLCN{kernel_size=9,divide_by_std=true,batch_size=5000,progress=true}) --,kernel_size=31,kernel_std=32})\n      ds = dp.Svhn{scale={0,1}, input_preprocess = input_preprocess}\n      ValidData = {data=ds:validSet():inputs():input():float(), label=ds:validSet():targets():input():byte() }; ValidData.data:div( ValidData.data:max())\n      TrainData = {data=ds:trainSet():inputs():input():float(), label=ds:trainSet():targets():input():byte() }; TrainData.data:div( TrainData.data:max())\n      TestData  = {data=ds:testSet():inputs():input():float(), label=ds:testSet():targets():input():byte() };  TestData.data:div( TestData.data:max())\n\n      collectgarbage()\n      torch.save(LCNfile_valid,ValidData)\n      torch.save(LCNfile_train,TrainData)\n      torch.save(LCNfile_test,TestData)\n    end\n    Classes = {1,2,3,4,5,6,7,8,9,0}\nend\n\nTrainData.data = TrainData.data:float()\nTestData.data = TestData.data:float()\n\nlocal TrainDataProvider = DataProvider.Container{\n  Name = 'TrainingData',\n  CachePrefix = nil,\n  CacheFiles = false,\n  Source = {TrainData.data,TrainData.label},\n  MaxNumItems = 1e6,\n  CopyData = false,\n  TensorType = 'torch.FloatTensor',\n}\nlocal TestDataProvider = DataProvider.Container{\n  Name = 'TestData',\n  CachePrefix = nil,\n  CacheFiles = false,\n  Source = {TestData.data, TestData.label},\n  MaxNumItems = 1e6,\n  CopyData = false,\n  TensorType = 'torch.FloatTensor',\n\n}\nlocal ValidDataProvider = DataProvider.Container{\n  Name = 'ValidData',\n  CachePrefix = nil,\n  CacheFiles = false,\n  Source = {ValidData.data, ValidData.label},\n  MaxNumItems = 1e6,\n  CopyData = false,\n  TensorType = 'torch.FloatTensor',\n\n}\n\n--Preprocesss\n\n  if format == 'yuv' then\n    require 'image'\n    TrainDataProvider:apply(image.rgb2yuv)\n    TestDataProvider:apply(image.rgb2yuv)\n  end\n  if Whiten then\n    require 'unsup'\n    local meanfile = paths.concat(PreProcDir, format .. 'imageMean.t7')\n    local mean, P, invP\n    local Pfile = paths.concat(PreProcDir,format .. 'P.t7')\n    local invPfile = paths.concat(PreProcDir,format .. 'invP.t7')\n\n    if (paths.filep(Pfile) and paths.filep(invPfile) and paths.filep(meanfile)) then\n      P = torch.load(Pfile)\n      invP = torch.load(invPfile)\n      mean = torch.load(meanfile)\n      TrainDataProvider.Data = unsup.zca_whiten(TrainDataProvider.Data, mean, P, invP)\n    else\n      TrainDataProvider.Data, mean, P, invP = unsup.zca_whiten(TrainDataProvider.Data)\n      torch.save(Pfile,P)\n      torch.save(invPfile,invP)\n      torch.save(meanfile,mean)\n    end\n      TestDataProvider.Data = unsup.zca_whiten(TestDataProvider.Data, mean, P, invP)\n      ValidDataProvider.Data = unsup.zca_whiten(ValidDataProvider.Data, mean, P, invP)\n  elseif dp_prepro then\n        -- Do nothing since we use dp lib for GCN and LCN\n  else\n      local meanfile = paths.concat(PreProcDir, format .. normalization .. 'Mean.t7')\n      local stdfile = paths.concat(PreProcDir,format .. normalization .. 'Std.t7')\n      local mean, std\n      local loaded = false\n\n      if paths.filep(meanfile) and paths.filep(stdfile) then\n        mean = torch.load(meanfile)\n        std = torch.load(stdfile)\n        loaded = true\n      end\n\n      mean, std = TrainDataProvider:normalize(normalization, mean, std)\n      TestDataProvider:normalize(normalization, mean, std)\n      ValidDataProvider:normalize(normalization, mean, std)\n      if not loaded then\n        torch.save(meanfile,mean)\n        torch.save(stdfile,std)\n      end\n    end\n\n\n\nreturn{\n    TrainData = TrainDataProvider,\n    TestData = TestDataProvider,\n    ValidData = ValidDataProvider,\n    Classes = Classes\n}\n"
  },
  {
    "path": "Dockerfile/binarynet-torch-gpu-cuda-8.0",
    "content": "FROM nvidia/cuda:8.0-cudnn5-devel\nWORKDIR /workspace\n\n# Install dependencies\nRUN apt-get update \\\n && apt-get install -y \\\n    build-essential git gfortran \\\n    python3 python3-setuptools python3-dev \\\n    cmake curl wget unzip libreadline-dev libjpeg-dev libpng-dev ncurses-dev \\\n    imagemagick gnuplot gnuplot-x11 libssl-dev libzmq3-dev graphviz vim sudo tmux\n\n# Install OpenBLAS\nRUN apt-get -y install libopenblas-dev\n\n# Install Torch commit no: 0219027e6c4644a0ba5c5bf137c989a0a8c9e01b\nRUN git clone https://github.com/torch/distro.git torch --recursive\nRUN cd torch \\\n && /bin/bash install-deps \\\n && ./install.sh\n\n# get torch tutorials. comment out this line if no need\nRUN git clone https://github.com/torch/tutorials.git\n\n# Install dependency for [BinaryNet](https://github.com/itayhubara/BinaryNet)\nRUN /workspace/torch/install/bin/luarocks install https://raw.githubusercontent.com/eladhoffer/DataProvider.torch/master/dataprovider-scm-1.rockspec\nRUN /workspace/torch/install/bin/luarocks install cudnn \nRUN /workspace/torch/install/bin/luarocks install dp\nRUN /workspace/torch/install/bin/luarocks install unsup\n\n# copy BinaryNet into the image\nADD . BinaryNet"
  },
  {
    "path": "Main_BinaryNet_Cifar10.lua",
    "content": "require 'torch'\nrequire 'xlua'\nrequire 'optim'\nrequire 'gnuplot'\nrequire 'pl'\nrequire 'trepl'\nrequire 'adaMax_binary_clip_shift'\nrequire 'adam_binary_clip_b'\nrequire 'nn'\nrequire 'SqrHingeEmbeddingCriterion'\n----------------------------------------------------------------------\n\ncmd = torch.CmdLine()\ncmd:addTime()\ncmd:text()\ncmd:text('Training a convolutional network for visual classification')\ncmd:text()\ncmd:text('==>Options')\n\ncmd:text('===>Model And Training Regime')\ncmd:option('-modelsFolder',       './Models/',            'Models Folder')\ncmd:option('-network',            'Model.lua',            'Model file - must return valid network.')\ncmd:option('-LR',                 2^-6,                   'learning rate')\ncmd:option('-LRDecay',            0,                      'learning rate decay (in # samples)')\ncmd:option('-weightDecay',        0.0,                    'L2 penalty on the weights')\ncmd:option('-momentum',           0.0,                    'momentum')\ncmd:option('-batchSize',          200,                    'batch size')\ncmd:option('-stcNeurons',         true,                   'use stochastic binarization for the neurons')\ncmd:option('-stcWeights',         false,                  'use stochastic binarization for the weights')\ncmd:option('-optimization',       'adam',                 'optimization method')\ncmd:option('-SBN',                true,                   'shift based batch-normalization')\ncmd:option('-runningVal',         false,                  'use running mean and std')\ncmd:option('-epoch',              -1,                     'number of epochs to train, -1 for unbounded')\n\ncmd:text('===>Platform Optimization')\ncmd:option('-threads',            8,                      'number of threads')\ncmd:option('-type',               'cuda',                 'float or cuda')\ncmd:option('-devid',              1,                      'device ID (if using CUDA)')\ncmd:option('-nGPU',               1,                      'num of gpu devices used')\ncmd:option('-constBatchSize',     false,                  'do not allow varying batch sizes - e.g for ccn2 kernel')\n\n\ncmd:text('===>Save/Load Options')\ncmd:option('-load',               '',                     'load existing net weights')\ncmd:option('-save',               os.date():gsub(' ',''), 'save directory')\n\ncmd:text('===>Data Options')\ncmd:option('-dataset',            'Cifar10',              'Dataset - Cifar10, Cifar100, STL10, SVHN, MNIST')\ncmd:option('-normalization',      'simple',               'simple - whole sample, channel - by image channel, image - mean and std images')\ncmd:option('-format',             'rgb',                  'rgb or yuv')\ncmd:option('-whiten',             true,                   'whiten data')\ncmd:option('-dp_prepro',          false,                   'preprocessing using dp lib')\ncmd:option('-augment',            false,                  'Augment training data')\ncmd:option('-preProcDir',         './PreProcData/',       'Data for pre-processing (means,P,invP)')\ncmd:text('===>Misc')\ncmd:option('-visualize',          0,                      'visualizing results')\n\ntorch.manualSeed(432)\nopt = cmd:parse(arg or {})\nopt.network = opt.modelsFolder .. paths.basename(opt.network, '.lua')\nopt.save = paths.concat('./Results', opt.save)\nopt.preProcDir = paths.concat(opt.preProcDir, opt.dataset .. '/')\n\n-- If you choose to use exponentialy decaying learning rate use uncomment this line\n--opt.LRDecay=torch.pow((2e-6/opt.LR),(1./500));\n--\nos.execute('mkdir -p ' .. opt.preProcDir)\ntorch.setnumthreads(opt.threads)\n\ntorch.setdefaulttensortype('torch.FloatTensor')\nif opt.augment then\n    require 'image'\nend\n----------------------------------------------------------------------\n-- Model + Loss:\nlocal modelAll = require(opt.network)\nmodel=modelAll.model\nGLRvec=modelAll.lrs\nclipV=modelAll.clipV\n\nlocal loss = SqrtHingeEmbeddingCriterion(1)\n\n\nlocal data = require 'Data'\nlocal classes = data.Classes\n\n----------------------------------------------------------------------\n\n-- This matrix records the current confusion across classes\nlocal confusion = optim.ConfusionMatrix(classes)\n\nlocal AllowVarBatch = not opt.constBatchSize\n\n\n----------------------------------------------------------------------\n\n\n-- Output files configuration\nos.execute('mkdir -p ' .. opt.save)\ncmd:log(opt.save .. '/Log.txt', opt)\nlocal netFilename = paths.concat(opt.save, 'Net')\nlocal logFilename = paths.concat(opt.save,'ErrorRate.log')\nlocal optStateFilename = paths.concat(opt.save,'optState')\nlocal Log = optim.Logger(logFilename)\n----------------------------------------------------------------------\n\nlocal TensorType = 'torch.FloatTensor'\nif paths.filep(opt.load) then\n    model = torch.load(opt.load)\n    print('==>Loaded model from: ' .. opt.load)\n    print(model)\nend\nif opt.type =='cuda' then\n    require 'cutorch'\n    cutorch.setDevice(opt.devid)\n    cutorch.setHeapTracking(true)\n    model:cuda()\n    GLRvec=GLRvec:cuda()\n    clipV=clipV:cuda()\n    loss = loss:cuda()\n    TensorType = 'torch.CudaTensor'\nend\n\n\n\n---Support for multiple GPUs - currently data parallel scheme\nif opt.nGPU > 1 then\n    local net = model\n    model = nn.DataParallelTable(1)\n    for i = 1, opt.nGPU do\n        cutorch.setDevice(i)\n        model:add(net:clone():cuda(), i)  -- Use the ith GPU\n    end\n    cutorch.setDevice(opt.devid)\nend\n\n-- Optimization configuration\nlocal Weights,Gradients = model:getParameters()\n\n\n----------------------------------------------------------------------\nprint '==> Network'\nprint(model)\nprint('==>' .. Weights:nElement() ..  ' Parameters')\n\nprint '==> Loss'\nprint(loss)\n\n\n------------------Optimization Configuration--------------------------\nlocal optimState = {\n    learningRate = opt.LR,\n    momentum = opt.momentum,\n    weightDecay = opt.weightDecay,\n    learningRateDecay = opt.LRDecay,\n    GLRvec=GLRvec,\n    clipV=clipV\n}\n----------------------------------------------------------------------\n\nlocal function SampleImages(images,labels)\n    if not opt.augment then\n        return images,labels\n    else\n\n        local sampled_imgs = images:clone()\n        for i=1,images:size(1) do\n            local sz = math.random(9) - 1\n            local hflip = math.random(2)==1\n\n            local startx = math.random(sz)\n            local starty = math.random(sz)\n            local img = images[i]:narrow(2,starty,32-sz):narrow(3,startx,32-sz)\n            if hflip then\n                img = image.hflip(img)\n            end\n            img = image.scale(img,32,32)\n            sampled_imgs[i]:copy(img)\n        end\n        return sampled_imgs,labels\n    end\nend\n\n\n------------------------------\nlocal function Forward(Data, train)\n\n\n  local MiniBatch = DataProvider.Container{\n    Name = 'GPU_Batch',\n    MaxNumItems = opt.batchSize,\n    Source = Data,\n    ExtractFunction = SampleImages,\n    TensorType = TensorType\n  }\n\n  local yt = MiniBatch.Labels\n  local x = MiniBatch.Data\n  local SizeData = Data:size()\n  if not AllowVarBatch then SizeData = math.floor(SizeData/opt.batchSize)*opt.batchSize end\n\n  local NumSamples = 0\n  local NumBatches = 0\n  local lossVal = 0\n\n  while NumSamples < SizeData do\n    MiniBatch:getNextBatch()\n    local y, currLoss\n    NumSamples = NumSamples + x:size(1)\n    NumBatches = NumBatches + 1\n    if opt.nGPU > 1 then\n      model:syncParameters()\n    end\n    y = model:forward(x)\n    one_hot_yt=torch.zeros(yt:size(1),10)\n    one_hot_yt:scatter(2, yt:long():view(-1,1), 1)\n    one_hot_yt=one_hot_yt:mul(2):float():add(-1)\n    if opt.type == 'cuda' then\n      one_hot_yt=one_hot_yt:cuda()\n    end\n\n    currLoss = loss:forward(y,one_hot_yt)\n    if train then\n      function feval()\n        model:zeroGradParameters()\n        local dE_dy = loss:backward(y, one_hot_yt)\n        model:backward(x, dE_dy)\n        return currLoss, Gradients\n      end\n       --_G.optim[opt.optimization](feval, Weights, optimState) -- If you choose to use different optimization remember to clip the weights\n       adaMax_binary_clip_shift(feval, Weights, optimState)\n    end\n\n    lossVal = currLoss + lossVal\n\n    if type(y) == 'table' then --table results - always take first prediction\n      y = y[1]\n    end\n\n    confusion:batchAdd(y,one_hot_yt)\n    xlua.progress(NumSamples, SizeData)\n    if math.fmod(NumBatches,100)==0 then\n      collectgarbage()\n    end\n  end\n  return(lossVal/math.ceil(SizeData/opt.batchSize))\nend\n\n------------------------------\nlocal function Train(Data)\n  model:training()\n  return Forward(Data, true)\nend\n\nlocal function Test(Data)\n  model:evaluate()\n  return Forward(Data, false)\nend\n------------------------------\n\nlocal epoch = 1\nprint '\\n==> Starting Training\\n'\n\n\nwhile epoch ~= opt.epoch do\n    data.TrainData:shuffleItems()\n    print('Epoch ' .. epoch)\n    --Train\n    confusion:zero()\n    local LossTrain = Train(data.TrainData)\n    if epoch%10==0 then\n      torch.save(netFilename, model)\n    end\n    confusion:updateValids()\n    local ErrTrain = (1-confusion.totalValid)\n    if #classes <= 10 then\n        print(confusion)\n    end\n    print('Training Error = ' .. ErrTrain)\n    print('Training Loss = ' .. LossTrain)\n\n    --validation\n    confusion:zero()\n    local LossValid = Test(data.ValidData)\n    confusion:updateValids()\n    local ErrValid = (1-confusion.totalValid)\n    if #classes <= 10 then\n        print(confusion)\n    end\n    print('Valid Error = ' .. ErrValid)\n    print('Valid Loss = ' .. LossValid)\n\n    --Test\n    confusion:zero()\n    local LossTest = Test(data.TestData)\n    confusion:updateValids()\n    local ErrTest = (1-confusion.totalValid)\n    if #classes <= 10 then\n        print(confusion)\n    end\n\n    print('Test Error = ' .. ErrTest)\n    print('Test Loss = ' .. LossTest)\n\n    Log:add{['Training Error']= ErrTrain, ['Valid Error'] = ErrValid, ['Test Error'] = ErrTest}\n    -- the training stops at epoch 3 if visualize is set to 1\n    if opt.visualize == 1 then\n        Log:style{['Training Error'] = '-',['Validation Error'] = '-', ['Test Error'] = '-'}\n        Log:plot()\n    end\n    --optimState.learningRate=optimState.learningRate*opt.LRDecay\n    if epoch%50==0  then\n      optimState.learningRate=optimState.learningRate*0.5\n    else\n      optimState.learningRate=optimState.learningRate --*opt.LRDecay\n    end\n    print('-------------------LR-------------------')\n    print(optimState.learningRate)\n    epoch = epoch + 1\nend\n"
  },
  {
    "path": "Main_BinaryNet_MNIST.lua",
    "content": "require 'torch'\nrequire 'xlua'\nrequire 'optim'\nrequire 'gnuplot'\nrequire 'pl'\nrequire 'trepl'\nrequire 'adaMax_binary_clip_shift'\nrequire 'nn'\nrequire 'SqrHingeEmbeddingCriterion'\n----------------------------------------------\n\ncmd = torch.CmdLine()\ncmd:addTime()\ncmd:text()\ncmd:text('Training a convolutional network for visual classification')\ncmd:text()\ncmd:text('==>Options')\n\ncmd:text('===>Model And Training Regime')\ncmd:option('-modelsFolder',       './Models/',            'Models Folder')\ncmd:option('-network',            'Model.lua',            'Model file - must return valid network.')\ncmd:option('-LR',                 2^-6,                    'learning rate')\ncmd:option('-LRDecay',            0,                     'learning rate decay (in # samples)')\ncmd:option('-weightDecay',        0.0,                   'L2 penalty on the weights')\ncmd:option('-momentum',           0.0,                    'momentum')\ncmd:option('-batchSize',          100,                    'batch size')\ncmd:option('-stcNeurons',         true,                    'batch size')\ncmd:option('-stcWeights',         false,                    'batch size')\ncmd:option('-optimization',       'adam',                  'optimization method')\ncmd:option('-SBN',                true,                   'shift based batch-normalization')\ncmd:option('-runningVal',         true,                    'use running mean and std')\ncmd:option('-epoch',              -1,                     'number of epochs to train, -1 for unbounded')\n\ncmd:text('===>Platform Optimization')\ncmd:option('-threads',            8,                      'number of threads')\ncmd:option('-type',               'cuda',                 'float or cuda')\ncmd:option('-devid',              1,                      'device ID (if using CUDA)')\ncmd:option('-nGPU',               1,                      'num of gpu devices used')\ncmd:option('-constBatchSize',     false,                    'do not allow varying batch sizes - e.g for ccn2 kernel')\n\ncmd:text('===>Save/Load Options')\ncmd:option('-load',               '',                  'load existing net weights')\ncmd:option('-save',               os.date():gsub(' ',''), 'save directory')\n\ncmd:text('===>Data Options')\ncmd:option('-dataset',            'MNIST',              'Dataset - Cifar10, Cifar100, STL10, SVHN, MNIST')\ncmd:option('-normalization',      'simple',               'simple - whole sample, channel - by image channel, image - mean and std images')\ncmd:option('-format',             'rgb',                  'rgb or yuv')\ncmd:option('-whiten',             false,                  'whiten data')\ncmd:option('-dp_prepro',          false,                  'preprocessing using dp lib')\ncmd:option('-augment',            false,                  'Augment training data')\ncmd:option('-preProcDir',         './PreProcData/',       'Data for pre-processing (means,P,invP)')\n\ncmd:text('===>Misc')\ncmd:option('-visualize',          1,                      'visualizing results')\n\ntorch.manualSeed(432)\nopt = cmd:parse(arg or {})\nopt.network = opt.modelsFolder .. paths.basename(opt.network, '.lua')\nopt.save = paths.concat('./Results', opt.save)\nopt.preProcDir = paths.concat(opt.preProcDir, opt.dataset .. '/')\n\n\n-- If you choose to use exponentialy decaying learning rate use uncomment this line\n--opt.LRDecay=torch.pow((2e-6/opt.LR),(1./500));\n--\n\n\n\nos.execute('mk1ir -p ' .. opt.preProcDir)\ntorch.setnumthreads(opt.threads)\n\ntorch.setdefaulttensortype('torch.FloatTensor')\nif opt.augment then\n    require 'image'\nend\n----------------------------------------------------------------------\n\nlocal modelAll = require(opt.network)\nmodel=modelAll.model\nGLRvec=modelAll.lrs\nclipV=modelAll.clipV\nlocal loss = SqrtHingeEmbeddingCriterion(1)\n\nlocal data = require 'Data'\nlocal classes = data.Classes\n\n----------------------------------------------------------------------\n\n-- This matrix records the current confusion across classes\nlocal confusion = optim.ConfusionMatrix(classes)\n\nlocal AllowVarBatch = not opt.constBatchSize\n\n\n----------------------------------------------------------------------\n\n\n-- Output files configuration\nos.execute('mkdir -p ' .. opt.save)\ncmd:log(opt.save .. '/Log.txt', opt)\nlocal netFilename = paths.concat(opt.save, 'Net')\nlocal logFilename = paths.concat(opt.save,'ErrorRate.log')\nlocal optStateFilename = paths.concat(opt.save,'optState')\nlocal Log = optim.Logger(logFilename)\n----------------------------------------------------------------------\n\nlocal TensorType = 'torch.FloatTensor'\nif paths.filep(opt.load) then\n    model = torch.load(opt.load)\n    print('==>Loaded model from: ' .. opt.load)\n    print(model)\nend\nif opt.type =='cuda' then\n    require 'cutorch'\n    cutorch.setDevice(opt.devid)\n    cutorch.setHeapTracking(true)\n    model:cuda()\n    GLRvec=GLRvec:cuda()\n    clipV=clipV:cuda()\n    loss = loss:cuda()\n    TensorType = 'torch.CudaTensor'\nend\n\n\n\n---Support for multiple GPUs - currently data parallel scheme\nif opt.nGPU > 1 then\n    local net = model\n    model = nn.DataParallelTable(1)\n    for i = 1, opt.nGPU do\n        cutorch.setDevice(i)\n        model:add(net:clone():cuda(), i)  -- Use the ith GPU\n    end\n    cutorch.setDevice(opt.devid)\nend\n\n-- Optimization configuration\nlocal Weights,Gradients = model:getParameters()\n\n\n----------------------------------------------------------------------\nprint '==> Network'\nprint(model)\nprint('==>' .. Weights:nElement() ..  ' Parameters')\n\nprint '==> Loss'\nprint(loss)\n\n\n------------------Optimization Configuration--------------------------\nlocal optimState = {\n    learningRate = opt.LR,\n    momentum = opt.momentum,\n    weightDecay = opt.weightDecay,\n    learningRateDecay = opt.LRDecay,\n    GLRvec=GLRvec,\n    clipV=clipV\n}\n----------------------------------------------------------------------\n\nlocal function SampleImages(images,labels)\n    if not opt.augment then\n        return images,labels\n    else\n\n        local sampled_imgs = images:clone()\n        for i=1,images:size(1) do\n            local sz = math.random(9) - 1\n            local hflip = math.random(2)==1\n\n            local startx = math.random(sz)\n            local starty = math.random(sz)\n            local img = images[i]:narrow(2,starty,32-sz):narrow(3,startx,32-sz)\n            if hflip then\n                img = image.hflip(img)\n            end\n            img = image.scale(img,32,32)\n            sampled_imgs[i]:copy(img)\n        end\n        return sampled_imgs,labels\n    end\nend\n\n\n------------------------------\nlocal function Forward(Data, train)\n\n\n  local MiniBatch = DataProvider.Container{\n    Name = 'GPU_Batch',\n    MaxNumItems = opt.batchSize,\n    Source = Data,\n    ExtractFunction = SampleImages,\n    TensorType = TensorType\n  }\n\n  local yt = MiniBatch.Labels\n  local x = MiniBatch.Data\n  local SizeData = Data:size()\n  if not AllowVarBatch then SizeData = math.floor(SizeData/opt.batchSize)*opt.batchSize end\n\n  local NumSamples = 0\n  local NumBatches = 0\n  local lossVal = 0\n\n  while NumSamples < SizeData do\n    MiniBatch:getNextBatch()\n    local y, currLoss\n    NumSamples = NumSamples + x:size(1)\n    NumBatches = NumBatches + 1\n    if opt.nGPU > 1 then\n      model:syncParameters()\n    end\n\n    y = model:forward(x)\n\n    one_hot_yt=torch.zeros(yt:size(1),10)\n    one_hot_yt:scatter(2, yt:long():view(-1,1), 1)\n    one_hot_yt=one_hot_yt:mul(2):float():add(-1):cuda()\n\n\n    currLoss = loss:forward(y,one_hot_yt)\n    if train then\n      function feval()\n        model:zeroGradParameters()\n        local dE_dy = loss:backward(y, one_hot_yt)\n        model:backward(x, dE_dy)\n        return currLoss, Gradients\n      end\n\n\n       adaMax_binary_clip_shift(feval, Weights, optimState)\n\n      local indLayer=0\n      for i, layer in ipairs(model.modules) do\n          indLayer=indLayer+1;\n          if layer.__typename == 'cudnnBinarySpatialConvolution' then\n            model.modules[indLayer].weight:clamp(-1,1)\n          elseif layer.__typename == 'BinaryLinear' then\n            --print(indLayer)\n            model.modules[indLayer].weight:clamp(-1,1)\n        end\n      end\n    end\n\n    lossVal = currLoss + lossVal\n\n    if type(y) == 'table' then --table results - always take first prediction\n      y = y[1]\n    end\n\n\n    confusion:batchAdd(y,one_hot_yt)\n    xlua.progress(NumSamples, SizeData)\n    if math.fmod(NumBatches,100)==0 then\n      collectgarbage()\n    end\n  end\n  return(lossVal/math.ceil(SizeData/opt.batchSize))\nend\n\n------------------------------\nlocal function Train(Data)\n  model:training()\n  return Forward(Data, true)\nend\n\nlocal function Test(Data)\n  model:evaluate()\n  return Forward(Data, false)\nend\n------------------------------\n\nlocal epoch = 1\nprint '\\n==> Starting Training\\n'\n\nlocal epoch = 1\nprint '\\n==> Starting Training\\n'\n\nwhile epoch ~= opt.epoch do\n    data.TrainData:shuffleItems()\n    print('Epoch ' .. epoch)\n    --Train\n    confusion:zero()\n    local LossTrain = Train(data.TrainData)\n    if epoch%10==0 then\n      torch.save(netFilename, model)\n    end\n    confusion:updateValids()\n    local ErrTrain = (1-confusion.totalValid)\n    if #classes <= 10 then\n        print(confusion)\n    end\n    print('Training Error = ' .. ErrTrain)\n    print('Training Loss = ' .. LossTrain)\n\n    --validation\n    confusion:zero()\n    local LossValid = Test(data.ValidData)\n    confusion:updateValids()\n    local ErrValid = (1-confusion.totalValid)\n    if #classes <= 10 then\n        print(confusion)\n    end\n    print('Valid Error = ' .. ErrValid)\n    print('Valid Loss = ' .. LossValid)\n\n    --Test\n    confusion:zero()\n    local LossTest = Test(data.TestData)\n    confusion:updateValids()\n    local ErrTest = (1-confusion.totalValid)\n    if #classes <= 10 then\n        print(confusion)\n    end\n\n    print('Test Error = ' .. ErrTest)\n    print('Test Loss = ' .. LossTest)\n\n    Log:add{['Training Error']= ErrTrain, ['Valid Error'] = ErrValid, ['Test Error'] = ErrTest}\n    if opt.visualize == 1 then\n        Log:style{['Training Error'] = '-',['Validation Error'] = '-', ['Test Error'] = '-'}\n        Log:plot()\n    end\n    if epoch%20==0  then\n      optimState.learningRate=optimState.learningRate*0.5\n    else\n      optimState.learningRate=optimState.learningRate --*opt.LRDecay\n    end\n    print('-------------------LR-------------------')\n    print(optimState.learningRate)\n    epoch = epoch + 1\nend\n"
  },
  {
    "path": "Main_BinaryNet_SVHN.lua",
    "content": "require 'torch'\nrequire 'xlua'\nrequire 'optim'\nrequire 'gnuplot'\nrequire 'pl'\nrequire 'trepl'\nrequire 'adaMax_binary_clip_shift'\nrequire 'nn'\nrequire 'SqrHingeEmbeddingCriterion'\n----------------------------------------------------------------------\n\ncmd = torch.CmdLine()\ncmd:addTime()\ncmd:text()\ncmd:text('Training a convolutional network for visual classification')\ncmd:text()\ncmd:text('==>Options')\n\ncmd:text('===>Model And Training Regime')\ncmd:option('-modelsFolder',       './Models/',            'Models Folder')\ncmd:option('-network',            'Model.lua',            'Model file - must return valid network.')\ncmd:option('-LR',                 2^-7,                    'learning rate')\ncmd:option('-LRDecay',            0,                     'learning rate decay (in # samples)')\ncmd:option('-weightDecay',        0.0,                   'L2 penalty on the weights')\ncmd:option('-momentum',           0.0,                    'momentum')\ncmd:option('-batchSize',          200,                    'batch size')\ncmd:option('-stcNeurons',         true,                    'batch size')\ncmd:option('-stcWeights',         false,                    'batch size')\ncmd:option('-optimization',       'adam',                  'optimization method')\ncmd:option('-SBN',                true,                   'shift based batch-normalization')\ncmd:option('-runningVal',         true,                    'use running mean and std')\ncmd:option('-epoch',              -1,                     'number of epochs to train, -1 for unbounded')\n\ncmd:text('===>Platform Optimization')\ncmd:option('-threads',            8,                      'number of threads')\ncmd:option('-type',               'cuda',                 'float or cuda')\ncmd:option('-devid',              1,                      'device ID (if using CUDA)')\ncmd:option('-nGPU',               1,                      'num of gpu devices used')\ncmd:option('-constBatchSize',     false,                    'do not allow varying batch sizes - e.g for ccn2 kernel')\n\ncmd:text('===>Save/Load Options')\ncmd:option('-load',               '',                  'load existing net weights')\ncmd:option('-save',               os.date():gsub(' ',''), 'save directory')\n\ncmd:text('===>Data Options')\ncmd:option('-dataset',            'SVHN',              'Dataset - Cifar10, Cifar100, STL10, SVHN, MNIST')\ncmd:option('-normalization',      'simple',               'simple - whole sample, channel - by image channel, image - mean and std images')\ncmd:option('-format',             'rgb',                  'rgb or yuv')\ncmd:option('-whiten',             false,                  'whiten data')\ncmd:option('-dp_prepro',          true,                   'preprocessing using dp lib')\ncmd:option('-augment',            false,                  'Augment training data')\ncmd:option('-preProcDir',         './PreProcData/',       'Data for pre-processing (means,P,invP)')\n\ncmd:text('===>Misc')\ncmd:option('-visualize',          1,                      'visualizing results')\n\ntorch.manualSeed(432)\nopt = cmd:parse(arg or {})\nopt.network = opt.modelsFolder .. paths.basename(opt.network, '.lua')\nopt.save = paths.concat('./Results', opt.save)\nopt.preProcDir = paths.concat(opt.preProcDir, opt.dataset .. '/')\n\n\n-- If you choose to use exponentialy decaying learning rate use uncomment this line\n--opt.LRDecay=torch.pow((2e-6/opt.LR),(1./500));\n--\n\nos.execute('mk1ir -p ' .. opt.preProcDir)\ntorch.setnumthreads(opt.threads)\n\ntorch.setdefaulttensortype('torch.FloatTensor')\nif opt.augment then\n    require 'image'\nend\n----------------------------------------------------------------------\n-- Model + Loss:\nlocal modelAll = require(opt.network)\nmodel=modelAll.model\nGLRvec=modelAll.lrs\nclipV=modelAll.clipV\n\nlocal loss = SqrtHingeEmbeddingCriterion(1) --nn.ClassNLLCriterion()\nlocal data = require 'Data'\nlocal classes = data.Classes\n\n----------------------------------------------------------------------\n\n-- This matrix records the current confusion across classes\nlocal confusion = optim.ConfusionMatrix(classes)\n\nlocal AllowVarBatch = not opt.constBatchSize\n\n\n----------------------------------------------------------------------\n\n\n-- Output files configuration\nos.execute('mkdir -p ' .. opt.save)\ncmd:log(opt.save .. '/Log.txt', opt)\nlocal netFilename = paths.concat(opt.save, 'Net')\nlocal logFilename = paths.concat(opt.save,'ErrorRate.log')\nlocal optStateFilename = paths.concat(opt.save,'optState')\nlocal Log = optim.Logger(logFilename)\n----------------------------------------------------------------------\n\nlocal TensorType = 'torch.FloatTensor'\n\nif opt.type =='cuda' then\n    require 'cutorch'\n    cutorch.setDevice(opt.devid)\n    cutorch.setHeapTracking(true)\n    model:cuda()\n    GLRvec=GLRvec:cuda()\n    clipV=clipV:cuda()\n    loss = loss:cuda()\n    TensorType = 'torch.CudaTensor'\nend\nif paths.filep(opt.load) then\n    model = torch.load(opt.load)\n    print('==>Loaded model from: ' .. opt.load)\n    print(model)\nend\n\n\n---Support for multiple GPUs - currently data parallel scheme\nif opt.nGPU > 1 then\n    local net = model\n    model = nn.DataParallelTable(1)\n    for i = 1, opt.nGPU do\n        cutorch.setDevice(i)\n        model:add(net:clone():cuda(), i)  -- Use the ith GPU\n    end\n    cutorch.setDevice(opt.devid)\nend\n\n-- Optimization configuration\nlocal Weights,Gradients = model:getParameters()\n\n\n\n----------------------------------------------------------------------\nprint '==> Network'\nprint(model)\nprint('==>' .. Weights:nElement() ..  ' Parameters')\n\nprint '==> Loss'\nprint(loss)\n\n\n------------------Optimization Configuration--------------------------\nlocal optimState = {\n    learningRate = opt.LR,\n    momentum = opt.momentum,\n    weightDecay = opt.weightDecay,\n    learningRateDecay = opt.LRDecay,\n    GLRvec=GLRvec,\n    clipV=clipV\n}\n----------------------------------------------------------------------\n\nlocal function SampleImages(images,labels)\n    if not opt.augment then\n        return images,labels\n    else\n\n        local sampled_imgs = images:clone()\n        for i=1,images:size(1) do\n            local sz = math.random(9) - 1\n            local hflip = math.random(2)==1\n\n            local startx = math.random(sz)\n            local starty = math.random(sz)\n            local img = images[i]:narrow(2,starty,32-sz):narrow(3,startx,32-sz)\n            if hflip then\n                img = image.hflip(img)\n            end\n            img = image.scale(img,32,32)\n            sampled_imgs[i]:copy(img)\n        end\n        return sampled_imgs,labels\n    end\nend\n\n\n------------------------------\nlocal function Forward(Data, train)\n\n\n  local MiniBatch = DataProvider.Container{\n    Name = 'GPU_Batch',\n    MaxNumItems = opt.batchSize,\n    Source = Data,\n    ExtractFunction = SampleImages,\n    TensorType = TensorType\n  }\n\n  local yt = MiniBatch.Labels\n  local x = MiniBatch.Data\n  local SizeData = Data:size()\n  if not AllowVarBatch then SizeData = math.floor(SizeData/opt.batchSize)*opt.batchSize end\n\n  local NumSamples = 0\n  local NumBatches = 0\n  local lossVal = 0\n\n  while NumSamples < SizeData do\n    MiniBatch:getNextBatch()\n    local y, currLoss\n    NumSamples = NumSamples + x:size(1)\n    NumBatches = NumBatches + 1\n    if opt.nGPU > 1 then\n      model:syncParameters()\n    end\n\n    y = model:forward(x)\n    one_hot_yt=torch.zeros(yt:size(1),10)\n    one_hot_yt:scatter(2, yt:long():view(-1,1), 1)\n    one_hot_yt=one_hot_yt:mul(2):float():add(-1):cuda()\n\n    currLoss = loss:forward(y,one_hot_yt)\n    if train then\n      function feval()\n        model:zeroGradParameters()\n        local dE_dy = loss:backward(y, one_hot_yt)\n        model:backward(x, dE_dy)\n        return currLoss, Gradients\n      end\n\n       adaMax_binary_clip_shift(feval, Weights, optimState)\n\n      local indLayer=0\n      for i, layer in ipairs(model.modules) do\n          indLayer=indLayer+1;\n          if layer.__typename == 'cudnnBinarySpatialConvolution' then\n            model.modules[indLayer].weight:copy(model.modules[indLayer].weight:clamp(-1,1))\n          elseif layer.__typename == 'BinaryLinear' then\n            model.modules[indLayer].weight:copy(model.modules[indLayer].weight:clamp(-1,1))\n        end\n      end\n    end\n\n    lossVal = currLoss + lossVal\n\n    if type(y) == 'table' then --table results - always take first prediction\n      y = y[1]\n    end\n\n\n    confusion:batchAdd(y,one_hot_yt)\n    xlua.progress(NumSamples, SizeData)\n    if math.fmod(NumBatches,100)==0 then\n      collectgarbage()\n    end\n  end\n  return(lossVal/math.ceil(SizeData/opt.batchSize))\nend\n\n------------------------------\nlocal function Train(Data)\n  model:training()\n  return Forward(Data, true)\nend\n\nlocal function Test(Data)\n  model:evaluate()\n  return Forward(Data, false)\nend\n------------------------------\n\nlocal epoch = 1\nprint '\\n==> Starting Training\\n'\n\n\nwhile epoch ~= opt.epoch do\n    data.TrainData:shuffleItems()\n    print('Epoch ' .. epoch)\n    --Train\n    confusion:zero()\n    local LossTrain = Train(data.TrainData)\n    if epoch%10==0 then\n      torch.save(netFilename, model)\n    end\n    confusion:updateValids()\n    local ErrTrain = (1-confusion.totalValid)\n    if #classes <= 10 then\n        print(confusion)\n    end\n    print('Training Error = ' .. ErrTrain)\n    print('Training Loss = ' .. LossTrain)\n    --validation\n    confusion:zero()\n    local LossValid = Test(data.ValidData)\n    confusion:updateValids()\n    local ErrValid = (1-confusion.totalValid)\n    if #classes <= 10 then\n        print(confusion)\n    end\n    print('Valid Error = ' .. ErrValid)\n    print('Valid Loss = ' .. LossValid)\n    --Test\n    confusion:zero()\n    local LossTest = Test(data.TestData)\n    confusion:updateValids()\n    local ErrTest = (1-confusion.totalValid)\n    if #classes <= 10 then\n        print(confusion)\n    end\n\n    print('Test Error = ' .. ErrTest)\n    print('Test Loss = ' .. LossTest)\n\n    Log:add{['Training Error']= ErrTrain, ['Valid Error'] = ErrValid, ['Test Error'] = ErrTest}\n    if opt.visualize == 1 then\n        Log:style{['Training Error'] = '-',['Validation Error'] = '-', ['Test Error'] = '-'}\n        Log:plot()\n    end\n    if epoch%20==0 then\n      optimState.learningRate=optimState.learningRate*0.5\n    else\n      optimState.learningRate=optimState.learningRate\n    end\n    print('-------------------LR-------------------')\n    print(optimState.learningRate)\n\n\n    epoch = epoch + 1\nend\n"
  },
  {
    "path": "Models/BatchNormalizationShiftPow2.lua",
    "content": "--[[\n   This file implements Shift based Batch Normalization based a variant of the vanilla BN as described in the paper:\n   \"Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Matthieu Courbariaux, Itay Hubara, Daniel Soudry, Ran El-Yaniv, Yoshua Bengio'\n\n   The code is based on nn library\n   --]]\n\n\nlocal BatchNormalizationShiftPow2,parent = torch.class('BatchNormalizationShiftPow2', 'nn.Module')\n\nfunction BatchNormalizationShiftPow2:__init(nOutput, runningVal, eps, momentum, affine)\n   parent.__init(self)\n   assert(nOutput and type(nOutput) == 'number',\n          'Missing argument #1: dimensionality of input. ')\n   assert(nOutput ~= 0, 'To set affine=false call BatchNormalization'\n     .. '(nOutput,  eps, momentum, false) ')\n   if affine ~= nil then\n      assert(type(affine) == 'boolean', 'affine has to be true/false')\n      self.affine = affine\n   else\n      self.affine = true\n   end\n   self.eps = eps or 1e-5\n   self.train = true\n   self.momentum = momentum or 0.125\n   self.runningVal = runningVal or true\n   self.running_mean = torch.zeros(nOutput)\n   self.running_std = torch.ones(nOutput)\n   self.running_std_ap2 = torch.ones(nOutput)\n   if self.affine then\n      self.weight = torch.Tensor(nOutput)\n      self.weightSign = torch.Tensor(nOutput)\n      self.weight_ap2 = torch.Tensor(nOutput)\n      self.bias = torch.Tensor(nOutput)\n      self.gradWeight = torch.Tensor(nOutput)\n      self.gradBias = torch.Tensor(nOutput)\n      self:reset()\n   end\nend\n\nfunction BatchNormalizationShiftPow2:reset()\n   self.weight:fill(1)\n   self.bias:zero()\n   self.running_mean:zero()\n   self.running_std:fill(1)\nend\n\nfunction BatchNormalizationShiftPow2:updateOutput(input)\n   assert(input:dim() == 2, 'only mini-batch supported (2D tensor), got '\n             .. input:dim() .. 'D tensor instead')\n   local nBatch = input:size(1)\n   -- buffers that are reused\n   self.buffer = self.buffer or input.new()\n   self.buffer2 = self.buffer2 or input.new()\n   self.centered = self.centered or input.new()\n   self.centered:resizeAs(input)\n   self.centerSign = self.centerSign or input.new()\n   self.centerSign:resizeAs(input)\n   self.centeredOrg = self.centeredOrg or input.new()\n   self.centeredOrg:resizeAs(input)\n   self.std = self.std or input.new()\n   self.normalized = self.normalized or input.new()\n   self.normalized:resizeAs(input)\n   self.normalizedSign = self.normalizedSign or input.new()\n   self.normalizedSign:resizeAs(input)\n   self.output:resizeAs(input)\n   self.gradInput:resizeAs(input)\n   if self.train == false and self.runningVal == true then\n     self.output:copy(input)\n     self.buffer:repeatTensor(self.running_mean, nBatch, 1)\n     self.output:add(-1, self.buffer)\n     self.running_std_ap2:copy(torch.pow(2,torch.round(torch.log(self.running_std):div(math.log(2)))))\n     self.buffer:repeatTensor(self.running_std_ap2, nBatch, 1)\n     self.output:cmul(self.buffer)\n   else -- training mode\n      -- calculate mean over mini-batch\n      self.buffer:mean(input, 1)                        -- E(x) = expectation of x.\n      self.running_mean:mul(1 - self.momentum):add(self.momentum, self.buffer) -- add to running mean\n      self.buffer:repeatTensor(self.buffer, nBatch, 1)\n\n      -- subtract mean\n      self.centered:add(input, -1, self.buffer)         -- x - E(x)\n      self.centeredOrg:copy(self.centered)\n      self.centerSign:copy(self.centered)\n      self.centerSign:sign()\n      self.centered:copy(torch.pow(2,torch.round(torch.log(self.centered:abs()):div(math.log(2))))):cmul(self.centerSign)\n      -- calculate standard deviation over mini-batch\n      self.buffer:copy(self.centered):cmul(self.centeredOrg) -- [x - E(x)]^2\n      -- 1 / E([x - E(x)]^2)\n      self.std:mean(self.buffer, 1):add(self.eps):sqrt():pow(-1)\n      self.running_std:mul(1 - self.momentum):add(self.momentum, self.std) -- add to running stdv\n      self.std:copy(torch.pow(2,torch.round(torch.log(self.std):div(math.log(2)))))\n      self.buffer:repeatTensor(self.std, nBatch, 1)\n\n      -- divide standard-deviation + eps\n\n      self.output:cmul(self.centeredOrg, self.buffer)\n      self.normalized:copy(self.output)\n      self.normalizedSign:copy(self.normalized)\n      self.normalizedSign:sign()\n\n      self.normalized:copy(torch.pow(2,torch.round(torch.log(self.normalized:abs()):div(math.log(2)))):cmul(self.normalizedSign))\n      --self.normalized[self.normalized:lt(0)]=1; -- Can improve results\n   end\n\n   if self.affine then\n      -- multiply with gamma and add beta\n      self.weightSign:copy(self.weight)\n      self.weightSign:sign()\n      self.weight_ap2:copy(torch.pow(2,torch.round(torch.log(self.weight:clone():abs()):div(math.log(2))))):cmul(self.weightSign)\n      --self.weight:fill(1) --Almost similar results\n      self.buffer:repeatTensor(self.weight_ap2, nBatch, 1)\n      self.output:cmul(self.buffer)\n      self.buffer:repeatTensor(self.bias, nBatch, 1)\n      self.output:add(self.buffer)\n   end\n   return self.output\nend\n\nfunction BatchNormalizationShiftPow2:updateGradInput(input, gradOutput)\n   assert(input:dim() == 2, 'only mini-batch supported')\n   assert(gradOutput:dim() == 2, 'only mini-batch supported')\n   assert(self.train == true, 'should be in training mode when self.train is true')\n   local nBatch = input:size(1)\n\n   self.gradInput:cmul(self.centered, gradOutput)\n   self.buffer:mean(self.gradInput, 1)\n   self.gradInput:repeatTensor(self.buffer, nBatch, 1)\n   self.gradInput:cmul(self.centered):mul(-1)\n   self.buffer:repeatTensor(self.std, nBatch, 1)\n   self.gradInput:cmul(self.buffer):cmul(self.buffer)\n\n   self.buffer:mean(gradOutput, 1)\n   self.buffer:repeatTensor(self.buffer, nBatch, 1)\n   self.gradInput:add(gradOutput):add(-1, self.buffer)\n   self.buffer:repeatTensor(self.std, nBatch, 1)\n   self.gradInput:cmul(self.buffer)\n\n   if self.affine then\n      self.buffer:repeatTensor(self.weight_ap2, nBatch, 1)\n      self.gradInput:cmul(self.buffer)\n   end\n\n   return self.gradInput\nend\n\nfunction BatchNormalizationShiftPow2:accGradParameters(input, gradOutput, scale)\n   if self.affine then\n      scale = scale or 1.0\n      self.buffer2:resizeAs(self.normalized):copy(self.normalized)\n      self.buffer2:cmul(gradOutput)\n      self.buffer:sum(self.buffer2, 1) -- sum over mini-batch\n      self.gradWeight:add(scale, self.buffer)\n      self.buffer:sum(gradOutput, 1) -- sum over mini-batch\n      self.gradBias:add(scale, self.buffer)\n   end\nend\n"
  },
  {
    "path": "Models/BinarizedNeurons.lua",
    "content": "local BinarizedNeurons,parent = torch.class('BinarizedNeurons', 'nn.Module')\n\n\nfunction BinarizedNeurons:__init(stcFlag)\n   parent.__init(self)\n   self.stcFlag = stcFlag\n   self.randmat=torch.Tensor();\n   self.outputR=torch.Tensor();\n end\nfunction BinarizedNeurons:updateOutput(input)\n    self.randmat:resizeAs(input);\n    self.outputR:resizeAs(input);\n    self.output:resizeAs(input);\n    self.outputR:copy(input):add(1):div(2)\n     if self.train and self.stcFlag then\n       local mask=self.outputR-self.randmat:rand(self.randmat:size())\n       self.output=mask:sign()\n     else\n       self.output:copy(self.outputR):add(-0.5):sign()\n     end\n   return self.output\nend\n\nfunction BinarizedNeurons:updateGradInput(input, gradOutput)\n        self.gradInput:resizeAs(gradOutput)\n        self.gradInput:copy(gradOutput) --:mul(0.5)\n   return self.gradInput\nend\n"
  },
  {
    "path": "Models/BinaryLinear.lua",
    "content": "--require 'randomkit'\n\nlocal BinaryLinear, parent = torch.class('BinaryLinear', 'nn.Linear')\n\nfunction BinaryLinear:__init(inputSize, outputSize,stcWeights)\n   local delayedReset = self.reset\n   self.reset = function() end\n   parent.__init(self, inputSize, outputSize)\n   self.reset = delayedReset\n\n   self.weight = torch.Tensor(outputSize, inputSize)\n   self.weightB = torch.Tensor(outputSize, inputSize)\n   self.weightOrg = torch.Tensor(outputSize, inputSize)\n   self.maskStc = torch.Tensor(outputSize, inputSize)\n   self.randmat = torch.Tensor(outputSize, inputSize)\n   self.bias = torch.Tensor(outputSize)\n   self.gradWeight = torch.Tensor(outputSize, inputSize)\n   self.gradBias = torch.Tensor(outputSize)\n   self.stcWeights=stcWeights\n   self:reset()\n   -- should nil for serialization, the reset will still work\n   self.reset = nil\nend\n\nfunction BinaryLinear:reset(stdv)\n   if stdv then\n      stdv = stdv * math.sqrt(3)\n   else\n      stdv = 1./math.sqrt(self.weight:size(2))\n   end\n   if nn.oldSeed then\n      for i=1,self.weight:size(1) do\n         self.weight:select(1, i):apply(function()\n            return torch.uniform(-1, 1)\n         end)\n         self.bias[i] = torch.uniform(-stdv, stdv)\n      end\n   else\n      self.weight:uniform(-1, 1)\n      self.bias:uniform(-stdv, stdv)\n   end\n\n   return self\nend\n\nfunction BinaryLinear:binarized(trainFlag)\n  self.weightOrg:copy(self.weight)\n  self.binaryFlag = true\n  if not self.binaryFlag then\n    self.weight:copy(self.weightOrg)\n  else\n    self.weightB:copy(self.weight):add(1):div(2):clamp(0,1)\n\n    if not self.stcWeights or not trainFlag then\n      self.weightB:round():mul(2):add(-1)\n    else\n      self.maskStc=self.weightB-self.randmat:rand(self.randmat:size())\n      self.weightB:copy(self.maskStc)\n\n    end\n  end\n\n  return  self.weightB\nend\n\nfunction BinaryLinear:updateOutput(input)\n\n  self.weightB = self:binarized(self.train)\n  self.weight:copy(self.weightB)\n   parent.updateOutput(self,input)\n   self.weight:copy(self.weightOrg);\n   return self.output\nend\n\nfunction BinaryLinear:updateGradInput(input, gradOutput)\n\n   if self.gradInput then\n      self.weight:copy(self.weightB)\n      parent.updateGradInput(self,input, gradOutput)\n      self.weight:copy(self.weightOrg);\n      return self.gradInput\n   end\n\nend\n\nfunction BinaryLinear:accGradParameters(input, gradOutput, scale)\n  parent.accGradParameters(self,input, gradOutput, scale)\nend\n\n-- we do not need to accumulate parameters when sharing\nBinaryLinear.sharedAccUpdateGradParameters = BinaryLinear.accUpdateGradParameters\n\n\nfunction BinaryLinear:__tostring__()\n  return torch.type(self) ..\n      string.format('(%d -> %d)', self.weight:size(2), self.weight:size(1))\nend\n"
  },
  {
    "path": "Models/BinaryNet_Cifar10_Model.lua",
    "content": "--[[This code specify the model for CIFAR 10 dataset. This model uses the Shift based batch-normalization algorithm.\nIn this file we also secify the Glorot learning parameter and the which of the learnable parameter we clip ]]\nrequire 'nn'\nrequire './BinaryLinear.lua'\nrequire './BinarizedNeurons'\n\nlocal SpatialConvolution\nlocal SpatialMaxPooling\nif opt.type =='cuda' then\n  require 'cunn'\n  require 'cudnn'\n  require './cudnnBinarySpatialConvolution.lua'\n  SpatialConvolution = cudnnBinarySpatialConvolution\n  SpatialMaxPooling = cudnn.SpatialMaxPooling\nelse\n  require './BinarySpatialConvolution.lua'\n  SpatialConvolution = BinarySpatialConvolution\n  SpatialMaxPooling = nn.SpatialMaxPooling\nend\nif opt.SBN == true then\n  require './BatchNormalizationShiftPow2.lua'\n  require './SpatialBatchNormalizationShiftPow2.lua'\n  BatchNormalization = BatchNormalizationShiftPow2\n  SpatialBatchNormalization = SpatialBatchNormalizationShiftPow2\nelse\n  BatchNormalization = nn.BatchNormalization\n  SpatialBatchNormalization = nn.SpatialBatchNormalization\nend\nnumHid=1024;\nlocal model = nn.Sequential()\n\n-- Convolution Layers\nmodel:add(SpatialConvolution(3, 128, 3, 3 ,1,1,1,1,opt.stcWeights ))\nmodel:add(SpatialBatchNormalization(128, opt.runningVal))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\n\nmodel:add(SpatialConvolution(128, 128, 3, 3,1,1,1,1,opt.stcWeights ))\nmodel:add(SpatialMaxPooling(2, 2))\nmodel:add(SpatialBatchNormalization(128, opt.runningVal))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\n\nmodel:add(SpatialConvolution(128, 256, 3, 3 ,1,1,1,1,opt.stcWeights ))\nmodel:add(SpatialBatchNormalization(256, opt.runningVal))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\n\nmodel:add(SpatialConvolution(256, 256, 3, 3 ,1,1,1,1,opt.stcWeights ))\nmodel:add(SpatialMaxPooling(2, 2))\nmodel:add(SpatialBatchNormalization(256, opt.runningVal))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\n\nmodel:add(SpatialConvolution(256, 512, 3, 3,1,1,1,1,opt.stcWeights ))\nmodel:add(SpatialBatchNormalization(512, opt.runningVal))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\n\nmodel:add(SpatialConvolution(512, 512, 3, 3,1,1,1,1,opt.stcWeights ))\nmodel:add(SpatialMaxPooling(2, 2))\nmodel:add(SpatialBatchNormalization(512, opt.runningVal))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\n\nmodel:add(nn.View(512*4*4))\nmodel:add(BinaryLinear(512*4*4,numHid,opt.stcWeights))\nmodel:add(BatchNormalization(numHid))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\n\nmodel:add(BinaryLinear(numHid,numHid,opt.stcWeights))\nmodel:add(BatchNormalization(numHid, opt.runningVal))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\n\nmodel:add(BinaryLinear(numHid,10,opt.stcWeights))\nmodel:add(nn.BatchNormalization(10))\n\nlocal dE, param = model:getParameters()\nlocal weight_size = dE:size(1)\nlocal learningRates = torch.Tensor(weight_size):fill(0)\nlocal clipvector = torch.Tensor(weight_size):fill(1)\nlocal counter = 0\nfor i, layer in ipairs(model.modules) do\n   if layer.__typename == 'BinaryLinear' then\n      local weight_size = layer.weight:size(1)*layer.weight:size(2)\n      local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]+size_w[2]))\n      GLR=(math.pow(2,torch.round(math.log(GLR)/(math.log(2)))))\n      learningRates[{{counter+1, counter+weight_size}}]:fill(GLR)\n      clipvector[{{counter+1, counter+weight_size}}]:fill(1)\n      counter = counter+weight_size\n      local bias_size = layer.bias:size(1)\n      learningRates[{{counter+1, counter+bias_size}}]:fill(GLR)\n      clipvector[{{counter+1, counter+bias_size}}]:fill(0)\n      counter = counter+bias_size\n    elseif layer.__typename == 'BatchNormalizationShiftPow2' then\n        local weight_size = layer.weight:size(1)\n        local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))\n        learningRates[{{counter+1, counter+weight_size}}]:fill(1)\n        clipvector[{{counter+1, counter+weight_size}}]:fill(0)\n        counter = counter+weight_size\n        local bias_size = layer.bias:size(1)\n        learningRates[{{counter+1, counter+bias_size}}]:fill(1)\n        clipvector[{{counter+1, counter+bias_size}}]:fill(0)\n        counter = counter+bias_size\n    elseif layer.__typename == 'nn.BatchNormalization' then\n      local weight_size = layer.weight:size(1)\n      learningRates[{{counter+1, counter+weight_size}}]:fill(1)\n      clipvector[{{counter+1, counter+weight_size}}]:fill(0)\n      counter = counter+weight_size\n      local bias_size = layer.bias:size(1)\n      learningRates[{{counter+1, counter+bias_size}}]:fill(1)\n      clipvector[{{counter+1, counter+bias_size}}]:fill(0)\n      counter = counter+bias_size\n    elseif layer.__typename == 'SpatialBatchNormalizationShiftPow2' then\n        local weight_size = layer.weight:size(1)\n        local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))\n        learningRates[{{counter+1, counter+weight_size}}]:fill(1)\n        clipvector[{{counter+1, counter+weight_size}}]:fill(0)\n        counter = counter+weight_size\n        local bias_size = layer.bias:size(1)\n        learningRates[{{counter+1, counter+bias_size}}]:fill(1)\n        clipvector[{{counter+1, counter+bias_size}}]:fill(0)\n        counter = counter+bias_size\n    elseif layer.__typename == 'nn.SpatialBatchNormalization' then\n            local weight_size = layer.weight:size(1)\n            local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))\n            learningRates[{{counter+1, counter+weight_size}}]:fill(1)\n            clipvector[{{counter+1, counter+weight_size}}]:fill(0)\n            counter = counter+weight_size\n            local bias_size = layer.bias:size(1)\n            learningRates[{{counter+1, counter+bias_size}}]:fill(1)\n            clipvector[{{counter+1, counter+bias_size}}]:fill(0)\n            counter = counter+bias_size\n    elseif layer.__typename == 'cudnnBinarySpatialConvolution' then\n      local size_w=layer.weight:size();\n      local weight_size = size_w[1]*size_w[2]*size_w[3]*size_w[4]\n\n      local filter_size=size_w[3]*size_w[4]\n      GLR=1/torch.sqrt(1.5/(size_w[1]*filter_size+size_w[2]*filter_size))\n      GLR=(math.pow(2,torch.round(math.log(GLR)/(math.log(2)))))\n      learningRates[{{counter+1, counter+weight_size}}]:fill(GLR)\n      clipvector[{{counter+1, counter+weight_size}}]:fill(1)\n      counter = counter+weight_size\n      local bias_size = layer.bias:size(1)\n      learningRates[{{counter+1, counter+bias_size}}]:fill(GLR)\n      clipvector[{{counter+1, counter+bias_size}}]:fill(0)\n      counter = counter+bias_size\n      elseif layer.__typename == 'BinarySpatialConvolution' then\n        local size_w=layer.weight:size();\n        local weight_size = size_w[1]*size_w[2]*size_w[3]*size_w[4]\n\n        local filter_size=size_w[3]*size_w[4]\n        GLR=1/torch.sqrt(1.5/(size_w[1]*filter_size+size_w[2]*filter_size))\n        GLR=(math.pow(2,torch.round(math.log(GLR)/(math.log(2)))))\n        learningRates[{{counter+1, counter+weight_size}}]:fill(GLR)\n        clipvector[{{counter+1, counter+weight_size}}]:fill(1)\n        counter = counter+weight_size\n        local bias_size = layer.bias:size(1)\n        learningRates[{{counter+1, counter+bias_size}}]:fill(GLR)\n        clipvector[{{counter+1, counter+bias_size}}]:fill(0)\n        counter = counter+bias_size\n\n  end\nend\n-- clip all parameter\nclipvector:fill(1)\n--\nprint(learningRates:eq(0):sum())\nprint(learningRates:ne(0):sum())\nprint(clipvector:ne(0):sum())\nprint(counter)\nreturn {\n     model = model,\n     lrs = learningRates,\n     clipV =clipvector,\n  }\n"
  },
  {
    "path": "Models/BinaryNet_MNIST_Model.lua",
    "content": "--[[This code specify the model for MNIST dataset. This model uses the Shift based batch-normalization algorithm.\nIn this file we also secify the Glorot learning parameter and which of the learnable parameter we clip ]]\nrequire 'nn'\nrequire './BinaryLinear.lua'\n\nrequire './BinarizedNeurons'\nif opt.type=='cuda' then\n  require 'cunn'\n  require 'cudnn'\nend\n\nlocal BatchNormalization;\nif opt.SBN == true then\n  require './BatchNormalizationShiftPow2'\n  BatchNormalization = BatchNormalizationShiftPow2\nelse\n  BatchNormalization = nn.BatchNormalization\nend\n\nlocal model = nn.Sequential()\nlocal numHid =2048\n-- Convolution Layers\nmodel:add(nn.View(-1,784))\n\nmodel:add(BinaryLinear(784,numHid))\nmodel:add(BatchNormalization(numHid, opt.runningVal))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\nmodel:add(BinaryLinear(numHid,numHid,opt.stcWeights))\nmodel:add(BatchNormalization(numHid, opt.runningVal))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\nmodel:add(BinaryLinear(numHid,numHid,opt.stcWeights))\nmodel:add(BatchNormalization(numHid, opt.runningVal))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\nmodel:add(BinaryLinear(numHid,10,opt.stcWeights))\nmodel:add(nn.BatchNormalization(10))\n\n\n\nlocal dE, param = model:getParameters()\nlocal weight_size = dE:size(1)\nlocal learningRates = torch.Tensor(weight_size):fill(0)\nlocal clipvector = torch.Tensor(weight_size):fill(0)\n\nlocal counter = 0\nfor i, layer in ipairs(model.modules) do\n   if layer.__typename == 'BinaryLinear' then\n      local weight_size = layer.weight:size(1)*layer.weight:size(2)\n      local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]+size_w[2]))\n      learningRates[{{counter+1, counter+weight_size}}]:fill(GLR)\n      clipvector[{{counter+1, counter+weight_size}}]:fill(1)\n      counter = counter+weight_size\n      local bias_size = layer.bias:size(1)\n      learningRates[{{counter+1, counter+bias_size}}]:fill(GLR)\n      clipvector[{{counter+1, counter+bias_size}}]:fill(0)\n      counter = counter+bias_size\n    elseif layer.__typename == 'BatchNormalizationShiftPow2' then\n        local weight_size = layer.weight:size(1)\n        local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))\n        learningRates[{{counter+1, counter+weight_size}}]:fill(GLR)\n        clipvector[{{counter+1, counter+weight_size}}]:fill(0)\n        counter = counter+weight_size\n        local bias_size = layer.bias:size(1)\n        learningRates[{{counter+1, counter+bias_size}}]:fill(1)\n        clipvector[{{counter+1, counter+bias_size}}]:fill(0)\n        counter = counter+bias_size\n    elseif layer.__typename == 'nn.BatchNormalization' then\n      local weight_size = layer.weight:size(1)\n      local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))\n      learningRates[{{counter+1, counter+weight_size}}]:fill(GLR)\n      clipvector[{{counter+1, counter+weight_size}}]:fill(0)\n      counter = counter+weight_size\n      local bias_size = layer.bias:size(1)\n      learningRates[{{counter+1, counter+bias_size}}]:fill(1)\n      clipvector[{{counter+1, counter+bias_size}}]:fill(0)\n      counter = counter+bias_size\n  end\nend\nprint(learningRates:eq(0):sum())\nprint(learningRates:ne(0):sum())\nprint(counter)\n\nreturn {\n   model = model,\n   lrs = learningRates,\n   clipV =clipvector,\n}\n"
  },
  {
    "path": "Models/BinaryNet_SVHN_Model.lua",
    "content": "--[[This code specify the model for SVHN dataset. This model uses the Shift based batch-normalization algorithm.\nIn this file we also secify the Glorot learning parameter and which of the learnable parameter we clip ]]\nrequire 'nn'\nrequire './BinaryLinear.lua'\nrequire './BinarizedNeurons'\n\nlocal SpatialConvolution\nif opt.type =='cuda' then\n  require 'cunn'\n  require 'cudnn'\n  require './cudnnBinarySpatialConvolution.lua'\n  SpatialConvolution = cudnnBinarySpatialConvolution\nelse\n  require './BinarySpatialConvolution.lua'\n  SpatialConvolution = BinarySpatialConvolution\nend\nif opt.SBN == true then\n  require './BatchNormalizationShiftPow2.lua'\n  require './SpatialBatchNormalizationShiftPow2.lua'\n  BatchNormalization = BatchNormalizationShiftPow2\n  SpatialBatchNormalization = SpatialBatchNormalizationShiftPow2\nelse\n  BatchNormalization = nn.BatchNormalization\n  SpatialBatchNormalization = nn.SpatialBatchNormalization\nend\n\n\nnumHid=1024;\nlocal model = nn.Sequential()\n\n-- Convolution Layers\nmodel:add(SpatialConvolution(3, 64, 3, 3 ,1,1,1,1,opt.stcWeights ))\nmodel:add(SpatialBatchNormalization(64, opt.runningVal))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\n\nmodel:add(SpatialConvolution(64, 64, 3, 3,1,1,1,1,opt.stcWeights ))\nmodel:add(cudnn.SpatialMaxPooling(2, 2))\nmodel:add(SpatialBatchNormalization(64, opt.runningVal))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\n\n\nmodel:add(SpatialConvolution(64, 128, 3, 3 ,1,1,1,1,opt.stcWeights ))\nmodel:add(SpatialBatchNormalization(128, opt.runningVal))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\n\nmodel:add(SpatialConvolution(128, 128, 3, 3 ,1,1,1,1,opt.stcWeights ))\nmodel:add(cudnn.SpatialMaxPooling(2, 2))\nmodel:add(SpatialBatchNormalization(128, opt.runningVal))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\n\n\nmodel:add(SpatialConvolution(128, 256, 3, 3,1,1,1,1,opt.stcWeights ))\nmodel:add(SpatialBatchNormalization(256, opt.runningVal))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\n\nmodel:add(SpatialConvolution(256, 256, 3, 3,1,1,1,1,opt.stcWeights ))\nmodel:add(cudnn.SpatialMaxPooling(2, 2))\nmodel:add(SpatialBatchNormalization(256, opt.runningVal))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\n\nmodel:add(nn.View(256*4*4))\n\nmodel:add(BinaryLinear(256*4*4,numHid,opt.stcWeights))\nmodel:add(BatchNormalization(numHid, opt.runningVal))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\n\nmodel:add(BinaryLinear(numHid,numHid,opt.stcWeights))\nmodel:add(BatchNormalization(numHid, opt.runningVal))\nmodel:add(nn.HardTanh())\nmodel:add(BinarizedNeurons(opt.stcNeurons))\n\n\nmodel:add(BinaryLinear(numHid,10,opt.stcWeights))\nmodel:add(nn.BatchNormalization(10))\n\nlocal dE, param = model:getParameters()\nlocal weight_size = dE:size(1)\nlocal learningRates = torch.Tensor(weight_size):fill(0)\nlocal clipvector = torch.Tensor(weight_size):fill(0)\nlocal counter = 0\nfor i, layer in ipairs(model.modules) do\n   if layer.__typename == 'BinaryLinear' then\n      local weight_size = layer.weight:size(1)*layer.weight:size(2)\n      local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]+size_w[2]))\n      GLR=(math.pow(2,torch.round(math.log(GLR)/(math.log(2)))))\n      learningRates[{{counter+1, counter+weight_size}}]:fill(GLR)\n      clipvector[{{counter+1, counter+weight_size}}]:fill(1)\n      counter = counter+weight_size\n      local bias_size = layer.bias:size(1)\n      learningRates[{{counter+1, counter+bias_size}}]:fill(GLR)\n      clipvector[{{counter+1, counter+bias_size}}]:fill(0)\n      counter = counter+bias_size\n    elseif layer.__typename == 'BatchNormalizationShiftPow2' then\n        local weight_size = layer.weight:size(1)\n        local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))\n        learningRates[{{counter+1, counter+weight_size}}]:fill(1)\n        clipvector[{{counter+1, counter+weight_size}}]:fill(0)\n        counter = counter+weight_size\n        local bias_size = layer.bias:size(1)\n        learningRates[{{counter+1, counter+bias_size}}]:fill(1)\n        clipvector[{{counter+1, counter+bias_size}}]:fill(0)\n        counter = counter+bias_size\n    elseif layer.__typename == 'nn.BatchNormalization' then\n      local weight_size = layer.weight:size(1)\n      local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))\n      learningRates[{{counter+1, counter+weight_size}}]:fill(1)\n      clipvector[{{counter+1, counter+weight_size}}]:fill(0)\n      counter = counter+weight_size\n      local bias_size = layer.bias:size(1)\n      learningRates[{{counter+1, counter+bias_size}}]:fill(1)\n      clipvector[{{counter+1, counter+bias_size}}]:fill(0)\n      counter = counter+bias_size\n    elseif layer.__typename == 'nn.SpatialBatchNormalization' then\n        local weight_size = layer.weight:size(1)\n        local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))\n        learningRates[{{counter+1, counter+weight_size}}]:fill(1)\n        clipvector[{{counter+1, counter+weight_size}}]:fill(0)\n        counter = counter+weight_size\n        local bias_size = layer.bias:size(1)\n        learningRates[{{counter+1, counter+bias_size}}]:fill(1)\n        clipvector[{{counter+1, counter+bias_size}}]:fill(0)\n        counter = counter+bias_size\n    elseif layer.__typename == 'nn.SpatialBatchNormalization' then\n                local weight_size = layer.weight:size(1)\n                local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))\n                learningRates[{{counter+1, counter+weight_size}}]:fill(1)\n                clipvector[{{counter+1, counter+weight_size}}]:fill(0)\n                counter = counter+weight_size\n                local bias_size = layer.bias:size(1)\n                learningRates[{{counter+1, counter+bias_size}}]:fill(1)\n                clipvector[{{counter+1, counter+bias_size}}]:fill(0)\n                counter = counter+bias_size\n    elseif layer.__typename == 'SpatialBatchNormalizationShiftPow2' then\n        local weight_size = layer.weight:size(1)\n        local size_w=layer.weight:size();   GLR=1/torch.sqrt(1.5/(size_w[1]))\n        learningRates[{{counter+1, counter+weight_size}}]:fill(1)\n        clipvector[{{counter+1, counter+weight_size}}]:fill(0)\n        counter = counter+weight_size\n        local bias_size = layer.bias:size(1)\n        learningRates[{{counter+1, counter+bias_size}}]:fill(1)\n        clipvector[{{counter+1, counter+bias_size}}]:fill(0)\n        counter = counter+bias_size\n    elseif layer.__typename == 'cudnnBinarySpatialConvolution' then\n      local size_w=layer.weight:size();\n      local weight_size = size_w[1]*size_w[2]*size_w[3]*size_w[4]\n\n      local filter_size=size_w[3]*size_w[4]\n      GLR=1/torch.sqrt(1.5/(size_w[1]*filter_size+size_w[2]*filter_size))\n      GLR=(math.pow(2,torch.round(math.log(GLR)/(math.log(2)))))\n      learningRates[{{counter+1, counter+weight_size}}]:fill(GLR)\n      clipvector[{{counter+1, counter+weight_size}}]:fill(1)\n      counter = counter+weight_size\n      local bias_size = layer.bias:size(1)\n      learningRates[{{counter+1, counter+bias_size}}]:fill(GLR)\n      clipvector[{{counter+1, counter+bias_size}}]:fill(0)\n      counter = counter+bias_size\n    elseif layer.__typename == 'BinarySpatialConvolution' then\n        local size_w=layer.weight:size();\n        local weight_size = size_w[1]*size_w[2]*size_w[3]*size_w[4]\n\n        local filter_size=size_w[3]*size_w[4]\n        GLR=1/torch.sqrt(1.5/(size_w[1]*filter_size+size_w[2]*filter_size))\n        GLR=(math.pow(2,torch.round(math.log(GLR)/(math.log(2)))))\n        learningRates[{{counter+1, counter+weight_size}}]:fill(GLR)\n        clipvector[{{counter+1, counter+weight_size}}]:fill(1)\n        counter = counter+weight_size\n        local bias_size = layer.bias:size(1)\n        learningRates[{{counter+1, counter+bias_size}}]:fill(GLR)\n        clipvector[{{counter+1, counter+bias_size}}]:fill(0)\n        counter = counter+bias_size\n\n  end\nend\n\nprint(learningRates:eq(0):sum())\nprint(learningRates:ne(0):sum())\nprint(clipvector:ne(0):sum())\nprint(counter)\nreturn {\n     model = model,\n     lrs = learningRates,\n     clipV =clipvector,\n  }\n"
  },
  {
    "path": "Models/BinarySpatialConvolution.lua",
    "content": "local BinarySpatialConvolution, parent = torch.class('BinarySpatialConvolution', 'nn.SpatialConvolution')\n\nfunction BinarySpatialConvolution:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)\n  local delayedReset = self.reset\n  self.reset = function() end\n  parent.__init(self, nInputPlane, nOutputPlane, kW, kH, dW, dH)\n  self.reset = delayedReset\n  self.padW = padW or 0\n  self.padH = padH or 0\n  self.stcWeights = stcWeights or false\n  self.groups = groups or 1\n  assert(nInputPlane % self.groups == 0,\n         'nInputPlane should be divisible by nGroups')\n  assert(nOutputPlane % self.groups == 0,\n         'nOutputPlane should be divisible by nGroups')\n  self.weight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)\n  self.weightB = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)\n  self.weightOrg = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)\n  self.randmat = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)\n  self.maskStc = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)\n  self:reset()\n  -- should nil for serialization, the reset will still work\n  self.reset = nil\n  self.iSize = torch.LongStorage(4):fill(0)\n\n\nend\n\nfunction BinarySpatialConvolution:reset(stdv)\n  if stdv then\n     stdv = stdv * math.sqrt(3)\n  else\n     stdv = 1/math.sqrt(self.kW*self.kH*self.nInputPlane)\n  end\n  if nn.oldSeed then\n     self.weight:apply(function()\n        return torch.uniform(-1, 1)\n     end)\n     if self.bias then\n        self.bias:apply(function()\n        return torch.uniform(-stdv, stdv)\n        end)\n     end\n  else\n     self.weight:uniform(-1, 1)\n     if self.bias then\n        self.bias:uniform(-stdv, stdv)\n     end\n  end\nend\n\nfunction BinarySpatialConvolution:binarized(trainFlag)\n  self.weightOrg:copy(self.weight)\n  self.binaryFlag = true\n  if not self.binaryFlag then\n    self.weight:copy(self.weightOrg)\n  else\n    self.weightB:copy(self.weight):add(1):div(2):clamp(0,1)\n\n    if not self.stcWeights or not trainFlag then\n      self.weightB:round():mul(2):add(-1)\n    else\n      self.maskStc=self.weightB-self.randmat:rand(self.randmat:size())\n      self.weightB:copy(self.maskStc)\n\n    end\n  end\n\n  return  self.weightB\nend\n\nlocal function backCompatibility(self)\n   self.finput = self.finput or self.weight.new()\n   self.fgradInput = self.fgradInput or self.weight.new()\n   if self.padding then\n      self.padW = self.padding\n      self.padH = self.padding\n      self.padding = nil\n   else\n      self.padW = self.padW or 0\n      self.padH = self.padH or 0\n   end\n   if self.weight:dim() == 2 then\n      self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)\n   end\n   if self.gradWeight and self.gradWeight:dim() == 2 then\n      self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)\n   end\nend\n\nlocal function makeContiguous(self, input, gradOutput)\n  if not input:isContiguous() then\n    self._input = self._input or input.new()\n    self._input:resizeAs(input):copy(input)\n    input = self._input\n end\n if gradOutput then\n    if not gradOutput:isContiguous() then\n self._gradOutput = self._gradOutput or gradOutput.new()\n self._gradOutput:resizeAs(gradOutput):copy(gradOutput)\n gradOutput = self._gradOutput\n    end\n end\n return input, gradOutput\nend\n\n-- function to re-view the weight layout in a way that would make the MM ops happy\nlocal function viewWeight(self)\n   self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane * self.kH * self.kW)\n   if self.gradWeight and self.gradWeight:dim() > 0 then\n      self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane * self.kH * self.kW)\n   end\nend\n\nlocal function unviewWeight(self)\n   self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)\n   if self.gradWeight and self.gradWeight:dim() > 0 then\n      self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)\n   end\nend\n\nfunction BinarySpatialConvolution:updateOutput(input)\n   backCompatibility(self)\n   viewWeight(self)\n   input = makeContiguous(self, input)\n   self.weightB = self:binarized(self.train)\n   self.weight:copy(self.weightB)\n   input.THNN.SpatialConvolutionMM_updateOutput(\n      input:cdata(),\n      self.output:cdata(),\n      self.weight:cdata(),\n      self.bias:cdata(),\n      self.finput:cdata(),\n      self.fgradInput:cdata(),\n      self.kW, self.kH,\n      self.dW, self.dH,\n      self.padW, self.padH\n   )\n   self.weight:copy(self.weightOrg)\n   unviewWeight(self)\n   return self.output\nend\n\nfunction BinarySpatialConvolution:updateGradInput(input, gradOutput)\n   if self.gradInput then\n      backCompatibility(self)\n      viewWeight(self)\n      input, gradOutput = makeContiguous(self, input, gradOutput)\n      self.weight:copy(self.weightB)\n      input.THNN.SpatialConvolutionMM_updateGradInput(\n         input:cdata(),\n         gradOutput:cdata(),\n         self.gradInput:cdata(),\n         self.weight:cdata(),\n         -- self.bias:cdata(), -- removed from this commit https://github.com/torch/nn/commit/651103f3aabc2dd154d6bd95ad565d14009255e6\n         self.finput:cdata(),\n         self.fgradInput:cdata(),\n         self.kW, self.kH,\n         self.dW, self.dH,\n         self.padW, self.padH\n      )\n      self.weight:copy(self.weightOrg)\n      unviewWeight(self)\n      return self.gradInput\n   end\nend\n\nfunction BinarySpatialConvolution:accGradParameters(input, gradOutput, scale)\n  scale = scale or 1\n  backCompatibility(self)\n  input, gradOutput = makeContiguous(self, input, gradOutput)\n  viewWeight(self)\n  input.THNN.SpatialConvolutionMM_accGradParameters(\n     input:cdata(),\n     gradOutput:cdata(),\n     self.gradWeight:cdata(),\n     self.gradBias:cdata(),\n     self.finput:cdata(),\n     self.fgradInput:cdata(),\n     self.kW, self.kH,\n     self.dW, self.dH,\n     self.padW, self.padH,\n     scale\n  )\n  unviewWeight(self)\nend\n\nfunction BinarySpatialConvolution:type(type,tensorCache)\n   self.finput = self.finput and torch.Tensor()\n   self.fgradInput = self.fgradInput and torch.Tensor()\n   return parent.type(self,type,tensorCache)\nend\n\nfunction BinarySpatialConvolution:__tostring__()\n   return parent.__tostring__(self)\nend\n\nfunction BinarySpatialConvolution:clearState()\n   nn.utils.clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')\n   return parent.clearState(self)\nend\n"
  },
  {
    "path": "Models/SpatialBatchNormalizationShiftPow2.lua",
    "content": "--[[\n   This file implements Shift based Batch Normalization based a variant of the vanilla BN as described in the paper:\n   \"Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Matthieu Courbariaux, Itay Hubara, Daniel Soudry, Ran El-Yaniv, Yoshua Bengio'\n\n   The code is based on nn library\n   --]]\nlocal SpatialBatchNormalizationShiftPow2,parent = torch.class('SpatialBatchNormalizationShiftPow2', 'nn.Module')\n\nfunction SpatialBatchNormalizationShiftPow2:__init(nFeature, runningVal, eps, momentum)\n   parent.__init(self)\n   assert(nFeature and type(nFeature) == 'number',\n          'Missing argument #1: Number of feature planes. ' ..\n          'Give 0 for no affine transform')\n   self.eps = eps or 1e-5\n   self.train = true\n   self.momentum = momentum or 0.125\n   self.runningVal = runningVal or true\n   self.running_mean = torch.Tensor()\n   self.running_std = torch.Tensor()\n   self.running_std_ap2 = torch.Tensor()\n   if nFeature > 0 then self.affine = true end\n\n   if self.affine then\n      self.weight = torch.Tensor(nFeature)\n      self.weightSign = torch.Tensor(nFeature)\n      self.weight_ap2 = torch.Tensor(nFeature)\n      self.bias = torch.Tensor(nFeature)\n      self.gradWeight = torch.Tensor(nFeature)\n      self.gradBias = torch.Tensor(nFeature)\n      self:reset()\n   end\nend\n\nfunction SpatialBatchNormalizationShiftPow2:reset()\n   self.weight:fill(1)\n   self.bias:zero()\nend\n\nfunction SpatialBatchNormalizationShiftPow2:updateOutput(input)\n   assert(input:dim() == 4, 'only mini-batch supported (4D tensor), got '\n             .. input:dim() .. 'D tensor instead')\n   local nBatch = input:size(1)\n   local nFeature = input:size(2)\n   local iH = input:size(3)\n   local iW = input:size(4)\n\n   -- buffers that are reused\n   self.buffer = self.buffer or input.new()\n   self.buffer2 = self.buffer2 or input.new()\n   self.centered = self.centered or input.new()\n   self.centered:resizeAs(input)\n   self.centeredOrg = self.centeredOrg or input.new()\n   self.centeredOrg:resizeAs(input)\n   self.centeredSign = self.centeredSign or input.new()\n   self.centeredSign:resizeAs(input)\n   self.std = self.std or input.new()\n   self.normalized = self.normalized or input.new()\n   self.normalized:resizeAs(input)\n   self.normalizedSign = self.normalizedSign or input.new()\n   self.normalizedSign:resizeAs(input)\n   self.output:resizeAs(input)\n   self.gradInput:resizeAs(input)\n   if self.train == false and self.runningVal == true then\n      assert(self.running_mean:nDimension() ~= 0,\n             'Module never run on training data. First run on some training data before evaluating.')\n      self.output:copy(input)\n      self.buffer:repeatTensor(self.running_mean:view(1, nFeature, 1, 1), nBatch, 1, iH, iW)\n      self.output:add(-1, self.buffer)\n      self.running_std_ap2:copy(torch.pow(2,torch.round(torch.log(self.running_std):div(math.log(2)))))\n      self.buffer:repeatTensor(self.running_std_ap2:view(1, nFeature, 1, 1), nBatch, 1, iH, iW)\n      self.output:cmul(self.buffer)\n   else -- training mode\n      if self.running_mean:nDimension() == 0 then\n         self.running_mean:resize(nFeature):zero()\n      end\n      if self.running_std:nDimension() == 0 then\n         self.running_std:resize(nFeature):zero()\n         self.running_std_ap2:resize(nFeature):zero()\n      end\n      -- calculate mean over mini-batch, over feature-maps\n      local in_folded = input:view(nBatch, nFeature, iH * iW)\n      self.buffer:mean(in_folded, 1)\n      self.buffer2:mean(self.buffer, 3)\n      self.running_mean:mul(1 - self.momentum):add(self.momentum, self.buffer2) -- add to running mean\n      self.buffer:repeatTensor(self.buffer2:view(1, nFeature, 1, 1),\n                               nBatch, 1, iH, iW)\n\n      -- subtract mean\n      self.centered:add(input, -1, self.buffer)                  -- x - E(x)\n      self.centeredOrg:copy(self.centered)\n      self.centeredSign:copy(self.centered)\n\n      self.centeredSign:sign()\n      self.centered:copy(torch.pow(2,torch.round(torch.log(self.centered:abs()):div(math.log(2))))):cmul(self.centeredSign)\n      -- calculate standard deviation over mini-batch\n\n      self.buffer:copy(self.centered):cmul(self.centeredOrg) --:abs()\n      -- calculate standard deviation over mini-batch\n\n      local buf_folded = self.buffer:view(nBatch,nFeature,iH*iW)\n      self.std:mean(self.buffer2:mean(buf_folded, 1), 3)\n      self.std:add(self.eps):sqrt():pow(-1)      -- 1 / E([x - E(x)]^2)\n      self.running_std:mul(1 - self.momentum):add(self.momentum, self.std) -- add to running stdv\n      self.std:copy(torch.pow(2,torch.round(torch.log(self.std):div(math.log(2)))))\n\n\n      self.buffer:repeatTensor(self.std:view(1, nFeature, 1, 1),\n                               nBatch, 1, iH, iW)\n\n      -- divide standard-deviation + eps\n      self.output:cmul(self.centeredOrg, self.buffer)\n      self.normalized:copy(self.output)\n      self.normalizedSign:copy(self.normalized)\n      self.normalizedSign:sign()\n      self.normalized:copy(torch.pow(2,torch.round(torch.log(self.normalized:abs()):div(math.log(2)))):cmul(self.normalizedSign))\n    --  self.normalized[self.normalized:lt(0)]=1; -- Can improve results\n   end\n\n   if self.affine then\n      -- multiply with gamma and add beta\n      self.weight_ap2:copy(self.weight)\n      self.weightSign:copy(self.weight):sign()\n      self.weight_ap2:copy(torch.pow(2,torch.round(torch.log(self.weight:clone():abs()):div(math.log(2))))):cmul(self.weightSign)\n      --self.weight:fill(1) --Almost similar results\n      self.buffer:repeatTensor(self.weight_ap2:view(1, nFeature, 1, 1),nBatch, 1, iH, iW)\n      self.output:cmul(self.buffer)\n      self.buffer:repeatTensor(self.bias:view(1, nFeature, 1, 1),\n                               nBatch, 1, iH, iW)\n      self.output:add(self.buffer)\n   end\n\n   return self.output\nend\n\nfunction SpatialBatchNormalizationShiftPow2:updateGradInput(input, gradOutput)\n   assert(input:dim() == 4, 'only mini-batch supported')\n   assert(gradOutput:dim() == 4, 'only mini-batch supported')\n   assert(self.train == true, 'should be in training mode when self.train is true')\n   local nBatch = input:size(1)\n   local nFeature = input:size(2)\n   local iH = input:size(3)\n   local iW = input:size(4)\n\n   self.gradInput:cmul(self.centered, gradOutput)\n   local gi_folded = self.gradInput:view(nBatch, nFeature, iH * iW)\n   self.buffer2:mean(self.buffer:mean(gi_folded, 1), 3)\n   self.gradInput:repeatTensor(self.buffer2:view(1, nFeature, 1, 1),\n                               nBatch, 1, iH, iW)\n   self.gradInput:cmul(self.centered):mul(-1)\n   self.buffer:repeatTensor(self.std:view(1, nFeature, 1, 1),\n                            nBatch, 1, iH, iW)\n   self.gradInput:cmul(self.buffer):cmul(self.buffer)\n\n   self.buffer:mean(gradOutput:view(nBatch, nFeature, iH*iW), 1)\n   self.buffer2:mean(self.buffer, 3)\n   self.buffer:repeatTensor(self.buffer2:view(1, nFeature, 1, 1),\n                            nBatch, 1, iH, iW)\n   self.gradInput:add(gradOutput):add(-1, self.buffer)\n   self.buffer:repeatTensor(self.std:view(1, nFeature, 1, 1),\n                            nBatch, 1, iH, iW)\n   self.gradInput:cmul(self.buffer)\n\n   if self.affine then\n      self.buffer:repeatTensor(self.weight_ap2:view(1, nFeature, 1, 1),\n                               nBatch, 1, iH, iW)\n      self.gradInput:cmul(self.buffer)\n   end\n\n   return self.gradInput\nend\n\nfunction SpatialBatchNormalizationShiftPow2:accGradParameters(input, gradOutput, scale)\n   if self.affine then\n      scale = scale or 1.0\n      local nBatch = input:size(1)\n      local nFeature = input:size(2)\n      local iH = input:size(3)\n      local iW = input:size(4)\n      self.buffer2:resizeAs(self.normalized):copy(self.normalized)\n      self.buffer2 = self.buffer2:cmul(gradOutput):view(nBatch, nFeature, iH*iW)\n      self.buffer:sum(self.buffer2, 1) -- sum over mini-batch\n      self.buffer2:sum(self.buffer, 3) -- sum over pixels\n      self.gradWeight:add(scale, self.buffer2)\n\n      self.buffer:sum(gradOutput:view(nBatch, nFeature, iH*iW), 1)\n      self.buffer2:sum(self.buffer, 3)\n      self.gradBias:add(scale, self.buffer2) -- sum over mini-batch\n   end\nend\n"
  },
  {
    "path": "Models/cudnnBinarySpatialConvolution.lua",
    "content": "local cudnnBinarySpatialConvolution, parent =\n    torch.class('cudnnBinarySpatialConvolution', 'cudnn.SpatialConvolution')\nlocal ffi = require 'ffi'\nlocal errcheck = cudnn.errcheck\n\nlocal autotunerCache = {}\nautotunerCache[1] = {} -- forward\nautotunerCache[2] = {} -- backwardFilter\nautotunerCache[3] = {} -- backwardData\n\nfunction cudnnBinarySpatialConvolution:__init(nInputPlane, nOutputPlane,\n                            kW, kH, dW, dH, padW, padH,stcWeights, groups)\n    local delayedReset = self.reset\n    self.reset = function() end\n    parent.__init(self, nInputPlane, nOutputPlane, kW, kH, dW, dH)\n    self.reset = delayedReset\n    self.padW = padW or 0\n    self.padH = padH or 0\n    self.groups = groups or 1\n    self.stcWeights = stcWeights or false\n    assert(nInputPlane % self.groups == 0,\n           'nInputPlane should be divisible by nGroups')\n    assert(nOutputPlane % self.groups == 0,\n           'nOutputPlane should be divisible by nGroups')\n    self.weight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kH, kW)\n    self.weightB = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)\n    self.weightOrg = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)\n    self.randmat = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)\n    self.maskStc = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)\n    self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kH, kW)\n    self:reset()\n    -- should nil for serialization, the reset will still work\n    self.reset = nil\nend\n\nfunction cudnnBinarySpatialConvolution:binarized(trainFlag)\n  self.weightOrg:copy(self.weight)\n  self.binaryFlag = true\n  if not self.binaryFlag then\n    self.weight:copy(self.weightOrg)\n  else\n    self.weightB:copy(self.weight):add(1):div(2):clamp(0,1)\n\n    if not self.stcWeights or not trainFlag then\n      self.weightB:round():mul(2):add(-1)\n      --print(self.weightB)\n    else\n      self.maskStc=self.weightB-self.randmat:rand(self.randmat:size())\n      self.weightB:copy(self.maskStc)\n\n    end\n  end\n\n  return  self.weightB\nend\n\n-- if you change the configuration of the module manually, call this\nfunction cudnnBinarySpatialConvolution:resetWeightDescriptors()\n    assert(torch.typename(self.weight) == 'torch.CudaTensor',\n           'Only Cuda supported duh!')\n    assert(torch.typename(self.bias) == 'torch.CudaTensor' or not self.bias,\n           'Only Cuda supported duh!')\n    -- for compatibility\n    self.groups = self.groups or 1\n    -- create filterDescriptor for weight\n    self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]')\n    errcheck('cudnnCreateFilterDescriptor', self.weightDesc)\n    local desc = torch.IntTensor({self.nOutputPlane/self.groups,\n                              self.nInputPlane/self.groups,\n                              self.kH, self.kW})\n    errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0],\n             'CUDNN_DATA_FLOAT', 'CUDNN_TENSOR_NCHW', 4,\n             desc:data());\n    local function destroyWDesc(d)\n        errcheck('cudnnDestroyFilterDescriptor', d[0]);\n    end\n    ffi.gc(self.weightDesc, destroyWDesc)\n\n    -- create descriptor for bias\n    if self.bias then\n        self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1))\n    end\nend\n\nfunction cudnnBinarySpatialConvolution:fastest(mode)\n    if mode == nil then mode = true end\n    self.fastest_mode = mode\n    self.iSize = self.iSize or torch.LongStorage(4)\n    self.iSize:fill(0)\n    return self\nend\n\nfunction cudnnBinarySpatialConvolution:setMode(fmode, bdmode, bwmode)\n    if fmode ~= nil then\n        self.fmode = fmode\n    end\n    if bdmode ~= nil then\n        self.bdmode = bdmode\n    end\n    if bwmode ~= nil then\n        self.bwmode = bwmode\n    end\n    self.iSize = self.iSize or torch.LongStorage(4)\n    self.iSize:fill(0)\n    return self\nend\n\nfunction cudnnBinarySpatialConvolution:resetMode()\n    self.fmode = nil\n    self.bdmode = nil\n    self.bwmode = nil\n    return self\nend\n\nfunction cudnnBinarySpatialConvolution:noBias()\n   self.bias = nil\n   self.gradBias = nil\n   return self\nend\n\nfunction cudnnBinarySpatialConvolution:createIODescriptors(input)\n    parent.createIODescriptors(self,input)\nend\n\nlocal one = torch.FloatTensor({1});\nlocal zero = torch.FloatTensor({0});\n\nlocal function makeContiguous(self, input, gradOutput)\n   if not input:isContiguous() then\n      self._input = self._input or input.new()\n      self._input:typeAs(input):resizeAs(input):copy(input)\n      input = self._input\n   end\n   if gradOutput and not gradOutput:isContiguous() then\n      self._gradOutput = self._gradOutput or gradOutput.new()\n      self._gradOutput:typeAs(gradOutput):resizeAs(gradOutput):copy(gradOutput)\n      gradOutput = self._gradOutput\n   end\n   return input, gradOutput\nend\n\nfunction cudnnBinarySpatialConvolution:updateOutput(input)\n    self.weightOrg:copy(self.weight)\n    self.weightB = self:binarized(self.train)\n    self.weight:copy(self.weightB)\n    parent.updateOutput(self,input)\n    self.weight:copy(self.weightOrg)\n    return self.output\nend\n\nfunction cudnnBinarySpatialConvolution:updateGradInput(input, gradOutput)\n    if not self.gradInput then return end\n    self.weight:copy(self.weightB)\n    parent.updateGradInput(self, input, gradOutput:contiguous(), scale)\n    self.weight:copy(self.weightOrg)\n    return self.gradInput\nend\n\nfunction cudnnBinarySpatialConvolution:accGradParameters(input, gradOutput, scale)\n    parent.accGradParameters(self, input, gradOutput:contiguous(), scale)\nend\n\nfunction cudnnBinarySpatialConvolution:clearDesc()\n    self.weightDesc = nil\n    self.biasDesc = nil\n    self.convDesc = nil\n    self.iDesc = nil\n    self.oDesc = nil\n    self.oDescForBias = nil\n    self.algType = nil\n    self.fwdAlgType = nil\n    self.bwdDataAlgType = nil\n    self.bwdFilterAlgType = nil\n    self.extraBuffer = nil\n    self.extraBufferSizeInBytes = nil\n    self.scaleT = nil\nend\n\nfunction cudnnBinarySpatialConvolution:write(f)\n    self:clearDesc()\n    local var = {}\n    for k,v in pairs(self) do\n        var[k] = v\n    end\n    f:writeObject(var)\nend\n\nfunction cudnnBinarySpatialConvolution:clearState()\n   self:clearDesc()\n   return nn.Module.clearState(self)\nend\n"
  },
  {
    "path": "README.md",
    "content": "Deep Networks on classification tasks using Torch\n=================================================\nThis is a complete training example for BinaryNets using Binary-Backpropagation algorithm as explained in\n\"Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Matthieu Courbariaux, Itay Hubara, Daniel Soudry, Ran El-Yaniv, Yoshua Bengio'\non following datasets: Cifar10/100, SVHN, MNIST\n\n## Data\nWe use dp library to extract all the data please view installation section\n\n## Dependencies\n* Torch (http://torch.ch)\n* \"DataProvider.torch\" (https://github.com/eladhoffer/DataProvider.torch) for DataProvider class.\n* \"cudnn.torch\" (https://github.com/soumith/cudnn.torch) for faster training. Can be avoided by changing \"cudnn\" to \"nn\" in models.\n* \"dp\" (https://github.com/nicholas-leonard/dp.git) for data extraction\n* \"unsup\" (https://github.com/koraykv/unsup.git) for data pre-processing\n\nTo install all dependencies (assuming torch is installed) use:\n```bash\nluarocks install https://raw.githubusercontent.com/eladhoffer/DataProvider.torch/master/dataprovider-scm-1.rockspec\nluarocks install cudnn\nluarocks install dp\nluarocks install unsup\n```\n\n## Training\nCreate pre-processing folder:\n```lua\ncd BinaryNet\nmkdir PreProcData\n```\n\nStart training using:\n```lua\nth Main_BinaryNet_Cifar10.lua -network BinaryNet_Cifar10_Model\n```\n\nor,\n\n```lua\nth Main_BinaryNet_MNIST.lua -network BinaryNet_MNIST_Model\n```\n\n## Run with Docker\nThe Docker is built from `nvidia/cuda:8.0-cudnn5-devel` with Torch commit `0219027e6c4644a0ba5c5bf137c989a0a8c9e01b`\n\n- To build image, run: `docker build -t binarynet:torch-gpu-cuda-8.0 -f Dockerfile/binarynet-torch-gpu-cuda-8.0 .` or to pull docker image: `docker pull hychiang/binarynet:torch-gpu-cuda-8.0`\n\n- To launch image with gpu, run: `docker run -it --gpus all binarynet:torch-gpu-cuda-8.0`\n\n- To train BNN with Cifar10: `th Main_BinaryNet_Cifar10.lua -network BinaryNet_Cifar10_Model`\n\n\n## Additional flags\n|Flag             | Default Value        |Description\n|:----------------|:--------------------:|:----------------------------------------------\n|modelsFolder     |  ./Models/           | Models Folder\n|network          |  Model.lua           | Model file - must return valid network.\n|LR               |  0.1                 | learning rate\n|LRDecay          |  0                   | learning rate decay (in # samples\n|weightDecay      |  1e-4                | L2 penalty on the weights\n|momentum         |  0.9                 | momentum\n|batchSize        |  128                 | batch size\n|stcNeurons       |  true                | using stochastic binarization for the neurons or not\n|stcWeights       |  false               | using stochastic binarization for the weights or not\n|optimization     |  adam                | optimization method\n|SBN              |  true                | use shift based batch-normalization or not\n|runningVal       |  true                | use running mean and std or not\n|epoch            |  -1                  | number of epochs to train (-1 for unbounded)\n|threads          |  8                   | number of threads\n|type             |  cuda                | float or cuda\n|devid            |  1                   | device ID (if using CUDA)\n|load             |  none                |  load existing net weights\n|save             |  time-identifier     | save directory\n|dataset          |  Cifar10             | Dataset - Cifar10, Cifar100, STL10, SVHN, MNIST\n|dp_prepro        |  false               | preprocessing using dp lib\n|whiten           |  false               | whiten data\n|augment          |  false               | Augment training data\n|preProcDir       |  ./PreProcData/      | Data for pre-processing (means,Pinv,P)\n"
  },
  {
    "path": "SqrHingeEmbeddingCriterion.lua",
    "content": "--[[\nThis Function implement the squared hinge loss criterion\n]]\nlocal SqrtHingeEmbeddingCriterion, parent = torch.class('SqrtHingeEmbeddingCriterion', 'nn.Criterion')\n\nfunction SqrtHingeEmbeddingCriterion:__init(margin)\n   parent.__init(self)\n   self.margin = margin or 1\n   self.sizeAverage = true\nend\n\nfunction SqrtHingeEmbeddingCriterion:updateOutput(input,y)\n   self.buffer = self.buffer or input.new()\n   if not torch.isTensor(y) then\n      self.ty = self.ty or input.new():resize(1)\n      self.ty[1]=y\n      y=self.ty\n   end\n\n   self.buffer:resizeAs(input):copy(input)\n   self.buffer:cmul(y):mul(-1):add(self.margin)\n   self.buffer[torch.le(self.buffer ,0)]=0\n   self.output=self.buffer:clone():pow(2):sum()\n\n   if (self.sizeAverage == nil or self.sizeAverage == true) then\n      self.output = self.output / input:nElement()\n   end\n\n   return self.output\nend\n\nfunction SqrtHingeEmbeddingCriterion:updateGradInput(input, y)\n   if not torch.isTensor(y) then self.ty[1]=y; y=self.ty end\n   self.gradInput:resizeAs(input):copy(y):mul(-2):cmul(self.buffer)\n   self.gradInput[torch.cmul(y,input):gt(self.margin)] = 0\n   if (self.sizeAverage == nil or self.sizeAverage == true) then\n      self.gradInput:mul(1 / input:nElement())\n   end\n   return self.gradInput\nend\n"
  },
  {
    "path": "adaMax_binary_clip_shift.lua",
    "content": "--[[ An implementation of Shift based AdaMax based on  http://arxiv.org/pdf/1412.6980.pdf as described the paper:\n   \"Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Matthieu Courbariaux, Itay Hubara, Daniel Soudry, Ran El-Yaniv, Yoshua Bengio'\n\nNote that this function perform the weight cliping as well\n\nARGS:\n\n- 'opfunc' : a function that takes a single input (X), the point\n             of a evaluation, and returns f(X) and df/dX\n- 'x'      : the initial point\n- 'config` : a table with configuration parameters for the optimizer\n- 'config.learningRate'      : learning rate\n- 'config.beta1'             : first moment coefficient\n- 'config.beta2'             : second moment coefficient\n- 'config.epsilon'           : for numerical stability\n- 'state'                    : a table describing the state of the optimizer; after each\n                              call the state is modified\n\nRETURN:\n- `x`     : the new x vector\n- `f(x)`  : the function, evaluated before the update\n\n]]\n\nfunction adaMax_binary_clip_shift(opfunc, x, config, state)\n    -- (0) get/update state\n    local config = config or {}\n    local state = state or config\n    local lr = config.learningRate or 0.002\n    local GLRvec = config.GLRvec or 1\n    local clipV = config.clipV or 0\n\n    local beta1 = config.beta1 or 0.9\n    local beta2 = config.beta2 or 0.999\n    local epsilon = config.epsilon or 2^-27\n\n    -- (1) evaluate f(x) and df/dx\n    local fx, dfdx = opfunc(x)\n    -- Initialization\n    state.t = state.t or 0\n    -- Exponential moving average of gradient values\n    state.m = state.m or x.new(dfdx:size()):zero()\n    -- Exponential moving average of squared gradient values\n    state.v = state.v or x.new(dfdx:size()):zero()\n    -- A tmp tensor to hold the sqrt(v) + epsilon\n    state.denom = state.denom or x.new(dfdx:size()):zero()\n\n    state.t = state.t + 1\n\n    -- Decay the first and second moment running average coefficient\n    state.m:mul(beta1):add(1-beta1, dfdx)\n    state.v:copy( torch.cmax(state.v:mul(beta2),dfdx:abs()) )\n    local biasCorrection1 = 1 - beta1^state.t\n\n    local stepSize = lr/biasCorrection1 --math.sqrt(biasCorrection2)/biasCorrection1\n\n    stepSize=math.pow(2,torch.round(math.log(stepSize)/(math.log(2))))\n    -- (2) update x\n    local tmp=torch.zeros(x:size())\n    if opt.type == 'cuda' then\n      tmp=tmp:cuda()\n    end\n\n\n    state.v:copy(torch.pow(2,torch.round(torch.log(state.v):div(math.log(2)))))\n    state.v:add(epsilon)\n    tmp:addcdiv(1, state.m, state.v)\n    -- Multiply by Glorot learning rate vector\n    x:addcmul(-stepSize, tmp, GLRvec)\n    -- Clip to [-1,1]\n    x[clipV:eq(1)]=x[clipV:eq(1)]:clamp(-1,1)\n    -- return x*, f(x) before optimization\n    return x, {fx}\nend\n"
  },
  {
    "path": "adam_binary_clip_b.lua",
    "content": "--[[ An implementation of Adam http://arxiv.org/pdf/1412.6980.pdf\n\nNote that this function perform the weight cliping as well\n\nARGS:\n\n- 'opfunc' : a function that takes a single input (X), the point\n             of a evaluation, and returns f(X) and df/dX\n- 'x'      : the initial point\n- 'config` : a table with configuration parameters for the optimizer\n- 'config.learningRate'      : learning rate\n- 'config.beta1'             : first moment coefficient\n- 'config.beta2'             : second moment coefficient\n- 'config.epsilon'           : for numerical stability\n- 'state'                    : a table describing the state of the optimizer; after each\n                              call the state is modified\n\nRETURN:\n- `x`     : the new x vector\n- `f(x)`  : the function, evaluated before the update\n\n]]\n\nfunction adam_binary_clip_b(opfunc, x, config, state)\n    -- (0) get/update state\n    local config = config or {}\n    local state = state or config\n    local lr = config.learningRate or 0.001\n    local GLRvec = config.GLRvec or 1\n\n    local beta1 = config.beta1 or 0.9\n    local beta2 = config.beta2 or 0.999\n    local epsilon = config.epsilon or 1e-8\n\n    -- (1) evaluate f(x) and df/dx\n    local fx, dfdx = opfunc(x)\n    --print(lr,dfdx:size())\n    -- Initialization\n    state.t = state.t or 0\n    -- Exponential moving average of gradient values\n    state.m = state.m or x.new(dfdx:size()):zero()\n    -- Exponential moving average of squared gradient values\n    state.v = state.v or x.new(dfdx:size()):zero()\n    -- A tmp tensor to hold the sqrt(v) + epsilon\n    state.denom = state.denom or x.new(dfdx:size()):zero()\n\n    state.t = state.t + 1\n\n    -- Decay the first and second moment running average coefficient\n    state.m:mul(beta1):add(1-beta1, dfdx)\n    state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx)\n\n    state.denom:copy(state.v):sqrt():add(epsilon)\n\n    local biasCorrection1 = 1 - beta1^state.t\n    local biasCorrection2 = 1 - beta2^state.t\n    local stepSize = lr * math.sqrt(biasCorrection2)/biasCorrection1\n    -- (2) update x\n    local tmp=torch.zeros(x:size())\n    if opt.type == 'cuda' then\n      tmp=tmp:cuda()\n    end\n\n    tmp:addcdiv(1, state.m, state.denom)\n    x:addcmul(-stepSize, tmp, GLRvec)\n    x[clipV:eq(1)]=x[clipV:eq(1)]:clamp(-1,1)\n\n    return x, {fx}\nend\n"
  }
]