Repository: itayhubara/BinaryNet Branch: master Commit: c23b86285cd1 Files: 18 Total size: 98.9 KB Directory structure: gitextract_kj09dah_/ ├── Data.lua ├── Dockerfile/ │ └── binarynet-torch-gpu-cuda-8.0 ├── Main_BinaryNet_Cifar10.lua ├── Main_BinaryNet_MNIST.lua ├── Main_BinaryNet_SVHN.lua ├── Models/ │ ├── BatchNormalizationShiftPow2.lua │ ├── BinarizedNeurons.lua │ ├── BinaryLinear.lua │ ├── BinaryNet_Cifar10_Model.lua │ ├── BinaryNet_MNIST_Model.lua │ ├── BinaryNet_SVHN_Model.lua │ ├── BinarySpatialConvolution.lua │ ├── SpatialBatchNormalizationShiftPow2.lua │ └── cudnnBinarySpatialConvolution.lua ├── README.md ├── SqrHingeEmbeddingCriterion.lua ├── adaMax_binary_clip_shift.lua └── adam_binary_clip_b.lua ================================================ FILE CONTENTS ================================================ ================================================ FILE: Data.lua ================================================ --[[ This code create the training test and validation datasets and preform diffrent kinds of preprocessing This code is based on elad hoffer Data.lua file from ConvNet-torch library (https://github.com/eladhoffer/ConvNet-torch.git) and uses: - Elad Hoffer DataProvidor.torch library: https://github.com/eladhoffer/DataProvider.torch.git - Nicholas Leonard dp library: https://github.com/nicholas-leonard/dp.git - Koray Kavukcuoglu dp library: https://github.com/koraykv/unsup.git ]] require 'dp' local DataProvider = require 'DataProvider' local opt = opt or {} local Dataset = opt.dataset or 'Cifar10' local PreProcDir = opt.preProcDir or './PreProcData/' local Whiten = opt.whiten or false local NormelizeWhiten = opt.NormelizeWhiten or false local DataPath = opt.datapath or '/home/itayh/Datasets/' local normalization = opt.normalization or 'simple' local format = opt.format or 'rgb' local TestData local TrainData local ValidData local Classes if Dataset =='Cifar100' then local file_valid = paths.concat(PreProcDir, format .. 'whiten_valid.t7') local file_train = paths.concat(PreProcDir, format .. 'whiten_train.t7') local file_test = paths.concat(PreProcDir, format .. 'whiten_test.t7') if (paths.filep(file_valid) and paths.filep(file_train) and paths.filep(file_test)) then ValidData=torch.load(file_valid) TrainData=torch.load(file_train) TestData=torch.load(file_test) else if paths.dirp(PreProcDir)==false then sys.execute('mkdir PreProcData/Cifar100') end input_preprocess = {} table.insert(input_preprocess, dp.ZCA()) ds = dp.Cifar100{scale={0,1}, valid_ratio=0.1,input_preprocess = input_preprocess} ValidData = {data=ds:validSet():inputs():input():clone():float(), label=ds:validSet():targets():input():clone():byte() } TrainData = {data=ds:trainSet():inputs():input():float(), label=ds:trainSet():targets():input():byte() } TestData = {data=ds:testSet():inputs():input():float() , label=ds:testSet():targets():input():byte() } collectgarbage() torch.save(file_valid,ValidData) torch.save(file_train,TrainData) torch.save(file_test,TestData) end elseif Dataset == 'Cifar10' then local file_valid = paths.concat(PreProcDir, format .. 'whiten_valid.t7') local file_train = paths.concat(PreProcDir, format .. 'whiten_train.t7') local file_test = paths.concat(PreProcDir, format .. 'whiten_test.t7') if (paths.filep(file_valid) and paths.filep(file_train) and paths.filep(file_test)) then ValidData=torch.load(file_valid) TrainData=torch.load(file_train) TestData=torch.load(file_test) else if paths.dirp(PreProcDir)==false then sys.execute('mkdir PreProcData/Cifar10') end input_preprocess = {} table.insert(input_preprocess, dp.ZCA()) ds = dp.Cifar10{scale={0,1},valid_ratio=0.1,input_preprocess = input_preprocess} --,input_preprocess = input_preprocess} scale={0,1}, ValidData = {data=ds:validSet():inputs():input():float(), label=ds:validSet():targets():input():clone():byte() } TrainData = {data=ds:trainSet():inputs():input():float(), label=ds:trainSet():targets():input():byte() } TestData = {data=ds:testSet():inputs():input():float(), label=ds:testSet():targets():input():byte() } collectgarbage() torch.save(file_valid,ValidData) torch.save(file_train,TrainData) torch.save(file_test,TestData) end Classes = {'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'} elseif Dataset == 'MNIST' then local file_valid = paths.concat(PreProcDir, format .. '_valid.t7') local file_train = paths.concat(PreProcDir, format .. '_train.t7') local file_test = paths.concat(PreProcDir, format .. '_test.t7') if (paths.filep(file_valid) and paths.filep(file_train) and paths.filep(file_test)) then ValidData=torch.load(file_valid) TrainData=torch.load(file_train) TestData=torch.load(file_test) else if paths.dirp(PreProcDir)==false then sys.execute('mkdir PreProcData/MNIST') end ds = dp.Mnist{scale={0,1}} ValidData = {data=ds:validSet():inputs():input():clone():float(), label=ds:validSet():targets():input():clone():byte() } TrainData = {data=ds:trainSet():inputs():input():float(), label=ds:trainSet():targets():input():byte() } TestData = {data=ds:testSet():inputs():input():float() , label=ds:testSet():targets():input():byte() } collectgarbage() torch.save(file_valid,ValidData) torch.save(file_train,TrainData) torch.save(file_test,TestData) end Classes = {1,2,3,4,5,6,7,8,9,0} elseif Dataset == 'SVHN' then local LCNfile_valid = paths.concat(PreProcDir, format .. 'GCN_LCN_valid.t7') local LCNfile_train = paths.concat(PreProcDir, format .. 'GCN_LCN_train.t7') local LCNfile_test = paths.concat(PreProcDir, format .. 'GCN_LCN_test.t7') print(LCNfile_valid) if (paths.filep(LCNfile_valid) and paths.filep(LCNfile_train) and paths.filep(LCNfile_test)) then ValidData=torch.load(LCNfile_valid) TrainData=torch.load(LCNfile_train) TestData=torch.load(LCNfile_test) else if paths.dirp(PreProcDir)==false then sys.execute('mkdir PreProcData/SVHN') end local input_preprocess = {} table.insert(input_preprocess, dp.GCN{batch_size=5000,use_std=true,sqrt_bias=10}) table.insert(input_preprocess, dp.LeCunLCN{kernel_size=9,divide_by_std=true,batch_size=5000,progress=true}) --,kernel_size=31,kernel_std=32}) ds = dp.Svhn{scale={0,1}, input_preprocess = input_preprocess} ValidData = {data=ds:validSet():inputs():input():float(), label=ds:validSet():targets():input():byte() }; ValidData.data:div( ValidData.data:max()) TrainData = {data=ds:trainSet():inputs():input():float(), label=ds:trainSet():targets():input():byte() }; TrainData.data:div( TrainData.data:max()) TestData = {data=ds:testSet():inputs():input():float(), label=ds:testSet():targets():input():byte() }; TestData.data:div( TestData.data:max()) collectgarbage() torch.save(LCNfile_valid,ValidData) torch.save(LCNfile_train,TrainData) torch.save(LCNfile_test,TestData) end Classes = {1,2,3,4,5,6,7,8,9,0} end TrainData.data = TrainData.data:float() TestData.data = TestData.data:float() local TrainDataProvider = DataProvider.Container{ Name = 'TrainingData', CachePrefix = nil, CacheFiles = false, Source = {TrainData.data,TrainData.label}, MaxNumItems = 1e6, CopyData = false, TensorType = 'torch.FloatTensor', } local TestDataProvider = DataProvider.Container{ Name = 'TestData', CachePrefix = nil, CacheFiles = false, Source = {TestData.data, TestData.label}, MaxNumItems = 1e6, CopyData = false, TensorType = 'torch.FloatTensor', } local ValidDataProvider = DataProvider.Container{ Name = 'ValidData', CachePrefix = nil, CacheFiles = false, Source = {ValidData.data, ValidData.label}, MaxNumItems = 1e6, CopyData = false, TensorType = 'torch.FloatTensor', } --Preprocesss if format == 'yuv' then require 'image' TrainDataProvider:apply(image.rgb2yuv) TestDataProvider:apply(image.rgb2yuv) end if Whiten then require 'unsup' local meanfile = paths.concat(PreProcDir, format .. 'imageMean.t7') local mean, P, invP local Pfile = paths.concat(PreProcDir,format .. 'P.t7') local invPfile = paths.concat(PreProcDir,format .. 'invP.t7') if (paths.filep(Pfile) and paths.filep(invPfile) and paths.filep(meanfile)) then P = torch.load(Pfile) invP = torch.load(invPfile) mean = torch.load(meanfile) TrainDataProvider.Data = unsup.zca_whiten(TrainDataProvider.Data, mean, P, invP) else TrainDataProvider.Data, mean, P, invP = unsup.zca_whiten(TrainDataProvider.Data) torch.save(Pfile,P) torch.save(invPfile,invP) torch.save(meanfile,mean) end TestDataProvider.Data = unsup.zca_whiten(TestDataProvider.Data, mean, P, invP) ValidDataProvider.Data = unsup.zca_whiten(ValidDataProvider.Data, mean, P, invP) elseif dp_prepro then -- Do nothing since we use dp lib for GCN and LCN else local meanfile = paths.concat(PreProcDir, format .. normalization .. 'Mean.t7') local stdfile = paths.concat(PreProcDir,format .. normalization .. 'Std.t7') local mean, std local loaded = false if paths.filep(meanfile) and paths.filep(stdfile) then mean = torch.load(meanfile) std = torch.load(stdfile) loaded = true end mean, std = TrainDataProvider:normalize(normalization, mean, std) TestDataProvider:normalize(normalization, mean, std) ValidDataProvider:normalize(normalization, mean, std) if not loaded then torch.save(meanfile,mean) torch.save(stdfile,std) end end return{ TrainData = TrainDataProvider, TestData = TestDataProvider, ValidData = ValidDataProvider, Classes = Classes } ================================================ FILE: Dockerfile/binarynet-torch-gpu-cuda-8.0 ================================================ FROM nvidia/cuda:8.0-cudnn5-devel WORKDIR /workspace # Install dependencies RUN apt-get update \ && apt-get install -y \ build-essential git gfortran \ python3 python3-setuptools python3-dev \ cmake curl wget unzip libreadline-dev libjpeg-dev libpng-dev ncurses-dev \ imagemagick gnuplot gnuplot-x11 libssl-dev libzmq3-dev graphviz vim sudo tmux # Install OpenBLAS RUN apt-get -y install libopenblas-dev # Install Torch commit no: 0219027e6c4644a0ba5c5bf137c989a0a8c9e01b RUN git clone https://github.com/torch/distro.git torch --recursive RUN cd torch \ && /bin/bash install-deps \ && ./install.sh # get torch tutorials. comment out this line if no need RUN git clone https://github.com/torch/tutorials.git # Install dependency for [BinaryNet](https://github.com/itayhubara/BinaryNet) RUN /workspace/torch/install/bin/luarocks install https://raw.githubusercontent.com/eladhoffer/DataProvider.torch/master/dataprovider-scm-1.rockspec RUN /workspace/torch/install/bin/luarocks install cudnn RUN /workspace/torch/install/bin/luarocks install dp RUN /workspace/torch/install/bin/luarocks install unsup # copy BinaryNet into the image ADD . BinaryNet ================================================ FILE: Main_BinaryNet_Cifar10.lua ================================================ require 'torch' require 'xlua' require 'optim' require 'gnuplot' require 'pl' require 'trepl' require 'adaMax_binary_clip_shift' require 'adam_binary_clip_b' require 'nn' require 'SqrHingeEmbeddingCriterion' ---------------------------------------------------------------------- cmd = torch.CmdLine() cmd:addTime() cmd:text() cmd:text('Training a convolutional network for visual classification') cmd:text() cmd:text('==>Options') cmd:text('===>Model And Training Regime') cmd:option('-modelsFolder', './Models/', 'Models Folder') cmd:option('-network', 'Model.lua', 'Model file - must return valid network.') cmd:option('-LR', 2^-6, 'learning rate') cmd:option('-LRDecay', 0, 'learning rate decay (in # samples)') cmd:option('-weightDecay', 0.0, 'L2 penalty on the weights') cmd:option('-momentum', 0.0, 'momentum') cmd:option('-batchSize', 200, 'batch size') cmd:option('-stcNeurons', true, 'use stochastic binarization for the neurons') cmd:option('-stcWeights', false, 'use stochastic binarization for the weights') cmd:option('-optimization', 'adam', 'optimization method') cmd:option('-SBN', true, 'shift based batch-normalization') cmd:option('-runningVal', false, 'use running mean and std') cmd:option('-epoch', -1, 'number of epochs to train, -1 for unbounded') cmd:text('===>Platform Optimization') cmd:option('-threads', 8, 'number of threads') cmd:option('-type', 'cuda', 'float or cuda') cmd:option('-devid', 1, 'device ID (if using CUDA)') cmd:option('-nGPU', 1, 'num of gpu devices used') cmd:option('-constBatchSize', false, 'do not allow varying batch sizes - e.g for ccn2 kernel') cmd:text('===>Save/Load Options') cmd:option('-load', '', 'load existing net weights') cmd:option('-save', os.date():gsub(' ',''), 'save directory') cmd:text('===>Data Options') cmd:option('-dataset', 'Cifar10', 'Dataset - Cifar10, Cifar100, STL10, SVHN, MNIST') cmd:option('-normalization', 'simple', 'simple - whole sample, channel - by image channel, image - mean and std images') cmd:option('-format', 'rgb', 'rgb or yuv') cmd:option('-whiten', true, 'whiten data') cmd:option('-dp_prepro', false, 'preprocessing using dp lib') cmd:option('-augment', false, 'Augment training data') cmd:option('-preProcDir', './PreProcData/', 'Data for pre-processing (means,P,invP)') cmd:text('===>Misc') cmd:option('-visualize', 0, 'visualizing results') torch.manualSeed(432) opt = cmd:parse(arg or {}) opt.network = opt.modelsFolder .. paths.basename(opt.network, '.lua') opt.save = paths.concat('./Results', opt.save) opt.preProcDir = paths.concat(opt.preProcDir, opt.dataset .. '/') -- If you choose to use exponentialy decaying learning rate use uncomment this line --opt.LRDecay=torch.pow((2e-6/opt.LR),(1./500)); -- os.execute('mkdir -p ' .. opt.preProcDir) torch.setnumthreads(opt.threads) torch.setdefaulttensortype('torch.FloatTensor') if opt.augment then require 'image' end ---------------------------------------------------------------------- -- Model + Loss: local modelAll = require(opt.network) model=modelAll.model GLRvec=modelAll.lrs clipV=modelAll.clipV local loss = SqrtHingeEmbeddingCriterion(1) local data = require 'Data' local classes = data.Classes ---------------------------------------------------------------------- -- This matrix records the current confusion across classes local confusion = optim.ConfusionMatrix(classes) local AllowVarBatch = not opt.constBatchSize ---------------------------------------------------------------------- -- Output files configuration os.execute('mkdir -p ' .. opt.save) cmd:log(opt.save .. '/Log.txt', opt) local netFilename = paths.concat(opt.save, 'Net') local logFilename = paths.concat(opt.save,'ErrorRate.log') local optStateFilename = paths.concat(opt.save,'optState') local Log = optim.Logger(logFilename) ---------------------------------------------------------------------- local TensorType = 'torch.FloatTensor' if paths.filep(opt.load) then model = torch.load(opt.load) print('==>Loaded model from: ' .. opt.load) print(model) end if opt.type =='cuda' then require 'cutorch' cutorch.setDevice(opt.devid) cutorch.setHeapTracking(true) model:cuda() GLRvec=GLRvec:cuda() clipV=clipV:cuda() loss = loss:cuda() TensorType = 'torch.CudaTensor' end ---Support for multiple GPUs - currently data parallel scheme if opt.nGPU > 1 then local net = model model = nn.DataParallelTable(1) for i = 1, opt.nGPU do cutorch.setDevice(i) model:add(net:clone():cuda(), i) -- Use the ith GPU end cutorch.setDevice(opt.devid) end -- Optimization configuration local Weights,Gradients = model:getParameters() ---------------------------------------------------------------------- print '==> Network' print(model) print('==>' .. Weights:nElement() .. ' Parameters') print '==> Loss' print(loss) ------------------Optimization Configuration-------------------------- local optimState = { learningRate = opt.LR, momentum = opt.momentum, weightDecay = opt.weightDecay, learningRateDecay = opt.LRDecay, GLRvec=GLRvec, clipV=clipV } ---------------------------------------------------------------------- local function SampleImages(images,labels) if not opt.augment then return images,labels else local sampled_imgs = images:clone() for i=1,images:size(1) do local sz = math.random(9) - 1 local hflip = math.random(2)==1 local startx = math.random(sz) local starty = math.random(sz) local img = images[i]:narrow(2,starty,32-sz):narrow(3,startx,32-sz) if hflip then img = image.hflip(img) end img = image.scale(img,32,32) sampled_imgs[i]:copy(img) end return sampled_imgs,labels end end ------------------------------ local function Forward(Data, train) local MiniBatch = DataProvider.Container{ Name = 'GPU_Batch', MaxNumItems = opt.batchSize, Source = Data, ExtractFunction = SampleImages, TensorType = TensorType } local yt = MiniBatch.Labels local x = MiniBatch.Data local SizeData = Data:size() if not AllowVarBatch then SizeData = math.floor(SizeData/opt.batchSize)*opt.batchSize end local NumSamples = 0 local NumBatches = 0 local lossVal = 0 while NumSamples < SizeData do MiniBatch:getNextBatch() local y, currLoss NumSamples = NumSamples + x:size(1) NumBatches = NumBatches + 1 if opt.nGPU > 1 then model:syncParameters() end y = model:forward(x) one_hot_yt=torch.zeros(yt:size(1),10) one_hot_yt:scatter(2, yt:long():view(-1,1), 1) one_hot_yt=one_hot_yt:mul(2):float():add(-1) if opt.type == 'cuda' then one_hot_yt=one_hot_yt:cuda() end currLoss = loss:forward(y,one_hot_yt) if train then function feval() model:zeroGradParameters() local dE_dy = loss:backward(y, one_hot_yt) model:backward(x, dE_dy) return currLoss, Gradients end --_G.optim[opt.optimization](feval, Weights, optimState) -- If you choose to use different optimization remember to clip the weights adaMax_binary_clip_shift(feval, Weights, optimState) end lossVal = currLoss + lossVal if type(y) == 'table' then --table results - always take first prediction y = y[1] end confusion:batchAdd(y,one_hot_yt) xlua.progress(NumSamples, SizeData) if math.fmod(NumBatches,100)==0 then collectgarbage() end end return(lossVal/math.ceil(SizeData/opt.batchSize)) end ------------------------------ local function Train(Data) model:training() return Forward(Data, true) end local function Test(Data) model:evaluate() return Forward(Data, false) end ------------------------------ local epoch = 1 print '\n==> Starting Training\n' while epoch ~= opt.epoch do data.TrainData:shuffleItems() print('Epoch ' .. epoch) --Train confusion:zero() local LossTrain = Train(data.TrainData) if epoch%10==0 then torch.save(netFilename, model) end confusion:updateValids() local ErrTrain = (1-confusion.totalValid) if #classes <= 10 then print(confusion) end print('Training Error = ' .. ErrTrain) print('Training Loss = ' .. LossTrain) --validation confusion:zero() local LossValid = Test(data.ValidData) confusion:updateValids() local ErrValid = (1-confusion.totalValid) if #classes <= 10 then print(confusion) end print('Valid Error = ' .. ErrValid) print('Valid Loss = ' .. LossValid) --Test confusion:zero() local LossTest = Test(data.TestData) confusion:updateValids() local ErrTest = (1-confusion.totalValid) if #classes <= 10 then print(confusion) end print('Test Error = ' .. ErrTest) print('Test Loss = ' .. LossTest) Log:add{['Training Error']= ErrTrain, ['Valid Error'] = ErrValid, ['Test Error'] = ErrTest} -- the training stops at epoch 3 if visualize is set to 1 if opt.visualize == 1 then Log:style{['Training Error'] = '-',['Validation Error'] = '-', ['Test Error'] = '-'} Log:plot() end --optimState.learningRate=optimState.learningRate*opt.LRDecay if epoch%50==0 then optimState.learningRate=optimState.learningRate*0.5 else optimState.learningRate=optimState.learningRate --*opt.LRDecay end print('-------------------LR-------------------') print(optimState.learningRate) epoch = epoch + 1 end ================================================ FILE: Main_BinaryNet_MNIST.lua ================================================ require 'torch' require 'xlua' require 'optim' require 'gnuplot' require 'pl' require 'trepl' require 'adaMax_binary_clip_shift' require 'nn' require 'SqrHingeEmbeddingCriterion' ---------------------------------------------- cmd = torch.CmdLine() cmd:addTime() cmd:text() cmd:text('Training a convolutional network for visual classification') cmd:text() cmd:text('==>Options') cmd:text('===>Model And Training Regime') cmd:option('-modelsFolder', './Models/', 'Models Folder') cmd:option('-network', 'Model.lua', 'Model file - must return valid network.') cmd:option('-LR', 2^-6, 'learning rate') cmd:option('-LRDecay', 0, 'learning rate decay (in # samples)') cmd:option('-weightDecay', 0.0, 'L2 penalty on the weights') cmd:option('-momentum', 0.0, 'momentum') cmd:option('-batchSize', 100, 'batch size') cmd:option('-stcNeurons', true, 'batch size') cmd:option('-stcWeights', false, 'batch size') cmd:option('-optimization', 'adam', 'optimization method') cmd:option('-SBN', true, 'shift based batch-normalization') cmd:option('-runningVal', true, 'use running mean and std') cmd:option('-epoch', -1, 'number of epochs to train, -1 for unbounded') cmd:text('===>Platform Optimization') cmd:option('-threads', 8, 'number of threads') cmd:option('-type', 'cuda', 'float or cuda') cmd:option('-devid', 1, 'device ID (if using CUDA)') cmd:option('-nGPU', 1, 'num of gpu devices used') cmd:option('-constBatchSize', false, 'do not allow varying batch sizes - e.g for ccn2 kernel') cmd:text('===>Save/Load Options') cmd:option('-load', '', 'load existing net weights') cmd:option('-save', os.date():gsub(' ',''), 'save directory') cmd:text('===>Data Options') cmd:option('-dataset', 'MNIST', 'Dataset - Cifar10, Cifar100, STL10, SVHN, MNIST') cmd:option('-normalization', 'simple', 'simple - whole sample, channel - by image channel, image - mean and std images') cmd:option('-format', 'rgb', 'rgb or yuv') cmd:option('-whiten', false, 'whiten data') cmd:option('-dp_prepro', false, 'preprocessing using dp lib') cmd:option('-augment', false, 'Augment training data') cmd:option('-preProcDir', './PreProcData/', 'Data for pre-processing (means,P,invP)') cmd:text('===>Misc') cmd:option('-visualize', 1, 'visualizing results') torch.manualSeed(432) opt = cmd:parse(arg or {}) opt.network = opt.modelsFolder .. paths.basename(opt.network, '.lua') opt.save = paths.concat('./Results', opt.save) opt.preProcDir = paths.concat(opt.preProcDir, opt.dataset .. '/') -- If you choose to use exponentialy decaying learning rate use uncomment this line --opt.LRDecay=torch.pow((2e-6/opt.LR),(1./500)); -- os.execute('mk1ir -p ' .. opt.preProcDir) torch.setnumthreads(opt.threads) torch.setdefaulttensortype('torch.FloatTensor') if opt.augment then require 'image' end ---------------------------------------------------------------------- local modelAll = require(opt.network) model=modelAll.model GLRvec=modelAll.lrs clipV=modelAll.clipV local loss = SqrtHingeEmbeddingCriterion(1) local data = require 'Data' local classes = data.Classes ---------------------------------------------------------------------- -- This matrix records the current confusion across classes local confusion = optim.ConfusionMatrix(classes) local AllowVarBatch = not opt.constBatchSize ---------------------------------------------------------------------- -- Output files configuration os.execute('mkdir -p ' .. opt.save) cmd:log(opt.save .. '/Log.txt', opt) local netFilename = paths.concat(opt.save, 'Net') local logFilename = paths.concat(opt.save,'ErrorRate.log') local optStateFilename = paths.concat(opt.save,'optState') local Log = optim.Logger(logFilename) ---------------------------------------------------------------------- local TensorType = 'torch.FloatTensor' if paths.filep(opt.load) then model = torch.load(opt.load) print('==>Loaded model from: ' .. opt.load) print(model) end if opt.type =='cuda' then require 'cutorch' cutorch.setDevice(opt.devid) cutorch.setHeapTracking(true) model:cuda() GLRvec=GLRvec:cuda() clipV=clipV:cuda() loss = loss:cuda() TensorType = 'torch.CudaTensor' end ---Support for multiple GPUs - currently data parallel scheme if opt.nGPU > 1 then local net = model model = nn.DataParallelTable(1) for i = 1, opt.nGPU do cutorch.setDevice(i) model:add(net:clone():cuda(), i) -- Use the ith GPU end cutorch.setDevice(opt.devid) end -- Optimization configuration local Weights,Gradients = model:getParameters() ---------------------------------------------------------------------- print '==> Network' print(model) print('==>' .. Weights:nElement() .. ' Parameters') print '==> Loss' print(loss) ------------------Optimization Configuration-------------------------- local optimState = { learningRate = opt.LR, momentum = opt.momentum, weightDecay = opt.weightDecay, learningRateDecay = opt.LRDecay, GLRvec=GLRvec, clipV=clipV } ---------------------------------------------------------------------- local function SampleImages(images,labels) if not opt.augment then return images,labels else local sampled_imgs = images:clone() for i=1,images:size(1) do local sz = math.random(9) - 1 local hflip = math.random(2)==1 local startx = math.random(sz) local starty = math.random(sz) local img = images[i]:narrow(2,starty,32-sz):narrow(3,startx,32-sz) if hflip then img = image.hflip(img) end img = image.scale(img,32,32) sampled_imgs[i]:copy(img) end return sampled_imgs,labels end end ------------------------------ local function Forward(Data, train) local MiniBatch = DataProvider.Container{ Name = 'GPU_Batch', MaxNumItems = opt.batchSize, Source = Data, ExtractFunction = SampleImages, TensorType = TensorType } local yt = MiniBatch.Labels local x = MiniBatch.Data local SizeData = Data:size() if not AllowVarBatch then SizeData = math.floor(SizeData/opt.batchSize)*opt.batchSize end local NumSamples = 0 local NumBatches = 0 local lossVal = 0 while NumSamples < SizeData do MiniBatch:getNextBatch() local y, currLoss NumSamples = NumSamples + x:size(1) NumBatches = NumBatches + 1 if opt.nGPU > 1 then model:syncParameters() end y = model:forward(x) one_hot_yt=torch.zeros(yt:size(1),10) one_hot_yt:scatter(2, yt:long():view(-1,1), 1) one_hot_yt=one_hot_yt:mul(2):float():add(-1):cuda() currLoss = loss:forward(y,one_hot_yt) if train then function feval() model:zeroGradParameters() local dE_dy = loss:backward(y, one_hot_yt) model:backward(x, dE_dy) return currLoss, Gradients end adaMax_binary_clip_shift(feval, Weights, optimState) local indLayer=0 for i, layer in ipairs(model.modules) do indLayer=indLayer+1; if layer.__typename == 'cudnnBinarySpatialConvolution' then model.modules[indLayer].weight:clamp(-1,1) elseif layer.__typename == 'BinaryLinear' then --print(indLayer) model.modules[indLayer].weight:clamp(-1,1) end end end lossVal = currLoss + lossVal if type(y) == 'table' then --table results - always take first prediction y = y[1] end confusion:batchAdd(y,one_hot_yt) xlua.progress(NumSamples, SizeData) if math.fmod(NumBatches,100)==0 then collectgarbage() end end return(lossVal/math.ceil(SizeData/opt.batchSize)) end ------------------------------ local function Train(Data) model:training() return Forward(Data, true) end local function Test(Data) model:evaluate() return Forward(Data, false) end ------------------------------ local epoch = 1 print '\n==> Starting Training\n' local epoch = 1 print '\n==> Starting Training\n' while epoch ~= opt.epoch do data.TrainData:shuffleItems() print('Epoch ' .. epoch) --Train confusion:zero() local LossTrain = Train(data.TrainData) if epoch%10==0 then torch.save(netFilename, model) end confusion:updateValids() local ErrTrain = (1-confusion.totalValid) if #classes <= 10 then print(confusion) end print('Training Error = ' .. ErrTrain) print('Training Loss = ' .. LossTrain) --validation confusion:zero() local LossValid = Test(data.ValidData) confusion:updateValids() local ErrValid = (1-confusion.totalValid) if #classes <= 10 then print(confusion) end print('Valid Error = ' .. ErrValid) print('Valid Loss = ' .. LossValid) --Test confusion:zero() local LossTest = Test(data.TestData) confusion:updateValids() local ErrTest = (1-confusion.totalValid) if #classes <= 10 then print(confusion) end print('Test Error = ' .. ErrTest) print('Test Loss = ' .. LossTest) Log:add{['Training Error']= ErrTrain, ['Valid Error'] = ErrValid, ['Test Error'] = ErrTest} if opt.visualize == 1 then Log:style{['Training Error'] = '-',['Validation Error'] = '-', ['Test Error'] = '-'} Log:plot() end if epoch%20==0 then optimState.learningRate=optimState.learningRate*0.5 else optimState.learningRate=optimState.learningRate --*opt.LRDecay end print('-------------------LR-------------------') print(optimState.learningRate) epoch = epoch + 1 end ================================================ FILE: Main_BinaryNet_SVHN.lua ================================================ require 'torch' require 'xlua' require 'optim' require 'gnuplot' require 'pl' require 'trepl' require 'adaMax_binary_clip_shift' require 'nn' require 'SqrHingeEmbeddingCriterion' ---------------------------------------------------------------------- cmd = torch.CmdLine() cmd:addTime() cmd:text() cmd:text('Training a convolutional network for visual classification') cmd:text() cmd:text('==>Options') cmd:text('===>Model And Training Regime') cmd:option('-modelsFolder', './Models/', 'Models Folder') cmd:option('-network', 'Model.lua', 'Model file - must return valid network.') cmd:option('-LR', 2^-7, 'learning rate') cmd:option('-LRDecay', 0, 'learning rate decay (in # samples)') cmd:option('-weightDecay', 0.0, 'L2 penalty on the weights') cmd:option('-momentum', 0.0, 'momentum') cmd:option('-batchSize', 200, 'batch size') cmd:option('-stcNeurons', true, 'batch size') cmd:option('-stcWeights', false, 'batch size') cmd:option('-optimization', 'adam', 'optimization method') cmd:option('-SBN', true, 'shift based batch-normalization') cmd:option('-runningVal', true, 'use running mean and std') cmd:option('-epoch', -1, 'number of epochs to train, -1 for unbounded') cmd:text('===>Platform Optimization') cmd:option('-threads', 8, 'number of threads') cmd:option('-type', 'cuda', 'float or cuda') cmd:option('-devid', 1, 'device ID (if using CUDA)') cmd:option('-nGPU', 1, 'num of gpu devices used') cmd:option('-constBatchSize', false, 'do not allow varying batch sizes - e.g for ccn2 kernel') cmd:text('===>Save/Load Options') cmd:option('-load', '', 'load existing net weights') cmd:option('-save', os.date():gsub(' ',''), 'save directory') cmd:text('===>Data Options') cmd:option('-dataset', 'SVHN', 'Dataset - Cifar10, Cifar100, STL10, SVHN, MNIST') cmd:option('-normalization', 'simple', 'simple - whole sample, channel - by image channel, image - mean and std images') cmd:option('-format', 'rgb', 'rgb or yuv') cmd:option('-whiten', false, 'whiten data') cmd:option('-dp_prepro', true, 'preprocessing using dp lib') cmd:option('-augment', false, 'Augment training data') cmd:option('-preProcDir', './PreProcData/', 'Data for pre-processing (means,P,invP)') cmd:text('===>Misc') cmd:option('-visualize', 1, 'visualizing results') torch.manualSeed(432) opt = cmd:parse(arg or {}) opt.network = opt.modelsFolder .. paths.basename(opt.network, '.lua') opt.save = paths.concat('./Results', opt.save) opt.preProcDir = paths.concat(opt.preProcDir, opt.dataset .. '/') -- If you choose to use exponentialy decaying learning rate use uncomment this line --opt.LRDecay=torch.pow((2e-6/opt.LR),(1./500)); -- os.execute('mk1ir -p ' .. opt.preProcDir) torch.setnumthreads(opt.threads) torch.setdefaulttensortype('torch.FloatTensor') if opt.augment then require 'image' end ---------------------------------------------------------------------- -- Model + Loss: local modelAll = require(opt.network) model=modelAll.model GLRvec=modelAll.lrs clipV=modelAll.clipV local loss = SqrtHingeEmbeddingCriterion(1) --nn.ClassNLLCriterion() local data = require 'Data' local classes = data.Classes ---------------------------------------------------------------------- -- This matrix records the current confusion across classes local confusion = optim.ConfusionMatrix(classes) local AllowVarBatch = not opt.constBatchSize ---------------------------------------------------------------------- -- Output files configuration os.execute('mkdir -p ' .. opt.save) cmd:log(opt.save .. '/Log.txt', opt) local netFilename = paths.concat(opt.save, 'Net') local logFilename = paths.concat(opt.save,'ErrorRate.log') local optStateFilename = paths.concat(opt.save,'optState') local Log = optim.Logger(logFilename) ---------------------------------------------------------------------- local TensorType = 'torch.FloatTensor' if opt.type =='cuda' then require 'cutorch' cutorch.setDevice(opt.devid) cutorch.setHeapTracking(true) model:cuda() GLRvec=GLRvec:cuda() clipV=clipV:cuda() loss = loss:cuda() TensorType = 'torch.CudaTensor' end if paths.filep(opt.load) then model = torch.load(opt.load) print('==>Loaded model from: ' .. opt.load) print(model) end ---Support for multiple GPUs - currently data parallel scheme if opt.nGPU > 1 then local net = model model = nn.DataParallelTable(1) for i = 1, opt.nGPU do cutorch.setDevice(i) model:add(net:clone():cuda(), i) -- Use the ith GPU end cutorch.setDevice(opt.devid) end -- Optimization configuration local Weights,Gradients = model:getParameters() ---------------------------------------------------------------------- print '==> Network' print(model) print('==>' .. Weights:nElement() .. ' Parameters') print '==> Loss' print(loss) ------------------Optimization Configuration-------------------------- local optimState = { learningRate = opt.LR, momentum = opt.momentum, weightDecay = opt.weightDecay, learningRateDecay = opt.LRDecay, GLRvec=GLRvec, clipV=clipV } ---------------------------------------------------------------------- local function SampleImages(images,labels) if not opt.augment then return images,labels else local sampled_imgs = images:clone() for i=1,images:size(1) do local sz = math.random(9) - 1 local hflip = math.random(2)==1 local startx = math.random(sz) local starty = math.random(sz) local img = images[i]:narrow(2,starty,32-sz):narrow(3,startx,32-sz) if hflip then img = image.hflip(img) end img = image.scale(img,32,32) sampled_imgs[i]:copy(img) end return sampled_imgs,labels end end ------------------------------ local function Forward(Data, train) local MiniBatch = DataProvider.Container{ Name = 'GPU_Batch', MaxNumItems = opt.batchSize, Source = Data, ExtractFunction = SampleImages, TensorType = TensorType } local yt = MiniBatch.Labels local x = MiniBatch.Data local SizeData = Data:size() if not AllowVarBatch then SizeData = math.floor(SizeData/opt.batchSize)*opt.batchSize end local NumSamples = 0 local NumBatches = 0 local lossVal = 0 while NumSamples < SizeData do MiniBatch:getNextBatch() local y, currLoss NumSamples = NumSamples + x:size(1) NumBatches = NumBatches + 1 if opt.nGPU > 1 then model:syncParameters() end y = model:forward(x) one_hot_yt=torch.zeros(yt:size(1),10) one_hot_yt:scatter(2, yt:long():view(-1,1), 1) one_hot_yt=one_hot_yt:mul(2):float():add(-1):cuda() currLoss = loss:forward(y,one_hot_yt) if train then function feval() model:zeroGradParameters() local dE_dy = loss:backward(y, one_hot_yt) model:backward(x, dE_dy) return currLoss, Gradients end adaMax_binary_clip_shift(feval, Weights, optimState) local indLayer=0 for i, layer in ipairs(model.modules) do indLayer=indLayer+1; if layer.__typename == 'cudnnBinarySpatialConvolution' then model.modules[indLayer].weight:copy(model.modules[indLayer].weight:clamp(-1,1)) elseif layer.__typename == 'BinaryLinear' then model.modules[indLayer].weight:copy(model.modules[indLayer].weight:clamp(-1,1)) end end end lossVal = currLoss + lossVal if type(y) == 'table' then --table results - always take first prediction y = y[1] end confusion:batchAdd(y,one_hot_yt) xlua.progress(NumSamples, SizeData) if math.fmod(NumBatches,100)==0 then collectgarbage() end end return(lossVal/math.ceil(SizeData/opt.batchSize)) end ------------------------------ local function Train(Data) model:training() return Forward(Data, true) end local function Test(Data) model:evaluate() return Forward(Data, false) end ------------------------------ local epoch = 1 print '\n==> Starting Training\n' while epoch ~= opt.epoch do data.TrainData:shuffleItems() print('Epoch ' .. epoch) --Train confusion:zero() local LossTrain = Train(data.TrainData) if epoch%10==0 then torch.save(netFilename, model) end confusion:updateValids() local ErrTrain = (1-confusion.totalValid) if #classes <= 10 then print(confusion) end print('Training Error = ' .. ErrTrain) print('Training Loss = ' .. LossTrain) --validation confusion:zero() local LossValid = Test(data.ValidData) confusion:updateValids() local ErrValid = (1-confusion.totalValid) if #classes <= 10 then print(confusion) end print('Valid Error = ' .. ErrValid) print('Valid Loss = ' .. LossValid) --Test confusion:zero() local LossTest = Test(data.TestData) confusion:updateValids() local ErrTest = (1-confusion.totalValid) if #classes <= 10 then print(confusion) end print('Test Error = ' .. ErrTest) print('Test Loss = ' .. LossTest) Log:add{['Training Error']= ErrTrain, ['Valid Error'] = ErrValid, ['Test Error'] = ErrTest} if opt.visualize == 1 then Log:style{['Training Error'] = '-',['Validation Error'] = '-', ['Test Error'] = '-'} Log:plot() end if epoch%20==0 then optimState.learningRate=optimState.learningRate*0.5 else optimState.learningRate=optimState.learningRate end print('-------------------LR-------------------') print(optimState.learningRate) epoch = epoch + 1 end ================================================ FILE: Models/BatchNormalizationShiftPow2.lua ================================================ --[[ This file implements Shift based Batch Normalization based a variant of the vanilla BN as described in the paper: "Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Matthieu Courbariaux, Itay Hubara, Daniel Soudry, Ran El-Yaniv, Yoshua Bengio' The code is based on nn library --]] local BatchNormalizationShiftPow2,parent = torch.class('BatchNormalizationShiftPow2', 'nn.Module') function BatchNormalizationShiftPow2:__init(nOutput, runningVal, eps, momentum, affine) parent.__init(self) assert(nOutput and type(nOutput) == 'number', 'Missing argument #1: dimensionality of input. ') assert(nOutput ~= 0, 'To set affine=false call BatchNormalization' .. '(nOutput, eps, momentum, false) ') if affine ~= nil then assert(type(affine) == 'boolean', 'affine has to be true/false') self.affine = affine else self.affine = true end self.eps = eps or 1e-5 self.train = true self.momentum = momentum or 0.125 self.runningVal = runningVal or true self.running_mean = torch.zeros(nOutput) self.running_std = torch.ones(nOutput) self.running_std_ap2 = torch.ones(nOutput) if self.affine then self.weight = torch.Tensor(nOutput) self.weightSign = torch.Tensor(nOutput) self.weight_ap2 = torch.Tensor(nOutput) self.bias = torch.Tensor(nOutput) self.gradWeight = torch.Tensor(nOutput) self.gradBias = torch.Tensor(nOutput) self:reset() end end function BatchNormalizationShiftPow2:reset() self.weight:fill(1) self.bias:zero() self.running_mean:zero() self.running_std:fill(1) end function BatchNormalizationShiftPow2:updateOutput(input) assert(input:dim() == 2, 'only mini-batch supported (2D tensor), got ' .. input:dim() .. 'D tensor instead') local nBatch = input:size(1) -- buffers that are reused self.buffer = self.buffer or input.new() self.buffer2 = self.buffer2 or input.new() self.centered = self.centered or input.new() self.centered:resizeAs(input) self.centerSign = self.centerSign or input.new() self.centerSign:resizeAs(input) self.centeredOrg = self.centeredOrg or input.new() self.centeredOrg:resizeAs(input) self.std = self.std or input.new() self.normalized = self.normalized or input.new() self.normalized:resizeAs(input) self.normalizedSign = self.normalizedSign or input.new() self.normalizedSign:resizeAs(input) self.output:resizeAs(input) self.gradInput:resizeAs(input) if self.train == false and self.runningVal == true then self.output:copy(input) self.buffer:repeatTensor(self.running_mean, nBatch, 1) self.output:add(-1, self.buffer) self.running_std_ap2:copy(torch.pow(2,torch.round(torch.log(self.running_std):div(math.log(2))))) self.buffer:repeatTensor(self.running_std_ap2, nBatch, 1) self.output:cmul(self.buffer) else -- training mode -- calculate mean over mini-batch self.buffer:mean(input, 1) -- E(x) = expectation of x. self.running_mean:mul(1 - self.momentum):add(self.momentum, self.buffer) -- add to running mean self.buffer:repeatTensor(self.buffer, nBatch, 1) -- subtract mean self.centered:add(input, -1, self.buffer) -- x - E(x) self.centeredOrg:copy(self.centered) self.centerSign:copy(self.centered) self.centerSign:sign() self.centered:copy(torch.pow(2,torch.round(torch.log(self.centered:abs()):div(math.log(2))))):cmul(self.centerSign) -- calculate standard deviation over mini-batch self.buffer:copy(self.centered):cmul(self.centeredOrg) -- [x - E(x)]^2 -- 1 / E([x - E(x)]^2) self.std:mean(self.buffer, 1):add(self.eps):sqrt():pow(-1) self.running_std:mul(1 - self.momentum):add(self.momentum, self.std) -- add to running stdv self.std:copy(torch.pow(2,torch.round(torch.log(self.std):div(math.log(2))))) self.buffer:repeatTensor(self.std, nBatch, 1) -- divide standard-deviation + eps self.output:cmul(self.centeredOrg, self.buffer) self.normalized:copy(self.output) self.normalizedSign:copy(self.normalized) self.normalizedSign:sign() self.normalized:copy(torch.pow(2,torch.round(torch.log(self.normalized:abs()):div(math.log(2)))):cmul(self.normalizedSign)) --self.normalized[self.normalized:lt(0)]=1; -- Can improve results end if self.affine then -- multiply with gamma and add beta self.weightSign:copy(self.weight) self.weightSign:sign() self.weight_ap2:copy(torch.pow(2,torch.round(torch.log(self.weight:clone():abs()):div(math.log(2))))):cmul(self.weightSign) --self.weight:fill(1) --Almost similar results self.buffer:repeatTensor(self.weight_ap2, nBatch, 1) self.output:cmul(self.buffer) self.buffer:repeatTensor(self.bias, nBatch, 1) self.output:add(self.buffer) end return self.output end function BatchNormalizationShiftPow2:updateGradInput(input, gradOutput) assert(input:dim() == 2, 'only mini-batch supported') assert(gradOutput:dim() == 2, 'only mini-batch supported') assert(self.train == true, 'should be in training mode when self.train is true') local nBatch = input:size(1) self.gradInput:cmul(self.centered, gradOutput) self.buffer:mean(self.gradInput, 1) self.gradInput:repeatTensor(self.buffer, nBatch, 1) self.gradInput:cmul(self.centered):mul(-1) self.buffer:repeatTensor(self.std, nBatch, 1) self.gradInput:cmul(self.buffer):cmul(self.buffer) self.buffer:mean(gradOutput, 1) self.buffer:repeatTensor(self.buffer, nBatch, 1) self.gradInput:add(gradOutput):add(-1, self.buffer) self.buffer:repeatTensor(self.std, nBatch, 1) self.gradInput:cmul(self.buffer) if self.affine then self.buffer:repeatTensor(self.weight_ap2, nBatch, 1) self.gradInput:cmul(self.buffer) end return self.gradInput end function BatchNormalizationShiftPow2:accGradParameters(input, gradOutput, scale) if self.affine then scale = scale or 1.0 self.buffer2:resizeAs(self.normalized):copy(self.normalized) self.buffer2:cmul(gradOutput) self.buffer:sum(self.buffer2, 1) -- sum over mini-batch self.gradWeight:add(scale, self.buffer) self.buffer:sum(gradOutput, 1) -- sum over mini-batch self.gradBias:add(scale, self.buffer) end end ================================================ FILE: Models/BinarizedNeurons.lua ================================================ local BinarizedNeurons,parent = torch.class('BinarizedNeurons', 'nn.Module') function BinarizedNeurons:__init(stcFlag) parent.__init(self) self.stcFlag = stcFlag self.randmat=torch.Tensor(); self.outputR=torch.Tensor(); end function BinarizedNeurons:updateOutput(input) self.randmat:resizeAs(input); self.outputR:resizeAs(input); self.output:resizeAs(input); self.outputR:copy(input):add(1):div(2) if self.train and self.stcFlag then local mask=self.outputR-self.randmat:rand(self.randmat:size()) self.output=mask:sign() else self.output:copy(self.outputR):add(-0.5):sign() end return self.output end function BinarizedNeurons:updateGradInput(input, gradOutput) self.gradInput:resizeAs(gradOutput) self.gradInput:copy(gradOutput) --:mul(0.5) return self.gradInput end ================================================ FILE: Models/BinaryLinear.lua ================================================ --require 'randomkit' local BinaryLinear, parent = torch.class('BinaryLinear', 'nn.Linear') function BinaryLinear:__init(inputSize, outputSize,stcWeights) local delayedReset = self.reset self.reset = function() end parent.__init(self, inputSize, outputSize) self.reset = delayedReset self.weight = torch.Tensor(outputSize, inputSize) self.weightB = torch.Tensor(outputSize, inputSize) self.weightOrg = torch.Tensor(outputSize, inputSize) self.maskStc = torch.Tensor(outputSize, inputSize) self.randmat = torch.Tensor(outputSize, inputSize) self.bias = torch.Tensor(outputSize) self.gradWeight = torch.Tensor(outputSize, inputSize) self.gradBias = torch.Tensor(outputSize) self.stcWeights=stcWeights self:reset() -- should nil for serialization, the reset will still work self.reset = nil end function BinaryLinear:reset(stdv) if stdv then stdv = stdv * math.sqrt(3) else stdv = 1./math.sqrt(self.weight:size(2)) end if nn.oldSeed then for i=1,self.weight:size(1) do self.weight:select(1, i):apply(function() return torch.uniform(-1, 1) end) self.bias[i] = torch.uniform(-stdv, stdv) end else self.weight:uniform(-1, 1) self.bias:uniform(-stdv, stdv) end return self end function BinaryLinear:binarized(trainFlag) self.weightOrg:copy(self.weight) self.binaryFlag = true if not self.binaryFlag then self.weight:copy(self.weightOrg) else self.weightB:copy(self.weight):add(1):div(2):clamp(0,1) if not self.stcWeights or not trainFlag then self.weightB:round():mul(2):add(-1) else self.maskStc=self.weightB-self.randmat:rand(self.randmat:size()) self.weightB:copy(self.maskStc) end end return self.weightB end function BinaryLinear:updateOutput(input) self.weightB = self:binarized(self.train) self.weight:copy(self.weightB) parent.updateOutput(self,input) self.weight:copy(self.weightOrg); return self.output end function BinaryLinear:updateGradInput(input, gradOutput) if self.gradInput then self.weight:copy(self.weightB) parent.updateGradInput(self,input, gradOutput) self.weight:copy(self.weightOrg); return self.gradInput end end function BinaryLinear:accGradParameters(input, gradOutput, scale) parent.accGradParameters(self,input, gradOutput, scale) end -- we do not need to accumulate parameters when sharing BinaryLinear.sharedAccUpdateGradParameters = BinaryLinear.accUpdateGradParameters function BinaryLinear:__tostring__() return torch.type(self) .. string.format('(%d -> %d)', self.weight:size(2), self.weight:size(1)) end ================================================ FILE: Models/BinaryNet_Cifar10_Model.lua ================================================ --[[This code specify the model for CIFAR 10 dataset. This model uses the Shift based batch-normalization algorithm. In this file we also secify the Glorot learning parameter and the which of the learnable parameter we clip ]] require 'nn' require './BinaryLinear.lua' require './BinarizedNeurons' local SpatialConvolution local SpatialMaxPooling if opt.type =='cuda' then require 'cunn' require 'cudnn' require './cudnnBinarySpatialConvolution.lua' SpatialConvolution = cudnnBinarySpatialConvolution SpatialMaxPooling = cudnn.SpatialMaxPooling else require './BinarySpatialConvolution.lua' SpatialConvolution = BinarySpatialConvolution SpatialMaxPooling = nn.SpatialMaxPooling end if opt.SBN == true then require './BatchNormalizationShiftPow2.lua' require './SpatialBatchNormalizationShiftPow2.lua' BatchNormalization = BatchNormalizationShiftPow2 SpatialBatchNormalization = SpatialBatchNormalizationShiftPow2 else BatchNormalization = nn.BatchNormalization SpatialBatchNormalization = nn.SpatialBatchNormalization end numHid=1024; local model = nn.Sequential() -- Convolution Layers model:add(SpatialConvolution(3, 128, 3, 3 ,1,1,1,1,opt.stcWeights )) model:add(SpatialBatchNormalization(128, opt.runningVal)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(SpatialConvolution(128, 128, 3, 3,1,1,1,1,opt.stcWeights )) model:add(SpatialMaxPooling(2, 2)) model:add(SpatialBatchNormalization(128, opt.runningVal)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(SpatialConvolution(128, 256, 3, 3 ,1,1,1,1,opt.stcWeights )) model:add(SpatialBatchNormalization(256, opt.runningVal)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(SpatialConvolution(256, 256, 3, 3 ,1,1,1,1,opt.stcWeights )) model:add(SpatialMaxPooling(2, 2)) model:add(SpatialBatchNormalization(256, opt.runningVal)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(SpatialConvolution(256, 512, 3, 3,1,1,1,1,opt.stcWeights )) model:add(SpatialBatchNormalization(512, opt.runningVal)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(SpatialConvolution(512, 512, 3, 3,1,1,1,1,opt.stcWeights )) model:add(SpatialMaxPooling(2, 2)) model:add(SpatialBatchNormalization(512, opt.runningVal)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(nn.View(512*4*4)) model:add(BinaryLinear(512*4*4,numHid,opt.stcWeights)) model:add(BatchNormalization(numHid)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(BinaryLinear(numHid,numHid,opt.stcWeights)) model:add(BatchNormalization(numHid, opt.runningVal)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(BinaryLinear(numHid,10,opt.stcWeights)) model:add(nn.BatchNormalization(10)) local dE, param = model:getParameters() local weight_size = dE:size(1) local learningRates = torch.Tensor(weight_size):fill(0) local clipvector = torch.Tensor(weight_size):fill(1) local counter = 0 for i, layer in ipairs(model.modules) do if layer.__typename == 'BinaryLinear' then local weight_size = layer.weight:size(1)*layer.weight:size(2) local size_w=layer.weight:size(); GLR=1/torch.sqrt(1.5/(size_w[1]+size_w[2])) GLR=(math.pow(2,torch.round(math.log(GLR)/(math.log(2))))) learningRates[{{counter+1, counter+weight_size}}]:fill(GLR) clipvector[{{counter+1, counter+weight_size}}]:fill(1) counter = counter+weight_size local bias_size = layer.bias:size(1) learningRates[{{counter+1, counter+bias_size}}]:fill(GLR) clipvector[{{counter+1, counter+bias_size}}]:fill(0) counter = counter+bias_size elseif layer.__typename == 'BatchNormalizationShiftPow2' then local weight_size = layer.weight:size(1) local size_w=layer.weight:size(); GLR=1/torch.sqrt(1.5/(size_w[1])) learningRates[{{counter+1, counter+weight_size}}]:fill(1) clipvector[{{counter+1, counter+weight_size}}]:fill(0) counter = counter+weight_size local bias_size = layer.bias:size(1) learningRates[{{counter+1, counter+bias_size}}]:fill(1) clipvector[{{counter+1, counter+bias_size}}]:fill(0) counter = counter+bias_size elseif layer.__typename == 'nn.BatchNormalization' then local weight_size = layer.weight:size(1) learningRates[{{counter+1, counter+weight_size}}]:fill(1) clipvector[{{counter+1, counter+weight_size}}]:fill(0) counter = counter+weight_size local bias_size = layer.bias:size(1) learningRates[{{counter+1, counter+bias_size}}]:fill(1) clipvector[{{counter+1, counter+bias_size}}]:fill(0) counter = counter+bias_size elseif layer.__typename == 'SpatialBatchNormalizationShiftPow2' then local weight_size = layer.weight:size(1) local size_w=layer.weight:size(); GLR=1/torch.sqrt(1.5/(size_w[1])) learningRates[{{counter+1, counter+weight_size}}]:fill(1) clipvector[{{counter+1, counter+weight_size}}]:fill(0) counter = counter+weight_size local bias_size = layer.bias:size(1) learningRates[{{counter+1, counter+bias_size}}]:fill(1) clipvector[{{counter+1, counter+bias_size}}]:fill(0) counter = counter+bias_size elseif layer.__typename == 'nn.SpatialBatchNormalization' then local weight_size = layer.weight:size(1) local size_w=layer.weight:size(); GLR=1/torch.sqrt(1.5/(size_w[1])) learningRates[{{counter+1, counter+weight_size}}]:fill(1) clipvector[{{counter+1, counter+weight_size}}]:fill(0) counter = counter+weight_size local bias_size = layer.bias:size(1) learningRates[{{counter+1, counter+bias_size}}]:fill(1) clipvector[{{counter+1, counter+bias_size}}]:fill(0) counter = counter+bias_size elseif layer.__typename == 'cudnnBinarySpatialConvolution' then local size_w=layer.weight:size(); local weight_size = size_w[1]*size_w[2]*size_w[3]*size_w[4] local filter_size=size_w[3]*size_w[4] GLR=1/torch.sqrt(1.5/(size_w[1]*filter_size+size_w[2]*filter_size)) GLR=(math.pow(2,torch.round(math.log(GLR)/(math.log(2))))) learningRates[{{counter+1, counter+weight_size}}]:fill(GLR) clipvector[{{counter+1, counter+weight_size}}]:fill(1) counter = counter+weight_size local bias_size = layer.bias:size(1) learningRates[{{counter+1, counter+bias_size}}]:fill(GLR) clipvector[{{counter+1, counter+bias_size}}]:fill(0) counter = counter+bias_size elseif layer.__typename == 'BinarySpatialConvolution' then local size_w=layer.weight:size(); local weight_size = size_w[1]*size_w[2]*size_w[3]*size_w[4] local filter_size=size_w[3]*size_w[4] GLR=1/torch.sqrt(1.5/(size_w[1]*filter_size+size_w[2]*filter_size)) GLR=(math.pow(2,torch.round(math.log(GLR)/(math.log(2))))) learningRates[{{counter+1, counter+weight_size}}]:fill(GLR) clipvector[{{counter+1, counter+weight_size}}]:fill(1) counter = counter+weight_size local bias_size = layer.bias:size(1) learningRates[{{counter+1, counter+bias_size}}]:fill(GLR) clipvector[{{counter+1, counter+bias_size}}]:fill(0) counter = counter+bias_size end end -- clip all parameter clipvector:fill(1) -- print(learningRates:eq(0):sum()) print(learningRates:ne(0):sum()) print(clipvector:ne(0):sum()) print(counter) return { model = model, lrs = learningRates, clipV =clipvector, } ================================================ FILE: Models/BinaryNet_MNIST_Model.lua ================================================ --[[This code specify the model for MNIST dataset. This model uses the Shift based batch-normalization algorithm. In this file we also secify the Glorot learning parameter and which of the learnable parameter we clip ]] require 'nn' require './BinaryLinear.lua' require './BinarizedNeurons' if opt.type=='cuda' then require 'cunn' require 'cudnn' end local BatchNormalization; if opt.SBN == true then require './BatchNormalizationShiftPow2' BatchNormalization = BatchNormalizationShiftPow2 else BatchNormalization = nn.BatchNormalization end local model = nn.Sequential() local numHid =2048 -- Convolution Layers model:add(nn.View(-1,784)) model:add(BinaryLinear(784,numHid)) model:add(BatchNormalization(numHid, opt.runningVal)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(BinaryLinear(numHid,numHid,opt.stcWeights)) model:add(BatchNormalization(numHid, opt.runningVal)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(BinaryLinear(numHid,numHid,opt.stcWeights)) model:add(BatchNormalization(numHid, opt.runningVal)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(BinaryLinear(numHid,10,opt.stcWeights)) model:add(nn.BatchNormalization(10)) local dE, param = model:getParameters() local weight_size = dE:size(1) local learningRates = torch.Tensor(weight_size):fill(0) local clipvector = torch.Tensor(weight_size):fill(0) local counter = 0 for i, layer in ipairs(model.modules) do if layer.__typename == 'BinaryLinear' then local weight_size = layer.weight:size(1)*layer.weight:size(2) local size_w=layer.weight:size(); GLR=1/torch.sqrt(1.5/(size_w[1]+size_w[2])) learningRates[{{counter+1, counter+weight_size}}]:fill(GLR) clipvector[{{counter+1, counter+weight_size}}]:fill(1) counter = counter+weight_size local bias_size = layer.bias:size(1) learningRates[{{counter+1, counter+bias_size}}]:fill(GLR) clipvector[{{counter+1, counter+bias_size}}]:fill(0) counter = counter+bias_size elseif layer.__typename == 'BatchNormalizationShiftPow2' then local weight_size = layer.weight:size(1) local size_w=layer.weight:size(); GLR=1/torch.sqrt(1.5/(size_w[1])) learningRates[{{counter+1, counter+weight_size}}]:fill(GLR) clipvector[{{counter+1, counter+weight_size}}]:fill(0) counter = counter+weight_size local bias_size = layer.bias:size(1) learningRates[{{counter+1, counter+bias_size}}]:fill(1) clipvector[{{counter+1, counter+bias_size}}]:fill(0) counter = counter+bias_size elseif layer.__typename == 'nn.BatchNormalization' then local weight_size = layer.weight:size(1) local size_w=layer.weight:size(); GLR=1/torch.sqrt(1.5/(size_w[1])) learningRates[{{counter+1, counter+weight_size}}]:fill(GLR) clipvector[{{counter+1, counter+weight_size}}]:fill(0) counter = counter+weight_size local bias_size = layer.bias:size(1) learningRates[{{counter+1, counter+bias_size}}]:fill(1) clipvector[{{counter+1, counter+bias_size}}]:fill(0) counter = counter+bias_size end end print(learningRates:eq(0):sum()) print(learningRates:ne(0):sum()) print(counter) return { model = model, lrs = learningRates, clipV =clipvector, } ================================================ FILE: Models/BinaryNet_SVHN_Model.lua ================================================ --[[This code specify the model for SVHN dataset. This model uses the Shift based batch-normalization algorithm. In this file we also secify the Glorot learning parameter and which of the learnable parameter we clip ]] require 'nn' require './BinaryLinear.lua' require './BinarizedNeurons' local SpatialConvolution if opt.type =='cuda' then require 'cunn' require 'cudnn' require './cudnnBinarySpatialConvolution.lua' SpatialConvolution = cudnnBinarySpatialConvolution else require './BinarySpatialConvolution.lua' SpatialConvolution = BinarySpatialConvolution end if opt.SBN == true then require './BatchNormalizationShiftPow2.lua' require './SpatialBatchNormalizationShiftPow2.lua' BatchNormalization = BatchNormalizationShiftPow2 SpatialBatchNormalization = SpatialBatchNormalizationShiftPow2 else BatchNormalization = nn.BatchNormalization SpatialBatchNormalization = nn.SpatialBatchNormalization end numHid=1024; local model = nn.Sequential() -- Convolution Layers model:add(SpatialConvolution(3, 64, 3, 3 ,1,1,1,1,opt.stcWeights )) model:add(SpatialBatchNormalization(64, opt.runningVal)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(SpatialConvolution(64, 64, 3, 3,1,1,1,1,opt.stcWeights )) model:add(cudnn.SpatialMaxPooling(2, 2)) model:add(SpatialBatchNormalization(64, opt.runningVal)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(SpatialConvolution(64, 128, 3, 3 ,1,1,1,1,opt.stcWeights )) model:add(SpatialBatchNormalization(128, opt.runningVal)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(SpatialConvolution(128, 128, 3, 3 ,1,1,1,1,opt.stcWeights )) model:add(cudnn.SpatialMaxPooling(2, 2)) model:add(SpatialBatchNormalization(128, opt.runningVal)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(SpatialConvolution(128, 256, 3, 3,1,1,1,1,opt.stcWeights )) model:add(SpatialBatchNormalization(256, opt.runningVal)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(SpatialConvolution(256, 256, 3, 3,1,1,1,1,opt.stcWeights )) model:add(cudnn.SpatialMaxPooling(2, 2)) model:add(SpatialBatchNormalization(256, opt.runningVal)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(nn.View(256*4*4)) model:add(BinaryLinear(256*4*4,numHid,opt.stcWeights)) model:add(BatchNormalization(numHid, opt.runningVal)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(BinaryLinear(numHid,numHid,opt.stcWeights)) model:add(BatchNormalization(numHid, opt.runningVal)) model:add(nn.HardTanh()) model:add(BinarizedNeurons(opt.stcNeurons)) model:add(BinaryLinear(numHid,10,opt.stcWeights)) model:add(nn.BatchNormalization(10)) local dE, param = model:getParameters() local weight_size = dE:size(1) local learningRates = torch.Tensor(weight_size):fill(0) local clipvector = torch.Tensor(weight_size):fill(0) local counter = 0 for i, layer in ipairs(model.modules) do if layer.__typename == 'BinaryLinear' then local weight_size = layer.weight:size(1)*layer.weight:size(2) local size_w=layer.weight:size(); GLR=1/torch.sqrt(1.5/(size_w[1]+size_w[2])) GLR=(math.pow(2,torch.round(math.log(GLR)/(math.log(2))))) learningRates[{{counter+1, counter+weight_size}}]:fill(GLR) clipvector[{{counter+1, counter+weight_size}}]:fill(1) counter = counter+weight_size local bias_size = layer.bias:size(1) learningRates[{{counter+1, counter+bias_size}}]:fill(GLR) clipvector[{{counter+1, counter+bias_size}}]:fill(0) counter = counter+bias_size elseif layer.__typename == 'BatchNormalizationShiftPow2' then local weight_size = layer.weight:size(1) local size_w=layer.weight:size(); GLR=1/torch.sqrt(1.5/(size_w[1])) learningRates[{{counter+1, counter+weight_size}}]:fill(1) clipvector[{{counter+1, counter+weight_size}}]:fill(0) counter = counter+weight_size local bias_size = layer.bias:size(1) learningRates[{{counter+1, counter+bias_size}}]:fill(1) clipvector[{{counter+1, counter+bias_size}}]:fill(0) counter = counter+bias_size elseif layer.__typename == 'nn.BatchNormalization' then local weight_size = layer.weight:size(1) local size_w=layer.weight:size(); GLR=1/torch.sqrt(1.5/(size_w[1])) learningRates[{{counter+1, counter+weight_size}}]:fill(1) clipvector[{{counter+1, counter+weight_size}}]:fill(0) counter = counter+weight_size local bias_size = layer.bias:size(1) learningRates[{{counter+1, counter+bias_size}}]:fill(1) clipvector[{{counter+1, counter+bias_size}}]:fill(0) counter = counter+bias_size elseif layer.__typename == 'nn.SpatialBatchNormalization' then local weight_size = layer.weight:size(1) local size_w=layer.weight:size(); GLR=1/torch.sqrt(1.5/(size_w[1])) learningRates[{{counter+1, counter+weight_size}}]:fill(1) clipvector[{{counter+1, counter+weight_size}}]:fill(0) counter = counter+weight_size local bias_size = layer.bias:size(1) learningRates[{{counter+1, counter+bias_size}}]:fill(1) clipvector[{{counter+1, counter+bias_size}}]:fill(0) counter = counter+bias_size elseif layer.__typename == 'nn.SpatialBatchNormalization' then local weight_size = layer.weight:size(1) local size_w=layer.weight:size(); GLR=1/torch.sqrt(1.5/(size_w[1])) learningRates[{{counter+1, counter+weight_size}}]:fill(1) clipvector[{{counter+1, counter+weight_size}}]:fill(0) counter = counter+weight_size local bias_size = layer.bias:size(1) learningRates[{{counter+1, counter+bias_size}}]:fill(1) clipvector[{{counter+1, counter+bias_size}}]:fill(0) counter = counter+bias_size elseif layer.__typename == 'SpatialBatchNormalizationShiftPow2' then local weight_size = layer.weight:size(1) local size_w=layer.weight:size(); GLR=1/torch.sqrt(1.5/(size_w[1])) learningRates[{{counter+1, counter+weight_size}}]:fill(1) clipvector[{{counter+1, counter+weight_size}}]:fill(0) counter = counter+weight_size local bias_size = layer.bias:size(1) learningRates[{{counter+1, counter+bias_size}}]:fill(1) clipvector[{{counter+1, counter+bias_size}}]:fill(0) counter = counter+bias_size elseif layer.__typename == 'cudnnBinarySpatialConvolution' then local size_w=layer.weight:size(); local weight_size = size_w[1]*size_w[2]*size_w[3]*size_w[4] local filter_size=size_w[3]*size_w[4] GLR=1/torch.sqrt(1.5/(size_w[1]*filter_size+size_w[2]*filter_size)) GLR=(math.pow(2,torch.round(math.log(GLR)/(math.log(2))))) learningRates[{{counter+1, counter+weight_size}}]:fill(GLR) clipvector[{{counter+1, counter+weight_size}}]:fill(1) counter = counter+weight_size local bias_size = layer.bias:size(1) learningRates[{{counter+1, counter+bias_size}}]:fill(GLR) clipvector[{{counter+1, counter+bias_size}}]:fill(0) counter = counter+bias_size elseif layer.__typename == 'BinarySpatialConvolution' then local size_w=layer.weight:size(); local weight_size = size_w[1]*size_w[2]*size_w[3]*size_w[4] local filter_size=size_w[3]*size_w[4] GLR=1/torch.sqrt(1.5/(size_w[1]*filter_size+size_w[2]*filter_size)) GLR=(math.pow(2,torch.round(math.log(GLR)/(math.log(2))))) learningRates[{{counter+1, counter+weight_size}}]:fill(GLR) clipvector[{{counter+1, counter+weight_size}}]:fill(1) counter = counter+weight_size local bias_size = layer.bias:size(1) learningRates[{{counter+1, counter+bias_size}}]:fill(GLR) clipvector[{{counter+1, counter+bias_size}}]:fill(0) counter = counter+bias_size end end print(learningRates:eq(0):sum()) print(learningRates:ne(0):sum()) print(clipvector:ne(0):sum()) print(counter) return { model = model, lrs = learningRates, clipV =clipvector, } ================================================ FILE: Models/BinarySpatialConvolution.lua ================================================ local BinarySpatialConvolution, parent = torch.class('BinarySpatialConvolution', 'nn.SpatialConvolution') function BinarySpatialConvolution:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH) local delayedReset = self.reset self.reset = function() end parent.__init(self, nInputPlane, nOutputPlane, kW, kH, dW, dH) self.reset = delayedReset self.padW = padW or 0 self.padH = padH or 0 self.stcWeights = stcWeights or false self.groups = groups or 1 assert(nInputPlane % self.groups == 0, 'nInputPlane should be divisible by nGroups') assert(nOutputPlane % self.groups == 0, 'nOutputPlane should be divisible by nGroups') self.weight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH) self.weightB = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH) self.weightOrg = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH) self.randmat = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH) self.maskStc = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH) self:reset() -- should nil for serialization, the reset will still work self.reset = nil self.iSize = torch.LongStorage(4):fill(0) end function BinarySpatialConvolution:reset(stdv) if stdv then stdv = stdv * math.sqrt(3) else stdv = 1/math.sqrt(self.kW*self.kH*self.nInputPlane) end if nn.oldSeed then self.weight:apply(function() return torch.uniform(-1, 1) end) if self.bias then self.bias:apply(function() return torch.uniform(-stdv, stdv) end) end else self.weight:uniform(-1, 1) if self.bias then self.bias:uniform(-stdv, stdv) end end end function BinarySpatialConvolution:binarized(trainFlag) self.weightOrg:copy(self.weight) self.binaryFlag = true if not self.binaryFlag then self.weight:copy(self.weightOrg) else self.weightB:copy(self.weight):add(1):div(2):clamp(0,1) if not self.stcWeights or not trainFlag then self.weightB:round():mul(2):add(-1) else self.maskStc=self.weightB-self.randmat:rand(self.randmat:size()) self.weightB:copy(self.maskStc) end end return self.weightB end local function backCompatibility(self) self.finput = self.finput or self.weight.new() self.fgradInput = self.fgradInput or self.weight.new() if self.padding then self.padW = self.padding self.padH = self.padding self.padding = nil else self.padW = self.padW or 0 self.padH = self.padH or 0 end if self.weight:dim() == 2 then self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW) end if self.gradWeight and self.gradWeight:dim() == 2 then self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW) end end local function makeContiguous(self, input, gradOutput) if not input:isContiguous() then self._input = self._input or input.new() self._input:resizeAs(input):copy(input) input = self._input end if gradOutput then if not gradOutput:isContiguous() then self._gradOutput = self._gradOutput or gradOutput.new() self._gradOutput:resizeAs(gradOutput):copy(gradOutput) gradOutput = self._gradOutput end end return input, gradOutput end -- function to re-view the weight layout in a way that would make the MM ops happy local function viewWeight(self) self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane * self.kH * self.kW) if self.gradWeight and self.gradWeight:dim() > 0 then self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane * self.kH * self.kW) end end local function unviewWeight(self) self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW) if self.gradWeight and self.gradWeight:dim() > 0 then self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW) end end function BinarySpatialConvolution:updateOutput(input) backCompatibility(self) viewWeight(self) input = makeContiguous(self, input) self.weightB = self:binarized(self.train) self.weight:copy(self.weightB) input.THNN.SpatialConvolutionMM_updateOutput( input:cdata(), self.output:cdata(), self.weight:cdata(), self.bias:cdata(), self.finput:cdata(), self.fgradInput:cdata(), self.kW, self.kH, self.dW, self.dH, self.padW, self.padH ) self.weight:copy(self.weightOrg) unviewWeight(self) return self.output end function BinarySpatialConvolution:updateGradInput(input, gradOutput) if self.gradInput then backCompatibility(self) viewWeight(self) input, gradOutput = makeContiguous(self, input, gradOutput) self.weight:copy(self.weightB) input.THNN.SpatialConvolutionMM_updateGradInput( input:cdata(), gradOutput:cdata(), self.gradInput:cdata(), self.weight:cdata(), -- self.bias:cdata(), -- removed from this commit https://github.com/torch/nn/commit/651103f3aabc2dd154d6bd95ad565d14009255e6 self.finput:cdata(), self.fgradInput:cdata(), self.kW, self.kH, self.dW, self.dH, self.padW, self.padH ) self.weight:copy(self.weightOrg) unviewWeight(self) return self.gradInput end end function BinarySpatialConvolution:accGradParameters(input, gradOutput, scale) scale = scale or 1 backCompatibility(self) input, gradOutput = makeContiguous(self, input, gradOutput) viewWeight(self) input.THNN.SpatialConvolutionMM_accGradParameters( input:cdata(), gradOutput:cdata(), self.gradWeight:cdata(), self.gradBias:cdata(), self.finput:cdata(), self.fgradInput:cdata(), self.kW, self.kH, self.dW, self.dH, self.padW, self.padH, scale ) unviewWeight(self) end function BinarySpatialConvolution:type(type,tensorCache) self.finput = self.finput and torch.Tensor() self.fgradInput = self.fgradInput and torch.Tensor() return parent.type(self,type,tensorCache) end function BinarySpatialConvolution:__tostring__() return parent.__tostring__(self) end function BinarySpatialConvolution:clearState() nn.utils.clear(self, 'finput', 'fgradInput', '_input', '_gradOutput') return parent.clearState(self) end ================================================ FILE: Models/SpatialBatchNormalizationShiftPow2.lua ================================================ --[[ This file implements Shift based Batch Normalization based a variant of the vanilla BN as described in the paper: "Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Matthieu Courbariaux, Itay Hubara, Daniel Soudry, Ran El-Yaniv, Yoshua Bengio' The code is based on nn library --]] local SpatialBatchNormalizationShiftPow2,parent = torch.class('SpatialBatchNormalizationShiftPow2', 'nn.Module') function SpatialBatchNormalizationShiftPow2:__init(nFeature, runningVal, eps, momentum) parent.__init(self) assert(nFeature and type(nFeature) == 'number', 'Missing argument #1: Number of feature planes. ' .. 'Give 0 for no affine transform') self.eps = eps or 1e-5 self.train = true self.momentum = momentum or 0.125 self.runningVal = runningVal or true self.running_mean = torch.Tensor() self.running_std = torch.Tensor() self.running_std_ap2 = torch.Tensor() if nFeature > 0 then self.affine = true end if self.affine then self.weight = torch.Tensor(nFeature) self.weightSign = torch.Tensor(nFeature) self.weight_ap2 = torch.Tensor(nFeature) self.bias = torch.Tensor(nFeature) self.gradWeight = torch.Tensor(nFeature) self.gradBias = torch.Tensor(nFeature) self:reset() end end function SpatialBatchNormalizationShiftPow2:reset() self.weight:fill(1) self.bias:zero() end function SpatialBatchNormalizationShiftPow2:updateOutput(input) assert(input:dim() == 4, 'only mini-batch supported (4D tensor), got ' .. input:dim() .. 'D tensor instead') local nBatch = input:size(1) local nFeature = input:size(2) local iH = input:size(3) local iW = input:size(4) -- buffers that are reused self.buffer = self.buffer or input.new() self.buffer2 = self.buffer2 or input.new() self.centered = self.centered or input.new() self.centered:resizeAs(input) self.centeredOrg = self.centeredOrg or input.new() self.centeredOrg:resizeAs(input) self.centeredSign = self.centeredSign or input.new() self.centeredSign:resizeAs(input) self.std = self.std or input.new() self.normalized = self.normalized or input.new() self.normalized:resizeAs(input) self.normalizedSign = self.normalizedSign or input.new() self.normalizedSign:resizeAs(input) self.output:resizeAs(input) self.gradInput:resizeAs(input) if self.train == false and self.runningVal == true then assert(self.running_mean:nDimension() ~= 0, 'Module never run on training data. First run on some training data before evaluating.') self.output:copy(input) self.buffer:repeatTensor(self.running_mean:view(1, nFeature, 1, 1), nBatch, 1, iH, iW) self.output:add(-1, self.buffer) self.running_std_ap2:copy(torch.pow(2,torch.round(torch.log(self.running_std):div(math.log(2))))) self.buffer:repeatTensor(self.running_std_ap2:view(1, nFeature, 1, 1), nBatch, 1, iH, iW) self.output:cmul(self.buffer) else -- training mode if self.running_mean:nDimension() == 0 then self.running_mean:resize(nFeature):zero() end if self.running_std:nDimension() == 0 then self.running_std:resize(nFeature):zero() self.running_std_ap2:resize(nFeature):zero() end -- calculate mean over mini-batch, over feature-maps local in_folded = input:view(nBatch, nFeature, iH * iW) self.buffer:mean(in_folded, 1) self.buffer2:mean(self.buffer, 3) self.running_mean:mul(1 - self.momentum):add(self.momentum, self.buffer2) -- add to running mean self.buffer:repeatTensor(self.buffer2:view(1, nFeature, 1, 1), nBatch, 1, iH, iW) -- subtract mean self.centered:add(input, -1, self.buffer) -- x - E(x) self.centeredOrg:copy(self.centered) self.centeredSign:copy(self.centered) self.centeredSign:sign() self.centered:copy(torch.pow(2,torch.round(torch.log(self.centered:abs()):div(math.log(2))))):cmul(self.centeredSign) -- calculate standard deviation over mini-batch self.buffer:copy(self.centered):cmul(self.centeredOrg) --:abs() -- calculate standard deviation over mini-batch local buf_folded = self.buffer:view(nBatch,nFeature,iH*iW) self.std:mean(self.buffer2:mean(buf_folded, 1), 3) self.std:add(self.eps):sqrt():pow(-1) -- 1 / E([x - E(x)]^2) self.running_std:mul(1 - self.momentum):add(self.momentum, self.std) -- add to running stdv self.std:copy(torch.pow(2,torch.round(torch.log(self.std):div(math.log(2))))) self.buffer:repeatTensor(self.std:view(1, nFeature, 1, 1), nBatch, 1, iH, iW) -- divide standard-deviation + eps self.output:cmul(self.centeredOrg, self.buffer) self.normalized:copy(self.output) self.normalizedSign:copy(self.normalized) self.normalizedSign:sign() self.normalized:copy(torch.pow(2,torch.round(torch.log(self.normalized:abs()):div(math.log(2)))):cmul(self.normalizedSign)) -- self.normalized[self.normalized:lt(0)]=1; -- Can improve results end if self.affine then -- multiply with gamma and add beta self.weight_ap2:copy(self.weight) self.weightSign:copy(self.weight):sign() self.weight_ap2:copy(torch.pow(2,torch.round(torch.log(self.weight:clone():abs()):div(math.log(2))))):cmul(self.weightSign) --self.weight:fill(1) --Almost similar results self.buffer:repeatTensor(self.weight_ap2:view(1, nFeature, 1, 1),nBatch, 1, iH, iW) self.output:cmul(self.buffer) self.buffer:repeatTensor(self.bias:view(1, nFeature, 1, 1), nBatch, 1, iH, iW) self.output:add(self.buffer) end return self.output end function SpatialBatchNormalizationShiftPow2:updateGradInput(input, gradOutput) assert(input:dim() == 4, 'only mini-batch supported') assert(gradOutput:dim() == 4, 'only mini-batch supported') assert(self.train == true, 'should be in training mode when self.train is true') local nBatch = input:size(1) local nFeature = input:size(2) local iH = input:size(3) local iW = input:size(4) self.gradInput:cmul(self.centered, gradOutput) local gi_folded = self.gradInput:view(nBatch, nFeature, iH * iW) self.buffer2:mean(self.buffer:mean(gi_folded, 1), 3) self.gradInput:repeatTensor(self.buffer2:view(1, nFeature, 1, 1), nBatch, 1, iH, iW) self.gradInput:cmul(self.centered):mul(-1) self.buffer:repeatTensor(self.std:view(1, nFeature, 1, 1), nBatch, 1, iH, iW) self.gradInput:cmul(self.buffer):cmul(self.buffer) self.buffer:mean(gradOutput:view(nBatch, nFeature, iH*iW), 1) self.buffer2:mean(self.buffer, 3) self.buffer:repeatTensor(self.buffer2:view(1, nFeature, 1, 1), nBatch, 1, iH, iW) self.gradInput:add(gradOutput):add(-1, self.buffer) self.buffer:repeatTensor(self.std:view(1, nFeature, 1, 1), nBatch, 1, iH, iW) self.gradInput:cmul(self.buffer) if self.affine then self.buffer:repeatTensor(self.weight_ap2:view(1, nFeature, 1, 1), nBatch, 1, iH, iW) self.gradInput:cmul(self.buffer) end return self.gradInput end function SpatialBatchNormalizationShiftPow2:accGradParameters(input, gradOutput, scale) if self.affine then scale = scale or 1.0 local nBatch = input:size(1) local nFeature = input:size(2) local iH = input:size(3) local iW = input:size(4) self.buffer2:resizeAs(self.normalized):copy(self.normalized) self.buffer2 = self.buffer2:cmul(gradOutput):view(nBatch, nFeature, iH*iW) self.buffer:sum(self.buffer2, 1) -- sum over mini-batch self.buffer2:sum(self.buffer, 3) -- sum over pixels self.gradWeight:add(scale, self.buffer2) self.buffer:sum(gradOutput:view(nBatch, nFeature, iH*iW), 1) self.buffer2:sum(self.buffer, 3) self.gradBias:add(scale, self.buffer2) -- sum over mini-batch end end ================================================ FILE: Models/cudnnBinarySpatialConvolution.lua ================================================ local cudnnBinarySpatialConvolution, parent = torch.class('cudnnBinarySpatialConvolution', 'cudnn.SpatialConvolution') local ffi = require 'ffi' local errcheck = cudnn.errcheck local autotunerCache = {} autotunerCache[1] = {} -- forward autotunerCache[2] = {} -- backwardFilter autotunerCache[3] = {} -- backwardData function cudnnBinarySpatialConvolution:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH,stcWeights, groups) local delayedReset = self.reset self.reset = function() end parent.__init(self, nInputPlane, nOutputPlane, kW, kH, dW, dH) self.reset = delayedReset self.padW = padW or 0 self.padH = padH or 0 self.groups = groups or 1 self.stcWeights = stcWeights or false assert(nInputPlane % self.groups == 0, 'nInputPlane should be divisible by nGroups') assert(nOutputPlane % self.groups == 0, 'nOutputPlane should be divisible by nGroups') self.weight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kH, kW) self.weightB = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH) self.weightOrg = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH) self.randmat = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH) self.maskStc = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH) self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kH, kW) self:reset() -- should nil for serialization, the reset will still work self.reset = nil end function cudnnBinarySpatialConvolution:binarized(trainFlag) self.weightOrg:copy(self.weight) self.binaryFlag = true if not self.binaryFlag then self.weight:copy(self.weightOrg) else self.weightB:copy(self.weight):add(1):div(2):clamp(0,1) if not self.stcWeights or not trainFlag then self.weightB:round():mul(2):add(-1) --print(self.weightB) else self.maskStc=self.weightB-self.randmat:rand(self.randmat:size()) self.weightB:copy(self.maskStc) end end return self.weightB end -- if you change the configuration of the module manually, call this function cudnnBinarySpatialConvolution:resetWeightDescriptors() assert(torch.typename(self.weight) == 'torch.CudaTensor', 'Only Cuda supported duh!') assert(torch.typename(self.bias) == 'torch.CudaTensor' or not self.bias, 'Only Cuda supported duh!') -- for compatibility self.groups = self.groups or 1 -- create filterDescriptor for weight self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]') errcheck('cudnnCreateFilterDescriptor', self.weightDesc) local desc = torch.IntTensor({self.nOutputPlane/self.groups, self.nInputPlane/self.groups, self.kH, self.kW}) errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0], 'CUDNN_DATA_FLOAT', 'CUDNN_TENSOR_NCHW', 4, desc:data()); local function destroyWDesc(d) errcheck('cudnnDestroyFilterDescriptor', d[0]); end ffi.gc(self.weightDesc, destroyWDesc) -- create descriptor for bias if self.bias then self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1)) end end function cudnnBinarySpatialConvolution:fastest(mode) if mode == nil then mode = true end self.fastest_mode = mode self.iSize = self.iSize or torch.LongStorage(4) self.iSize:fill(0) return self end function cudnnBinarySpatialConvolution:setMode(fmode, bdmode, bwmode) if fmode ~= nil then self.fmode = fmode end if bdmode ~= nil then self.bdmode = bdmode end if bwmode ~= nil then self.bwmode = bwmode end self.iSize = self.iSize or torch.LongStorage(4) self.iSize:fill(0) return self end function cudnnBinarySpatialConvolution:resetMode() self.fmode = nil self.bdmode = nil self.bwmode = nil return self end function cudnnBinarySpatialConvolution:noBias() self.bias = nil self.gradBias = nil return self end function cudnnBinarySpatialConvolution:createIODescriptors(input) parent.createIODescriptors(self,input) end local one = torch.FloatTensor({1}); local zero = torch.FloatTensor({0}); local function makeContiguous(self, input, gradOutput) if not input:isContiguous() then self._input = self._input or input.new() self._input:typeAs(input):resizeAs(input):copy(input) input = self._input end if gradOutput and not gradOutput:isContiguous() then self._gradOutput = self._gradOutput or gradOutput.new() self._gradOutput:typeAs(gradOutput):resizeAs(gradOutput):copy(gradOutput) gradOutput = self._gradOutput end return input, gradOutput end function cudnnBinarySpatialConvolution:updateOutput(input) self.weightOrg:copy(self.weight) self.weightB = self:binarized(self.train) self.weight:copy(self.weightB) parent.updateOutput(self,input) self.weight:copy(self.weightOrg) return self.output end function cudnnBinarySpatialConvolution:updateGradInput(input, gradOutput) if not self.gradInput then return end self.weight:copy(self.weightB) parent.updateGradInput(self, input, gradOutput:contiguous(), scale) self.weight:copy(self.weightOrg) return self.gradInput end function cudnnBinarySpatialConvolution:accGradParameters(input, gradOutput, scale) parent.accGradParameters(self, input, gradOutput:contiguous(), scale) end function cudnnBinarySpatialConvolution:clearDesc() self.weightDesc = nil self.biasDesc = nil self.convDesc = nil self.iDesc = nil self.oDesc = nil self.oDescForBias = nil self.algType = nil self.fwdAlgType = nil self.bwdDataAlgType = nil self.bwdFilterAlgType = nil self.extraBuffer = nil self.extraBufferSizeInBytes = nil self.scaleT = nil end function cudnnBinarySpatialConvolution:write(f) self:clearDesc() local var = {} for k,v in pairs(self) do var[k] = v end f:writeObject(var) end function cudnnBinarySpatialConvolution:clearState() self:clearDesc() return nn.Module.clearState(self) end ================================================ FILE: README.md ================================================ Deep Networks on classification tasks using Torch ================================================= This is a complete training example for BinaryNets using Binary-Backpropagation algorithm as explained in "Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Matthieu Courbariaux, Itay Hubara, Daniel Soudry, Ran El-Yaniv, Yoshua Bengio' on following datasets: Cifar10/100, SVHN, MNIST ## Data We use dp library to extract all the data please view installation section ## Dependencies * Torch (http://torch.ch) * "DataProvider.torch" (https://github.com/eladhoffer/DataProvider.torch) for DataProvider class. * "cudnn.torch" (https://github.com/soumith/cudnn.torch) for faster training. Can be avoided by changing "cudnn" to "nn" in models. * "dp" (https://github.com/nicholas-leonard/dp.git) for data extraction * "unsup" (https://github.com/koraykv/unsup.git) for data pre-processing To install all dependencies (assuming torch is installed) use: ```bash luarocks install https://raw.githubusercontent.com/eladhoffer/DataProvider.torch/master/dataprovider-scm-1.rockspec luarocks install cudnn luarocks install dp luarocks install unsup ``` ## Training Create pre-processing folder: ```lua cd BinaryNet mkdir PreProcData ``` Start training using: ```lua th Main_BinaryNet_Cifar10.lua -network BinaryNet_Cifar10_Model ``` or, ```lua th Main_BinaryNet_MNIST.lua -network BinaryNet_MNIST_Model ``` ## Run with Docker The Docker is built from `nvidia/cuda:8.0-cudnn5-devel` with Torch commit `0219027e6c4644a0ba5c5bf137c989a0a8c9e01b` - To build image, run: `docker build -t binarynet:torch-gpu-cuda-8.0 -f Dockerfile/binarynet-torch-gpu-cuda-8.0 .` or to pull docker image: `docker pull hychiang/binarynet:torch-gpu-cuda-8.0` - To launch image with gpu, run: `docker run -it --gpus all binarynet:torch-gpu-cuda-8.0` - To train BNN with Cifar10: `th Main_BinaryNet_Cifar10.lua -network BinaryNet_Cifar10_Model` ## Additional flags |Flag | Default Value |Description |:----------------|:--------------------:|:---------------------------------------------- |modelsFolder | ./Models/ | Models Folder |network | Model.lua | Model file - must return valid network. |LR | 0.1 | learning rate |LRDecay | 0 | learning rate decay (in # samples |weightDecay | 1e-4 | L2 penalty on the weights |momentum | 0.9 | momentum |batchSize | 128 | batch size |stcNeurons | true | using stochastic binarization for the neurons or not |stcWeights | false | using stochastic binarization for the weights or not |optimization | adam | optimization method |SBN | true | use shift based batch-normalization or not |runningVal | true | use running mean and std or not |epoch | -1 | number of epochs to train (-1 for unbounded) |threads | 8 | number of threads |type | cuda | float or cuda |devid | 1 | device ID (if using CUDA) |load | none | load existing net weights |save | time-identifier | save directory |dataset | Cifar10 | Dataset - Cifar10, Cifar100, STL10, SVHN, MNIST |dp_prepro | false | preprocessing using dp lib |whiten | false | whiten data |augment | false | Augment training data |preProcDir | ./PreProcData/ | Data for pre-processing (means,Pinv,P) ================================================ FILE: SqrHingeEmbeddingCriterion.lua ================================================ --[[ This Function implement the squared hinge loss criterion ]] local SqrtHingeEmbeddingCriterion, parent = torch.class('SqrtHingeEmbeddingCriterion', 'nn.Criterion') function SqrtHingeEmbeddingCriterion:__init(margin) parent.__init(self) self.margin = margin or 1 self.sizeAverage = true end function SqrtHingeEmbeddingCriterion:updateOutput(input,y) self.buffer = self.buffer or input.new() if not torch.isTensor(y) then self.ty = self.ty or input.new():resize(1) self.ty[1]=y y=self.ty end self.buffer:resizeAs(input):copy(input) self.buffer:cmul(y):mul(-1):add(self.margin) self.buffer[torch.le(self.buffer ,0)]=0 self.output=self.buffer:clone():pow(2):sum() if (self.sizeAverage == nil or self.sizeAverage == true) then self.output = self.output / input:nElement() end return self.output end function SqrtHingeEmbeddingCriterion:updateGradInput(input, y) if not torch.isTensor(y) then self.ty[1]=y; y=self.ty end self.gradInput:resizeAs(input):copy(y):mul(-2):cmul(self.buffer) self.gradInput[torch.cmul(y,input):gt(self.margin)] = 0 if (self.sizeAverage == nil or self.sizeAverage == true) then self.gradInput:mul(1 / input:nElement()) end return self.gradInput end ================================================ FILE: adaMax_binary_clip_shift.lua ================================================ --[[ An implementation of Shift based AdaMax based on http://arxiv.org/pdf/1412.6980.pdf as described the paper: "Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1, Matthieu Courbariaux, Itay Hubara, Daniel Soudry, Ran El-Yaniv, Yoshua Bengio' Note that this function perform the weight cliping as well ARGS: - 'opfunc' : a function that takes a single input (X), the point of a evaluation, and returns f(X) and df/dX - 'x' : the initial point - 'config` : a table with configuration parameters for the optimizer - 'config.learningRate' : learning rate - 'config.beta1' : first moment coefficient - 'config.beta2' : second moment coefficient - 'config.epsilon' : for numerical stability - 'state' : a table describing the state of the optimizer; after each call the state is modified RETURN: - `x` : the new x vector - `f(x)` : the function, evaluated before the update ]] function adaMax_binary_clip_shift(opfunc, x, config, state) -- (0) get/update state local config = config or {} local state = state or config local lr = config.learningRate or 0.002 local GLRvec = config.GLRvec or 1 local clipV = config.clipV or 0 local beta1 = config.beta1 or 0.9 local beta2 = config.beta2 or 0.999 local epsilon = config.epsilon or 2^-27 -- (1) evaluate f(x) and df/dx local fx, dfdx = opfunc(x) -- Initialization state.t = state.t or 0 -- Exponential moving average of gradient values state.m = state.m or x.new(dfdx:size()):zero() -- Exponential moving average of squared gradient values state.v = state.v or x.new(dfdx:size()):zero() -- A tmp tensor to hold the sqrt(v) + epsilon state.denom = state.denom or x.new(dfdx:size()):zero() state.t = state.t + 1 -- Decay the first and second moment running average coefficient state.m:mul(beta1):add(1-beta1, dfdx) state.v:copy( torch.cmax(state.v:mul(beta2),dfdx:abs()) ) local biasCorrection1 = 1 - beta1^state.t local stepSize = lr/biasCorrection1 --math.sqrt(biasCorrection2)/biasCorrection1 stepSize=math.pow(2,torch.round(math.log(stepSize)/(math.log(2)))) -- (2) update x local tmp=torch.zeros(x:size()) if opt.type == 'cuda' then tmp=tmp:cuda() end state.v:copy(torch.pow(2,torch.round(torch.log(state.v):div(math.log(2))))) state.v:add(epsilon) tmp:addcdiv(1, state.m, state.v) -- Multiply by Glorot learning rate vector x:addcmul(-stepSize, tmp, GLRvec) -- Clip to [-1,1] x[clipV:eq(1)]=x[clipV:eq(1)]:clamp(-1,1) -- return x*, f(x) before optimization return x, {fx} end ================================================ FILE: adam_binary_clip_b.lua ================================================ --[[ An implementation of Adam http://arxiv.org/pdf/1412.6980.pdf Note that this function perform the weight cliping as well ARGS: - 'opfunc' : a function that takes a single input (X), the point of a evaluation, and returns f(X) and df/dX - 'x' : the initial point - 'config` : a table with configuration parameters for the optimizer - 'config.learningRate' : learning rate - 'config.beta1' : first moment coefficient - 'config.beta2' : second moment coefficient - 'config.epsilon' : for numerical stability - 'state' : a table describing the state of the optimizer; after each call the state is modified RETURN: - `x` : the new x vector - `f(x)` : the function, evaluated before the update ]] function adam_binary_clip_b(opfunc, x, config, state) -- (0) get/update state local config = config or {} local state = state or config local lr = config.learningRate or 0.001 local GLRvec = config.GLRvec or 1 local beta1 = config.beta1 or 0.9 local beta2 = config.beta2 or 0.999 local epsilon = config.epsilon or 1e-8 -- (1) evaluate f(x) and df/dx local fx, dfdx = opfunc(x) --print(lr,dfdx:size()) -- Initialization state.t = state.t or 0 -- Exponential moving average of gradient values state.m = state.m or x.new(dfdx:size()):zero() -- Exponential moving average of squared gradient values state.v = state.v or x.new(dfdx:size()):zero() -- A tmp tensor to hold the sqrt(v) + epsilon state.denom = state.denom or x.new(dfdx:size()):zero() state.t = state.t + 1 -- Decay the first and second moment running average coefficient state.m:mul(beta1):add(1-beta1, dfdx) state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx) state.denom:copy(state.v):sqrt():add(epsilon) local biasCorrection1 = 1 - beta1^state.t local biasCorrection2 = 1 - beta2^state.t local stepSize = lr * math.sqrt(biasCorrection2)/biasCorrection1 -- (2) update x local tmp=torch.zeros(x:size()) if opt.type == 'cuda' then tmp=tmp:cuda() end tmp:addcdiv(1, state.m, state.denom) x:addcmul(-stepSize, tmp, GLRvec) x[clipV:eq(1)]=x[clipV:eq(1)]:clamp(-1,1) return x, {fx} end