[
  {
    "path": ".gitmodules",
    "content": "[submodule \"matconvnet\"]\n\tpath = matconvnet\n\turl = https://github.com/vlfeat/matconvnet\n\tbranch = master\n"
  },
  {
    "path": "Datasets/cnn_hmdb51_of_setup_data.m",
    "content": "function imdb = cnn_hmdb51_of_setup_data(varargin)\n% CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set\n% http://crcv.ucf.edu/data/UCF101.php\n% this script requires UCF101 downloaded and frames extracted in frames\n% folder\n\n\nopts.dataDir = fullfile('data','HMDB51') ;\nopts.lite = false ;\n% opts = vl_argparse(opts, varargin) ;\n\n%% ------------------------------------------------------------------------\n%                                                  Load categories metadata\n% -------------------------------------------------------------------------\n% find images\nimagePath = fullfile(opts.dataDir, 'tvl1_flow', 'u', '*') ;\nimages = dir(imagePath) ;\n\nvideoNames = cell(1,numel(images)) ;\nframeNames = cell(1,numel(images)) ;\nnrFrames = zeros(1,numel(images)) ;\nfor i=1:numel(images)\n  \n  frames = dir(fullfile(opts.dataDir,'tvl1_flow','u',images(i).name,'frame*.jpg')) ;\n  framesc = cell(1,numel(frames)) ;\n  if ~isempty(numel(frames))\n    for j=1:numel(frames)\n      framesc{j} = frames(j).name ;\n    end\n    frameNames{i} = framesc ;\n    frameNames{i} = strcat(images(i).name,'/',framesc) ;\n    nrFrames(i) = numel(framesc) ;\n    videoNames{i} = images(i).name ; \n  end\nend\n\nvideoNames(nrFrames==0) = [] ;\nframeNames(nrFrames==0) = [] ;\n% nrFrames(nrFrames==0) = [] ;\n\n\nframeNamesuv = cell(1,numel(frameNames)) ;\nfor i=1:numel(frameNames)\n  nn = frameNames{i} ;\n  nn1 = strcat('u/',nn) ;\n  nn2 = strcat('v/',nn) ;\n  \n  frameNamesuv{i} = cell(1,2*numel(nn1)) ;\n  frameNamesuv{i}(1:2:end) = nn1 ;\n  frameNamesuv{i}(2:2:end) = nn2 ;\nend\n\n% find metadata\n% ncls = 51 ;\n\nmetaPath = fullfile(opts.dataDir, 'hmdb51_splits', '*.txt') ;\n\nsplits = dir(metaPath) ;\n\ncats = cell(1,numel(videoNames)) ;\nsets = zeros(3,numel(videoNames)) ;\ncatNames = cell(1,numel(splits)) ;\n\nfor i=1:numel(splits)\n  j = strfind(splits(i).name,'_test_') ;\n  splitno = str2double(splits(i).name(j+11)) ;\n  catNames{i} = splits(i).name(1:j-1) ;\n  t = importdata(fullfile(opts.dataDir, 'hmdb51_splits', splits(i).name)) ;\n  \n  vids = cell(1,numel(t.textdata)) ;\n  for k=1:numel(t.textdata)\n    vids{k} = t.textdata{k}(1:end-4) ;\n  end\n  \n  [ia,ib] = ismember(vids,videoNames) ;\n  assert(all(ia)) ;\n  sets(splitno,ib) = t.data' ;\n  cats(ib) = repmat(catNames(i),numel(ia),1) ;\nend\n\n[cu,~,labels] = unique(cats) ;\nsets(sets(:)==2) = 3 ;\n\nimdb.classes.name = cu ;\nimdb.images.name = videoNames ;\nimdb.images.names = frameNamesuv ;\nimdb.images.label = labels' ;\nimdb.images.sets = sets ;\nimdb.imageDir = fullfile(opts.dataDir, 'tvl1_flow') ;\n"
  },
  {
    "path": "Datasets/cnn_hmdb51_setup_data.m",
    "content": "function imdb = cnn_hmdb51_setup_data(varargin)\n% CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set\n% http://crcv.ucf.edu/data/UCF101.php\n% this script requires UCF101 downloaded and frames extracted in frames\n% folder\n\nopts.dataDir = fullfile('data','HMDB51') ;\nopts.lite = false ;\n% opts = vl_argparse(opts, varargin) ;\n\n%% ------------------------------------------------------------------------\n%                                                  Load categories metadata\n% -------------------------------------------------------------------------\n% find images\nimagePath = fullfile(opts.dataDir, 'frames', '*') ;\nimages = dir(imagePath) ;\n\nvideoNames = cell(1,numel(images)) ;\nframeNames = cell(1,numel(images)) ;\nnrFrames = zeros(1,numel(images)) ;\nfor i=1:numel(images)\n  \n  frames = dir(fullfile(opts.dataDir,'frames',images(i).name,'frame*.jpg')) ;\n  framesc = cell(1,numel(frames)) ;\n  if ~isempty(numel(frames))\n    for j=1:numel(frames)\n      framesc{j} = frames(j).name ;\n    end\n    frameNames{i} = strcat(images(i).name,'/',framesc) ;\n    nrFrames(i) = numel(framesc) ;\n    videoNames{i} = images(i).name ; \n  end\nend\n\nvideoNames(nrFrames==0) = [] ;\nframeNames(nrFrames==0) = [] ;\n% nrFrames(nrFrames==0) = [] ;\n\n\n% find metadata\n% ncls = 51 ;\n\n\nmetaPath = fullfile(opts.dataDir, 'hmdb51_splits', '*.txt') ;\n\nsplits = dir(metaPath) ;\n\n% splitFiles = cell(1,3*ncls) ;\ncats = cell(1,numel(videoNames)) ;\nsets = zeros(3,numel(videoNames)) ;\ncatNames = cell(1,numel(splits)) ;\n\nfor i=1:numel(splits)\n  j = strfind(splits(i).name,'_test_') ;\n  splitno = str2double(splits(i).name(j+11)) ;\n  catNames{i} = splits(i).name(1:j-1) ;\n  t = importdata(fullfile(opts.dataDir, 'hmdb51_splits', splits(i).name)) ;\n  \n  vids = cell(1,numel(t.textdata)) ;\n  for k=1:numel(t.textdata)\n    vids{k} = t.textdata{k}(1:end-4) ;\n  end\n  \n  [ia,ib] = ismember(vids,videoNames) ;\n  assert(all(ia)) ;\n  sets(splitno,ib) = t.data' ;\n  cats(ib) = repmat(catNames(i),numel(ia),1) ;\nend\n\n[cu,~,labels] = unique(cats) ;\nsets(sets(:)==2) = 3 ;\n\nimdb.classes.name = cu ;\nimdb.images.name = videoNames ;\nimdb.images.names = frameNames ;\nimdb.images.label = labels' ;\nimdb.images.sets = sets ;\nimdb.imageDir = fullfile(opts.dataDir, 'frames') ;\n\n\n"
  },
  {
    "path": "Datasets/cnn_ucf101_of_setup_data.m",
    "content": "function imdb = cnn_ucf101_of_setup_data(varargin)\n% CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set\n% http://crcv.ucf.edu/data/UCF101.php\n% this script requires UCF101 downloaded and frames extracted in frames\n% folder\n\nopts.dataDir = fullfile('data','UCF101') ;\nopts.lite = false ;\nopts = vl_argparse(opts, varargin) ;\n\n%% ------------------------------------------------------------------------\n%                                                  Load categories metadata\n% -------------------------------------------------------------------------\n\n% find metadata\nmetaPath = fullfile(opts.dataDir, 'ucfTrainTestlist/classInd.txt') ;\n\nfprintf('using metadata %s\\n', metaPath) ;\ntmp = importdata(metaPath);\nnCls = numel(tmp);\n\nif nCls ~= 101\n  error('Wrong meta file %s',metaPath);\nend\n\ncats = cell(1,nCls);\nfor i=1:numel(tmp)\n  t = strsplit(tmp{i});\n  cats{i} = t{2};\nend\n\nimdb.classes.name = sort(cats) ;\nimdb.imageDir = fullfile(opts.dataDir, 'tvl1_flow', 'u') ;\n\n%% ------------------------------------------------------------------------\n%                                              load image names and labels\n% -------------------------------------------------------------------------\n\nfprintf('searching training images ...\\n') ;\nnames = {} ;\nname = {};\nlabels = {} ;\nfor d = dir(fullfile(imdb.imageDir, 'v_*'))'\n  [~,lab] = ismember(lower(d.name(3:end-8)), lower(cats)) ;\n  if lab==0\n    error('no class label found for %s',d.name);\n  end\n  ims = dir(fullfile(imdb.imageDir, d.name, '*.jpg')) ;\n  name{end+1} = d.name;\n  names{end+1} = strcat([d.name, filesep], {ims.name}) ;\n  labels{end+1} = lab ;\n  if mod(numel(names), 10) == 0, fprintf('.') ; end\n  if mod(numel(names), 500) == 0, fprintf('\\n') ; end\n  %fprintf('found %s with %d images\\n', d.name, numel(ims)) ;\nend\n% names = horzcat(names{:}) ;\n\nlabels = horzcat(labels{:}) ;\n% labels = [labels ; labels] ;\nlabels = labels(:)' ;\n\nfor i=1:numel(names)\n  nn = names{i} ;\n  nn1 = strcat('u/',nn) ;\n  nn2 = strcat('v/',nn) ;\n  \n  names{i} = cell(1,2*numel(nn1)) ;\n  names{i}(1:2:end) = nn1 ;\n  names{i}(2:2:end) = nn2 ;\nend\n\nimdb.images.id = 1:numel(names) ;\nimdb.images.name = name ;\nimdb.images.names = names ;\nimdb.images.label = labels ;\nimdb.imageDir = fullfile(opts.dataDir, 'tvl1_flow') ;\n\n%% ------------------------------------------------------------------------\n%                                                 load train / test splits\n% -------------------------------------------------------------------------\n\nfprintf('labeling data...(this may take couple of minutes)') ;\nimdb.images.sets = zeros(3, numel(names)) ;\nsetNames = {'train','test'};\nsetVal = [1,3];\n\nfor s=1:numel(setNames)\n  for i=1:3\n    trainFl = fullfile(opts.dataDir, 'ucfTrainTestlist',sprintf('%slist%02d.txt',...\n      setNames{s},i)) ;\n    trainList = importdata(trainFl);\n    if isfield(trainList,'textdata')\n      trainList = trainList.textdata;\n    end\n    for j=1:numel(trainList)\n      tmp = strsplit(trainList{j},'/');\n      [~,lab] = ismember(lower(tmp{2}(1:end-4)), lower(name)) ;\n      if lab==0\n%         error('cannot find the video %s',tmp{2}(1:end-4));\n        warning('cannot find the video %s',tmp{2}(1:end-4));\n        continue ;\n      end\n%       if trainList.data(j) ~= labels(lab)\n%         error('Labels do not match for %s',tmp{2});\n%       end\n      imdb.images.sets(i,lab) = setVal(s);\n    end\n  end  \nend\nfprintf('\\n') ;\n%% ------------------------------------------------------------------------\n%                                                            Postprocessing\n% -------------------------------------------------------------------------\n\n% sort categories by WNID (to be compatible with other implementations)\n[imdb.classes.name,perm] = sort(imdb.classes.name) ;\nrelabel(perm) = 1:numel(imdb.classes.name) ;\nok = imdb.images.label >  0 ;\nimdb.images.label(ok) = relabel(imdb.images.label(ok)) ;\n\nif opts.lite\n  % pick a small number of images for the first 10 classes\n  % this cannot be done for test as we do not have test labels\n  clear keep ;\n  for i=1:10\n    sel = find(imdb.images.label == i) ;\n    train = sel(imdb.images.sets(1,sel) == 1) ;\n    test = sel(imdb.images.sets(1,sel) == 3) ;\n    keep{i} = [train test] ;\n  end\n  keep = keep{:};\n  imdb.images.id = imdb.images.id(keep) ;\n  imdb.images.name = imdb.images.name(keep) ;\n  imdb.images.names = imdb.images.names(keep) ;\n  imdb.images.sets = imdb.images.sets(1,keep) ;\n  imdb.images.label = imdb.images.label(keep) ;\nend\n"
  },
  {
    "path": "Datasets/cnn_ucf101_setup_data.m",
    "content": "function imdb = cnn_ucf101_setup_data(varargin)\n% CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set\n% http://crcv.ucf.edu/data/UCF101.php\n% this script requires UCF101 downloaded and frames extracted in frames\n% folder\n\nopts.dataDir = fullfile('data','UCF101') ;\nopts.lite = false ;\nopts = vl_argparse(opts, varargin) ;\n\n%% ------------------------------------------------------------------------\n%                                                  Load categories metadata\n% -------------------------------------------------------------------------\n\n% find metadata\nmetaPath = fullfile(opts.dataDir, 'ucfTrainTestlist/classInd.txt') ;\n\nfprintf('using metadata %s\\n', metaPath) ;\ntmp = importdata(metaPath);\nnCls = numel(tmp);\n\nif nCls ~= 101\n  error('Wrong meta file %s',metaPath);\nend\n\ncats = cell(1,nCls);\nfor i=1:numel(tmp)\n  t = strsplit(tmp{i});\n  cats{i} = t{2};\nend\n\nimdb.classes.name = cats ;\nimdb.imageDir = fullfile(opts.dataDir, 'frames') ;\n\n%% ------------------------------------------------------------------------\n%                                              load image names and labels\n% -------------------------------------------------------------------------\n\nfprintf('searching training images ...\\n') ;\nnames = {} ;\nname = {};\nlabels = {} ;\nfor d = dir(fullfile(imdb.imageDir, 'v_*'))'\n  [~,lab] = ismember(lower(d.name(3:end-8)), lower(cats)) ;\n  if lab==0\n    error('no class label found for %s',d.name);\n  end\n  ims = dir(fullfile(imdb.imageDir, d.name, '*.jpg')) ;\n  name{end+1} = d.name;\n  names{end+1} = strcat([d.name, filesep], {ims.name}) ;\n  labels{end+1} = lab ;\n  if mod(numel(names), 10) == 0, fprintf('.') ; end\n  if mod(numel(names), 500) == 0, fprintf('\\n') ; end\n  %fprintf('found %s with %d images\\n', d.name, numel(ims)) ;\nend\n% names = horzcat(names{:}) ;\nlabels = horzcat(labels{:}) ;\n\nimdb.images.id = 1:numel(names) ;\nimdb.images.name = name ;\nimdb.images.names = names ;\nimdb.images.label = labels ;\n\n\n%% ------------------------------------------------------------------------\n%                                                 load train / test splits\n% -------------------------------------------------------------------------\n\nfprintf('labeling data...(this may take couple of minutes)') ;\nimdb.images.sets = zeros(3, numel(names)) ;\nsetNames = {'train','test'};\nsetVal = [1,3];\n\nfor s=1:numel(setNames)\n  for i=1:3\n    trainFl = fullfile(opts.dataDir, 'ucfTrainTestlist',sprintf('%slist%02d.txt',...\n      setNames{s},i)) ;\n    trainList = importdata(trainFl);\n    if isfield(trainList,'textdata')\n      trainList = trainList.textdata;\n    end\n    for j=1:numel(trainList)\n      tmp = strsplit(trainList{j},'/');\n      [~,lab] = ismember(lower(tmp{2}(1:end-4)), lower(name)) ;\n      if lab==0\n        error('cannot find the video %s',tmp{2});\n      end\n%       if trainList.data(j) ~= labels(lab)\n%         error('Labels do not match for %s',tmp{2});\n%       end\n      imdb.images.sets(i,lab) = setVal(s);\n    end\n  end  \nend\nfprintf('\\n') ;\n%% ------------------------------------------------------------------------\n%                                                            Postprocessing\n% -------------------------------------------------------------------------\n\n% sort categories by WNID (to be compatible with other implementations)\n[imdb.classes.name,perm] = sort(imdb.classes.name) ;\nrelabel(perm) = 1:numel(imdb.classes.name) ;\nok = imdb.images.label >  0 ;\nimdb.images.label(ok) = relabel(imdb.images.label(ok)) ;\n\nif opts.lite\n  % pick a small number of images for the first 10 classes\n  % this cannot be done for test as we do not have test labels\n  clear keep ;\n  for i=1:10\n    sel = find(imdb.images.label == i) ;\n    train = sel(imdb.images.sets(1,sel) == 1) ;\n    test = sel(imdb.images.sets(1,sel) == 3) ;\n    keep{i} = [train test] ;\n  end\n  keep = keep{:};\n  imdb.images.id = imdb.images.id(keep) ;\n  imdb.images.name = imdb.images.name(keep) ;\n  imdb.images.names = imdb.images.names(keep) ;\n  imdb.images.sets = imdb.images.sets(1,keep) ;\n  imdb.images.label = imdb.images.label(keep) ;\nend\n"
  },
  {
    "path": "Layers/AppRankPooling.m",
    "content": "classdef AppRankPooling < dagnn.ElementWise\n  % author: Hakan Bilen\n  % dagnn wrapper for approximate rank pooling\n  \n  properties\n    scale = 1 \n  end\n    \n  methods\n    function outputs = forward(obj, inputs, params)\n      outputs{1} = vl_nnarpooltemporal(inputs{1},inputs{2}) * obj.scale ;\n    end\n    \n    function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)\n      derInputs = cell(1,2);\n      derInputs{1} = vl_nnarpooltemporal(inputs{1},inputs{2},derOutputs{1}) * obj.scale;\n      derParams = {} ;\n    end\n    \n    function outputSizes = getOutputSizes(obj, inputSizes)\n      % This is not correct, dim(4) depends on inputs{2}\n      outputSizes{1} = inputSizes{1} ;\n    end\n    \n    function obj = AppRankPooling(varargin)\n      obj.load(varargin) ;  \n    end  \n    \n  end\nend\n\n"
  },
  {
    "path": "Layers/BatchNormN.m",
    "content": "classdef BatchNormN < dagnn.ElementWise\n  properties\n    numChannels\n    epsilon = 1e-5\n    opts = {'NoCuDNN'} % ours seems slightly faster\n  end\n\n  properties (Transient)\n    moments\n  end\n\n  methods\n    function outputs = forward(obj, inputs, params)\n      if strcmp(obj.net.mode, 'test')\n        outputs{1} = vl_nnbnorm(inputs{1}, params{1}, params{2}, ...\n                                'moments', params{3}, ...\n                                'epsilon', obj.epsilon, ...\n                                obj.opts{:}) ;\n      else\n        [outputs{1},obj.moments] = ...\n            vl_nnbnorm(inputs{1}, params{1}, params{2}, ...\n                       'epsilon', obj.epsilon, ...\n                       obj.opts{:}) ;\n      end\n    end\n\n    function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)\n      [derInputs{1}, derParams{1}, derParams{2}, derParams{3}] = ...\n        vl_nnbnorm(inputs{1}, params{1}, params{2}, derOutputs{1}, ...\n                   'epsilon', obj.epsilon, ...\n                   'moments', obj.moments, ...\n                   obj.opts{:}) ;\n      obj.moments = [] ;\n      % multiply the moments update by the number of images in the batch\n      % this is required to make the update additive for subbatches\n      % and will eventually be normalized away\n      % derParams{3} = derParams{3} * size(inputs{1},4) ;\n    end\n\n    % ---------------------------------------------------------------------\n    function obj = BatchNormN(varargin)\n      obj.load(varargin{:}) ;\n    end\n\n    function params = initParams(obj)\n      params{1} = ones(obj.numChannels,1,'single') ;\n      params{2} = zeros(obj.numChannels,1,'single') ;\n      params{3} = zeros(obj.numChannels,2,'single') ;\n    end\n\n    function attach(obj, net, index)\n      attach@dagnn.ElementWise(obj, net, index) ;\n      p = net.getParamIndex(net.layers(index).params{3}) ;\n      net.params(p).trainMethod = 'average' ;\n      net.params(p).learningRate = 0.1 ;\n    end\n  end\nend\n"
  },
  {
    "path": "Layers/ErrorMultiClass.m",
    "content": "classdef ErrorMultiClass < dagnn.Loss\n% author: Hakan Bilen\n% computes multi-class accuracy\n% inputs{1}->scores\n% inputs{2}->gt labels\n  properties\n    nImgPerClass = []\n    nCorPred = []\n    accuracy = []\n    resetLayer = false \n  end\n    \n  methods\n    function outputs = forward(obj, inputs, params)\n      \n      if numel(inputs)~=2\n        error('wrong number of inputs');\n      end\n      \n      nCls = size(inputs{1},3);\n      \n      if obj.resetLayer || isempty(obj.nImgPerClass)\n        obj.nImgPerClass = zeros(1,size(inputs{1},3));\n        obj.nCorPred = zeros(1,size(inputs{1},3));\n        obj.accuracy = zeros(1,size(inputs{1},3));\n        \n        if obj.resetLayer\n          obj.resetLayer = false ;\n          obj.average = 0 ;\n        end\n      end\n      \n      \n      [~,predictions] = max(gather(squeeze(inputs{1})),[],1);\n      \n      for c=1:nCls\n        obj.nImgPerClass(c) = obj.nImgPerClass(c) + sum(inputs{2}==c);\n        obj.nCorPred(c)     = obj.nCorPred(c) + sum(predictions==c & inputs{2}==c);\n      end\n      \n      ni = obj.nImgPerClass;\n      ni(ni==0) = 1;\n      \n      obj.accuracy = obj.nCorPred ./ ni;\n      obj.average = (1-mean(obj.accuracy));\n      outputs{1} =  obj.average;\n    end\n    \n    function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)\n      derInputs = cell(1,2);\n      derParams = {} ;\n    end\n    \n    function reset(obj)\n      obj.resetLayer = true ;\n%       obj.nImgPerClass = [];\n%       obj.nCorPred = [];\n%       obj.accuracy = [];\n%       obj.average = 0;\n    end\n    \n    \n    function obj = ErrorMultiClass(varargin)\n      obj.load(varargin) ;\n      obj.loss = 'error_multi_class' ;\n    end\n  end\nend\n"
  },
  {
    "path": "Layers/L2Normalize.m",
    "content": "classdef L2Normalize < dagnn.ElementWise\n  % author: Hakan Bilen\n  % dagnn wrapper for l2 normalization\n  \n  properties\n    scale = 1;\n    clip = [-inf inf];\n    offset = 0;\n  end\n  \n  methods\n    function outputs = forward(obj, inputs, params)\n      outputs{1} = vl_nnl2norm(inputs{1},[obj.scale obj.clip obj.offset]);\n    end\n    \n    function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)\n      derInputs{1} = vl_nnl2norm(inputs{1},[obj.scale obj.clip obj.offset],derOutputs{1});\n      derParams = {} ;\n    end\n    \n    function obj = L2Normalize(varargin)\n      obj.load(varargin) ;  \n    end  \n    \n  end\nend\n\n"
  },
  {
    "path": "Layers/LossNormalized.m",
    "content": "classdef LossNormalized < dagnn.Loss\n%   properties\n%     loss = 'softmaxlog'\n%     ignoreAverage = false\n%     opts = {}\n%   end\n%   properties (Transient)\n%     average = 0\n%     numAveraged = 0\n%   end\n\n  methods\n    function outputs = forward(obj, inputs, params)\n      outputs{1} = vl_nnloss(inputs{1}, inputs{2}, [], 'loss', obj.loss, obj.opts{:}) ;\n      obj.accumulateAverage(inputs, outputs);\n      if numel(size(inputs{1}))>3\n        bs = size(inputs{1},4) ;\n      else\n        bs = 1 ;\n      end\n      outputs{1} = outputs{1} / bs ;\n    end\n\n    function accumulateAverage(obj, inputs, outputs)\n      if obj.ignoreAverage, return; end;\n      n = obj.numAveraged ;\n      m = n + size(inputs{1}, 1) *  size(inputs{1}, 2) * size(inputs{1}, 4);\n      obj.average = bsxfun(@plus, n * obj.average, gather(outputs{1})) / m ;\n      obj.numAveraged = m ;\n    end\n\n    function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)\n      if numel(size(inputs{1}))>3\n        bs = size(inputs{1},4) ;\n      else\n        bs = 1 ;\n      end\n      \n      derInputs{1} = vl_nnloss(inputs{1}, inputs{2}, derOutputs{1}, 'loss', obj.loss, obj.opts{:}) / bs;\n      derInputs{2} = [] ;\n      derParams = {} ;\n    end\n\n    function reset(obj)\n      obj.average = 0 ;\n      obj.numAveraged = 0 ;\n    end\n\n    function outputSizes = getOutputSizes(obj, inputSizes, paramSizes)\n      outputSizes{1} = [1 1 1 inputSizes{1}(4)] ;\n    end\n\n    function rfs = getReceptiveFields(obj)\n      % the receptive field depends on the dimension of the variables\n      % which is not known until the network is run\n      rfs(1,1).size = [NaN NaN] ;\n      rfs(1,1).stride = [NaN NaN] ;\n      rfs(1,1).offset = [NaN NaN] ;\n      rfs(2,1) = rfs(1,1) ;\n    end\n\n    function obj = LossNormalized(varargin)\n      obj.load(varargin) ;\n    end\n  end\nend\n"
  },
  {
    "path": "Layers/TemporalPooling.m",
    "content": "classdef TemporalPooling < dagnn.ElementWise\n  % author: Hakan Bilen\n  % dagnn wrapper for approximate rank pooling\n  \n  properties\n    method = 'max';\n  end\n \n  methods\n    function outputs = forward(obj, inputs, params)\n      outputs{1} = vl_nnpooltemporal(inputs{1},inputs{2},obj.method);\n    end\n    \n    function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)\n      derInputs = cell(1,2);\n      derInputs{1} = vl_nnpooltemporal(inputs{1},inputs{2},obj.method,derOutputs{1});\n      derParams = {} ;\n    end\n    \n    function obj = TemporalPooling(varargin)\n      obj.load(varargin) ;  \n    end  \n    \n  end\nend\n\n"
  },
  {
    "path": "Layers/vl_nnarpooltemporal.m",
    "content": "function Y = vl_nnarpooltemporal(X,ids,dzdy)\n% author: Hakan Bilen\n% approximate rank pooling\n% ids indicates frame-video association (must be in range [1-N])\n\nsz = size(X);\nforward = logical(nargin<3);\n\nif numel(ids)~=size(X,4)\n  error('Error: ids dimension does not match with X!');\nend\n\nnVideos = max(ids);\n\nif forward\n  Y = zeros([sz(1:3),nVideos],'like',X);\nelse\n  Y = zeros(size(X),'like',X);\nend\n\nfor v=1:nVideos\n  % pool among frames\n  indv = find(ids==v);\n  if isempty(indv)\n    error('Error: No frames in video %d',v);\n  end\n  N = numel(indv);\n  % magic numbers\n  fw = zeros(1,N);\n  if N==1\n    fw = 1;\n  else\n    for i=1:N\n      fw(i) = sum((2*(i:N)-N-1) ./ (i:N));\n    end\n  end\n  \n  if forward\n    Y(:,:,:,v) =  sum(bsxfun(@times,X(:,:,:,indv),...\n      reshape(single(fw),[1 1 1 numel(indv)])),4);    \n  else\n    Y(:,:,:,indv) = (bsxfun(@times,repmat(dzdy(:,:,:,v),[1,1,1,numel(indv)]),...\n      reshape(fw,[1 1 1 numel(indv)]))) ;\n  end\nend\n%\n% if forward\n  %   fprintf(' fwd-arpool %.2f ',sqrt(sum(Y(:).^2)));\n  % else\n  %   fprintf(' back-arpool %f ',sqrt(sum(Y(:).^2)));\n% end\n\n"
  },
  {
    "path": "Layers/vl_nnl2norm.m",
    "content": "function y = vl_nnl2norm(x,param,dzdy)\n% author: Hakan Bilen\n% l2 normalize whole feature map\n\nsc = param(1);\nclip = param(2:3);\noffset = param(4);\n\nif nargin == 3\n  assert(all(size(x) == size(dzdy)));\nelse\n  dzdy = [];\nend\n\nx_sz = size(x);\nif ~all(x_sz([1 2]) == 1)\n  % Create an array of size #channels x #samples\n  x = reshape(x, prod(x_sz(1:3)), []);\nend\n\n\nx = x + offset;\n\nif isempty(dzdy)\n \n  y = (bsxfun(@times, x, sc./(sqrt(sum(x .* x)) + single(1e-12))));\n  % clip max values\n  if all(y(:)<clip(1) | y(:)>clip(2))\n    warning('Too small clipping interval');\n    fprintf('min %f max %f\\n',min(y(:)),max(y(:)));\n  end\n  \n  y(y(:)<clip(1)) = clip(1);\n  y(y(:)>clip(2)) = clip(2);\n  \n  \nelse\n  if ~all(x_sz([1 2]) == 1)\n    dzdy = reshape(dzdy, prod(x_sz(1:3)), []);\n  end\n  \n  len_ = 1./sqrt(sum(x.*x)+single(1e-12));\n  dzdy_ = bsxfun(@times,dzdy,len_.^3);\n  y = sc * (bsxfun(@times,dzdy,len_)-bsxfun(@times,x,sum(x.*dzdy_)));\nend\n\nif ~all(x_sz([1 2]) == 1)\n  y = reshape(y, x_sz);\nend\n% \n% if isempty(dzdy)\n%   fprintf(' fwd-l2 %.2f ',sqrt(sum(y(:).^2)));\n% else\n%   fprintf(' back-l2 %f dzdy %f ',sqrt(sum(y(:).^2)),sqrt(sum(dzdy(:).^2)));\n% end\n"
  },
  {
    "path": "Layers/vl_nnpooltemporal.m",
    "content": "function Y = vl_nnpooltemporal(X,ids,method,dzdy)\n% author: Hakan Bilen\n% temporal pooling along frames\n% ids indicates frame-video association\n% method 'max' or 'avg'\n\nsz = size(X);\nforward = logical(nargin<4);\nXp = permute(X,[4,1,2,3]);\n\nif numel(ids)~=size(X,4)\n  error('Error: ids dimension does not match with X!');\nend\n\nnVideos = max(ids);\n\nif forward\n  Yp = zeros([nVideos,sz(1:3)],'like',X);\n  for v=1:nVideos\n    % pool among frames\n    indv = find(ids==v);\n    Yp(v,:,:,:) = vl_nnpool(Xp(indv,:,:,:), [numel(indv),1], ...\n      'pad', 0, 'stride', [numel(indv),1], 'method', method) ;\n  end\nelse\n  dzdyp = permute(dzdy,[4,1,2,3]);\n  Yp = zeros(size(Xp),'like',Xp);\n  for v=1:nVideos\n    % pool among frames\n    indv = find(ids==v);\n    Yp(indv,:,:,:) = vl_nnpool(Xp(indv,:,:,:), [numel(indv),1], dzdyp(v,:,:,:), ...\n      'pad', 0, 'stride', [numel(indv),1], 'method', method) ;\n  end\n  \nend\n% permute back\nY = permute(Yp,[2,3,4,1]);\n\n% if forward\n%   fprintf(' fwd-ptemp %.2f ',sqrt(sum(Y(:).^2)));\n% else\n%   fprintf(' back-ptemp %.2f ',sqrt(sum(Y(:).^2)));\n% end\n"
  },
  {
    "path": "README.md",
    "content": "# Dynamic Image Networks for Action Recognition\n## Improved Results (see the extended version of CVPR paper)\n\n\nResNeXt-50        | HMDB51 (%) | UCF101 (%) |\n------------------|--------|--------|\nSI                |  53.5  |  87.6  |\nDI                |  57.3  |  86.6  |\nOF                |  55.8  |  84.9  |\nDOF               |  58.9  |  86.6  |\nSI+OF             |  67.5  |  93.9  |\nSI+DI             |  61.3  |  90.6  |\nOF+DOF            |  62.6  |  89.1  |\nSI+DI+OF+DOF      |  71.5  |  95.0  |\nSI+DI+OF+DOF+iDT  |  74.2  |  95.4  |\n\n* Results are in the standard average multi-class accuracy (%)\n* SI: RGB image\n* DI: dynamic RBG image\n* OF: optical flow \n* DOF: dynamic optical flow \n* iDT: improved trajectory features \n\n\n## Installation\n1. Clone the Dynamic Image Net repository:\n\n    ```Shell\n    git clone --recursive  https://github.com/hbilen/dynamic-image-nets\n    ```\n    \n2. Compile matconvnet toolbox: (see [http://www.vlfeat.org/matconvnet/install/](http://www.vlfeat.org/matconvnet/install/))\n\n3. Install additional matconvnet packages\n    \n  ```Shell\n    run matconvnet/matlab/vl_setupnn.m ;\n    vl_contrib install mcnExtraLayers ; vl_contrib setup mcnExtraLayers ;\n    vl_contrib install autonn ; vl_contrib setup autonn ;\n  ```\n\n4. Download your dataset : (e.g. UCF101 from [http://crcv.ucf.edu/data/UCF101.php](http://crcv.ucf.edu/data/UCF101.php))\n\n5. Convert videos to frames, resize them to 256x256 and store them in such a directory structure:\nAlternatively, you can download RGB and precomputed optical flow frames from [Christoph Feichtenhofer](http://ftp.tugraz.at/pub/feichtenhofer/tsfusion/data/) and copy RGB frames under \"UCF101/frames\" and optical flow frames under \"UCF101/tvl1_flow\".\n    \n    ```Shell\n    data/UCF101/ucfTrainTestlist/\n    ├── classInd.txt\n    ├── testlist01.txt\n    ├── testlist02.txt\n    ├── testlist03.txt\n    ├── trainlist01.txt\n    ├── trainlist02.txt\n    └── trainlist03.txt\n    data/UCF101/frames/\n    ├── v_ApplyEyeMakeup_g01_c01\n    │   ├── 00001.jpg\n    │   ├── 00002.jpg\n    │   ├── 00003.jpg\n    │   ├── 00004.jpg\n    │   ├── 00005.jpg\n    ```\n\n## Compute and Visualise Approximate Dynamic Images\n1. If you want to compute approximate dynamic images, get a list of ordered frames from a video and try\n  ```matlab\n  di = compute_approximate_dynamic_images(images) ;\n  ```\n\n2. If you want to visualise approximate dynamic images, get a list of ordered frames from a video and try\n  ```matlab\n  visualize_approximate_dynamic_images(images)\n  ```\n\n## Train a Dynamic Image Net\nYou can modify the options in `main_train.m` and train your model by running\n    ```matlab\n    main_train\n    ```\n    \nNote: If you want to train a model on a different dataset than UCF101 or HMDB51, you need to write a custom script `cnn_dataset_setup_data` to build your database (imdb).\n\n## Evaluation\n1. Download the CNN Models for the UCF101 dataset, that are used in the journal, from [here](http://groups.inf.ed.ac.uk/hbilen-data/data/resnext50_dicnn.tar).\n2. Choose the right model, split and input type (e.g.)\n    ```matlab\n    net = load('resnext50-rgb-arpool-split1.mat') ;\n    net = dagnn.DagNN.loadobj(net) ;\n    net.addLayer('errMC',ErrorMultiClass(),{'prediction','label'},'mcerr') ;\n    opts.network = net ;\n    opts.split = 1 ;\n    opts.train.gpus = 1 ;\n    opts.epochFactor = 0 ; \n    [net, info] = cnn_dicnn_rgb(opts)\n    ```\n\n## Citing Dynamic Image Networks\n\nIf you find the code useful, please cite:\n\n        @inproceedings{Bilen2016a,\n          author    = \"Bilen, H. and Fernando, B. and Gavves, E. and Vedaldi, A. and Gould, S.\",\n          title     = \"Dynamic Image Networks for Action Recognition\",\n          booktitle = \"CVPR\",\n          year      = \"2016\"\n        }\n        @journal{Bilen2017a,\n          author    = \"Bilen, H. and Fernando, B. and Gavves, E. and Vedaldi, A.\",\n          title     = \"Action Recognition with Dynamic Image Networks\",\n          journal   = \" IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)\",\n          year      = \"2017\"\n        }\n\n## License\nThe analysis work performed with the program(s) must be non-proprietary work. Licensee and its contract users must be or be affiliated with an academic facility. Licensee may additionally permit individuals who are students at such academic facility to access and use the program(s). Such students will be considered contract users of licensee. The program(s) may not be used for commercial competitive analysis (such as benchmarking) or for any commercial activity, including consulting.\n\n"
  },
  {
    "path": "dicnn/cnn_dicnn_of.m",
    "content": "function [net, info] = cnn_dicnn_of(varargin)\n%CNN_DICNN_OF Fine-tunes a pre-trained CNN with dynamic images on optical\n% (DOF in pami journal) flow frames on UCF101 dataset\n\n\nrun(fullfile(fileparts(mfilename('fullpath')), ...\n  '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ;\n\nrun(fullfile(fileparts(mfilename('fullpath')), ...\n  '..', 'matconvnet', 'contrib', 'mcnExtraLayers', 'setup_mcnExtraLayers.m')) ;\n\nrun(fullfile(fileparts(mfilename('fullpath')), ...\n  '..', 'matconvnet', 'contrib', 'autonn', 'setup_autonn.m')) ;\n\naddpath Layers Datasets\n\nopts.dataDir = fullfile('data','UCF101') ;\nopts.expDir  = fullfile('exp', 'UCF101') ;\nopts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat.mat') ;\n[opts, varargin] = vl_argparse(opts, varargin) ;\n\nopts.numFetchThreads = 8 ;\n\nopts.lite = false ;\nopts.imdbPath = fullfile(opts.dataDir, 'imdb-of.mat');\nopts.pool1Layer = 'conv0'; % before conv1\nopts.pool1Type = 'arpool'; % before conv1\nopts.pool2Layer = 'fc6'; % before conv1\nopts.DropOutRate = 0.85 ;\nopts.datasetFn = @cnn_ucf101_of_setup_data ;\nopts.networkFn = @cnn_init_resnext ;\nopts.network = [] ;\n\nopts.split = 1; % data split\nopts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1]\nopts.numDynImgs = 10 ;\nopts.epochFactor = 5 ;\n\nopts.train = struct() ;\nopts.train.gpus = [];\nopts.train.batchSize = 128 ;\nopts.train.numSubBatches = 32 ;\nopts.train.solver = [] ;\nopts.train.prefetch = true ;\nopts.train.learningRate = 1e-2 ;\nopts.train.numEpochs = 30 ;\n% opts.train.savePreds = true ;\nopts.train.randomSeed = 0 ;\n\nopts = vl_argparse(opts, varargin) ;\nif ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end;\n\n\n% -------------------------------------------------------------------------\n%                                                              Prepare data\n% -------------------------------------------------------------------------\n\nif exist(opts.imdbPath,'file')\n  imdb = load(opts.imdbPath) ;\nelse\n  imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ;\n  mkdir(opts.expDir) ;\n  save(opts.imdbPath, '-struct', 'imdb') ;\nend\n\n% UCF101 has 3 data splits\nif opts.split>3\n  error('split should be <=3');\nend\nimdb.images.set = imdb.images.sets(opts.split,:);\n\n% reverse frame order\nif opts.reverseDyn\n  for i=1:numel(imdb.images.names)\n    imdb.images.names{i} = imdb.images.names{i}(end:-1:1);\n  end\nend\n% -------------------------------------------------------------------------\n%                                                             Prepare model\n% -------------------------------------------------------------------------\nif isempty(opts.network)\n  net = load(opts.modelPath);\n  if isfield(net,'net')\n    net = net.net;\n  end\n  opts.nCls = max(imdb.images.label) ;\n  % net = dagnn.DagNN.loadobj(net) ;\n  net = opts.networkFn(net,opts) ;\n  \n  % two channels instead of 3 RGB\n  net.params(1).value = net.params(1).value(:,:,1:2,:) ;\n  \n  % Set the class names in the network\n  net.meta.classes.name = imdb.classes.name ;\n  net.meta.classes.description = imdb.classes.name ;\nelse\n  assert(isa(opts.network,'dagnn.DagNN')) ;\n  net = opts.network ;\nend\n\n% -------------------------------------------------------------------------\n%                                                                     Learn\n% -------------------------------------------------------------------------\nif opts.epochFactor>0\n  opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ;\nelse\n  opts.train.train = NaN ;\n  opts.train.numEpochs = 1 ;\nend\nopts.train.val = find(imdb.images.set==3) ;\n\n[net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ...\n                      'expDir', opts.expDir, ...\n                      opts.train) ;\n\n\n% -------------------------------------------------------------------------\n%                                                          Report accuracy\n% -------------------------------------------------------------------------\nerrlayer = net.getLayerIndex('errMC') ;\n\nif ~isnan(errlayer)\n  cats = imdb.classes.name ;\n  accs = net.layers(errlayer).block.accuracy ; \n  \n  if numel(cats)~=numel(accs)\n    error('wrong number of classes\\n') ;\n  end\n  \n  for i=1:numel(cats)\n    fprintf('%s acc %.1f\\n',cats{i},100*accs(i)) ;\n  end\n  fprintf('Mean accuracy %.1f\\n',100*mean(accs)) ;\nend\n% -------------------------------------------------------------------------\nfunction fn = getBatchFn(opts, meta)\n% -------------------------------------------------------------------------\nuseGpu = numel(opts.train.gpus) > 0 ;\n\nbopts.numThreads = opts.numFetchThreads ;\nbopts.imageSize = meta.normalization.imageSize ;\nif isfield(meta.normalization,'border')\n  bopts.border = meta.normalization.border ;  \nelse\n  bopts.border = meta.normalization.imageSize(1:2) ./ ...\n    meta.normalization.cropSize - meta.normalization.imageSize(1:2);\nend\n\nbopts.averageImage = 128 * ones([1 1 2],'single') ;\nbopts.numDynImgs = opts.numDynImgs ;\n\nfn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ;\n\n\n\n% -------------------------------------------------------------------------\nfunction inputs = getDagNNBatch(opts, useGpu, imdb, batch)\n% -------------------------------------------------------------------------\n\n% batch refers to videos (not for frames)\nif isempty(batch)\n  inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []};\n  return;\nend\n\nisVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ;\n\nif ~isVal, transformation='multiScaleRegular'; else transformation='none';end\n\nnames = imdb.images.names(batch);\n\n\n% images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ;\n\nnamesM = {};\nnVids = numel(batch);\n\nVideoId1 = [];\nVideoId2 = [];\n\n% step-size\nstepSize = 6;\n\n% pool nFrames into a dynamic image\nnFrames = 10;\n% number of dynamic images to be max pooled later\nnDynImgs = opts.numDynImgs ;\nopts = rmfield(opts,'numDynImgs') ;\n\n\nc1 = 1;\nfor v=1:nVids\n  \n  name = names{v};\n  nFrms = numel(name)/2;\n\n  nSample = nFrames;\n  \n  if isVal\n    startF = 1 ;\n  else\n    startF = ceil(stepSize/2) ;\n  end\n  nr = numel(startF:stepSize:nFrms);\n  \n  % jitter by removing 50 % and limit a batch to nMaxs * nSamples images\n  if nr > 1 && (~isVal && nr>nDynImgs)\n    rat = min(nDynImgs,ceil(0.50*nr));\n    ri = randperm(nr);\n    ri = ri(1:rat);\n    r = zeros(1,nr);\n    r(ri) = 1;\n  else\n    r = ones(1,nr);\n  end\n  \n  c3 = 1;\n  c2 = 0;\n  \n  for f=startF:stepSize:nFrms\n    if r(c3)\n      idx = f:min(f+nSample-1,nFrms) ;\n      if numel(idx)<nFrames\n        idx = [idx idx(end) * ones(1,nFrames-numel(idx))];\n      end\n      idxu = 2*idx - 1;\n      idxv = 2*idx;\n      idxuv = zeros(1,2 * numel(idxu)) ;\n      idxuv(1:2:end) = idxu ;\n      idxuv(2:2:end) = idxv ;\n            \n      namesM{end+1} = name(idxuv);\n      VideoId1 = [VideoId1 c1 * ones(1,numel(idx))];\n      c1 = c1 + 1;\n      c2 = c2 + 1;\n    end\n    c3 = c3 + 1;\n  end\n  VideoId2 = [VideoId2 v * ones(1,c2) ] ;\nend\n\nimages = strcat([imdb.imageDir filesep], horzcat(namesM{:}) ) ;\n\nim = cnn_video_of_get_batch(images, VideoId1, opts, ...\n  'transformation', transformation, 'prefetch', nargout == 0) ;\n\nif nargout > 0\n  if useGpu\n    im = gpuArray(im) ;\n  end\n  inputs = {'input', im, 'label', imdb.images.label(batch), ...\n    'VideoId1', VideoId1, 'VideoId2', VideoId2};\n\nend\n"
  },
  {
    "path": "dicnn/cnn_dicnn_rgb.m",
    "content": "function [net, info] = cnn_dicnn_rgb(varargin)\n%CNN_DICNN_RGB Fine-tunes a pre-trained CNN with dynamic images on RGB frames\n% (DI in pami journal) on UCF101 dataset\n\n\nrun(fullfile(fileparts(mfilename('fullpath')), ...\n  '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ;\n\nrun(fullfile(fileparts(mfilename('fullpath')), ...\n  '..', 'matconvnet', 'contrib', 'mcnExtraLayers', 'setup_mcnExtraLayers.m')) ;\n\nrun(fullfile(fileparts(mfilename('fullpath')), ...\n  '..', 'matconvnet', 'contrib', 'autonn', 'setup_autonn.m')) ;\n\naddpath Layers Datasets\n\nopts.dataDir = fullfile('data','UCF101') ;\nopts.expDir  = fullfile('exp', 'UCF101') ;\nopts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat');\nopts.datasetFn = @cnn_ucf101_setup_data ;\nopts.networkFn = @cnn_init_resnext ;\nopts.network = [] ;\n\n[opts, varargin] = vl_argparse(opts, varargin) ;\n\nopts.numFetchThreads = 8 ;\n\nopts.lite = false ;\nopts.imdbPath = fullfile(opts.dataDir, 'imdb-rgb.mat');\nopts.pool1Layer = 'conv0'; % before conv1\nopts.pool1Type = 'arpool'; \nopts.pool2Layer = 'pool5'; \nopts.pool2Type = 'maxpool'; \nopts.DropOutRate = 0.5 ;\nopts.epochFactor = 5 ;\n\nopts.split = 1; % data split\nopts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1]\nopts.train = struct() ;\nopts.train.gpus = [];\nopts.train.batchSize = 128 ;\nopts.train.numSubBatches = 16 ;\nopts.train.solver = [] ;\nopts.train.prefetch = true ;\nopts.train.numEpochs = 30 ;\nopts.train.randomSeed = 0 ;\n% resnet50\n% opts.train.learningRate = 1e-2 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)];\n% caffe-ref\nopts.train.learningRate = 1e-3 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)];\n\nopts = vl_argparse(opts, varargin) ;\nif ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end\n% opts.train.numEpochs = numel(opts.train.learningRate);\n\n% -------------------------------------------------------------------------\n%                                                              Prepare data\n% -------------------------------------------------------------------------\n\nif exist(opts.imdbPath,'file')\n  imdb = load(opts.imdbPath) ;\nelse\n  imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ;\n  mkdir(opts.expDir) ;\n  save(opts.imdbPath, '-struct', 'imdb') ;\nend\n\n% UCF101 has 3 data splits\nif opts.split>3\n  error('split should be <=3');\nend\nimdb.images.set = imdb.images.sets(opts.split,:);\n\n% reverse frame order\nif opts.reverseDyn\n  for i=1:numel(imdb.images.names)\n    imdb.images.names{i} = imdb.images.names{i}(end:-1:1);\n  end\nend\n\n% -------------------------------------------------------------------------\n%                                                             Prepare model\n% -------------------------------------------------------------------------\nif isempty(opts.network)\n  net = load(opts.modelPath);\n  if isfield(net,'net')\n    net = net.net;\n  end\n  opts.nCls = max(imdb.images.label) ;\n  net = opts.networkFn(net,opts);\n\n  if numel(net.meta.normalization.averageImage)>3\n    sz = size(net.meta.normalization.averageImage) ;\n    net.meta.normalization.averageImage = ...\n      mean(reshape(net.meta.normalization.averageImage,[sz(1)*sz(2) sz(3)]),1) ;\n  end\n\n  % Set the class names in the network\n  net.meta.classes.name = imdb.classes.name ;\n  net.meta.classes.description = imdb.classes.name ;\nelse\n  assert(isa(opts.network,'dagnn.DagNN')) ;\n  net = opts.network ;\nend\n% -------------------------------------------------------------------------\n%                                                                     Learn\n% -------------------------------------------------------------------------\nif opts.epochFactor>0\n  opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ;\nelse\n  opts.train.train = NaN ;\n  opts.train.numEpochs = 1 ;\nend\nopts.train.val = find(imdb.images.set==3) ;\n\n[net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ...\n  'expDir', opts.expDir, ...\n  opts.train) ;\n\n% -------------------------------------------------------------------------\n%                                                          Report accuracy\n% -------------------------------------------------------------------------\nerrlayer = net.getLayerIndex('errMC') ;\n\nif ~isnan(errlayer)\n  cats = imdb.classes.name ;\n  accs = net.layers(errlayer).block.accuracy ; \n  \n  if numel(cats)~=numel(accs)\n    error('wrong number of classes\\n') ;\n  end\n  \n  for i=1:numel(cats)\n    fprintf('%s acc %.1f\\n',cats{i},100*accs(i)) ;\n  end\n  fprintf('Mean accuracy %.1f\\n',100*mean(accs)) ;\nend\n\n% -------------------------------------------------------------------------\nfunction fn = getBatchFn(opts, meta)\n% -------------------------------------------------------------------------\nuseGpu = numel(opts.train.gpus) > 0 ;\n\nbopts.numThreads = opts.numFetchThreads ;\nbopts.imageSize = meta.normalization.imageSize ;\nif isfield(meta.normalization,'border')\n  bopts.border = meta.normalization.border ;  \nelse\n  bopts.border = meta.normalization.imageSize(1:2) ./ ...\n    meta.normalization.cropSize - meta.normalization.imageSize(1:2);\n\nend\n\n% bopts.averageImage = []; \nbopts.averageImage = meta.normalization.averageImage ;\nbopts.interpolation = meta.normalization.interpolation ;\nbopts.keepAspect = meta.normalization.keepAspect ;\n% bopts.rgbVariance = meta.augmentation.rgbVariance ;\n% bopts.transformation = meta.augmentation.transformation ;\n\n\nfn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ;\n\n\n\n% -------------------------------------------------------------------------\nfunction inputs = getDagNNBatch(opts, useGpu, imdb, batch)\n% -------------------------------------------------------------------------\n\n% batch refers to videos (not for frames)\nif isempty(batch)\n  inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []};\n  return;\nend\n\nisVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ;\n\n% if ~isVal, transformation='stretch'; else transformation='none';end\nif ~isVal, transformation='multiScaleRegular'; else transformation='none';end\n\nnames = imdb.images.names(batch);\n\n\n% images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ;\n\nnamesM = {};\nnVids = numel(batch);\n\nVideoId1 = [];\nVideoId2 = [];\n\n% step-size\nstepSize = 6;\n\n% pool nFrames into a dynamic image\nnFrames = 10;\n% number of dynamic images to be max pooled later\nnDynImgs = 10;\n\n\nc1 = 1;\nfor v=1:nVids\n  \n  name = names{v};\n    \n  if isVal\n    startF = 1 ;\n  else\n    startF = ceil(stepSize/2) ;\n  end\n  \n  nFrms = numel(name);\n\n  nSample = nFrames;\n  nr = numel(startF:stepSize:nFrms);\n  \n  % jitter by removing 50 % and limit a batch to nMaxs * nSamples images\n  if nr > 1 && (~isVal && nr>nDynImgs)\n    rat = min(nDynImgs,ceil(0.50*nr));\n    ri = randperm(nr);\n    ri = ri(1:rat);\n    r = zeros(1,nr);\n    r(ri) = 1;\n  else\n    if nr>2*nDynImgs\n      rat = 2*nDynImgs;\n      ri = randperm(nr);\n      ri = ri(1:rat);\n      r = zeros(1,nr);\n      r(ri) = 1;\n    else\n      r = ones(1,nr);\n    end\n  end\n  \n  c3 = 1;\n  c2 = 0;\n  \n  for f=startF:stepSize:nFrms\n    if r(c3)\n      idx = f:min(f+nSample-1,nFrms) ;\n      if numel(idx)<nFrames\n        idx = [idx idx(end) * ones(1,nFrames-numel(idx))];\n      end\n      namesM{end+1} = name(idx);\n      VideoId1 = [VideoId1 c1 * ones(1,numel(idx))];\n      c1 = c1 + 1;\n      c2 = c2 + 1;\n    end\n    c3 = c3 + 1;\n  end\n  VideoId2 = [VideoId2 v * ones(1,c2) ] ;\nend\n\nimages = strcat([imdb.imageDir filesep], horzcat(namesM{:}) ) ;\n\nim = cnn_video_rgb_get_batch(images, VideoId1, opts, ...\n  'transformation', transformation, 'prefetch', nargout == 0) ;\n\nif nargout > 0\n  if useGpu\n    im = gpuArray(im) ;\n  end\n  inputs = {'input', im, 'label', imdb.images.label(batch), ...\n    'VideoId1', VideoId1, 'VideoId2', VideoId2};\nend\n"
  },
  {
    "path": "dicnn/cnn_init_cafferef.m",
    "content": "% -------------------------------------------------------------------------\nfunction net = cnn_init_cafferef(net,opts)\n% -------------------------------------------------------------------------\n\ndrop6p = find(cellfun(@(a) strcmp(a.name, 'dropout6'), net.layers)==1);\ndrop7p = find(cellfun(@(a) strcmp(a.name, 'dropout7'), net.layers)==1);\n\nif ~isempty(drop6p)\n  assert(~isempty(drop7p));\n  net.layers{drop6p}.rate = opts.DropOutRate;\n  net.layers{drop7p}.rate = opts.DropOutRate;\nelse\n  relu6p = find(cellfun(@(a) strcmp(a.name, 'relu6'), net.layers)==1);\n  relu7p = find(cellfun(@(a) strcmp(a.name, 'relu7'), net.layers)==1);\n\n  drop6 = struct('type','dropout','rate', opts.DropOutRate,'name','dropout6') ;\n  drop7 = struct('type','dropout','rate', opts.DropOutRate,'name','dropout7') ;\n  net.layers = [net.layers(1:relu6p) drop6 net.layers(relu6p+1:relu7p) drop7 net.layers(relu7p+1:end)];\nend\n\n% replace fc8\nfc8l = cellfun(@(a) strcmp(a.name, 'fc8'), net.layers)==1;\n\nnCls = opts.nCls ;\n% nCls = 101;\nsizeW = size(net.layers{fc8l}.weights{1});\n\nif sizeW(4)~=nCls\n  net.layers{fc8l}.weights = {zeros(sizeW(1),sizeW(2),sizeW(3),nCls,'single'), ...\n    zeros(1, nCls, 'single')};\nend\n\n% change loss\n% net.layers(end) = [];\nnet.layers{end} = struct('name','loss', 'type','softmaxloss') ;\n\n% convert to dagnn\nnet = dagnn.DagNN.fromSimpleNN(net, 'canonicalNames', true) ;\n\npoolLyr1 = find(arrayfun(@(a) strcmp(a.name, opts.pool1Layer), net.layers)==1);\nassert(~isempty(poolLyr1));\n% configure appr-rank-pool\nswitch opts.pool1Type\n  case 'arpool'\n    if strcmp(opts.pool1Layer,'conv1')\n      net.addLayer('arpool',AppRankPooling('scale',1),{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN');\n      net.addLayer('l2normalize',L2Normalize('scale',6000,'clip',[-128 128]),...\n        'DynImgN','DynImg');\n    else\n      net.addLayer('arpool',AppRankPooling('scale',0.1),{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN');\n      net.addLayer('reluP',dagnn.ReLU(),...\n      {'DynImgN'},'DynImg');\n    end\n    net.setLayerInputs(opts.pool1Layer,{'DynImg'}) ;  \n  case 'ppool1'\n    if strcmp(opts.pool1Layer,'conv1')\n      net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...\n      {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImg',{'conv0f','conv0b'});\n    else\n      net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...\n      {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN',{'conv0f','conv0b'});\n    net.addLayer('reluP',dagnn.ReLU(),...\n      {'DynImgN'},'DynImg');\n    end\n    \n    net.layers(poolLyr1).inputs{1} = 'DynImg' ;\n%     net.params(end-1).value = 0.01 * randn(1,1,10,1,'single');\n    net.params(end-1).value = 0.1 * ones(1,1,10,1,'single');\n    net.params(end).value = zeros(1,1,'single');    \n    \n    net.params(end-1).learningRate = 0.1 ;\n    net.params(end).learningRate = 0.2 ;\n  case 'ppool2'\n    if strcmp(opts.pool1Layer,'conv1')\n      net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...\n      {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImg',{'conv0f','conv0b'});\n    else\n      net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...\n      {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN',{'conv0f','conv0b'});\n    net.addLayer('reluP',dagnn.ReLU(),...\n      {'DynImgN'},'DynImg');\n    end\n    \n    net.layers(poolLyr1).inputs{1} = 'DynImg' ;\n%     net.params(end-1).value = 0.01 * randn(1,1,10,1,'single');\n    net.params(end-1).value = 0.1 * ones(1,1,10,1,'single');\n    net.params(end).value = zeros(1,1,'single');    \n    \n    net.params(end-1).learningRate = 0.1 ;\n    net.params(end).learningRate = 0.2 ;\n  case 'none'\n    \n  otherwise\n    error('Unknown pool type %s', opts.pool1Type) ;\nend\n\n\n\n% second pool layer (max pooling)\npoolLyr2 = find(arrayfun(@(a) strcmp(a.name, opts.pool2Layer), net.layers)==1);\nnet.addLayer('tempPoolMax',TemporalPooling('method','max'),...\n  {net.layers(poolLyr2(1)).inputs{1},'VideoId2'},'tempPoolMax');\n\nnet.layers(poolLyr2).inputs{1} = 'tempPoolMax';\n\n% add multi-class error\nnet.addLayer('errMC',ErrorMultiClass(),{'prediction','label'},'mcerr');\n\nnet_ = net.saveobj ;\nnet = dagnn.DagNN.loadobj(net_) ;\n\nnet.removeLayer('loss') ;\nnet.addLayer('loss', ...\n             LossNormalized('loss', 'softmaxlog') ,...\n             {'prediction', 'label'}, ...\n             'objective') ;\n           \n% replace standard matconvnet bnorm with my version\nbns = find(arrayfun(@(a) strcmp(class(a.block), 'dagnn.BatchNorm'), net.layers)==1);\nfor i=1:numel(bns)\n  bb = net.layers(bns(i)).block ;\n  net.layers(bns(i)).block = BatchNormN('numChannels',bb.numChannels,...\n  'epsilon',bb.epsilon,...\n  'opts',bb.opts) ;\nend\n"
  },
  {
    "path": "dicnn/cnn_init_resnext.m",
    "content": "% -------------------------------------------------------------------------\nfunction net = cnn_init_resnext(net,opts)\n% -------------------------------------------------------------------------\n% initialize classifier\nnet = dagnn.DagNN.loadobj(net) ;\n\n% convs = find(arrayfun(@(a) isa(a.block, 'dagnn.Conv'), net.layers)==1);\n\nfclayer = net.getLayer('classifier_0') ;\nsizeW = size(net.params(fclayer.paramIndexes(1)).value);\n\n% opts.nCls = 101;\nnCls = opts.nCls ;\nDropOutRate = opts.DropOutRate ; \n\n\nnet.params(fclayer.paramIndexes(1)).value = ...\n  0.01 * randn([sizeW(1:3),nCls],'single') ;\nnet.params(fclayer.paramIndexes(2)).value = zeros(nCls,1,'single') ;\n\n\n% change loss\nsoftmax = find(arrayfun(@(a) isa(a.block, 'dagnn.SoftMax'), net.layers)==1);\nif ~isempty(softmax)\n  net.removeLayer(net.layers(softmax(1)).name) ;\nend\n% convs = find(arrayfun(@(a) isa(a.block, 'dagnn.Conv'), net.layers)==1);\nfclayer = find(arrayfun(@(a) strcmp(a.name, 'classifier_0'), net.layers)==1);\nnet.renameVar(net.layers(fclayer(end)).name,'prediction') ;\nnet.renameVar('data','input') ;\n\n%------------------------------------------------------------------------%\n% configure appr-rank-pool\nswitch opts.pool1Type\n  case 'arpool'\n    if strcmp(opts.pool1Layer,'conv0')\n      poolLyr1 = 1 ;\n      net.addLayer('arpool',AppRankPooling('scale',0.1),{'input','VideoId1'},'DynImg');\n      net.setLayerInputs(net.layers(1).name,{'DynImg'}) ;\n    else\n      poolLyr1 = find(arrayfun(@(a) strcmp(a.name, opts.pool1Layer), net.layers)==1);\n      assert(~isempty(poolLyr1));\n      net.addLayer('arpool',AppRankPooling('scale',0.1),{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImg');\n      net.setLayerInputs(opts.pool1Layer,{'DynImg'}) ;\n    end\n  case 'ppool1'\n    if strcmp(opts.pool1Layer,'conv0')\n      poolLyr1 = 1 ;\n    else\n      poolLyr1 = find(arrayfun(@(a) strcmp(a.name, opts.pool1Layer), net.layers)==1);\n    end\n    net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...\n      {'features_4_0_merge','VideoId1'},'DynImg0',{'conv0f','conv0b'});\n    \n%     net.params(end-1).value = 0.1 * ones(1,1,10,1,'single');\n    net.params(end-1).value = 0.1 * randn(1,1,10,1,'single');\n    net.params(end).value = zeros(1,1,'single');  \n    \n    net.addLayer('BnormDyn',dagnn.BatchNorm('numChannels',256),'DynImg0','DynImg',...\n      {'dym','dyb','dybx'}) ;\n    net.params(end-2).value =  ones(256,1,'single') ;\n    net.params(end-1).value =  zeros(256,1,'single') ;\n    net.params(end).value   =  zeros(256,2,'single') ;\n    \n%     net.addLayer('reluP',dagnn.ReLU(),...\n%       {'DynImg1'},'DynImg');\n    net.layers(16).inputs{1} = 'DynImg' ;\n    for i=numel(net.params)-4:numel(net.params),\n      net.params(i).learningRate = 0.1 * net.params(i).learningRate;\n    end\n  case 'none'\n  otherwise\n    error('Unknown pool type %s', opts.pool1Type) ;\nend\n\n\nnet.rebuild() ;\n%------------------------------------------------------------------------%\n% second pool layer (max pooling)\n% poolLyr2 = find(arrayfun(@(a) strcmp(a.name, 'pool5'), net.layers)==1);\npoolLyr2 = find(arrayfun(@(a) strcmp(a.name, 'features_7_1_merge'), net.layers)==1);\nnet.addLayer('tempPoolMax',TemporalPooling('method','max'),...\n  {net.layers(poolLyr2(1)).outputs{1},'VideoId2'},'tempPoolMax');\n\n% change the input of fc last layer\n% net.setLayerInputs(net.layers(convs(end)).name,'tempPoolMax') ;\n% net.addLayer('bnar',dagnn.BatchNorm('numChannels',2048),{'tempPoolMax'},...\n%   'tempPoolMaxbn',{'bnar_m','bnar_b','bnar_x'});\npoolLyr2next = find(arrayfun(@(a) strcmp(a.name, 'features_7_1_id_relu'), net.layers)==1);\nnet.setLayerInputs(net.layers(poolLyr2next(1)).name,{'tempPoolMax'}) ;\nnet.rebuild() ;\n%------------------------------------------------------------------------%\n% add drop-out layers\nif DropOutRate>0\n\n  pool5 = find(arrayfun(@(a) strcmp(a.name, 'features_8'), net.layers)==1);\n  oo = net.layers(pool5(1)).outputs{1};\n  net.addLayer('drop_pool5',dagnn.DropOut('rate',DropOutRate),...\n    oo,sprintf('drop_%s',oo),{});\n  net.setLayerInputs('classifier_permute',{sprintf('drop_%s',oo)}) ;\nend\n\n\n%------------------------------------------------------------------------%\n% add multi-class error\nnet.addLayer('errMC',ErrorMultiClass(),{'prediction','label'},'mcerr');\n\nnet.addLayer('loss', ...\n             LossNormalized('loss', 'softmaxlog') ,...\n             {'prediction', 'label'}, ...\n             'objective') ;\n\n%------------------------------------------------------------------------%\nnet.rebuild()\n\n% replace standard matconvnet bnorm with my version\nbns = find(arrayfun(@(a) strcmp(class(a.block), 'dagnn.BatchNorm'), net.layers)==1);\nfor i=1:numel(bns)\n  bb = net.layers(bns(i)).block ;\n  net.layers(bns(i)).block = BatchNormN('numChannels',bb.numChannels,...\n  'epsilon',bb.epsilon,...\n  'opts',bb.opts) ;\nend\n\n% dagMergeBatchNorm(net) ;\n% dagRemoveLayersOfType(net, 'dagnn.BatchNorm') ;\nnet_ = net.saveobj ;\nnet = dagnn.DagNN.loadobj(net_) ;\nnet.meta.normalization.border = [32 32] ;\n"
  },
  {
    "path": "dicnn/cnn_single_of.m",
    "content": "function [net, info] = cnn_single_of(varargin)\n%CNN_SINGLE_OF Demonstrates fine-tuning a pre-trained CNN with static \n% optical flow (OF in pami journal) on UCF101 dataset\n\nrun(fullfile(fileparts(mfilename('fullpath')), ...\n  '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ;\n\naddpath Layers Datasets\n\nopts.dataDir = fullfile('data','UCF101') ;\nopts.expDir  = fullfile('exp', 'UCF101') ;\nopts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat.mat') ;\n[opts, varargin] = vl_argparse(opts, varargin) ;\n\nopts.numFetchThreads = 8 ;\n\nopts.lite = false ;\nopts.imdbPath = fullfile(opts.dataDir, 'imdb-of.mat');\n\nopts.DropOutRate = 0.85 ;\nopts.datasetFn = @cnn_ucf101_of_setup_data ;\nopts.networkFn = @cnn_resnext_init ;\n\nopts.split = 1; % data split\nopts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1]\nopts.numDynImgs = 10 ;\nopts.epochFactor = 5 ;\nopts.pool1Layer = 'conv0'; % before conv1\nopts.pool1Type = 'none' ;\nopts.pool2Layer = 'fc6' ;\n\nopts.train = struct() ;\nopts.train.gpus = [];\nopts.train.batchSize = 128 ;\nopts.train.numSubBatches = 32 ;\nopts.train.solver = [] ;\nopts.train.prefetch = true ;\nopts.train.learningRate = 1e-2 ;\nopts.train.numEpochs = 30 ;\n\nopts = vl_argparse(opts, varargin) ;\nif ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end;\n\n\n% -------------------------------------------------------------------------\n%                                                              Prepare data\n% -------------------------------------------------------------------------\n\nif exist(opts.imdbPath,'file')\n  imdb = load(opts.imdbPath) ;\nelse\n  imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ;\n  mkdir(opts.expDir) ;\n  save(opts.imdbPath, '-struct', 'imdb') ;\nend\n\n% UCF101 has 3 data splits\nif opts.split>3\n  error('split should be <=3');\nend\nimdb.images.set = imdb.images.sets(opts.split,:);\n\n% reverse frame order\nif opts.reverseDyn\n  for i=1:numel(imdb.images.names)\n    imdb.images.names{i} = imdb.images.names{i}(end:-1:1);\n  end\nend\n% -------------------------------------------------------------------------\n%                                                             Prepare model\n% -------------------------------------------------------------------------\nnet = load(opts.modelPath);\nif isfield(net,'net')\n  net = net.net;\nend\nopts.nCls = max(imdb.images.label) ;\n% net = dagnn.DagNN.loadobj(net) ;\nnet = opts.networkFn(net,opts) ;\n\n% two channels instead of 3 RGB\nnet.params(1).value = net.params(1).value(:,:,1:2,:) ; \n\n% Set the class names in the network\nnet.meta.classes.name = imdb.classes.name ;\nnet.meta.classes.description = imdb.classes.name ;\n\n% -------------------------------------------------------------------------\n%                                                                     Learn\n% -------------------------------------------------------------------------\nif opts.epochFactor>0\n  opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ;\nelse\n  opts.train.train = NaN ;\nend\nopts.train.val = find(imdb.images.set==3) ;\n\n[net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ...\n                      'expDir', opts.expDir, ...\n                      opts.train) ;\n\n% -------------------------------------------------------------------------\n%                                                          Report accuracy\n% -------------------------------------------------------------------------\nerrlayer = net.getLayerIndex('errMC') ;\n\nif ~isnan(errlayer)\n  cats = imdb.classes.name ;\n  accs = net.layers(errlayer).block.accuracy ; \n  \n  if numel(cats)~=numel(accs)\n    error('wrong number of classes\\n') ;\n  end\n  \n  for i=1:numel(cats)\n    fprintf('%s acc %.1f\\n',cats{i},100*accs(i)) ;\n  end\n  fprintf('Mean accuracy %.1f\\n',100*mean(accs)) ;\nend\n\n% -------------------------------------------------------------------------\nfunction fn = getBatchFn(opts, meta)\n% -------------------------------------------------------------------------\nuseGpu = numel(opts.train.gpus) > 0 ;\n\nbopts.numThreads = opts.numFetchThreads ;\nbopts.imageSize = meta.normalization.imageSize ;\nif isfield(meta.normalization,'border')\n  bopts.border = meta.normalization.border ;  \nelse\n  bopts.border = meta.normalization.imageSize(1:2) ./ ...\n    meta.normalization.cropSize - meta.normalization.imageSize(1:2);\nend\n\nbopts.averageImage = 128 * ones([1 1 2],'single') ;\nbopts.numDynImgs = opts.numDynImgs ;\n% bopts.averageImage = meta.normalization.averageImage ;\n% bopts.rgbVariance = meta.augmentation.rgbVariance ;\n% bopts.transformation = meta.augmentation.transformation ;\nbopts.transformation = 'stretch' ;\nbopts.transformation = 'multiScaleRegular' ;\n\nfn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ;\n\n\n\n% -------------------------------------------------------------------------\nfunction inputs = getDagNNBatch(opts, useGpu, imdb, batch)\n% -------------------------------------------------------------------------\n\n% batch refers to videos (not for frames)\nif isempty(batch)\n  inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []};\n  return;\nend\n\nisVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ;\n\nif ~isVal, transformation='multiScaleRegular'; else transformation='none';end\n\nnames = imdb.images.names(batch);\n\n\n% images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ;\n\nnamesM = {};\nnVids = numel(batch);\n\nVideoId1 = [];\nVideoId2 = [];\n\n% step-size\nstepSize = 6;\n% pool nFrames into a dynamic image\nnFrames = 1;\n% number of dynamic images to be max pooled later\nnDynImgs = opts.numDynImgs ;\nopts = rmfield(opts,'numDynImgs') ;\n\n\nc1 = 1;\nfor v=1:nVids\n  \n  name = names{v};\n  nFrms = numel(name)/2;\n\n  nSample = nFrames;\n  nr = numel(1:stepSize:nFrms);\n  \n  % jitter by removing 50 % and limit a batch to nMaxs * nSamples images\n  if nr > 1 && (~isVal && nr>nDynImgs)\n    rat = min(nDynImgs,ceil(0.50*nr));\n    ri = randperm(nr);\n    ri = ri(1:rat);\n    r = zeros(1,nr);\n    r(ri) = 1;\n  else\n    r = ones(1,nr);\n  end\n  \n  c3 = 1;\n  c2 = 0;\n  \n  for f=1:stepSize:nFrms\n    if r(c3)\n      idx = f:min(f+nSample-1,nFrms) ;\n      if numel(idx)<nFrames\n        idx = [idx idx(end) * ones(1,nFrames-numel(idx))];\n      end\n      idxu = 2*idx - 1;\n      idxv = 2*idx;\n      idxuv = zeros(1,2 * numel(idxu)) ;\n      idxuv(1:2:end) = idxu ;\n      idxuv(2:2:end) = idxv ;\n            \n      namesM{end+1} = name(idxuv);\n      VideoId1 = [VideoId1 c1 * ones(1,numel(idx))];\n      c1 = c1 + 1;\n      c2 = c2 + 1;\n    end\n    c3 = c3 + 1;\n  end\n  VideoId2 = [VideoId2 v * ones(1,c2) ] ;\nend\n\nimages = strcat([imdb.imageDir filesep], horzcat(namesM{:}) ) ;\n\nim = cnn_video_of_get_batch(images, VideoId1, opts, ...\n  'transformation', transformation, 'prefetch', nargout == 0, ...\n  'subMean', false) ;\n\nif nargout > 0\n  if useGpu\n    im = gpuArray(im) ;\n  end\n  inputs = {'input', im, 'label', imdb.images.label(batch), ...\n    'VideoId2', VideoId2};\n\nend\n"
  },
  {
    "path": "dicnn/cnn_single_rgb.m",
    "content": "  function [net, info] = cnn_single_rgb(varargin)\n%CNN_SINGLE_RGB Demonstrates fine-tuning a pre-trained CNN with static \n% RGB frames (SI in pami journal) on UCF101 dataset\n\n\nrun(fullfile(fileparts(mfilename('fullpath')), ...\n  '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ;\n\naddpath Layers Datasets\n\nopts.dataDir = fullfile('data','UCF101') ;\nopts.expDir  = fullfile('exp', 'UCF101') ;\nopts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat');\nopts.datasetFn = @cnn_ucf101_setup_data ;\nopts.networkFn = @cnn_init_resnext ;\nopts.pool1Type = 'none' ;\nopts.pool1Layer = 'conv1' ;\nopts.pool2Layer = '' ;\n[opts, varargin] = vl_argparse(opts, varargin) ;\n\nopts.numFetchThreads = 8 ;\n\nopts.lite = false ;\nopts.imdbPath = fullfile(opts.dataDir, 'imdb-rgb.mat');\nopts.ARPoolLayer = 'conv0'; % before conv1\nopts.DropOutRate = 0.5 ;\nopts.epochFactor = 5 ;\n\nopts.split = 1; % data split\nopts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1]\nopts.train = struct() ;\nopts.train.gpus = [];\nopts.train.batchSize = 128 ;\nopts.train.numSubBatches = 16 ;\nopts.train.solver = [] ;\nopts.train.prefetch = true ;\nopts.train.numEpochs = 30 ;\n% resnet50\nopts.train.learningRate = 1e-2 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)];\n% caffe-ref\nopts.train.learningRate = 1e-4 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)];\n\nopts = vl_argparse(opts, varargin) ;\nif ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end;\n% opts.train.numEpochs = numel(opts.train.learningRate);\n\n% -------------------------------------------------------------------------\n%                                                              Prepare data\n% -------------------------------------------------------------------------\n\nif exist(opts.imdbPath,'file')\n  imdb = load(opts.imdbPath) ;\nelse\n  imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ;\n  mkdir(opts.expDir) ;\n  save(opts.imdbPath, '-struct', 'imdb') ;\nend\n\n% UCF101 has 3 data splits\nif opts.split>3\n  error('split should be <=3');\nend\nimdb.images.set = imdb.images.sets(opts.split,:);\n\n% reverse frame order\nif opts.reverseDyn\n  for i=1:numel(imdb.images.names)\n    imdb.images.names{i} = imdb.images.names{i}(end:-1:1);\n  end\nend\n\n% -------------------------------------------------------------------------\n%                                                             Prepare model\n% -------------------------------------------------------------------------\nnet = load(opts.modelPath);\nif isfield(net,'net')\n  net = net.net;\nend\nopts.nCls = max(imdb.images.label) ;\nnet = opts.networkFn(net,opts);\n\nif numel(net.meta.normalization.averageImage)>3\n  sz = size(net.meta.normalization.averageImage) ;\n  net.meta.normalization.averageImage = ...\n    mean(reshape(net.meta.normalization.averageImage,[sz(1)*sz(2) sz(3)]),1) ;\nend\n\n% Set the class names in the network\nnet.meta.classes.name = imdb.classes.name ;\nnet.meta.classes.description = imdb.classes.name ;\n% -------------------------------------------------------------------------\n%                                                                     Learn\n% -------------------------------------------------------------------------\nif opts.epochFactor>0\n  opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ;\nelse\n  opts.train.train = NaN ;\nend\nopts.train.val = find(imdb.images.set==3) ;\n\n[net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ...\n                      'expDir', opts.expDir, ...\n                      opts.train) ;\n\n% -------------------------------------------------------------------------\n%                                                          Report accuracy\n% -------------------------------------------------------------------------\nerrlayer = net.getLayerIndex('errMC') ;\n\nif ~isnan(errlayer)\n  cats = imdb.classes.name ;\n  accs = net.layers(errlayer).block.accuracy ; \n  \n  if numel(cats)~=numel(accs)\n    error('wrong number of classes\\n') ;\n  end\n  \n  for i=1:numel(cats)\n    fprintf('%s acc %.1f\\n',cats{i},100*accs(i)) ;\n  end\n  fprintf('Mean accuracy %.1f\\n',100*mean(accs)) ;\nend\n\n% -------------------------------------------------------------------------\nfunction fn = getBatchFn(opts, meta)\n% -------------------------------------------------------------------------\nuseGpu = numel(opts.train.gpus) > 0 ;\n\nbopts.numThreads = opts.numFetchThreads ;\nbopts.imageSize = meta.normalization.imageSize ;\nif isfield(meta.normalization,'border')\n  bopts.border = meta.normalization.border ;  \nelse\n  bopts.border = meta.normalization.imageSize(1:2) ./ ...\n    meta.normalization.cropSize - meta.normalization.imageSize(1:2);\n\nend\n\n% bopts.averageImage = []; \nbopts.averageImage = meta.normalization.averageImage ;\nbopts.interpolation = meta.normalization.interpolation ;\nbopts.keepAspect = meta.normalization.keepAspect ;\n% bopts.rgbVariance = meta.augmentation.rgbVariance ;\n% bopts.transformation = meta.augmentation.transformation ;\n\n\nfn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ;\n\n\n\n% -------------------------------------------------------------------------\nfunction inputs = getDagNNBatch(opts, useGpu, imdb, batch)\n% -------------------------------------------------------------------------\n\n% batch refers to videos (not for frames)\nif isempty(batch)\n  inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []};\n  return;\nend\n\nisVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ;\n\n% if ~isVal, transformation='stretch'; else transformation='none';end\nif ~isVal, transformation='multiScaleRegular'; else transformation='none';end\n\nnames = imdb.images.names(batch);\n\n\n% images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ;\n\nnamesM = {};\nnVids = numel(batch);\n\nVideoId1 = [];\nVideoId2 = [];\n\n% step-size\nstepSize = 6;\n% pool nFrames into a dynamic image\nnFrames = 1;\n% number of dynamic images to be max pooled later\nnDynImgs = 10;\n\n\nc1 = 1;\nfor v=1:nVids\n  \n  name = names{v};\n  nFrms = numel(name);\n\n  nSample = nFrames;\n  nr = numel(1:stepSize:nFrms);\n  \n  % jitter by removing 50 % and limit a batch to nMaxs * nSamples images\n  if nr > 1 && (~isVal && nr>nDynImgs)\n    rat = min(nDynImgs,ceil(0.50*nr));\n    ri = randperm(nr);\n    ri = ri(1:rat);\n    r = zeros(1,nr);\n    r(ri) = 1;\n  else\n    r = ones(1,nr);\n  end\n  \n  c3 = 1;\n  c2 = 0;\n  \n  for f=1:stepSize:nFrms\n    if r(c3)\n      idx = f:min(f+nSample-1,nFrms) ;\n      if numel(idx)<nFrames\n        idx = [idx idx(end) * ones(1,nFrames-numel(idx))];\n      end\n      namesM{end+1} = name(idx);\n      VideoId1 = [VideoId1 c1 * ones(1,numel(idx))];\n      c1 = c1 + 1;\n      c2 = c2 + 1;\n    end\n    c3 = c3 + 1;\n  end\n  VideoId2 = [VideoId2 v * ones(1,c2) ] ;\nend\n\nimages = strcat([imdb.imageDir filesep], horzcat(namesM{:}) ) ;\n\nim = cnn_video_rgb_get_batch(images, VideoId1, opts, ...\n  'transformation', transformation, 'prefetch', nargout == 0, ...\n  'subMean', false) ;\n\nif nargout > 0\n  if useGpu\n    im = gpuArray(im) ;\n  end\n  inputs = {'input', im, 'label', imdb.images.label(batch), ...\n    'VideoId2', VideoId2};\nend\n"
  },
  {
    "path": "dicnn/cnn_train_dicnn_dag.m",
    "content": "function [net,stats] = cnn_train_dicnn_dag(net, imdb, getBatch, varargin)\n%CNN_DICNN_TRAIN_DAG Demonstrates training a CNN using the DagNN wrapper\n%    CNN_TRAIN_DAG() is similar to CNN_TRAIN(), but works with\n%    the DagNN wrapper instead of the SimpleNN wrapper.\n\n% Copyright (C) 2014-16 Andrea Vedaldi.\n% All rights reserved.\n%\n% This file is part of the VLFeat library and is made available under\n% the terms of the BSD license (see the COPYING file).\naddpath(fullfile(vl_rootnn, 'examples'));\n\nopts.expDir = fullfile('data','exp') ;\nopts.continue = true ;\nopts.batchSize = 256 ;\nopts.numSubBatches = 1 ;\nopts.train = [] ;\nopts.val = [] ;\nopts.gpus = [] ;\nopts.prefetch = false ;\nopts.epochSize = inf;\nopts.numEpochs = 300 ;\nopts.learningRate = 0.001 ;\nopts.weightDecay = 0.0005 ;\n\nopts.solver = [] ;  % Empty array means use the default SGD solver\n[opts, varargin] = vl_argparse(opts, varargin) ;\nif ~isempty(opts.solver)\n  assert(isa(opts.solver, 'function_handle') && nargout(opts.solver) == 2,...\n    'Invalid solver; expected a function handle with two outputs.') ;\n  % Call without input arguments, to get default options\n  opts.solverOpts = opts.solver() ;\nend\n\nopts.momentum = 0.9 ;\nopts.saveSolverState = true ;\nopts.nesterovUpdate = false ;\nopts.randomSeed = 0 ;\nopts.profile = false ;\nopts.parameterServer.method = 'mmap' ;\nopts.parameterServer.prefix = 'mcn' ;\n\nopts.derOutputs = {'objective', 1} ;\nopts.extractStatsFn = @extractStats ;\nopts.plotStatistics = true;\nopts.postEpochFn = [] ;  % postEpochFn(net,params,state) called after each epoch; can return a new learning rate, 0 to stop, [] for no change\nopts = vl_argparse(opts, varargin) ;\n\nif ~exist(opts.expDir, 'dir'), mkdir(opts.expDir) ; end\nif isempty(opts.train), opts.train = find(imdb.images.set==1) ; end\nif isempty(opts.val), opts.val = find(imdb.images.set==2) ; end\nif isscalar(opts.train) && isnumeric(opts.train) && isnan(opts.train)\n  opts.train = [] ;\nend\nif isscalar(opts.val) && isnumeric(opts.val) && isnan(opts.val)\n  opts.val = [] ;\nend\n\n% -------------------------------------------------------------------------\n%                                                            Initialization\n% -------------------------------------------------------------------------\n\nevaluateMode = isempty(opts.train) ;\nif ~evaluateMode\n  if isempty(opts.derOutputs)\n    error('DEROUTPUTS must be specified when training.\\n') ;\n  end\nend\n\n% -------------------------------------------------------------------------\n%                                                        Train and validate\n% -------------------------------------------------------------------------\n\nmodelPath = @(ep) fullfile(opts.expDir, sprintf('net-epoch-%d.mat', ep));\nmodelFigPath = fullfile(opts.expDir, 'net-train.pdf') ;\n\nstart = opts.continue * findLastCheckpoint(opts.expDir) ;\nif start >= 1\n  fprintf('%s: resuming by loading epoch %d\\n', mfilename, start) ;\n  [net, state, stats] = loadState(modelPath(start)) ;\nelse\n  state = [] ;\nend\n\nfor epoch=start+1:opts.numEpochs\n\n  % Set the random seed based on the epoch and opts.randomSeed.\n  % This is important for reproducibility, including when training\n  % is restarted from a checkpoint.\n\n  rng(epoch + opts.randomSeed) ;\n  prepareGPUs(opts, epoch == start+1) ;\n\n  % Train for one epoch.\n  params = opts ;\n  params.epoch = epoch ;\n  params.learningRate = opts.learningRate(min(epoch, numel(opts.learningRate))) ;\n  params.train = opts.train(randperm(numel(opts.train))) ; % shuffle\n  params.train = params.train(1:min(opts.epochSize, numel(opts.train)));\n  params.val = opts.val(randperm(numel(opts.val))) ;\n  params.imdb = imdb ;\n  params.getBatch = getBatch ;\n\n  if numel(opts.gpus) <= 1\n    [net, state] = processEpoch(net, state, params, 'train') ;\n    [net, state] = processEpoch(net, state, params, 'val') ;\n    if ~evaluateMode\n      saveState(modelPath(epoch), net, state) ;\n    end\n    lastStats = state.stats ;\n  else\n    spmd\n      [net, state] = processEpoch(net, state, params, 'train') ;\n      [net, state] = processEpoch(net, state, params, 'val') ;\n      if labindex == 1 && ~evaluateMode\n        saveState(modelPath(epoch), net, state) ;\n      end\n      lastStats = state.stats ;\n    end\n    lastStats = accumulateStats(lastStats) ;\n  end\n\n  stats.train(epoch) = lastStats.train ;\n  stats.val(epoch) = lastStats.val ;\n  clear lastStats ;\n  saveStats(modelPath(epoch), stats) ;\n\n  if opts.plotStatistics\n    switchFigure(1) ; clf ;\n    plots = setdiff(...\n      cat(2,...\n      fieldnames(stats.train)', ...\n      fieldnames(stats.val)'), {'num', 'time'}) ;\n    for p = plots\n      p = char(p) ;\n      values = zeros(0, epoch) ;\n      leg = {} ;\n      for f = {'train', 'val'}\n        f = char(f) ;\n        if isfield(stats.(f), p)\n          tmp = [stats.(f).(p)] ;\n          values(end+1,:) = tmp(1,:)' ;\n          leg{end+1} = f ;\n        end\n      end\n      subplot(1,numel(plots),find(strcmp(p,plots))) ;\n      plot(1:epoch, values','o-') ;\n      xlabel('epoch') ;\n      title(p) ;\n      legend(leg{:}) ;\n      grid on ;\n    end\n    drawnow ;\n    print(1, modelFigPath, '-dpdf') ;\n  end\n  \n  if ~isempty(opts.postEpochFn)\n    if nargout(opts.postEpochFn) == 0\n      opts.postEpochFn(net, params, state) ;\n    else\n      lr = opts.postEpochFn(net, params, state) ;\n      if ~isempty(lr), opts.learningRate = lr; end\n      if opts.learningRate == 0, break; end\n    end\n  end\nend\n\n% With multiple GPUs, return one copy\nif isa(net, 'Composite'), net = net{1} ; end\n\n% -------------------------------------------------------------------------\nfunction [net, state] = processEpoch(net, state, params, mode)\n% -------------------------------------------------------------------------\n% Note that net is not strictly needed as an output argument as net\n% is a handle class. However, this fixes some aliasing issue in the\n% spmd caller.\n\n% initialize with momentum 0\nif isempty(state) || isempty(state.solverState)\n  state.solverState = cell(1, numel(net.params)) ;\n  state.solverState(:) = {0} ;\nend\n\n% move CNN  to GPU as needed\nnumGpus = numel(params.gpus) ;\nif numGpus >= 1\n  net.move('gpu') ;\n  for i = 1:numel(state.solverState)\n    s = state.solverState{i} ;\n    if isnumeric(s)\n      state.solverState{i} = gpuArray(s) ;\n    elseif isstruct(s)\n      state.solverState{i} = structfun(@gpuArray, s, 'UniformOutput', false) ;\n    end\n  end\nend\nif numGpus > 1\n  parserv = ParameterServer(params.parameterServer) ;\n  net.setParameterServer(parserv) ;\nelse\n  parserv = [] ;\nend\n\n% profile\nif params.profile\n  if numGpus <= 1\n    profile clear ;\n    profile on ;\n  else\n    mpiprofile reset ;\n    mpiprofile on ;\n  end\nend\n\nnum = 0 ;\nepoch = params.epoch ;\nsubset = params.(mode) ;\nadjustTime = 0 ;\n\nstats.num = 0 ; % return something even if subset = []\nstats.time = 0 ;\n\nstart = tic ;\nfor t=1:params.batchSize:numel(subset)\n  fprintf('%s: epoch %02d: %3d/%3d:', mode, epoch, ...\n          fix((t-1)/params.batchSize)+1, ceil(numel(subset)/params.batchSize)) ;\n  batchSize = min(params.batchSize, numel(subset) - t + 1) ;\n\n  for s=1:params.numSubBatches\n    % get this image batch and prefetch the next\n    batchStart = t + (labindex-1) + (s-1) * numlabs ;\n    batchEnd = min(t+params.batchSize-1, numel(subset)) ;\n    batch = subset(batchStart : params.numSubBatches * numlabs : batchEnd) ;\n    num = num + numel(batch) ;\n    if numel(batch) == 0, continue ; end\n\n    inputs = params.getBatch(params.imdb, batch) ;\n\n    if params.prefetch\n      if s == params.numSubBatches\n        batchStart = t + (labindex-1) + params.batchSize ;\n        batchEnd = min(t+2*params.batchSize-1, numel(subset)) ;\n      else\n        batchStart = batchStart + numlabs ;\n      end\n      nextBatch = subset(batchStart : params.numSubBatches * numlabs : batchEnd) ;\n      params.getBatch(params.imdb, nextBatch) ;\n    end\n\n    if strcmp(mode, 'train')\n      net.mode = 'normal' ;\n      net.accumulateParamDers = (s ~= 1) ;\n      net.eval(inputs, params.derOutputs, 'holdOn', s < params.numSubBatches) ;\n    else\n      net.mode = 'test' ;\n      net.eval(inputs) ;\n    end\n  end\n\n  % Accumulate gradient.\n  if strcmp(mode, 'train')\n    if ~isempty(parserv), parserv.sync() ; end\n    state = accumulateGradients(net, state, params, parserv) ;\n  end\n\n  % Get statistics.\n  time = toc(start) + adjustTime ;\n  batchTime = time - stats.time ;\n  stats.num = num ;\n  stats.time = time ;\n  stats = params.extractStatsFn(stats,net) ;\n  currentSpeed = batchSize / batchTime ;\n  averageSpeed = (t + batchSize - 1) / time ;\n  if t == 3*params.batchSize + 1\n    % compensate for the first three iterations, which are outliers\n    adjustTime = 4*batchTime - time ;\n    stats.time = time + adjustTime ;\n  end\n\n  fprintf(' %.1f (%.1f) Hz', averageSpeed, currentSpeed) ;\n  for f = setdiff(fieldnames(stats)', {'num', 'time'})\n    f = char(f) ;\n    fprintf(' %s: %.3f', f, stats.(f)) ;\n  end\n  fprintf('\\n') ;\nend\n\n% Save back to state.\nstate.stats.(mode) = stats ;\nif params.profile\n  if numGpus <= 1\n    state.prof.(mode) = profile('info') ;\n    profile off ;\n  else\n    state.prof.(mode) = mpiprofile('info');\n    mpiprofile off ;\n  end\nend\nif ~params.saveSolverState\n  state.solverState = [] ;\nelse\n  for i = 1:numel(state.solverState)\n    s = state.solverState{i} ;\n    if isnumeric(s)\n      state.solverState{i} = gather(s) ;\n    elseif isstruct(s)\n      state.solverState{i} = structfun(@gather, s, 'UniformOutput', false) ;\n    end\n  end\nend\n\nnet.reset() ;\nnet.move('cpu') ;\n\n% -------------------------------------------------------------------------\nfunction state = accumulateGradients(net, state, params, parserv)\n% -------------------------------------------------------------------------\nnumGpus = numel(params.gpus) ;\notherGpus = setdiff(1:numGpus, labindex) ;\n\nden = params.numSubBatches * max(numGpus,1) ;\n\nfor p=1:numel(net.params)\n\n  if ~isempty(parserv)\n    parDer = parserv.pullWithIndex(p) ;\n  else\n    parDer = net.params(p).der ;\n  end\n\n  switch net.params(p).trainMethod\n\n    case 'average' % mainly for batch normalization\n      thisLR = net.params(p).learningRate ;\n      net.params(p).value = vl_taccum(...\n          1 - thisLR, net.params(p).value, ...\n          (thisLR/den/net.params(p).fanout),  parDer) ;\n\n    case 'gradient'\n      thisDecay = params.weightDecay * net.params(p).weightDecay ;\n      thisLR = params.learningRate * net.params(p).learningRate ;\n\n      if thisLR>0 || thisDecay>0\n        % Normalize gradient and incorporate weight decay.\n        parDer = vl_taccum(1/den, parDer, ...\n                           thisDecay, net.params(p).value) ;\n\n        if isempty(params.solver)\n          % Default solver is the optimised SGD.\n          % Update momentum.\n          state.solverState{p} = vl_taccum(...\n            params.momentum, state.solverState{p}, ...\n            -1, parDer) ;\n\n          % Nesterov update (aka one step ahead).\n          if params.nesterovUpdate\n            delta = params.momentum * state.solverState{p} - parDer ;\n          else\n            delta = state.solverState{p} ;\n          end\n\n          % Update parameters.\n          net.params(p).value = vl_taccum(...\n            1,  net.params(p).value, thisLR, delta) ;\n\n        else\n          % call solver function to update weights\n          [net.params(p).value, state.solverState{p}] = ...\n            params.solver(net.params(p).value, state.solverState{p}, ...\n            parDer, params.solverOpts, thisLR) ;\n        end\n      end\n    otherwise\n      error('Unknown training method ''%s'' for parameter ''%s''.', ...\n        net.params(p).trainMethod, ...\n        net.params(p).name) ;\n  end\nend\n\n% -------------------------------------------------------------------------\nfunction stats = accumulateStats(stats_)\n% -------------------------------------------------------------------------\n\nfor s = {'train', 'val'}\n  s = char(s) ;\n  total = 0 ;\n\n  % initialize stats stucture with same fields and same order as\n  % stats_{1}\n  stats__ = stats_{1} ;\n  names = fieldnames(stats__.(s))' ;\n  values = zeros(1, numel(names)) ;\n  fields = cat(1, names, num2cell(values)) ;\n  stats.(s) = struct(fields{:}) ;\n\n  for g = 1:numel(stats_)\n    stats__ = stats_{g} ;\n    num__ = stats__.(s).num ;\n    total = total + num__ ;\n\n    for f = setdiff(fieldnames(stats__.(s))', 'num')\n      f = char(f) ;\n      stats.(s).(f) = stats.(s).(f) + stats__.(s).(f) * num__ ;\n\n      if g == numel(stats_)\n        stats.(s).(f) = stats.(s).(f) / total ;\n      end\n    end\n  end\n  stats.(s).num = total ;\nend\n\n% -------------------------------------------------------------------------\nfunction stats = extractStats(stats, net)\n% -------------------------------------------------------------------------\nsel = find(cellfun(@(x) isa(x,'dagnn.Loss'), {net.layers.block})) ;\nfor i = 1:numel(sel)\n  if net.layers(sel(i)).block.ignoreAverage, continue; end;\n  stats.(net.layers(sel(i)).outputs{1}) = net.layers(sel(i)).block.average ;\nend\n\n% -------------------------------------------------------------------------\nfunction saveState(fileName, net_, state)\n% -------------------------------------------------------------------------\nnet = net_.saveobj() ;\nsave(fileName, 'net', 'state') ;\n\n% -------------------------------------------------------------------------\nfunction saveStats(fileName, stats)\n% -------------------------------------------------------------------------\nif exist(fileName)\n  save(fileName, 'stats', '-append') ;\nelse\n  save(fileName, 'stats') ;\nend\n\n% -------------------------------------------------------------------------\nfunction [net, state, stats] = loadState(fileName)\n% -------------------------------------------------------------------------\nload(fileName, 'net', 'state', 'stats') ;\nnet = dagnn.DagNN.loadobj(net) ;\nif isempty(whos('stats'))\n  error('Epoch ''%s'' was only partially saved. Delete this file and try again.', ...\n        fileName) ;\nend\n\n% -------------------------------------------------------------------------\nfunction epoch = findLastCheckpoint(modelDir)\n% -------------------------------------------------------------------------\nlist = dir(fullfile(modelDir, 'net-epoch-*.mat')) ;\ntokens = regexp({list.name}, 'net-epoch-([\\d]+).mat', 'tokens') ;\nepoch = cellfun(@(x) sscanf(x{1}{1}, '%d'), tokens) ;\nepoch = max([epoch 0]) ;\n\n% -------------------------------------------------------------------------\nfunction switchFigure(n)\n% -------------------------------------------------------------------------\nif get(0,'CurrentFigure') ~= n\n  try\n    set(0,'CurrentFigure',n) ;\n  catch\n    figure(n) ;\n  end\nend\n\n% -------------------------------------------------------------------------\nfunction clearMex()\n% -------------------------------------------------------------------------\nclear vl_tmove vl_imreadjpeg ;\n\n% -------------------------------------------------------------------------\nfunction prepareGPUs(opts, cold)\n% -------------------------------------------------------------------------\nnumGpus = numel(opts.gpus) ;\nif numGpus > 1\n  % check parallel pool integrity as it could have timed out\n  pool = gcp('nocreate') ;\n  if ~isempty(pool) && pool.NumWorkers ~= numGpus\n    delete(pool) ;\n  end\n  pool = gcp('nocreate') ;\n  if isempty(pool)\n    parpool('local', numGpus) ;\n    cold = true ;\n  end\n\nend\nif numGpus >= 1 && cold\n  fprintf('%s: resetting GPU\\n', mfilename)\n  clearMex() ;\n  if numGpus == 1\n    gpuDevice(opts.gpus)\n  else\n    spmd\n      clearMex() ;\n      gpuDevice(opts.gpus(labindex))\n    end\n  end\nend\n"
  },
  {
    "path": "dicnn/cnn_video_of_get_batch.m",
    "content": "function imo = cnn_video_of_get_batch(images, vids, varargin)\n% CNN_VIDEO_OF_GET_BATCH  Load, preprocess, and pack images for CNN evaluation\n\n% video ids\n% use same spatial jittering for frames from the same video\n% NOTE: all the frames from a video should have the same size (wxh)\n\nopts.imageSize = [227, 227] ;\nopts.border = [29, 29] ;\nopts.keepAspect = true ;\nopts.numAugments = 1 ;\nopts.transformation = 'multiScaleRegular' ;\nopts.averageImage = [] ;\nopts.rgbVariance = zeros(0,2,'single') ;\nopts.interpolation = 'bilinear' ;\nopts.numThreads = 1 ;\nopts.prefetch = false ;\nopts.lazyResize = true ;\nopts.subMean = false; % subtract the mean from each video\nopts = vl_argparse(opts, varargin);\n\n% fetch is true if images is a list of filenames (instead of\n% a cell array of images)\nfetch = numel(images) >= 1 && ischar(images{1}) ;\n\n% prefetch is used to load images in a separate thread\nprefetch = fetch & opts.prefetch ;\n\nif prefetch\n  vl_imreadjpeg(images, 'numThreads', opts.numThreads, 'prefetch') ;\n  imo = [] ;\n  return ;\nend\nif fetch\n  im = vl_imreadjpeg(images,'numThreads', opts.numThreads) ;\nelse\n  im = images ;\nend\n\ntfs = [] ;\nswitch opts.transformation\n  case 'none'\n    tfs = [\n      .5 ;\n      .5 ;\n      0 ] ;\n  case 'f5'\n    tfs = [...\n      .5 0 0 1 1 .5 0 0 1 1 ;\n      .5 0 1 0 1 .5 0 1 0 1 ;\n      0 0 0 0 0  1 1 1 1 1] ;\n  case 'f25'\n    [tx,ty] = meshgrid(linspace(0,1,5)) ;\n    tfs = [tx(:)' ; ty(:)' ; zeros(1,numel(tx))] ;\n    tfs_ = tfs ;\n    tfs_(3,:) = 1 ;\n    tfs = [tfs,tfs_] ;\n  case 'stretch'\n  case 'multiScaleRegular'\n  otherwise\n    error('Uknown transformations %s', opts.transformation) ;\nend\n[~,transformations] = sort(rand(size(tfs,2), numel(images)), 1) ;\n\nif ~isempty(opts.rgbVariance) && isempty(opts.averageImage)\n  opts.averageImage = zeros(1,1,2) ;\nend\nif numel(opts.averageImage) == 2\n  opts.averageImage = reshape(opts.averageImage, 1,1,2) ;\nend\n\nimo = zeros(opts.imageSize(1), opts.imageSize(2), 2, ...\n  numel(images)/2*opts.numAugments, 'single') ;\n\nnVid = max(vids);\nsi = 1 ;\ncountv = 1;\nfor v=1:nVid\n  \n  vid = find(vids==v);\n  \n  for i=1:numel(images(vid))\n    \n    % acquire image\n    if isempty(im{i})\n      imt1 = imread(images{2*vid(i)-1}) ;\n      imt2 = imread(images{2*vid(i)}) ;\n    else\n      imt1 = im{2*vid(i)-1} ;\n      imt2 = im{2*vid(i)} ;\n    end\n    imt = single(cat(3,imt1,imt2)) ; % faster than im2single (and multiplies by 255)\n \n    % resize\n    w = size(imt,2) ;\n    h = size(imt,1) ;\n    factor = [(opts.imageSize(1)+opts.border(1))/h ...\n      (opts.imageSize(2)+opts.border(2))/w];\n    \n    if opts.keepAspect\n      factor = max(factor) ;\n    end\n    if any(abs(factor - 1) > 0.0001)\n      imt = imresize(imt, ...\n        'scale', factor, ...\n        'method', opts.interpolation) ;\n    end\n    \n    % crop & flip\n    if i==1\n      flip = rand > 0.5 ;\n      w = size(imt,2) ;\n      h = size(imt,1) ;\n      switch opts.transformation\n        case 'stretch'\n          sz = round(min(opts.imageSize(1:2)' .* (1-0.1+0.2*rand(2,1)), [w;h])) ;\n          dx = randi(w - sz(2) + 1, 1) ;\n          dy = randi(h - sz(1) + 1, 1) ;\n%           flip = rand > 0.5 ;\n        case 'multiScaleRegular'\n          reg_szs = [256, 224, 192, 168] ;          \n          sz(1) = reg_szs(randi(4)); sz(2) = reg_szs(randi(4));\n \n          dy = [0 h-sz(1) 0 h-sz(1)  floor((h-sz(1)+1)/2)] + 1;\n          dx = [0 w-sz(2) w-sz(2) 0 floor((w-sz(2)+1)/2)] + 1;\n          corner = randi(5);\n          dx = dx(corner); dy = dy(corner); \n        otherwise\n          tf = tfs(:, transformations(mod(0, numel(transformations)) + 1)) ;\n          sz = opts.imageSize(1:2) ;\n          dx = floor((w - sz(2)) * tf(2)) + 1 ;\n          dy = floor((h - sz(1)) * tf(1)) + 1 ;\n%           flip = tf(3) ;\n      end\n      \n    end\n    if opts.lazyResize\n      sx = round(linspace(dx, sz(2)+dx-1, opts.imageSize(2))) ;\n      sy = round(linspace(dy, sz(1)+dy-1, opts.imageSize(1))) ;\n    else\n      factor = [opts.imageSize(1)/sz(1) ...\n                  opts.imageSize(2)/sz(2)];\n      if any(abs(factor - 1) > 0.0001)\n        imt =   imresize(gather(imt(dy:sz(1)+dy-1,dx:sz(2)+dx-1,:)), [opts.imageSize(1:2)],...\n          'Antialiasing', false, 'Method', opts.interpolation);\n      end\n      sx = 1:opts.imageSize(2); sy = 1:opts.imageSize(1);\n    end\n    if flip\n      sx = fliplr(sx) ; \n      imo(:,:,1,si) = 255 - imt(sy,sx,1) ;\n      imo(:,:,2,si) = imt(sy,sx,2) ;\n    else \n      imo(:,:,:,si) = imt(sy,sx,:) ;\n    end\n    si = si + 1 ;\n  end\n\n  countv = countv + numel(images(vid));\nend\nif ~isempty(opts.averageImage) && numel(opts.averageImage)==2\n  if ~isempty(opts.rgbVariance)\n    imo = bsxfun(@minus, imo, opts.averageImage+reshape(opts.rgbVariance * randn(2,1), 1,1,3)) ;\n  else\n    imo = bsxfun(@minus, imo, opts.averageImage) ;\n  end\nend\n\n\n"
  },
  {
    "path": "dicnn/cnn_video_rgb_get_batch.m",
    "content": "function imo = cnn_video_rgb_get_batch(images, vids, varargin)\n% CNN_VIDEO_RGB_GET_BATCH  Load, preprocess, and pack images for CNN evaluation\n\n% video ids\n% use same spatial jittering for frames from the same video\n% NOTE: all the frames from a video should have the same size (wxh)\n\nopts.imageSize = [227, 227] ;\nopts.border = [29, 29] ;\nopts.keepAspect = true ;\nopts.numAugments = 1 ;\nopts.transformation = 'none' ;\nopts.averageImage = [] ;\nopts.rgbVariance = zeros(0,3,'single') ;\nopts.interpolation = 'bilinear' ;\nopts.numThreads = 1 ;\nopts.prefetch = false ;\nopts.subMean = false ; % subtract the mean from each video\nopts.lazyResize = true ;\n\nopts = vl_argparse(opts, varargin);\n\n% fetch is true if images is a list of filenames (instead of\n% a cell array of images)\nfetch = numel(images) >= 1 && ischar(images{1}) ;\n\n% prefetch is used to load images in a separate thread\nprefetch = fetch & opts.prefetch ;\n\nif prefetch\n  vl_imreadjpeg(images, 'numThreads', opts.numThreads, 'prefetch') ;\n  imo = [] ;\n  return ;\nend\nif fetch\n  im = vl_imreadjpeg(images,'numThreads', opts.numThreads) ;\nelse\n  im = images ;\nend\n\ntfs = [] ;\nswitch opts.transformation\n  case 'none'\n    tfs = [\n      .5 ;\n      .5 ;\n      0 ] ;\n  case 'f5'\n    tfs = [...\n      .5 0 0 1 1 .5 0 0 1 1 ;\n      .5 0 1 0 1 .5 0 1 0 1 ;\n      0 0 0 0 0  1 1 1 1 1] ;\n  case 'f25'\n    [tx,ty] = meshgrid(linspace(0,1,5)) ;\n    tfs = [tx(:)' ; ty(:)' ; zeros(1,numel(tx))] ;\n    tfs_ = tfs ;\n    tfs_(3,:) = 1 ;\n    tfs = [tfs,tfs_] ;\n  case 'stretch'\n  case 'multiScaleRegular'\n  otherwise\n    error('Uknown transformations %s', opts.transformation) ;\nend\n[~,transformations] = sort(rand(size(tfs,2), numel(images)), 1) ;\n\nif ~isempty(opts.rgbVariance) && isempty(opts.averageImage)\n  opts.averageImage = zeros(1,1,3) ;\nend\nif numel(opts.averageImage) == 3\n  opts.averageImage = reshape(opts.averageImage, 1,1,3) ;\nend\n\nimo = zeros(opts.imageSize(1), opts.imageSize(2), 3, ...\n  numel(images)*opts.numAugments, 'single') ;\n\nnVid = max(vids);\nsi = 1 ;\ncountv = 1;\nfor v=1:nVid\n  \n  vid = find(vids==v);\n  \n  for i=1:numel(images(vid))\n    \n    % acquire image\n    if isempty(im{i})\n      imt = imread(images{vid(i)}) ;\n      imt = single(imt) ; % faster than im2single (and multiplies by 255)\n    else\n      imt = im{vid(i)} ;\n    end\n    if size(imt,3) == 1\n      imt = cat(3, imt, imt, imt) ;\n    end\n    \n    % resize\n    w = size(imt,2) ;\n    h = size(imt,1) ;\n    factor = [(opts.imageSize(1)+opts.border(1))/h ...\n      (opts.imageSize(2)+opts.border(2))/w];\n    \n    if opts.keepAspect\n      factor = max(factor) ;\n    end\n    if any(abs(factor - 1) > 0.0001)\n      imt = imresize(imt, ...\n        'scale', factor, ...\n        'method', opts.interpolation) ;\n    end\n    \n    % crop & flip\n    if i==1\n      w = size(imt,2) ;\n      h = size(imt,1) ;\n      switch opts.transformation\n        case 'stretch'\n          sz = round(min(opts.imageSize(1:2)' .* (1-0.1+0.2*rand(2,1)), [w;h])) ;\n          dx = randi(w - sz(2) + 1, 1) ;\n          dy = randi(h - sz(1) + 1, 1) ;\n          flip = rand > 0.5 ;\n        case 'multiScaleRegular'\n          reg_szs = [256, 224, 192, 168] ;\n          sz(1) = reg_szs(randi(4)); sz(2) = reg_szs(randi(4));\n          \n          dy = [0 h-sz(1) 0 h-sz(1)  floor((h-sz(1)+1)/2)] + 1;\n          dx = [0 w-sz(2) w-sz(2) 0 floor((w-sz(2)+1)/2)] + 1;\n          corner = randi(5);\n          dx = dx(corner); dy = dy(corner);\n          flip = rand > 0.5 ;\n        otherwise\n          tf = tfs(:, transformations(mod(0, numel(transformations)) + 1)) ;\n          sz = opts.imageSize(1:2) ;\n          dx = floor((w - sz(2)) * tf(2)) + 1 ;\n          dy = floor((h - sz(1)) * tf(1)) + 1 ;\n          flip = tf(3) ;\n      end\n      \n    end\n    \n    if opts.lazyResize\n      sx = round(linspace(dx, sz(2)+dx-1, opts.imageSize(2))) ;\n      sy = round(linspace(dy, sz(1)+dy-1, opts.imageSize(1))) ;\n    else\n      factor = [opts.imageSize(1)/sz(1) ...\n        opts.imageSize(2)/sz(2)];\n      if any(abs(factor - 1) > 0.0001)\n        imt =   imresize(gather(imt(dy:sz(1)+dy-1,dx:sz(2)+dx-1,:)), ...\n          opts.imageSize(1:2), 'Antialiasing', false, ...\n         'Method', opts.interpolation);\n      end\n      sx = 1:opts.imageSize(2); sy = 1:opts.imageSize(1);\n    end\n    \n    \n    if flip\n      sx = fliplr(sx) ;   \n    end\n    \n    imo(:,:,:,si) = imt(sy,sx,:) ;\n    si = si + 1 ;\n  end\n  countv = countv + numel(images(vid));\n\nend\n\nif ~isempty(opts.averageImage) && numel(opts.averageImage)==3\n  if ~isempty(opts.rgbVariance)\n    imo = bsxfun(@minus, imo, opts.averageImage+reshape(opts.rgbVariance * randn(3,1), 1,1,3)) ;\n  else\n    imo = bsxfun(@minus, imo, opts.averageImage) ;\n  end\nend\n"
  },
  {
    "path": "dicnn/compute_approximate_dynamic_images.m",
    "content": "function di = compute_approximate_dynamic_images(images)\n% Computes approximate dynamic images for a given array of images\n% IMAGES must be a tensor of H x W x D x N dimensionality or\n% cell of image names\n\n% For the exact dynamic images, use the code\n% http://users.cecs.anu.edu.au/~basura/dynamic_images/code.zip\n% Explained here http://arxiv.org/abs/1512.01848\n\nif isempty(images)\n  di = [] ;\n  return ;\nend\n\n\nif iscell(images)\n  imagesA = cell(1,numel(images)) ; \n  for i=1:numel(images)\n    if ~ischar(images{i})\n      error('images must be an array of images or cell of image names') ;\n    end\n    imagesA{i} = imread(images{i}) ;\n  end\n  images = cat(4,imagesA{:}) ;\nend\n\nN = size(images,4) ;\ndi = vl_nnarpooltemporal(single(images),ones(1,N)) ;\n\n\n"
  },
  {
    "path": "dicnn/visualize_approximate_dynamic_images.m",
    "content": "function visualize_approximate_dynamic_images(images)\n% VISUALIZE_DYNAMIC_IMAGES\n\ndi = compute_approximate_dynamic_images(images) ;\n\ndi = di - min(di(:)) ;\ndi = 255 * di ./ max(di(:)) ;\nimage(uint8(di)) ;\n"
  },
  {
    "path": "main_train.m",
    "content": "model = 'resnext50' ; % {'cafferef','resnext50','resnext101'}\ninput = 'rgb' ; % {'rgb','of'}\ndataset = 'ucf101' ; % {'ucf101','hmdb51'}  hmdb51 requires more iterations to train (add more epochs to learning rate)\nopts.train.batchSize = 128 ;\nopts.train.numSubBatches = 32 ; % increase the number (16,32) if it does not fit into gpu mem \nopts.epochFactor = 5 ;\nopts.split = 1 ;\n\nopts.train.gpus = 1 ;\n\nrun matconvnet/matlab/vl_setupnn.m ;\nvl_contrib install mcnExtraLayers ; vl_contrib setup mcnExtraLayers ;\nvl_contrib install autonn ; vl_contrib setup autonn ;\n\n% addpath(fullfile('matconvnet','contrib','mcnExtraLayers','matlab')) ;\n\nopts.expDir = ['exp/' model 'rgb-arpool-split' num2str(opts.split)] ;\nif strcmp(input,'rgb')  \n  opts.DropOutRate = 0.5 ;\n  trainfn = @cnn_dicnn_rgb ;\nelseif strcmp(input,'of')  \n  opts.DropOutRate = 0.8 ;\n  trainfn = @cnn_dicnn_of ;\nend\n\nif strcmp(model,'cafferef')  \n\n  opts.pool1Layer = 'conv1' ;\n  % download from http://www.vlfeat.org/matconvnet/models/imagenet-caffe-ref.mat\n  opts.modelPath = fullfile('models','imagenet-caffe-ref.mat') ;\n  opts.networkFn = @cnn_init_cafferef ;\n  \n  if strcmp(input,'rgb')  \n    opts.train.learningRate = 1e-3 * [ones(1,2) 0.1*ones(1,2)] ;\n  else\n    opts.train.learningRate = 3e-3 * [ones(1,10) 0.1*ones(1,2)] ;\n  end\n\n  opts.train.numEpochs = numel(opts.train.learningRate) ;\nelseif strcmp(model,'resnext50') || strcmp(model,'resnext101')\n  % download from http://www.robots.ox.ac.uk/~albanie/models/pytorch-imports/resnext_50_32x4d-pt-mcn.mat\n  % download from http://www.robots.ox.ac.uk/~albanie/models/pytorch-imports/resnext_101_32x4d-pt-mcn.mat\n  if strcmp(model,'resnext50')\n    opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat') ;\n  else\n    opts.modelPath = fullfile('models','resnext_101_32x4d-pt-mcn.mat') ;\n  end\n  opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat') ;\n  opts.networkFn = @cnn_init_resnext ;\n  if strcmp(input,'rgb')  \n    opts.train.learningRate = 1e-2 * [ones(1,2) 0.1*ones(1,8) ] ;\n  else\n    opts.train.learningRate = 1e-2 * [ones(1,2) 0.1*ones(1,2) ] ;\n  end\nend\n\naddpath dicnn ;\n\n[net, info] = trainfn(opts)\n"
  },
  {
    "path": "utils/extract_frames.sh",
    "content": "# !/bin/bash\n\n# This script converts videos into frames\n# for different fps change (-r 1)\n\nfor f in *.avi\n  do g=`echo $f | sed 's/\\.avi//'`;\n  echo Processing $f; \n  mkdir -p frames/$g/ ;\n  ffmpeg -i $f frames/$g/image-%04d.jpeg ; \ndone\n"
  }
]