Repository: hbilen/dynamic-image-nets
Branch: master
Commit: 96b91afab109
Files: 28
Total size: 93.3 KB

Directory structure:
gitextract_6iyvfhqz/

├── .gitmodules
├── Datasets/
│   ├── cnn_hmdb51_of_setup_data.m
│   ├── cnn_hmdb51_setup_data.m
│   ├── cnn_ucf101_of_setup_data.m
│   └── cnn_ucf101_setup_data.m
├── Layers/
│   ├── AppRankPooling.m
│   ├── BatchNormN.m
│   ├── ErrorMultiClass.m
│   ├── L2Normalize.m
│   ├── LossNormalized.m
│   ├── TemporalPooling.m
│   ├── vl_nnarpooltemporal.m
│   ├── vl_nnl2norm.m
│   └── vl_nnpooltemporal.m
├── README.md
├── dicnn/
│   ├── cnn_dicnn_of.m
│   ├── cnn_dicnn_rgb.m
│   ├── cnn_init_cafferef.m
│   ├── cnn_init_resnext.m
│   ├── cnn_single_of.m
│   ├── cnn_single_rgb.m
│   ├── cnn_train_dicnn_dag.m
│   ├── cnn_video_of_get_batch.m
│   ├── cnn_video_rgb_get_batch.m
│   ├── compute_approximate_dynamic_images.m
│   └── visualize_approximate_dynamic_images.m
├── main_train.m
└── utils/
    └── extract_frames.sh

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitmodules
================================================
[submodule "matconvnet"]
	path = matconvnet
	url = https://github.com/vlfeat/matconvnet
	branch = master


================================================
FILE: Datasets/cnn_hmdb51_of_setup_data.m
================================================
function imdb = cnn_hmdb51_of_setup_data(varargin)
% CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set
% http://crcv.ucf.edu/data/UCF101.php
% this script requires UCF101 downloaded and frames extracted in frames
% folder


opts.dataDir = fullfile('data','HMDB51') ;
opts.lite = false ;
% opts = vl_argparse(opts, varargin) ;

%% ------------------------------------------------------------------------
%                                                  Load categories metadata
% -------------------------------------------------------------------------
% find images
imagePath = fullfile(opts.dataDir, 'tvl1_flow', 'u', '*') ;
images = dir(imagePath) ;

videoNames = cell(1,numel(images)) ;
frameNames = cell(1,numel(images)) ;
nrFrames = zeros(1,numel(images)) ;
for i=1:numel(images)
  
  frames = dir(fullfile(opts.dataDir,'tvl1_flow','u',images(i).name,'frame*.jpg')) ;
  framesc = cell(1,numel(frames)) ;
  if ~isempty(numel(frames))
    for j=1:numel(frames)
      framesc{j} = frames(j).name ;
    end
    frameNames{i} = framesc ;
    frameNames{i} = strcat(images(i).name,'/',framesc) ;
    nrFrames(i) = numel(framesc) ;
    videoNames{i} = images(i).name ; 
  end
end

videoNames(nrFrames==0) = [] ;
frameNames(nrFrames==0) = [] ;
% nrFrames(nrFrames==0) = [] ;


frameNamesuv = cell(1,numel(frameNames)) ;
for i=1:numel(frameNames)
  nn = frameNames{i} ;
  nn1 = strcat('u/',nn) ;
  nn2 = strcat('v/',nn) ;
  
  frameNamesuv{i} = cell(1,2*numel(nn1)) ;
  frameNamesuv{i}(1:2:end) = nn1 ;
  frameNamesuv{i}(2:2:end) = nn2 ;
end

% find metadata
% ncls = 51 ;

metaPath = fullfile(opts.dataDir, 'hmdb51_splits', '*.txt') ;

splits = dir(metaPath) ;

cats = cell(1,numel(videoNames)) ;
sets = zeros(3,numel(videoNames)) ;
catNames = cell(1,numel(splits)) ;

for i=1:numel(splits)
  j = strfind(splits(i).name,'_test_') ;
  splitno = str2double(splits(i).name(j+11)) ;
  catNames{i} = splits(i).name(1:j-1) ;
  t = importdata(fullfile(opts.dataDir, 'hmdb51_splits', splits(i).name)) ;
  
  vids = cell(1,numel(t.textdata)) ;
  for k=1:numel(t.textdata)
    vids{k} = t.textdata{k}(1:end-4) ;
  end
  
  [ia,ib] = ismember(vids,videoNames) ;
  assert(all(ia)) ;
  sets(splitno,ib) = t.data' ;
  cats(ib) = repmat(catNames(i),numel(ia),1) ;
end

[cu,~,labels] = unique(cats) ;
sets(sets(:)==2) = 3 ;

imdb.classes.name = cu ;
imdb.images.name = videoNames ;
imdb.images.names = frameNamesuv ;
imdb.images.label = labels' ;
imdb.images.sets = sets ;
imdb.imageDir = fullfile(opts.dataDir, 'tvl1_flow') ;


================================================
FILE: Datasets/cnn_hmdb51_setup_data.m
================================================
function imdb = cnn_hmdb51_setup_data(varargin)
% CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set
% http://crcv.ucf.edu/data/UCF101.php
% this script requires UCF101 downloaded and frames extracted in frames
% folder

opts.dataDir = fullfile('data','HMDB51') ;
opts.lite = false ;
% opts = vl_argparse(opts, varargin) ;

%% ------------------------------------------------------------------------
%                                                  Load categories metadata
% -------------------------------------------------------------------------
% find images
imagePath = fullfile(opts.dataDir, 'frames', '*') ;
images = dir(imagePath) ;

videoNames = cell(1,numel(images)) ;
frameNames = cell(1,numel(images)) ;
nrFrames = zeros(1,numel(images)) ;
for i=1:numel(images)
  
  frames = dir(fullfile(opts.dataDir,'frames',images(i).name,'frame*.jpg')) ;
  framesc = cell(1,numel(frames)) ;
  if ~isempty(numel(frames))
    for j=1:numel(frames)
      framesc{j} = frames(j).name ;
    end
    frameNames{i} = strcat(images(i).name,'/',framesc) ;
    nrFrames(i) = numel(framesc) ;
    videoNames{i} = images(i).name ; 
  end
end

videoNames(nrFrames==0) = [] ;
frameNames(nrFrames==0) = [] ;
% nrFrames(nrFrames==0) = [] ;


% find metadata
% ncls = 51 ;


metaPath = fullfile(opts.dataDir, 'hmdb51_splits', '*.txt') ;

splits = dir(metaPath) ;

% splitFiles = cell(1,3*ncls) ;
cats = cell(1,numel(videoNames)) ;
sets = zeros(3,numel(videoNames)) ;
catNames = cell(1,numel(splits)) ;

for i=1:numel(splits)
  j = strfind(splits(i).name,'_test_') ;
  splitno = str2double(splits(i).name(j+11)) ;
  catNames{i} = splits(i).name(1:j-1) ;
  t = importdata(fullfile(opts.dataDir, 'hmdb51_splits', splits(i).name)) ;
  
  vids = cell(1,numel(t.textdata)) ;
  for k=1:numel(t.textdata)
    vids{k} = t.textdata{k}(1:end-4) ;
  end
  
  [ia,ib] = ismember(vids,videoNames) ;
  assert(all(ia)) ;
  sets(splitno,ib) = t.data' ;
  cats(ib) = repmat(catNames(i),numel(ia),1) ;
end

[cu,~,labels] = unique(cats) ;
sets(sets(:)==2) = 3 ;

imdb.classes.name = cu ;
imdb.images.name = videoNames ;
imdb.images.names = frameNames ;
imdb.images.label = labels' ;
imdb.images.sets = sets ;
imdb.imageDir = fullfile(opts.dataDir, 'frames') ;


================================================
FILE: Datasets/cnn_ucf101_of_setup_data.m
================================================
function imdb = cnn_ucf101_of_setup_data(varargin)
% CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set
% http://crcv.ucf.edu/data/UCF101.php
% this script requires UCF101 downloaded and frames extracted in frames
% folder

opts.dataDir = fullfile('data','UCF101') ;
opts.lite = false ;
opts = vl_argparse(opts, varargin) ;

%% ------------------------------------------------------------------------
%                                                  Load categories metadata
% -------------------------------------------------------------------------

% find metadata
metaPath = fullfile(opts.dataDir, 'ucfTrainTestlist/classInd.txt') ;

fprintf('using metadata %s\n', metaPath) ;
tmp = importdata(metaPath);
nCls = numel(tmp);

if nCls ~= 101
  error('Wrong meta file %s',metaPath);
end

cats = cell(1,nCls);
for i=1:numel(tmp)
  t = strsplit(tmp{i});
  cats{i} = t{2};
end

imdb.classes.name = sort(cats) ;
imdb.imageDir = fullfile(opts.dataDir, 'tvl1_flow', 'u') ;

%% ------------------------------------------------------------------------
%                                              load image names and labels
% -------------------------------------------------------------------------

fprintf('searching training images ...\n') ;
names = {} ;
name = {};
labels = {} ;
for d = dir(fullfile(imdb.imageDir, 'v_*'))'
  [~,lab] = ismember(lower(d.name(3:end-8)), lower(cats)) ;
  if lab==0
    error('no class label found for %s',d.name);
  end
  ims = dir(fullfile(imdb.imageDir, d.name, '*.jpg')) ;
  name{end+1} = d.name;
  names{end+1} = strcat([d.name, filesep], {ims.name}) ;
  labels{end+1} = lab ;
  if mod(numel(names), 10) == 0, fprintf('.') ; end
  if mod(numel(names), 500) == 0, fprintf('\n') ; end
  %fprintf('found %s with %d images\n', d.name, numel(ims)) ;
end
% names = horzcat(names{:}) ;

labels = horzcat(labels{:}) ;
% labels = [labels ; labels] ;
labels = labels(:)' ;

for i=1:numel(names)
  nn = names{i} ;
  nn1 = strcat('u/',nn) ;
  nn2 = strcat('v/',nn) ;
  
  names{i} = cell(1,2*numel(nn1)) ;
  names{i}(1:2:end) = nn1 ;
  names{i}(2:2:end) = nn2 ;
end

imdb.images.id = 1:numel(names) ;
imdb.images.name = name ;
imdb.images.names = names ;
imdb.images.label = labels ;
imdb.imageDir = fullfile(opts.dataDir, 'tvl1_flow') ;

%% ------------------------------------------------------------------------
%                                                 load train / test splits
% -------------------------------------------------------------------------

fprintf('labeling data...(this may take couple of minutes)') ;
imdb.images.sets = zeros(3, numel(names)) ;
setNames = {'train','test'};
setVal = [1,3];

for s=1:numel(setNames)
  for i=1:3
    trainFl = fullfile(opts.dataDir, 'ucfTrainTestlist',sprintf('%slist%02d.txt',...
      setNames{s},i)) ;
    trainList = importdata(trainFl);
    if isfield(trainList,'textdata')
      trainList = trainList.textdata;
    end
    for j=1:numel(trainList)
      tmp = strsplit(trainList{j},'/');
      [~,lab] = ismember(lower(tmp{2}(1:end-4)), lower(name)) ;
      if lab==0
%         error('cannot find the video %s',tmp{2}(1:end-4));
        warning('cannot find the video %s',tmp{2}(1:end-4));
        continue ;
      end
%       if trainList.data(j) ~= labels(lab)
%         error('Labels do not match for %s',tmp{2});
%       end
      imdb.images.sets(i,lab) = setVal(s);
    end
  end  
end
fprintf('\n') ;
%% ------------------------------------------------------------------------
%                                                            Postprocessing
% -------------------------------------------------------------------------

% sort categories by WNID (to be compatible with other implementations)
[imdb.classes.name,perm] = sort(imdb.classes.name) ;
relabel(perm) = 1:numel(imdb.classes.name) ;
ok = imdb.images.label >  0 ;
imdb.images.label(ok) = relabel(imdb.images.label(ok)) ;

if opts.lite
  % pick a small number of images for the first 10 classes
  % this cannot be done for test as we do not have test labels
  clear keep ;
  for i=1:10
    sel = find(imdb.images.label == i) ;
    train = sel(imdb.images.sets(1,sel) == 1) ;
    test = sel(imdb.images.sets(1,sel) == 3) ;
    keep{i} = [train test] ;
  end
  keep = keep{:};
  imdb.images.id = imdb.images.id(keep) ;
  imdb.images.name = imdb.images.name(keep) ;
  imdb.images.names = imdb.images.names(keep) ;
  imdb.images.sets = imdb.images.sets(1,keep) ;
  imdb.images.label = imdb.images.label(keep) ;
end


================================================
FILE: Datasets/cnn_ucf101_setup_data.m
================================================
function imdb = cnn_ucf101_setup_data(varargin)
% CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set
% http://crcv.ucf.edu/data/UCF101.php
% this script requires UCF101 downloaded and frames extracted in frames
% folder

opts.dataDir = fullfile('data','UCF101') ;
opts.lite = false ;
opts = vl_argparse(opts, varargin) ;

%% ------------------------------------------------------------------------
%                                                  Load categories metadata
% -------------------------------------------------------------------------

% find metadata
metaPath = fullfile(opts.dataDir, 'ucfTrainTestlist/classInd.txt') ;

fprintf('using metadata %s\n', metaPath) ;
tmp = importdata(metaPath);
nCls = numel(tmp);

if nCls ~= 101
  error('Wrong meta file %s',metaPath);
end

cats = cell(1,nCls);
for i=1:numel(tmp)
  t = strsplit(tmp{i});
  cats{i} = t{2};
end

imdb.classes.name = cats ;
imdb.imageDir = fullfile(opts.dataDir, 'frames') ;

%% ------------------------------------------------------------------------
%                                              load image names and labels
% -------------------------------------------------------------------------

fprintf('searching training images ...\n') ;
names = {} ;
name = {};
labels = {} ;
for d = dir(fullfile(imdb.imageDir, 'v_*'))'
  [~,lab] = ismember(lower(d.name(3:end-8)), lower(cats)) ;
  if lab==0
    error('no class label found for %s',d.name);
  end
  ims = dir(fullfile(imdb.imageDir, d.name, '*.jpg')) ;
  name{end+1} = d.name;
  names{end+1} = strcat([d.name, filesep], {ims.name}) ;
  labels{end+1} = lab ;
  if mod(numel(names), 10) == 0, fprintf('.') ; end
  if mod(numel(names), 500) == 0, fprintf('\n') ; end
  %fprintf('found %s with %d images\n', d.name, numel(ims)) ;
end
% names = horzcat(names{:}) ;
labels = horzcat(labels{:}) ;

imdb.images.id = 1:numel(names) ;
imdb.images.name = name ;
imdb.images.names = names ;
imdb.images.label = labels ;


%% ------------------------------------------------------------------------
%                                                 load train / test splits
% -------------------------------------------------------------------------

fprintf('labeling data...(this may take couple of minutes)') ;
imdb.images.sets = zeros(3, numel(names)) ;
setNames = {'train','test'};
setVal = [1,3];

for s=1:numel(setNames)
  for i=1:3
    trainFl = fullfile(opts.dataDir, 'ucfTrainTestlist',sprintf('%slist%02d.txt',...
      setNames{s},i)) ;
    trainList = importdata(trainFl);
    if isfield(trainList,'textdata')
      trainList = trainList.textdata;
    end
    for j=1:numel(trainList)
      tmp = strsplit(trainList{j},'/');
      [~,lab] = ismember(lower(tmp{2}(1:end-4)), lower(name)) ;
      if lab==0
        error('cannot find the video %s',tmp{2});
      end
%       if trainList.data(j) ~= labels(lab)
%         error('Labels do not match for %s',tmp{2});
%       end
      imdb.images.sets(i,lab) = setVal(s);
    end
  end  
end
fprintf('\n') ;
%% ------------------------------------------------------------------------
%                                                            Postprocessing
% -------------------------------------------------------------------------

% sort categories by WNID (to be compatible with other implementations)
[imdb.classes.name,perm] = sort(imdb.classes.name) ;
relabel(perm) = 1:numel(imdb.classes.name) ;
ok = imdb.images.label >  0 ;
imdb.images.label(ok) = relabel(imdb.images.label(ok)) ;

if opts.lite
  % pick a small number of images for the first 10 classes
  % this cannot be done for test as we do not have test labels
  clear keep ;
  for i=1:10
    sel = find(imdb.images.label == i) ;
    train = sel(imdb.images.sets(1,sel) == 1) ;
    test = sel(imdb.images.sets(1,sel) == 3) ;
    keep{i} = [train test] ;
  end
  keep = keep{:};
  imdb.images.id = imdb.images.id(keep) ;
  imdb.images.name = imdb.images.name(keep) ;
  imdb.images.names = imdb.images.names(keep) ;
  imdb.images.sets = imdb.images.sets(1,keep) ;
  imdb.images.label = imdb.images.label(keep) ;
end


================================================
FILE: Layers/AppRankPooling.m
================================================
classdef AppRankPooling < dagnn.ElementWise
  % author: Hakan Bilen
  % dagnn wrapper for approximate rank pooling
  
  properties
    scale = 1 
  end
    
  methods
    function outputs = forward(obj, inputs, params)
      outputs{1} = vl_nnarpooltemporal(inputs{1},inputs{2}) * obj.scale ;
    end
    
    function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)
      derInputs = cell(1,2);
      derInputs{1} = vl_nnarpooltemporal(inputs{1},inputs{2},derOutputs{1}) * obj.scale;
      derParams = {} ;
    end
    
    function outputSizes = getOutputSizes(obj, inputSizes)
      % This is not correct, dim(4) depends on inputs{2}
      outputSizes{1} = inputSizes{1} ;
    end
    
    function obj = AppRankPooling(varargin)
      obj.load(varargin) ;  
    end  
    
  end
end


================================================
FILE: Layers/BatchNormN.m
================================================
classdef BatchNormN < dagnn.ElementWise
  properties
    numChannels
    epsilon = 1e-5
    opts = {'NoCuDNN'} % ours seems slightly faster
  end

  properties (Transient)
    moments
  end

  methods
    function outputs = forward(obj, inputs, params)
      if strcmp(obj.net.mode, 'test')
        outputs{1} = vl_nnbnorm(inputs{1}, params{1}, params{2}, ...
                                'moments', params{3}, ...
                                'epsilon', obj.epsilon, ...
                                obj.opts{:}) ;
      else
        [outputs{1},obj.moments] = ...
            vl_nnbnorm(inputs{1}, params{1}, params{2}, ...
                       'epsilon', obj.epsilon, ...
                       obj.opts{:}) ;
      end
    end

    function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)
      [derInputs{1}, derParams{1}, derParams{2}, derParams{3}] = ...
        vl_nnbnorm(inputs{1}, params{1}, params{2}, derOutputs{1}, ...
                   'epsilon', obj.epsilon, ...
                   'moments', obj.moments, ...
                   obj.opts{:}) ;
      obj.moments = [] ;
      % multiply the moments update by the number of images in the batch
      % this is required to make the update additive for subbatches
      % and will eventually be normalized away
      % derParams{3} = derParams{3} * size(inputs{1},4) ;
    end

    % ---------------------------------------------------------------------
    function obj = BatchNormN(varargin)
      obj.load(varargin{:}) ;
    end

    function params = initParams(obj)
      params{1} = ones(obj.numChannels,1,'single') ;
      params{2} = zeros(obj.numChannels,1,'single') ;
      params{3} = zeros(obj.numChannels,2,'single') ;
    end

    function attach(obj, net, index)
      attach@dagnn.ElementWise(obj, net, index) ;
      p = net.getParamIndex(net.layers(index).params{3}) ;
      net.params(p).trainMethod = 'average' ;
      net.params(p).learningRate = 0.1 ;
    end
  end
end


================================================
FILE: Layers/ErrorMultiClass.m
================================================
classdef ErrorMultiClass < dagnn.Loss
% author: Hakan Bilen
% computes multi-class accuracy
% inputs{1}->scores
% inputs{2}->gt labels
  properties
    nImgPerClass = []
    nCorPred = []
    accuracy = []
    resetLayer = false 
  end
    
  methods
    function outputs = forward(obj, inputs, params)
      
      if numel(inputs)~=2
        error('wrong number of inputs');
      end
      
      nCls = size(inputs{1},3);
      
      if obj.resetLayer || isempty(obj.nImgPerClass)
        obj.nImgPerClass = zeros(1,size(inputs{1},3));
        obj.nCorPred = zeros(1,size(inputs{1},3));
        obj.accuracy = zeros(1,size(inputs{1},3));
        
        if obj.resetLayer
          obj.resetLayer = false ;
          obj.average = 0 ;
        end
      end
      
      
      [~,predictions] = max(gather(squeeze(inputs{1})),[],1);
      
      for c=1:nCls
        obj.nImgPerClass(c) = obj.nImgPerClass(c) + sum(inputs{2}==c);
        obj.nCorPred(c)     = obj.nCorPred(c) + sum(predictions==c & inputs{2}==c);
      end
      
      ni = obj.nImgPerClass;
      ni(ni==0) = 1;
      
      obj.accuracy = obj.nCorPred ./ ni;
      obj.average = (1-mean(obj.accuracy));
      outputs{1} =  obj.average;
    end
    
    function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)
      derInputs = cell(1,2);
      derParams = {} ;
    end
    
    function reset(obj)
      obj.resetLayer = true ;
%       obj.nImgPerClass = [];
%       obj.nCorPred = [];
%       obj.accuracy = [];
%       obj.average = 0;
    end
    
    
    function obj = ErrorMultiClass(varargin)
      obj.load(varargin) ;
      obj.loss = 'error_multi_class' ;
    end
  end
end


================================================
FILE: Layers/L2Normalize.m
================================================
classdef L2Normalize < dagnn.ElementWise
  % author: Hakan Bilen
  % dagnn wrapper for l2 normalization
  
  properties
    scale = 1;
    clip = [-inf inf];
    offset = 0;
  end
  
  methods
    function outputs = forward(obj, inputs, params)
      outputs{1} = vl_nnl2norm(inputs{1},[obj.scale obj.clip obj.offset]);
    end
    
    function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)
      derInputs{1} = vl_nnl2norm(inputs{1},[obj.scale obj.clip obj.offset],derOutputs{1});
      derParams = {} ;
    end
    
    function obj = L2Normalize(varargin)
      obj.load(varargin) ;  
    end  
    
  end
end


================================================
FILE: Layers/LossNormalized.m
================================================
classdef LossNormalized < dagnn.Loss
%   properties
%     loss = 'softmaxlog'
%     ignoreAverage = false
%     opts = {}
%   end
%   properties (Transient)
%     average = 0
%     numAveraged = 0
%   end

  methods
    function outputs = forward(obj, inputs, params)
      outputs{1} = vl_nnloss(inputs{1}, inputs{2}, [], 'loss', obj.loss, obj.opts{:}) ;
      obj.accumulateAverage(inputs, outputs);
      if numel(size(inputs{1}))>3
        bs = size(inputs{1},4) ;
      else
        bs = 1 ;
      end
      outputs{1} = outputs{1} / bs ;
    end

    function accumulateAverage(obj, inputs, outputs)
      if obj.ignoreAverage, return; end;
      n = obj.numAveraged ;
      m = n + size(inputs{1}, 1) *  size(inputs{1}, 2) * size(inputs{1}, 4);
      obj.average = bsxfun(@plus, n * obj.average, gather(outputs{1})) / m ;
      obj.numAveraged = m ;
    end

    function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)
      if numel(size(inputs{1}))>3
        bs = size(inputs{1},4) ;
      else
        bs = 1 ;
      end
      
      derInputs{1} = vl_nnloss(inputs{1}, inputs{2}, derOutputs{1}, 'loss', obj.loss, obj.opts{:}) / bs;
      derInputs{2} = [] ;
      derParams = {} ;
    end

    function reset(obj)
      obj.average = 0 ;
      obj.numAveraged = 0 ;
    end

    function outputSizes = getOutputSizes(obj, inputSizes, paramSizes)
      outputSizes{1} = [1 1 1 inputSizes{1}(4)] ;
    end

    function rfs = getReceptiveFields(obj)
      % the receptive field depends on the dimension of the variables
      % which is not known until the network is run
      rfs(1,1).size = [NaN NaN] ;
      rfs(1,1).stride = [NaN NaN] ;
      rfs(1,1).offset = [NaN NaN] ;
      rfs(2,1) = rfs(1,1) ;
    end

    function obj = LossNormalized(varargin)
      obj.load(varargin) ;
    end
  end
end


================================================
FILE: Layers/TemporalPooling.m
================================================
classdef TemporalPooling < dagnn.ElementWise
  % author: Hakan Bilen
  % dagnn wrapper for approximate rank pooling
  
  properties
    method = 'max';
  end
 
  methods
    function outputs = forward(obj, inputs, params)
      outputs{1} = vl_nnpooltemporal(inputs{1},inputs{2},obj.method);
    end
    
    function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)
      derInputs = cell(1,2);
      derInputs{1} = vl_nnpooltemporal(inputs{1},inputs{2},obj.method,derOutputs{1});
      derParams = {} ;
    end
    
    function obj = TemporalPooling(varargin)
      obj.load(varargin) ;  
    end  
    
  end
end


================================================
FILE: Layers/vl_nnarpooltemporal.m
================================================
function Y = vl_nnarpooltemporal(X,ids,dzdy)
% author: Hakan Bilen
% approximate rank pooling
% ids indicates frame-video association (must be in range [1-N])

sz = size(X);
forward = logical(nargin<3);

if numel(ids)~=size(X,4)
  error('Error: ids dimension does not match with X!');
end

nVideos = max(ids);

if forward
  Y = zeros([sz(1:3),nVideos],'like',X);
else
  Y = zeros(size(X),'like',X);
end

for v=1:nVideos
  % pool among frames
  indv = find(ids==v);
  if isempty(indv)
    error('Error: No frames in video %d',v);
  end
  N = numel(indv);
  % magic numbers
  fw = zeros(1,N);
  if N==1
    fw = 1;
  else
    for i=1:N
      fw(i) = sum((2*(i:N)-N-1) ./ (i:N));
    end
  end
  
  if forward
    Y(:,:,:,v) =  sum(bsxfun(@times,X(:,:,:,indv),...
      reshape(single(fw),[1 1 1 numel(indv)])),4);    
  else
    Y(:,:,:,indv) = (bsxfun(@times,repmat(dzdy(:,:,:,v),[1,1,1,numel(indv)]),...
      reshape(fw,[1 1 1 numel(indv)]))) ;
  end
end
%
% if forward
  %   fprintf(' fwd-arpool %.2f ',sqrt(sum(Y(:).^2)));
  % else
  %   fprintf(' back-arpool %f ',sqrt(sum(Y(:).^2)));
% end


================================================
FILE: Layers/vl_nnl2norm.m
================================================
function y = vl_nnl2norm(x,param,dzdy)
% author: Hakan Bilen
% l2 normalize whole feature map

sc = param(1);
clip = param(2:3);
offset = param(4);

if nargin == 3
  assert(all(size(x) == size(dzdy)));
else
  dzdy = [];
end

x_sz = size(x);
if ~all(x_sz([1 2]) == 1)
  % Create an array of size #channels x #samples
  x = reshape(x, prod(x_sz(1:3)), []);
end


x = x + offset;

if isempty(dzdy)
 
  y = (bsxfun(@times, x, sc./(sqrt(sum(x .* x)) + single(1e-12))));
  % clip max values
  if all(y(:)<clip(1) | y(:)>clip(2))
    warning('Too small clipping interval');
    fprintf('min %f max %f\n',min(y(:)),max(y(:)));
  end
  
  y(y(:)<clip(1)) = clip(1);
  y(y(:)>clip(2)) = clip(2);
  
  
else
  if ~all(x_sz([1 2]) == 1)
    dzdy = reshape(dzdy, prod(x_sz(1:3)), []);
  end
  
  len_ = 1./sqrt(sum(x.*x)+single(1e-12));
  dzdy_ = bsxfun(@times,dzdy,len_.^3);
  y = sc * (bsxfun(@times,dzdy,len_)-bsxfun(@times,x,sum(x.*dzdy_)));
end

if ~all(x_sz([1 2]) == 1)
  y = reshape(y, x_sz);
end
% 
% if isempty(dzdy)
%   fprintf(' fwd-l2 %.2f ',sqrt(sum(y(:).^2)));
% else
%   fprintf(' back-l2 %f dzdy %f ',sqrt(sum(y(:).^2)),sqrt(sum(dzdy(:).^2)));
% end


================================================
FILE: Layers/vl_nnpooltemporal.m
================================================
function Y = vl_nnpooltemporal(X,ids,method,dzdy)
% author: Hakan Bilen
% temporal pooling along frames
% ids indicates frame-video association
% method 'max' or 'avg'

sz = size(X);
forward = logical(nargin<4);
Xp = permute(X,[4,1,2,3]);

if numel(ids)~=size(X,4)
  error('Error: ids dimension does not match with X!');
end

nVideos = max(ids);

if forward
  Yp = zeros([nVideos,sz(1:3)],'like',X);
  for v=1:nVideos
    % pool among frames
    indv = find(ids==v);
    Yp(v,:,:,:) = vl_nnpool(Xp(indv,:,:,:), [numel(indv),1], ...
      'pad', 0, 'stride', [numel(indv),1], 'method', method) ;
  end
else
  dzdyp = permute(dzdy,[4,1,2,3]);
  Yp = zeros(size(Xp),'like',Xp);
  for v=1:nVideos
    % pool among frames
    indv = find(ids==v);
    Yp(indv,:,:,:) = vl_nnpool(Xp(indv,:,:,:), [numel(indv),1], dzdyp(v,:,:,:), ...
      'pad', 0, 'stride', [numel(indv),1], 'method', method) ;
  end
  
end
% permute back
Y = permute(Yp,[2,3,4,1]);

% if forward
%   fprintf(' fwd-ptemp %.2f ',sqrt(sum(Y(:).^2)));
% else
%   fprintf(' back-ptemp %.2f ',sqrt(sum(Y(:).^2)));
% end


================================================
FILE: README.md
================================================
# Dynamic Image Networks for Action Recognition
## Improved Results (see the extended version of CVPR paper)


ResNeXt-50        | HMDB51 (%) | UCF101 (%) |
------------------|--------|--------|
SI                |  53.5  |  87.6  |
DI                |  57.3  |  86.6  |
OF                |  55.8  |  84.9  |
DOF               |  58.9  |  86.6  |
SI+OF             |  67.5  |  93.9  |
SI+DI             |  61.3  |  90.6  |
OF+DOF            |  62.6  |  89.1  |
SI+DI+OF+DOF      |  71.5  |  95.0  |
SI+DI+OF+DOF+iDT  |  74.2  |  95.4  |

* Results are in the standard average multi-class accuracy (%)
* SI: RGB image
* DI: dynamic RBG image
* OF: optical flow 
* DOF: dynamic optical flow 
* iDT: improved trajectory features 


## Installation
1. Clone the Dynamic Image Net repository:

    ```Shell
    git clone --recursive  https://github.com/hbilen/dynamic-image-nets
    ```
    
2. Compile matconvnet toolbox: (see [http://www.vlfeat.org/matconvnet/install/](http://www.vlfeat.org/matconvnet/install/))

3. Install additional matconvnet packages
    
  ```Shell
    run matconvnet/matlab/vl_setupnn.m ;
    vl_contrib install mcnExtraLayers ; vl_contrib setup mcnExtraLayers ;
    vl_contrib install autonn ; vl_contrib setup autonn ;
  ```

4. Download your dataset : (e.g. UCF101 from [http://crcv.ucf.edu/data/UCF101.php](http://crcv.ucf.edu/data/UCF101.php))

5. Convert videos to frames, resize them to 256x256 and store them in such a directory structure:
Alternatively, you can download RGB and precomputed optical flow frames from [Christoph Feichtenhofer](http://ftp.tugraz.at/pub/feichtenhofer/tsfusion/data/) and copy RGB frames under "UCF101/frames" and optical flow frames under "UCF101/tvl1_flow".
    
    ```Shell
    data/UCF101/ucfTrainTestlist/
    ├── classInd.txt
    ├── testlist01.txt
    ├── testlist02.txt
    ├── testlist03.txt
    ├── trainlist01.txt
    ├── trainlist02.txt
    └── trainlist03.txt
    data/UCF101/frames/
    ├── v_ApplyEyeMakeup_g01_c01
    │   ├── 00001.jpg
    │   ├── 00002.jpg
    │   ├── 00003.jpg
    │   ├── 00004.jpg
    │   ├── 00005.jpg
    ```

## Compute and Visualise Approximate Dynamic Images
1. If you want to compute approximate dynamic images, get a list of ordered frames from a video and try
  ```matlab
  di = compute_approximate_dynamic_images(images) ;
  ```

2. If you want to visualise approximate dynamic images, get a list of ordered frames from a video and try
  ```matlab
  visualize_approximate_dynamic_images(images)
  ```

## Train a Dynamic Image Net
You can modify the options in `main_train.m` and train your model by running
    ```matlab
    main_train
    ```
    
Note: If you want to train a model on a different dataset than UCF101 or HMDB51, you need to write a custom script `cnn_dataset_setup_data` to build your database (imdb).

## Evaluation
1. Download the CNN Models for the UCF101 dataset, that are used in the journal, from [here](http://groups.inf.ed.ac.uk/hbilen-data/data/resnext50_dicnn.tar).
2. Choose the right model, split and input type (e.g.)
    ```matlab
    net = load('resnext50-rgb-arpool-split1.mat') ;
    net = dagnn.DagNN.loadobj(net) ;
    net.addLayer('errMC',ErrorMultiClass(),{'prediction','label'},'mcerr') ;
    opts.network = net ;
    opts.split = 1 ;
    opts.train.gpus = 1 ;
    opts.epochFactor = 0 ; 
    [net, info] = cnn_dicnn_rgb(opts)
    ```

## Citing Dynamic Image Networks

If you find the code useful, please cite:

        @inproceedings{Bilen2016a,
          author    = "Bilen, H. and Fernando, B. and Gavves, E. and Vedaldi, A. and Gould, S.",
          title     = "Dynamic Image Networks for Action Recognition",
          booktitle = "CVPR",
          year      = "2016"
        }
        @journal{Bilen2017a,
          author    = "Bilen, H. and Fernando, B. and Gavves, E. and Vedaldi, A.",
          title     = "Action Recognition with Dynamic Image Networks",
          journal   = " IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)",
          year      = "2017"
        }

## License
The analysis work performed with the program(s) must be non-proprietary work. Licensee and its contract users must be or be affiliated with an academic facility. Licensee may additionally permit individuals who are students at such academic facility to access and use the program(s). Such students will be considered contract users of licensee. The program(s) may not be used for commercial competitive analysis (such as benchmarking) or for any commercial activity, including consulting.


================================================
FILE: dicnn/cnn_dicnn_of.m
================================================
function [net, info] = cnn_dicnn_of(varargin)
%CNN_DICNN_OF Fine-tunes a pre-trained CNN with dynamic images on optical
% (DOF in pami journal) flow frames on UCF101 dataset


run(fullfile(fileparts(mfilename('fullpath')), ...
  '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ;

run(fullfile(fileparts(mfilename('fullpath')), ...
  '..', 'matconvnet', 'contrib', 'mcnExtraLayers', 'setup_mcnExtraLayers.m')) ;

run(fullfile(fileparts(mfilename('fullpath')), ...
  '..', 'matconvnet', 'contrib', 'autonn', 'setup_autonn.m')) ;

addpath Layers Datasets

opts.dataDir = fullfile('data','UCF101') ;
opts.expDir  = fullfile('exp', 'UCF101') ;
opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat.mat') ;
[opts, varargin] = vl_argparse(opts, varargin) ;

opts.numFetchThreads = 8 ;

opts.lite = false ;
opts.imdbPath = fullfile(opts.dataDir, 'imdb-of.mat');
opts.pool1Layer = 'conv0'; % before conv1
opts.pool1Type = 'arpool'; % before conv1
opts.pool2Layer = 'fc6'; % before conv1
opts.DropOutRate = 0.85 ;
opts.datasetFn = @cnn_ucf101_of_setup_data ;
opts.networkFn = @cnn_init_resnext ;
opts.network = [] ;

opts.split = 1; % data split
opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1]
opts.numDynImgs = 10 ;
opts.epochFactor = 5 ;

opts.train = struct() ;
opts.train.gpus = [];
opts.train.batchSize = 128 ;
opts.train.numSubBatches = 32 ;
opts.train.solver = [] ;
opts.train.prefetch = true ;
opts.train.learningRate = 1e-2 ;
opts.train.numEpochs = 30 ;
% opts.train.savePreds = true ;
opts.train.randomSeed = 0 ;

opts = vl_argparse(opts, varargin) ;
if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end;


% -------------------------------------------------------------------------
%                                                              Prepare data
% -------------------------------------------------------------------------

if exist(opts.imdbPath,'file')
  imdb = load(opts.imdbPath) ;
else
  imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ;
  mkdir(opts.expDir) ;
  save(opts.imdbPath, '-struct', 'imdb') ;
end

% UCF101 has 3 data splits
if opts.split>3
  error('split should be <=3');
end
imdb.images.set = imdb.images.sets(opts.split,:);

% reverse frame order
if opts.reverseDyn
  for i=1:numel(imdb.images.names)
    imdb.images.names{i} = imdb.images.names{i}(end:-1:1);
  end
end
% -------------------------------------------------------------------------
%                                                             Prepare model
% -------------------------------------------------------------------------
if isempty(opts.network)
  net = load(opts.modelPath);
  if isfield(net,'net')
    net = net.net;
  end
  opts.nCls = max(imdb.images.label) ;
  % net = dagnn.DagNN.loadobj(net) ;
  net = opts.networkFn(net,opts) ;
  
  % two channels instead of 3 RGB
  net.params(1).value = net.params(1).value(:,:,1:2,:) ;
  
  % Set the class names in the network
  net.meta.classes.name = imdb.classes.name ;
  net.meta.classes.description = imdb.classes.name ;
else
  assert(isa(opts.network,'dagnn.DagNN')) ;
  net = opts.network ;
end

% -------------------------------------------------------------------------
%                                                                     Learn
% -------------------------------------------------------------------------
if opts.epochFactor>0
  opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ;
else
  opts.train.train = NaN ;
  opts.train.numEpochs = 1 ;
end
opts.train.val = find(imdb.images.set==3) ;

[net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ...
                      'expDir', opts.expDir, ...
                      opts.train) ;


% -------------------------------------------------------------------------
%                                                          Report accuracy
% -------------------------------------------------------------------------
errlayer = net.getLayerIndex('errMC') ;

if ~isnan(errlayer)
  cats = imdb.classes.name ;
  accs = net.layers(errlayer).block.accuracy ; 
  
  if numel(cats)~=numel(accs)
    error('wrong number of classes\n') ;
  end
  
  for i=1:numel(cats)
    fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ;
  end
  fprintf('Mean accuracy %.1f\n',100*mean(accs)) ;
end
% -------------------------------------------------------------------------
function fn = getBatchFn(opts, meta)
% -------------------------------------------------------------------------
useGpu = numel(opts.train.gpus) > 0 ;

bopts.numThreads = opts.numFetchThreads ;
bopts.imageSize = meta.normalization.imageSize ;
if isfield(meta.normalization,'border')
  bopts.border = meta.normalization.border ;  
else
  bopts.border = meta.normalization.imageSize(1:2) ./ ...
    meta.normalization.cropSize - meta.normalization.imageSize(1:2);
end

bopts.averageImage = 128 * ones([1 1 2],'single') ;
bopts.numDynImgs = opts.numDynImgs ;

fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ;


% -------------------------------------------------------------------------
function inputs = getDagNNBatch(opts, useGpu, imdb, batch)
% -------------------------------------------------------------------------

% batch refers to videos (not for frames)
if isempty(batch)
  inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []};
  return;
end

isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ;

if ~isVal, transformation='multiScaleRegular'; else transformation='none';end

names = imdb.images.names(batch);


% images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ;

namesM = {};
nVids = numel(batch);

VideoId1 = [];
VideoId2 = [];

% step-size
stepSize = 6;

% pool nFrames into a dynamic image
nFrames = 10;
% number of dynamic images to be max pooled later
nDynImgs = opts.numDynImgs ;
opts = rmfield(opts,'numDynImgs') ;


c1 = 1;
for v=1:nVids
  
  name = names{v};
  nFrms = numel(name)/2;

  nSample = nFrames;
  
  if isVal
    startF = 1 ;
  else
    startF = ceil(stepSize/2) ;
  end
  nr = numel(startF:stepSize:nFrms);
  
  % jitter by removing 50 % and limit a batch to nMaxs * nSamples images
  if nr > 1 && (~isVal && nr>nDynImgs)
    rat = min(nDynImgs,ceil(0.50*nr));
    ri = randperm(nr);
    ri = ri(1:rat);
    r = zeros(1,nr);
    r(ri) = 1;
  else
    r = ones(1,nr);
  end
  
  c3 = 1;
  c2 = 0;
  
  for f=startF:stepSize:nFrms
    if r(c3)
      idx = f:min(f+nSample-1,nFrms) ;
      if numel(idx)<nFrames
        idx = [idx idx(end) * ones(1,nFrames-numel(idx))];
      end
      idxu = 2*idx - 1;
      idxv = 2*idx;
      idxuv = zeros(1,2 * numel(idxu)) ;
      idxuv(1:2:end) = idxu ;
      idxuv(2:2:end) = idxv ;
            
      namesM{end+1} = name(idxuv);
      VideoId1 = [VideoId1 c1 * ones(1,numel(idx))];
      c1 = c1 + 1;
      c2 = c2 + 1;
    end
    c3 = c3 + 1;
  end
  VideoId2 = [VideoId2 v * ones(1,c2) ] ;
end

images = strcat([imdb.imageDir filesep], horzcat(namesM{:}) ) ;

im = cnn_video_of_get_batch(images, VideoId1, opts, ...
  'transformation', transformation, 'prefetch', nargout == 0) ;

if nargout > 0
  if useGpu
    im = gpuArray(im) ;
  end
  inputs = {'input', im, 'label', imdb.images.label(batch), ...
    'VideoId1', VideoId1, 'VideoId2', VideoId2};

end


================================================
FILE: dicnn/cnn_dicnn_rgb.m
================================================
function [net, info] = cnn_dicnn_rgb(varargin)
%CNN_DICNN_RGB Fine-tunes a pre-trained CNN with dynamic images on RGB frames
% (DI in pami journal) on UCF101 dataset


run(fullfile(fileparts(mfilename('fullpath')), ...
  '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ;

run(fullfile(fileparts(mfilename('fullpath')), ...
  '..', 'matconvnet', 'contrib', 'mcnExtraLayers', 'setup_mcnExtraLayers.m')) ;

run(fullfile(fileparts(mfilename('fullpath')), ...
  '..', 'matconvnet', 'contrib', 'autonn', 'setup_autonn.m')) ;

addpath Layers Datasets

opts.dataDir = fullfile('data','UCF101') ;
opts.expDir  = fullfile('exp', 'UCF101') ;
opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat');
opts.datasetFn = @cnn_ucf101_setup_data ;
opts.networkFn = @cnn_init_resnext ;
opts.network = [] ;

[opts, varargin] = vl_argparse(opts, varargin) ;

opts.numFetchThreads = 8 ;

opts.lite = false ;
opts.imdbPath = fullfile(opts.dataDir, 'imdb-rgb.mat');
opts.pool1Layer = 'conv0'; % before conv1
opts.pool1Type = 'arpool'; 
opts.pool2Layer = 'pool5'; 
opts.pool2Type = 'maxpool'; 
opts.DropOutRate = 0.5 ;
opts.epochFactor = 5 ;

opts.split = 1; % data split
opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1]
opts.train = struct() ;
opts.train.gpus = [];
opts.train.batchSize = 128 ;
opts.train.numSubBatches = 16 ;
opts.train.solver = [] ;
opts.train.prefetch = true ;
opts.train.numEpochs = 30 ;
opts.train.randomSeed = 0 ;
% resnet50
% opts.train.learningRate = 1e-2 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)];
% caffe-ref
opts.train.learningRate = 1e-3 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)];

opts = vl_argparse(opts, varargin) ;
if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end
% opts.train.numEpochs = numel(opts.train.learningRate);

% -------------------------------------------------------------------------
%                                                              Prepare data
% -------------------------------------------------------------------------

if exist(opts.imdbPath,'file')
  imdb = load(opts.imdbPath) ;
else
  imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ;
  mkdir(opts.expDir) ;
  save(opts.imdbPath, '-struct', 'imdb') ;
end

% UCF101 has 3 data splits
if opts.split>3
  error('split should be <=3');
end
imdb.images.set = imdb.images.sets(opts.split,:);

% reverse frame order
if opts.reverseDyn
  for i=1:numel(imdb.images.names)
    imdb.images.names{i} = imdb.images.names{i}(end:-1:1);
  end
end

% -------------------------------------------------------------------------
%                                                             Prepare model
% -------------------------------------------------------------------------
if isempty(opts.network)
  net = load(opts.modelPath);
  if isfield(net,'net')
    net = net.net;
  end
  opts.nCls = max(imdb.images.label) ;
  net = opts.networkFn(net,opts);

  if numel(net.meta.normalization.averageImage)>3
    sz = size(net.meta.normalization.averageImage) ;
    net.meta.normalization.averageImage = ...
      mean(reshape(net.meta.normalization.averageImage,[sz(1)*sz(2) sz(3)]),1) ;
  end

  % Set the class names in the network
  net.meta.classes.name = imdb.classes.name ;
  net.meta.classes.description = imdb.classes.name ;
else
  assert(isa(opts.network,'dagnn.DagNN')) ;
  net = opts.network ;
end
% -------------------------------------------------------------------------
%                                                                     Learn
% -------------------------------------------------------------------------
if opts.epochFactor>0
  opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ;
else
  opts.train.train = NaN ;
  opts.train.numEpochs = 1 ;
end
opts.train.val = find(imdb.images.set==3) ;

[net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ...
  'expDir', opts.expDir, ...
  opts.train) ;

% -------------------------------------------------------------------------
%                                                          Report accuracy
% -------------------------------------------------------------------------
errlayer = net.getLayerIndex('errMC') ;

if ~isnan(errlayer)
  cats = imdb.classes.name ;
  accs = net.layers(errlayer).block.accuracy ; 
  
  if numel(cats)~=numel(accs)
    error('wrong number of classes\n') ;
  end
  
  for i=1:numel(cats)
    fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ;
  end
  fprintf('Mean accuracy %.1f\n',100*mean(accs)) ;
end

% -------------------------------------------------------------------------
function fn = getBatchFn(opts, meta)
% -------------------------------------------------------------------------
useGpu = numel(opts.train.gpus) > 0 ;

bopts.numThreads = opts.numFetchThreads ;
bopts.imageSize = meta.normalization.imageSize ;
if isfield(meta.normalization,'border')
  bopts.border = meta.normalization.border ;  
else
  bopts.border = meta.normalization.imageSize(1:2) ./ ...
    meta.normalization.cropSize - meta.normalization.imageSize(1:2);

end

% bopts.averageImage = []; 
bopts.averageImage = meta.normalization.averageImage ;
bopts.interpolation = meta.normalization.interpolation ;
bopts.keepAspect = meta.normalization.keepAspect ;
% bopts.rgbVariance = meta.augmentation.rgbVariance ;
% bopts.transformation = meta.augmentation.transformation ;


fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ;


% -------------------------------------------------------------------------
function inputs = getDagNNBatch(opts, useGpu, imdb, batch)
% -------------------------------------------------------------------------

% batch refers to videos (not for frames)
if isempty(batch)
  inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []};
  return;
end

isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ;

% if ~isVal, transformation='stretch'; else transformation='none';end
if ~isVal, transformation='multiScaleRegular'; else transformation='none';end

names = imdb.images.names(batch);


% images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ;

namesM = {};
nVids = numel(batch);

VideoId1 = [];
VideoId2 = [];

% step-size
stepSize = 6;

% pool nFrames into a dynamic image
nFrames = 10;
% number of dynamic images to be max pooled later
nDynImgs = 10;


c1 = 1;
for v=1:nVids
  
  name = names{v};
    
  if isVal
    startF = 1 ;
  else
    startF = ceil(stepSize/2) ;
  end
  
  nFrms = numel(name);

  nSample = nFrames;
  nr = numel(startF:stepSize:nFrms);
  
  % jitter by removing 50 % and limit a batch to nMaxs * nSamples images
  if nr > 1 && (~isVal && nr>nDynImgs)
    rat = min(nDynImgs,ceil(0.50*nr));
    ri = randperm(nr);
    ri = ri(1:rat);
    r = zeros(1,nr);
    r(ri) = 1;
  else
    if nr>2*nDynImgs
      rat = 2*nDynImgs;
      ri = randperm(nr);
      ri = ri(1:rat);
      r = zeros(1,nr);
      r(ri) = 1;
    else
      r = ones(1,nr);
    end
  end
  
  c3 = 1;
  c2 = 0;
  
  for f=startF:stepSize:nFrms
    if r(c3)
      idx = f:min(f+nSample-1,nFrms) ;
      if numel(idx)<nFrames
        idx = [idx idx(end) * ones(1,nFrames-numel(idx))];
      end
      namesM{end+1} = name(idx);
      VideoId1 = [VideoId1 c1 * ones(1,numel(idx))];
      c1 = c1 + 1;
      c2 = c2 + 1;
    end
    c3 = c3 + 1;
  end
  VideoId2 = [VideoId2 v * ones(1,c2) ] ;
end

images = strcat([imdb.imageDir filesep], horzcat(namesM{:}) ) ;

im = cnn_video_rgb_get_batch(images, VideoId1, opts, ...
  'transformation', transformation, 'prefetch', nargout == 0) ;

if nargout > 0
  if useGpu
    im = gpuArray(im) ;
  end
  inputs = {'input', im, 'label', imdb.images.label(batch), ...
    'VideoId1', VideoId1, 'VideoId2', VideoId2};
end


================================================
FILE: dicnn/cnn_init_cafferef.m
================================================
% -------------------------------------------------------------------------
function net = cnn_init_cafferef(net,opts)
% -------------------------------------------------------------------------

drop6p = find(cellfun(@(a) strcmp(a.name, 'dropout6'), net.layers)==1);
drop7p = find(cellfun(@(a) strcmp(a.name, 'dropout7'), net.layers)==1);

if ~isempty(drop6p)
  assert(~isempty(drop7p));
  net.layers{drop6p}.rate = opts.DropOutRate;
  net.layers{drop7p}.rate = opts.DropOutRate;
else
  relu6p = find(cellfun(@(a) strcmp(a.name, 'relu6'), net.layers)==1);
  relu7p = find(cellfun(@(a) strcmp(a.name, 'relu7'), net.layers)==1);

  drop6 = struct('type','dropout','rate', opts.DropOutRate,'name','dropout6') ;
  drop7 = struct('type','dropout','rate', opts.DropOutRate,'name','dropout7') ;
  net.layers = [net.layers(1:relu6p) drop6 net.layers(relu6p+1:relu7p) drop7 net.layers(relu7p+1:end)];
end

% replace fc8
fc8l = cellfun(@(a) strcmp(a.name, 'fc8'), net.layers)==1;

nCls = opts.nCls ;
% nCls = 101;
sizeW = size(net.layers{fc8l}.weights{1});

if sizeW(4)~=nCls
  net.layers{fc8l}.weights = {zeros(sizeW(1),sizeW(2),sizeW(3),nCls,'single'), ...
    zeros(1, nCls, 'single')};
end

% change loss
% net.layers(end) = [];
net.layers{end} = struct('name','loss', 'type','softmaxloss') ;

% convert to dagnn
net = dagnn.DagNN.fromSimpleNN(net, 'canonicalNames', true) ;

poolLyr1 = find(arrayfun(@(a) strcmp(a.name, opts.pool1Layer), net.layers)==1);
assert(~isempty(poolLyr1));
% configure appr-rank-pool
switch opts.pool1Type
  case 'arpool'
    if strcmp(opts.pool1Layer,'conv1')
      net.addLayer('arpool',AppRankPooling('scale',1),{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN');
      net.addLayer('l2normalize',L2Normalize('scale',6000,'clip',[-128 128]),...
        'DynImgN','DynImg');
    else
      net.addLayer('arpool',AppRankPooling('scale',0.1),{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN');
      net.addLayer('reluP',dagnn.ReLU(),...
      {'DynImgN'},'DynImg');
    end
    net.setLayerInputs(opts.pool1Layer,{'DynImg'}) ;  
  case 'ppool1'
    if strcmp(opts.pool1Layer,'conv1')
      net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...
      {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImg',{'conv0f','conv0b'});
    else
      net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...
      {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN',{'conv0f','conv0b'});
    net.addLayer('reluP',dagnn.ReLU(),...
      {'DynImgN'},'DynImg');
    end
    
    net.layers(poolLyr1).inputs{1} = 'DynImg' ;
%     net.params(end-1).value = 0.01 * randn(1,1,10,1,'single');
    net.params(end-1).value = 0.1 * ones(1,1,10,1,'single');
    net.params(end).value = zeros(1,1,'single');    
    
    net.params(end-1).learningRate = 0.1 ;
    net.params(end).learningRate = 0.2 ;
  case 'ppool2'
    if strcmp(opts.pool1Layer,'conv1')
      net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...
      {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImg',{'conv0f','conv0b'});
    else
      net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...
      {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN',{'conv0f','conv0b'});
    net.addLayer('reluP',dagnn.ReLU(),...
      {'DynImgN'},'DynImg');
    end
    
    net.layers(poolLyr1).inputs{1} = 'DynImg' ;
%     net.params(end-1).value = 0.01 * randn(1,1,10,1,'single');
    net.params(end-1).value = 0.1 * ones(1,1,10,1,'single');
    net.params(end).value = zeros(1,1,'single');    
    
    net.params(end-1).learningRate = 0.1 ;
    net.params(end).learningRate = 0.2 ;
  case 'none'
    
  otherwise
    error('Unknown pool type %s', opts.pool1Type) ;
end


% second pool layer (max pooling)
poolLyr2 = find(arrayfun(@(a) strcmp(a.name, opts.pool2Layer), net.layers)==1);
net.addLayer('tempPoolMax',TemporalPooling('method','max'),...
  {net.layers(poolLyr2(1)).inputs{1},'VideoId2'},'tempPoolMax');

net.layers(poolLyr2).inputs{1} = 'tempPoolMax';

% add multi-class error
net.addLayer('errMC',ErrorMultiClass(),{'prediction','label'},'mcerr');

net_ = net.saveobj ;
net = dagnn.DagNN.loadobj(net_) ;

net.removeLayer('loss') ;
net.addLayer('loss', ...
             LossNormalized('loss', 'softmaxlog') ,...
             {'prediction', 'label'}, ...
             'objective') ;
           
% replace standard matconvnet bnorm with my version
bns = find(arrayfun(@(a) strcmp(class(a.block), 'dagnn.BatchNorm'), net.layers)==1);
for i=1:numel(bns)
  bb = net.layers(bns(i)).block ;
  net.layers(bns(i)).block = BatchNormN('numChannels',bb.numChannels,...
  'epsilon',bb.epsilon,...
  'opts',bb.opts) ;
end


================================================
FILE: dicnn/cnn_init_resnext.m
================================================
% -------------------------------------------------------------------------
function net = cnn_init_resnext(net,opts)
% -------------------------------------------------------------------------
% initialize classifier
net = dagnn.DagNN.loadobj(net) ;

% convs = find(arrayfun(@(a) isa(a.block, 'dagnn.Conv'), net.layers)==1);

fclayer = net.getLayer('classifier_0') ;
sizeW = size(net.params(fclayer.paramIndexes(1)).value);

% opts.nCls = 101;
nCls = opts.nCls ;
DropOutRate = opts.DropOutRate ; 


net.params(fclayer.paramIndexes(1)).value = ...
  0.01 * randn([sizeW(1:3),nCls],'single') ;
net.params(fclayer.paramIndexes(2)).value = zeros(nCls,1,'single') ;


% change loss
softmax = find(arrayfun(@(a) isa(a.block, 'dagnn.SoftMax'), net.layers)==1);
if ~isempty(softmax)
  net.removeLayer(net.layers(softmax(1)).name) ;
end
% convs = find(arrayfun(@(a) isa(a.block, 'dagnn.Conv'), net.layers)==1);
fclayer = find(arrayfun(@(a) strcmp(a.name, 'classifier_0'), net.layers)==1);
net.renameVar(net.layers(fclayer(end)).name,'prediction') ;
net.renameVar('data','input') ;

%------------------------------------------------------------------------%
% configure appr-rank-pool
switch opts.pool1Type
  case 'arpool'
    if strcmp(opts.pool1Layer,'conv0')
      poolLyr1 = 1 ;
      net.addLayer('arpool',AppRankPooling('scale',0.1),{'input','VideoId1'},'DynImg');
      net.setLayerInputs(net.layers(1).name,{'DynImg'}) ;
    else
      poolLyr1 = find(arrayfun(@(a) strcmp(a.name, opts.pool1Layer), net.layers)==1);
      assert(~isempty(poolLyr1));
      net.addLayer('arpool',AppRankPooling('scale',0.1),{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImg');
      net.setLayerInputs(opts.pool1Layer,{'DynImg'}) ;
    end
  case 'ppool1'
    if strcmp(opts.pool1Layer,'conv0')
      poolLyr1 = 1 ;
    else
      poolLyr1 = find(arrayfun(@(a) strcmp(a.name, opts.pool1Layer), net.layers)==1);
    end
    net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...
      {'features_4_0_merge','VideoId1'},'DynImg0',{'conv0f','conv0b'});
    
%     net.params(end-1).value = 0.1 * ones(1,1,10,1,'single');
    net.params(end-1).value = 0.1 * randn(1,1,10,1,'single');
    net.params(end).value = zeros(1,1,'single');  
    
    net.addLayer('BnormDyn',dagnn.BatchNorm('numChannels',256),'DynImg0','DynImg',...
      {'dym','dyb','dybx'}) ;
    net.params(end-2).value =  ones(256,1,'single') ;
    net.params(end-1).value =  zeros(256,1,'single') ;
    net.params(end).value   =  zeros(256,2,'single') ;
    
%     net.addLayer('reluP',dagnn.ReLU(),...
%       {'DynImg1'},'DynImg');
    net.layers(16).inputs{1} = 'DynImg' ;
    for i=numel(net.params)-4:numel(net.params),
      net.params(i).learningRate = 0.1 * net.params(i).learningRate;
    end
  case 'none'
  otherwise
    error('Unknown pool type %s', opts.pool1Type) ;
end


net.rebuild() ;
%------------------------------------------------------------------------%
% second pool layer (max pooling)
% poolLyr2 = find(arrayfun(@(a) strcmp(a.name, 'pool5'), net.layers)==1);
poolLyr2 = find(arrayfun(@(a) strcmp(a.name, 'features_7_1_merge'), net.layers)==1);
net.addLayer('tempPoolMax',TemporalPooling('method','max'),...
  {net.layers(poolLyr2(1)).outputs{1},'VideoId2'},'tempPoolMax');

% change the input of fc last layer
% net.setLayerInputs(net.layers(convs(end)).name,'tempPoolMax') ;
% net.addLayer('bnar',dagnn.BatchNorm('numChannels',2048),{'tempPoolMax'},...
%   'tempPoolMaxbn',{'bnar_m','bnar_b','bnar_x'});
poolLyr2next = find(arrayfun(@(a) strcmp(a.name, 'features_7_1_id_relu'), net.layers)==1);
net.setLayerInputs(net.layers(poolLyr2next(1)).name,{'tempPoolMax'}) ;
net.rebuild() ;
%------------------------------------------------------------------------%
% add drop-out layers
if DropOutRate>0

  pool5 = find(arrayfun(@(a) strcmp(a.name, 'features_8'), net.layers)==1);
  oo = net.layers(pool5(1)).outputs{1};
  net.addLayer('drop_pool5',dagnn.DropOut('rate',DropOutRate),...
    oo,sprintf('drop_%s',oo),{});
  net.setLayerInputs('classifier_permute',{sprintf('drop_%s',oo)}) ;
end


%------------------------------------------------------------------------%
% add multi-class error
net.addLayer('errMC',ErrorMultiClass(),{'prediction','label'},'mcerr');

net.addLayer('loss', ...
             LossNormalized('loss', 'softmaxlog') ,...
             {'prediction', 'label'}, ...
             'objective') ;

%------------------------------------------------------------------------%
net.rebuild()

% replace standard matconvnet bnorm with my version
bns = find(arrayfun(@(a) strcmp(class(a.block), 'dagnn.BatchNorm'), net.layers)==1);
for i=1:numel(bns)
  bb = net.layers(bns(i)).block ;
  net.layers(bns(i)).block = BatchNormN('numChannels',bb.numChannels,...
  'epsilon',bb.epsilon,...
  'opts',bb.opts) ;
end

% dagMergeBatchNorm(net) ;
% dagRemoveLayersOfType(net, 'dagnn.BatchNorm') ;
net_ = net.saveobj ;
net = dagnn.DagNN.loadobj(net_) ;
net.meta.normalization.border = [32 32] ;


================================================
FILE: dicnn/cnn_single_of.m
================================================
function [net, info] = cnn_single_of(varargin)
%CNN_SINGLE_OF Demonstrates fine-tuning a pre-trained CNN with static 
% optical flow (OF in pami journal) on UCF101 dataset

run(fullfile(fileparts(mfilename('fullpath')), ...
  '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ;

addpath Layers Datasets

opts.dataDir = fullfile('data','UCF101') ;
opts.expDir  = fullfile('exp', 'UCF101') ;
opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat.mat') ;
[opts, varargin] = vl_argparse(opts, varargin) ;

opts.numFetchThreads = 8 ;

opts.lite = false ;
opts.imdbPath = fullfile(opts.dataDir, 'imdb-of.mat');

opts.DropOutRate = 0.85 ;
opts.datasetFn = @cnn_ucf101_of_setup_data ;
opts.networkFn = @cnn_resnext_init ;

opts.split = 1; % data split
opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1]
opts.numDynImgs = 10 ;
opts.epochFactor = 5 ;
opts.pool1Layer = 'conv0'; % before conv1
opts.pool1Type = 'none' ;
opts.pool2Layer = 'fc6' ;

opts.train = struct() ;
opts.train.gpus = [];
opts.train.batchSize = 128 ;
opts.train.numSubBatches = 32 ;
opts.train.solver = [] ;
opts.train.prefetch = true ;
opts.train.learningRate = 1e-2 ;
opts.train.numEpochs = 30 ;

opts = vl_argparse(opts, varargin) ;
if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end;


% -------------------------------------------------------------------------
%                                                              Prepare data
% -------------------------------------------------------------------------

if exist(opts.imdbPath,'file')
  imdb = load(opts.imdbPath) ;
else
  imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ;
  mkdir(opts.expDir) ;
  save(opts.imdbPath, '-struct', 'imdb') ;
end

% UCF101 has 3 data splits
if opts.split>3
  error('split should be <=3');
end
imdb.images.set = imdb.images.sets(opts.split,:);

% reverse frame order
if opts.reverseDyn
  for i=1:numel(imdb.images.names)
    imdb.images.names{i} = imdb.images.names{i}(end:-1:1);
  end
end
% -------------------------------------------------------------------------
%                                                             Prepare model
% -------------------------------------------------------------------------
net = load(opts.modelPath);
if isfield(net,'net')
  net = net.net;
end
opts.nCls = max(imdb.images.label) ;
% net = dagnn.DagNN.loadobj(net) ;
net = opts.networkFn(net,opts) ;

% two channels instead of 3 RGB
net.params(1).value = net.params(1).value(:,:,1:2,:) ; 

% Set the class names in the network
net.meta.classes.name = imdb.classes.name ;
net.meta.classes.description = imdb.classes.name ;

% -------------------------------------------------------------------------
%                                                                     Learn
% -------------------------------------------------------------------------
if opts.epochFactor>0
  opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ;
else
  opts.train.train = NaN ;
end
opts.train.val = find(imdb.images.set==3) ;

[net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ...
                      'expDir', opts.expDir, ...
                      opts.train) ;

% -------------------------------------------------------------------------
%                                                          Report accuracy
% -------------------------------------------------------------------------
errlayer = net.getLayerIndex('errMC') ;

if ~isnan(errlayer)
  cats = imdb.classes.name ;
  accs = net.layers(errlayer).block.accuracy ; 
  
  if numel(cats)~=numel(accs)
    error('wrong number of classes\n') ;
  end
  
  for i=1:numel(cats)
    fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ;
  end
  fprintf('Mean accuracy %.1f\n',100*mean(accs)) ;
end

% -------------------------------------------------------------------------
function fn = getBatchFn(opts, meta)
% -------------------------------------------------------------------------
useGpu = numel(opts.train.gpus) > 0 ;

bopts.numThreads = opts.numFetchThreads ;
bopts.imageSize = meta.normalization.imageSize ;
if isfield(meta.normalization,'border')
  bopts.border = meta.normalization.border ;  
else
  bopts.border = meta.normalization.imageSize(1:2) ./ ...
    meta.normalization.cropSize - meta.normalization.imageSize(1:2);
end

bopts.averageImage = 128 * ones([1 1 2],'single') ;
bopts.numDynImgs = opts.numDynImgs ;
% bopts.averageImage = meta.normalization.averageImage ;
% bopts.rgbVariance = meta.augmentation.rgbVariance ;
% bopts.transformation = meta.augmentation.transformation ;
bopts.transformation = 'stretch' ;
bopts.transformation = 'multiScaleRegular' ;

fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ;


% -------------------------------------------------------------------------
function inputs = getDagNNBatch(opts, useGpu, imdb, batch)
% -------------------------------------------------------------------------

% batch refers to videos (not for frames)
if isempty(batch)
  inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []};
  return;
end

isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ;

if ~isVal, transformation='multiScaleRegular'; else transformation='none';end

names = imdb.images.names(batch);


% images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ;

namesM = {};
nVids = numel(batch);

VideoId1 = [];
VideoId2 = [];

% step-size
stepSize = 6;
% pool nFrames into a dynamic image
nFrames = 1;
% number of dynamic images to be max pooled later
nDynImgs = opts.numDynImgs ;
opts = rmfield(opts,'numDynImgs') ;


c1 = 1;
for v=1:nVids
  
  name = names{v};
  nFrms = numel(name)/2;

  nSample = nFrames;
  nr = numel(1:stepSize:nFrms);
  
  % jitter by removing 50 % and limit a batch to nMaxs * nSamples images
  if nr > 1 && (~isVal && nr>nDynImgs)
    rat = min(nDynImgs,ceil(0.50*nr));
    ri = randperm(nr);
    ri = ri(1:rat);
    r = zeros(1,nr);
    r(ri) = 1;
  else
    r = ones(1,nr);
  end
  
  c3 = 1;
  c2 = 0;
  
  for f=1:stepSize:nFrms
    if r(c3)
      idx = f:min(f+nSample-1,nFrms) ;
      if numel(idx)<nFrames
        idx = [idx idx(end) * ones(1,nFrames-numel(idx))];
      end
      idxu = 2*idx - 1;
      idxv = 2*idx;
      idxuv = zeros(1,2 * numel(idxu)) ;
      idxuv(1:2:end) = idxu ;
      idxuv(2:2:end) = idxv ;
            
      namesM{end+1} = name(idxuv);
      VideoId1 = [VideoId1 c1 * ones(1,numel(idx))];
      c1 = c1 + 1;
      c2 = c2 + 1;
    end
    c3 = c3 + 1;
  end
  VideoId2 = [VideoId2 v * ones(1,c2) ] ;
end

images = strcat([imdb.imageDir filesep], horzcat(namesM{:}) ) ;

im = cnn_video_of_get_batch(images, VideoId1, opts, ...
  'transformation', transformation, 'prefetch', nargout == 0, ...
  'subMean', false) ;

if nargout > 0
  if useGpu
    im = gpuArray(im) ;
  end
  inputs = {'input', im, 'label', imdb.images.label(batch), ...
    'VideoId2', VideoId2};

end


================================================
FILE: dicnn/cnn_single_rgb.m
================================================
  function [net, info] = cnn_single_rgb(varargin)
%CNN_SINGLE_RGB Demonstrates fine-tuning a pre-trained CNN with static 
% RGB frames (SI in pami journal) on UCF101 dataset


run(fullfile(fileparts(mfilename('fullpath')), ...
  '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ;

addpath Layers Datasets

opts.dataDir = fullfile('data','UCF101') ;
opts.expDir  = fullfile('exp', 'UCF101') ;
opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat');
opts.datasetFn = @cnn_ucf101_setup_data ;
opts.networkFn = @cnn_init_resnext ;
opts.pool1Type = 'none' ;
opts.pool1Layer = 'conv1' ;
opts.pool2Layer = '' ;
[opts, varargin] = vl_argparse(opts, varargin) ;

opts.numFetchThreads = 8 ;

opts.lite = false ;
opts.imdbPath = fullfile(opts.dataDir, 'imdb-rgb.mat');
opts.ARPoolLayer = 'conv0'; % before conv1
opts.DropOutRate = 0.5 ;
opts.epochFactor = 5 ;

opts.split = 1; % data split
opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1]
opts.train = struct() ;
opts.train.gpus = [];
opts.train.batchSize = 128 ;
opts.train.numSubBatches = 16 ;
opts.train.solver = [] ;
opts.train.prefetch = true ;
opts.train.numEpochs = 30 ;
% resnet50
opts.train.learningRate = 1e-2 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)];
% caffe-ref
opts.train.learningRate = 1e-4 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)];

opts = vl_argparse(opts, varargin) ;
if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end;
% opts.train.numEpochs = numel(opts.train.learningRate);

% -------------------------------------------------------------------------
%                                                              Prepare data
% -------------------------------------------------------------------------

if exist(opts.imdbPath,'file')
  imdb = load(opts.imdbPath) ;
else
  imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ;
  mkdir(opts.expDir) ;
  save(opts.imdbPath, '-struct', 'imdb') ;
end

% UCF101 has 3 data splits
if opts.split>3
  error('split should be <=3');
end
imdb.images.set = imdb.images.sets(opts.split,:);

% reverse frame order
if opts.reverseDyn
  for i=1:numel(imdb.images.names)
    imdb.images.names{i} = imdb.images.names{i}(end:-1:1);
  end
end

% -------------------------------------------------------------------------
%                                                             Prepare model
% -------------------------------------------------------------------------
net = load(opts.modelPath);
if isfield(net,'net')
  net = net.net;
end
opts.nCls = max(imdb.images.label) ;
net = opts.networkFn(net,opts);

if numel(net.meta.normalization.averageImage)>3
  sz = size(net.meta.normalization.averageImage) ;
  net.meta.normalization.averageImage = ...
    mean(reshape(net.meta.normalization.averageImage,[sz(1)*sz(2) sz(3)]),1) ;
end

% Set the class names in the network
net.meta.classes.name = imdb.classes.name ;
net.meta.classes.description = imdb.classes.name ;
% -------------------------------------------------------------------------
%                                                                     Learn
% -------------------------------------------------------------------------
if opts.epochFactor>0
  opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ;
else
  opts.train.train = NaN ;
end
opts.train.val = find(imdb.images.set==3) ;

[net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ...
                      'expDir', opts.expDir, ...
                      opts.train) ;

% -------------------------------------------------------------------------
%                                                          Report accuracy
% -------------------------------------------------------------------------
errlayer = net.getLayerIndex('errMC') ;

if ~isnan(errlayer)
  cats = imdb.classes.name ;
  accs = net.layers(errlayer).block.accuracy ; 
  
  if numel(cats)~=numel(accs)
    error('wrong number of classes\n') ;
  end
  
  for i=1:numel(cats)
    fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ;
  end
  fprintf('Mean accuracy %.1f\n',100*mean(accs)) ;
end

% -------------------------------------------------------------------------
function fn = getBatchFn(opts, meta)
% -------------------------------------------------------------------------
useGpu = numel(opts.train.gpus) > 0 ;

bopts.numThreads = opts.numFetchThreads ;
bopts.imageSize = meta.normalization.imageSize ;
if isfield(meta.normalization,'border')
  bopts.border = meta.normalization.border ;  
else
  bopts.border = meta.normalization.imageSize(1:2) ./ ...
    meta.normalization.cropSize - meta.normalization.imageSize(1:2);

end

% bopts.averageImage = []; 
bopts.averageImage = meta.normalization.averageImage ;
bopts.interpolation = meta.normalization.interpolation ;
bopts.keepAspect = meta.normalization.keepAspect ;
% bopts.rgbVariance = meta.augmentation.rgbVariance ;
% bopts.transformation = meta.augmentation.transformation ;


fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ;


% -------------------------------------------------------------------------
function inputs = getDagNNBatch(opts, useGpu, imdb, batch)
% -------------------------------------------------------------------------

% batch refers to videos (not for frames)
if isempty(batch)
  inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []};
  return;
end

isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ;

% if ~isVal, transformation='stretch'; else transformation='none';end
if ~isVal, transformation='multiScaleRegular'; else transformation='none';end

names = imdb.images.names(batch);


% images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ;

namesM = {};
nVids = numel(batch);

VideoId1 = [];
VideoId2 = [];

% step-size
stepSize = 6;
% pool nFrames into a dynamic image
nFrames = 1;
% number of dynamic images to be max pooled later
nDynImgs = 10;


c1 = 1;
for v=1:nVids
  
  name = names{v};
  nFrms = numel(name);

  nSample = nFrames;
  nr = numel(1:stepSize:nFrms);
  
  % jitter by removing 50 % and limit a batch to nMaxs * nSamples images
  if nr > 1 && (~isVal && nr>nDynImgs)
    rat = min(nDynImgs,ceil(0.50*nr));
    ri = randperm(nr);
    ri = ri(1:rat);
    r = zeros(1,nr);
    r(ri) = 1;
  else
    r = ones(1,nr);
  end
  
  c3 = 1;
  c2 = 0;
  
  for f=1:stepSize:nFrms
    if r(c3)
      idx = f:min(f+nSample-1,nFrms) ;
      if numel(idx)<nFrames
        idx = [idx idx(end) * ones(1,nFrames-numel(idx))];
      end
      namesM{end+1} = name(idx);
      VideoId1 = [VideoId1 c1 * ones(1,numel(idx))];
      c1 = c1 + 1;
      c2 = c2 + 1;
    end
    c3 = c3 + 1;
  end
  VideoId2 = [VideoId2 v * ones(1,c2) ] ;
end

images = strcat([imdb.imageDir filesep], horzcat(namesM{:}) ) ;

im = cnn_video_rgb_get_batch(images, VideoId1, opts, ...
  'transformation', transformation, 'prefetch', nargout == 0, ...
  'subMean', false) ;

if nargout > 0
  if useGpu
    im = gpuArray(im) ;
  end
  inputs = {'input', im, 'label', imdb.images.label(batch), ...
    'VideoId2', VideoId2};
end


================================================
FILE: dicnn/cnn_train_dicnn_dag.m
================================================
function [net,stats] = cnn_train_dicnn_dag(net, imdb, getBatch, varargin)
%CNN_DICNN_TRAIN_DAG Demonstrates training a CNN using the DagNN wrapper
%    CNN_TRAIN_DAG() is similar to CNN_TRAIN(), but works with
%    the DagNN wrapper instead of the SimpleNN wrapper.

% Copyright (C) 2014-16 Andrea Vedaldi.
% All rights reserved.
%
% This file is part of the VLFeat library and is made available under
% the terms of the BSD license (see the COPYING file).
addpath(fullfile(vl_rootnn, 'examples'));

opts.expDir = fullfile('data','exp') ;
opts.continue = true ;
opts.batchSize = 256 ;
opts.numSubBatches = 1 ;
opts.train = [] ;
opts.val = [] ;
opts.gpus = [] ;
opts.prefetch = false ;
opts.epochSize = inf;
opts.numEpochs = 300 ;
opts.learningRate = 0.001 ;
opts.weightDecay = 0.0005 ;

opts.solver = [] ;  % Empty array means use the default SGD solver
[opts, varargin] = vl_argparse(opts, varargin) ;
if ~isempty(opts.solver)
  assert(isa(opts.solver, 'function_handle') && nargout(opts.solver) == 2,...
    'Invalid solver; expected a function handle with two outputs.') ;
  % Call without input arguments, to get default options
  opts.solverOpts = opts.solver() ;
end

opts.momentum = 0.9 ;
opts.saveSolverState = true ;
opts.nesterovUpdate = false ;
opts.randomSeed = 0 ;
opts.profile = false ;
opts.parameterServer.method = 'mmap' ;
opts.parameterServer.prefix = 'mcn' ;

opts.derOutputs = {'objective', 1} ;
opts.extractStatsFn = @extractStats ;
opts.plotStatistics = true;
opts.postEpochFn = [] ;  % postEpochFn(net,params,state) called after each epoch; can return a new learning rate, 0 to stop, [] for no change
opts = vl_argparse(opts, varargin) ;

if ~exist(opts.expDir, 'dir'), mkdir(opts.expDir) ; end
if isempty(opts.train), opts.train = find(imdb.images.set==1) ; end
if isempty(opts.val), opts.val = find(imdb.images.set==2) ; end
if isscalar(opts.train) && isnumeric(opts.train) && isnan(opts.train)
  opts.train = [] ;
end
if isscalar(opts.val) && isnumeric(opts.val) && isnan(opts.val)
  opts.val = [] ;
end

% -------------------------------------------------------------------------
%                                                            Initialization
% -------------------------------------------------------------------------

evaluateMode = isempty(opts.train) ;
if ~evaluateMode
  if isempty(opts.derOutputs)
    error('DEROUTPUTS must be specified when training.\n') ;
  end
end

% -------------------------------------------------------------------------
%                                                        Train and validate
% -------------------------------------------------------------------------

modelPath = @(ep) fullfile(opts.expDir, sprintf('net-epoch-%d.mat', ep));
modelFigPath = fullfile(opts.expDir, 'net-train.pdf') ;

start = opts.continue * findLastCheckpoint(opts.expDir) ;
if start >= 1
  fprintf('%s: resuming by loading epoch %d\n', mfilename, start) ;
  [net, state, stats] = loadState(modelPath(start)) ;
else
  state = [] ;
end

for epoch=start+1:opts.numEpochs

  % Set the random seed based on the epoch and opts.randomSeed.
  % This is important for reproducibility, including when training
  % is restarted from a checkpoint.

  rng(epoch + opts.randomSeed) ;
  prepareGPUs(opts, epoch == start+1) ;

  % Train for one epoch.
  params = opts ;
  params.epoch = epoch ;
  params.learningRate = opts.learningRate(min(epoch, numel(opts.learningRate))) ;
  params.train = opts.train(randperm(numel(opts.train))) ; % shuffle
  params.train = params.train(1:min(opts.epochSize, numel(opts.train)));
  params.val = opts.val(randperm(numel(opts.val))) ;
  params.imdb = imdb ;
  params.getBatch = getBatch ;

  if numel(opts.gpus) <= 1
    [net, state] = processEpoch(net, state, params, 'train') ;
    [net, state] = processEpoch(net, state, params, 'val') ;
    if ~evaluateMode
      saveState(modelPath(epoch), net, state) ;
    end
    lastStats = state.stats ;
  else
    spmd
      [net, state] = processEpoch(net, state, params, 'train') ;
      [net, state] = processEpoch(net, state, params, 'val') ;
      if labindex == 1 && ~evaluateMode
        saveState(modelPath(epoch), net, state) ;
      end
      lastStats = state.stats ;
    end
    lastStats = accumulateStats(lastStats) ;
  end

  stats.train(epoch) = lastStats.train ;
  stats.val(epoch) = lastStats.val ;
  clear lastStats ;
  saveStats(modelPath(epoch), stats) ;

  if opts.plotStatistics
    switchFigure(1) ; clf ;
    plots = setdiff(...
      cat(2,...
      fieldnames(stats.train)', ...
      fieldnames(stats.val)'), {'num', 'time'}) ;
    for p = plots
      p = char(p) ;
      values = zeros(0, epoch) ;
      leg = {} ;
      for f = {'train', 'val'}
        f = char(f) ;
        if isfield(stats.(f), p)
          tmp = [stats.(f).(p)] ;
          values(end+1,:) = tmp(1,:)' ;
          leg{end+1} = f ;
        end
      end
      subplot(1,numel(plots),find(strcmp(p,plots))) ;
      plot(1:epoch, values','o-') ;
      xlabel('epoch') ;
      title(p) ;
      legend(leg{:}) ;
      grid on ;
    end
    drawnow ;
    print(1, modelFigPath, '-dpdf') ;
  end
  
  if ~isempty(opts.postEpochFn)
    if nargout(opts.postEpochFn) == 0
      opts.postEpochFn(net, params, state) ;
    else
      lr = opts.postEpochFn(net, params, state) ;
      if ~isempty(lr), opts.learningRate = lr; end
      if opts.learningRate == 0, break; end
    end
  end
end

% With multiple GPUs, return one copy
if isa(net, 'Composite'), net = net{1} ; end

% -------------------------------------------------------------------------
function [net, state] = processEpoch(net, state, params, mode)
% -------------------------------------------------------------------------
% Note that net is not strictly needed as an output argument as net
% is a handle class. However, this fixes some aliasing issue in the
% spmd caller.

% initialize with momentum 0
if isempty(state) || isempty(state.solverState)
  state.solverState = cell(1, numel(net.params)) ;
  state.solverState(:) = {0} ;
end

% move CNN  to GPU as needed
numGpus = numel(params.gpus) ;
if numGpus >= 1
  net.move('gpu') ;
  for i = 1:numel(state.solverState)
    s = state.solverState{i} ;
    if isnumeric(s)
      state.solverState{i} = gpuArray(s) ;
    elseif isstruct(s)
      state.solverState{i} = structfun(@gpuArray, s, 'UniformOutput', false) ;
    end
  end
end
if numGpus > 1
  parserv = ParameterServer(params.parameterServer) ;
  net.setParameterServer(parserv) ;
else
  parserv = [] ;
end

% profile
if params.profile
  if numGpus <= 1
    profile clear ;
    profile on ;
  else
    mpiprofile reset ;
    mpiprofile on ;
  end
end

num = 0 ;
epoch = params.epoch ;
subset = params.(mode) ;
adjustTime = 0 ;

stats.num = 0 ; % return something even if subset = []
stats.time = 0 ;

start = tic ;
for t=1:params.batchSize:numel(subset)
  fprintf('%s: epoch %02d: %3d/%3d:', mode, epoch, ...
          fix((t-1)/params.batchSize)+1, ceil(numel(subset)/params.batchSize)) ;
  batchSize = min(params.batchSize, numel(subset) - t + 1) ;

  for s=1:params.numSubBatches
    % get this image batch and prefetch the next
    batchStart = t + (labindex-1) + (s-1) * numlabs ;
    batchEnd = min(t+params.batchSize-1, numel(subset)) ;
    batch = subset(batchStart : params.numSubBatches * numlabs : batchEnd) ;
    num = num + numel(batch) ;
    if numel(batch) == 0, continue ; end

    inputs = params.getBatch(params.imdb, batch) ;

    if params.prefetch
      if s == params.numSubBatches
        batchStart = t + (labindex-1) + params.batchSize ;
        batchEnd = min(t+2*params.batchSize-1, numel(subset)) ;
      else
        batchStart = batchStart + numlabs ;
      end
      nextBatch = subset(batchStart : params.numSubBatches * numlabs : batchEnd) ;
      params.getBatch(params.imdb, nextBatch) ;
    end

    if strcmp(mode, 'train')
      net.mode = 'normal' ;
      net.accumulateParamDers = (s ~= 1) ;
      net.eval(inputs, params.derOutputs, 'holdOn', s < params.numSubBatches) ;
    else
      net.mode = 'test' ;
      net.eval(inputs) ;
    end
  end

  % Accumulate gradient.
  if strcmp(mode, 'train')
    if ~isempty(parserv), parserv.sync() ; end
    state = accumulateGradients(net, state, params, parserv) ;
  end

  % Get statistics.
  time = toc(start) + adjustTime ;
  batchTime = time - stats.time ;
  stats.num = num ;
  stats.time = time ;
  stats = params.extractStatsFn(stats,net) ;
  currentSpeed = batchSize / batchTime ;
  averageSpeed = (t + batchSize - 1) / time ;
  if t == 3*params.batchSize + 1
    % compensate for the first three iterations, which are outliers
    adjustTime = 4*batchTime - time ;
    stats.time = time + adjustTime ;
  end

  fprintf(' %.1f (%.1f) Hz', averageSpeed, currentSpeed) ;
  for f = setdiff(fieldnames(stats)', {'num', 'time'})
    f = char(f) ;
    fprintf(' %s: %.3f', f, stats.(f)) ;
  end
  fprintf('\n') ;
end

% Save back to state.
state.stats.(mode) = stats ;
if params.profile
  if numGpus <= 1
    state.prof.(mode) = profile('info') ;
    profile off ;
  else
    state.prof.(mode) = mpiprofile('info');
    mpiprofile off ;
  end
end
if ~params.saveSolverState
  state.solverState = [] ;
else
  for i = 1:numel(state.solverState)
    s = state.solverState{i} ;
    if isnumeric(s)
      state.solverState{i} = gather(s) ;
    elseif isstruct(s)
      state.solverState{i} = structfun(@gather, s, 'UniformOutput', false) ;
    end
  end
end

net.reset() ;
net.move('cpu') ;

% -------------------------------------------------------------------------
function state = accumulateGradients(net, state, params, parserv)
% -------------------------------------------------------------------------
numGpus = numel(params.gpus) ;
otherGpus = setdiff(1:numGpus, labindex) ;

den = params.numSubBatches * max(numGpus,1) ;

for p=1:numel(net.params)

  if ~isempty(parserv)
    parDer = parserv.pullWithIndex(p) ;
  else
    parDer = net.params(p).der ;
  end

  switch net.params(p).trainMethod

    case 'average' % mainly for batch normalization
      thisLR = net.params(p).learningRate ;
      net.params(p).value = vl_taccum(...
          1 - thisLR, net.params(p).value, ...
          (thisLR/den/net.params(p).fanout),  parDer) ;

    case 'gradient'
      thisDecay = params.weightDecay * net.params(p).weightDecay ;
      thisLR = params.learningRate * net.params(p).learningRate ;

      if thisLR>0 || thisDecay>0
        % Normalize gradient and incorporate weight decay.
        parDer = vl_taccum(1/den, parDer, ...
                           thisDecay, net.params(p).value) ;

        if isempty(params.solver)
          % Default solver is the optimised SGD.
          % Update momentum.
          state.solverState{p} = vl_taccum(...
            params.momentum, state.solverState{p}, ...
            -1, parDer) ;

          % Nesterov update (aka one step ahead).
          if params.nesterovUpdate
            delta = params.momentum * state.solverState{p} - parDer ;
          else
            delta = state.solverState{p} ;
          end

          % Update parameters.
          net.params(p).value = vl_taccum(...
            1,  net.params(p).value, thisLR, delta) ;

        else
          % call solver function to update weights
          [net.params(p).value, state.solverState{p}] = ...
            params.solver(net.params(p).value, state.solverState{p}, ...
            parDer, params.solverOpts, thisLR) ;
        end
      end
    otherwise
      error('Unknown training method ''%s'' for parameter ''%s''.', ...
        net.params(p).trainMethod, ...
        net.params(p).name) ;
  end
end

% -------------------------------------------------------------------------
function stats = accumulateStats(stats_)
% -------------------------------------------------------------------------

for s = {'train', 'val'}
  s = char(s) ;
  total = 0 ;

  % initialize stats stucture with same fields and same order as
  % stats_{1}
  stats__ = stats_{1} ;
  names = fieldnames(stats__.(s))' ;
  values = zeros(1, numel(names)) ;
  fields = cat(1, names, num2cell(values)) ;
  stats.(s) = struct(fields{:}) ;

  for g = 1:numel(stats_)
    stats__ = stats_{g} ;
    num__ = stats__.(s).num ;
    total = total + num__ ;

    for f = setdiff(fieldnames(stats__.(s))', 'num')
      f = char(f) ;
      stats.(s).(f) = stats.(s).(f) + stats__.(s).(f) * num__ ;

      if g == numel(stats_)
        stats.(s).(f) = stats.(s).(f) / total ;
      end
    end
  end
  stats.(s).num = total ;
end

% -------------------------------------------------------------------------
function stats = extractStats(stats, net)
% -------------------------------------------------------------------------
sel = find(cellfun(@(x) isa(x,'dagnn.Loss'), {net.layers.block})) ;
for i = 1:numel(sel)
  if net.layers(sel(i)).block.ignoreAverage, continue; end;
  stats.(net.layers(sel(i)).outputs{1}) = net.layers(sel(i)).block.average ;
end

% -------------------------------------------------------------------------
function saveState(fileName, net_, state)
% -------------------------------------------------------------------------
net = net_.saveobj() ;
save(fileName, 'net', 'state') ;

% -------------------------------------------------------------------------
function saveStats(fileName, stats)
% -------------------------------------------------------------------------
if exist(fileName)
  save(fileName, 'stats', '-append') ;
else
  save(fileName, 'stats') ;
end

% -------------------------------------------------------------------------
function [net, state, stats] = loadState(fileName)
% -------------------------------------------------------------------------
load(fileName, 'net', 'state', 'stats') ;
net = dagnn.DagNN.loadobj(net) ;
if isempty(whos('stats'))
  error('Epoch ''%s'' was only partially saved. Delete this file and try again.', ...
        fileName) ;
end

% -------------------------------------------------------------------------
function epoch = findLastCheckpoint(modelDir)
% -------------------------------------------------------------------------
list = dir(fullfile(modelDir, 'net-epoch-*.mat')) ;
tokens = regexp({list.name}, 'net-epoch-([\d]+).mat', 'tokens') ;
epoch = cellfun(@(x) sscanf(x{1}{1}, '%d'), tokens) ;
epoch = max([epoch 0]) ;

% -------------------------------------------------------------------------
function switchFigure(n)
% -------------------------------------------------------------------------
if get(0,'CurrentFigure') ~= n
  try
    set(0,'CurrentFigure',n) ;
  catch
    figure(n) ;
  end
end

% -------------------------------------------------------------------------
function clearMex()
% -------------------------------------------------------------------------
clear vl_tmove vl_imreadjpeg ;

% -------------------------------------------------------------------------
function prepareGPUs(opts, cold)
% -------------------------------------------------------------------------
numGpus = numel(opts.gpus) ;
if numGpus > 1
  % check parallel pool integrity as it could have timed out
  pool = gcp('nocreate') ;
  if ~isempty(pool) && pool.NumWorkers ~= numGpus
    delete(pool) ;
  end
  pool = gcp('nocreate') ;
  if isempty(pool)
    parpool('local', numGpus) ;
    cold = true ;
  end

end
if numGpus >= 1 && cold
  fprintf('%s: resetting GPU\n', mfilename)
  clearMex() ;
  if numGpus == 1
    gpuDevice(opts.gpus)
  else
    spmd
      clearMex() ;
      gpuDevice(opts.gpus(labindex))
    end
  end
end


================================================
FILE: dicnn/cnn_video_of_get_batch.m
================================================
function imo = cnn_video_of_get_batch(images, vids, varargin)
% CNN_VIDEO_OF_GET_BATCH  Load, preprocess, and pack images for CNN evaluation

% video ids
% use same spatial jittering for frames from the same video
% NOTE: all the frames from a video should have the same size (wxh)

opts.imageSize = [227, 227] ;
opts.border = [29, 29] ;
opts.keepAspect = true ;
opts.numAugments = 1 ;
opts.transformation = 'multiScaleRegular' ;
opts.averageImage = [] ;
opts.rgbVariance = zeros(0,2,'single') ;
opts.interpolation = 'bilinear' ;
opts.numThreads = 1 ;
opts.prefetch = false ;
opts.lazyResize = true ;
opts.subMean = false; % subtract the mean from each video
opts = vl_argparse(opts, varargin);

% fetch is true if images is a list of filenames (instead of
% a cell array of images)
fetch = numel(images) >= 1 && ischar(images{1}) ;

% prefetch is used to load images in a separate thread
prefetch = fetch & opts.prefetch ;

if prefetch
  vl_imreadjpeg(images, 'numThreads', opts.numThreads, 'prefetch') ;
  imo = [] ;
  return ;
end
if fetch
  im = vl_imreadjpeg(images,'numThreads', opts.numThreads) ;
else
  im = images ;
end

tfs = [] ;
switch opts.transformation
  case 'none'
    tfs = [
      .5 ;
      .5 ;
      0 ] ;
  case 'f5'
    tfs = [...
      .5 0 0 1 1 .5 0 0 1 1 ;
      .5 0 1 0 1 .5 0 1 0 1 ;
      0 0 0 0 0  1 1 1 1 1] ;
  case 'f25'
    [tx,ty] = meshgrid(linspace(0,1,5)) ;
    tfs = [tx(:)' ; ty(:)' ; zeros(1,numel(tx))] ;
    tfs_ = tfs ;
    tfs_(3,:) = 1 ;
    tfs = [tfs,tfs_] ;
  case 'stretch'
  case 'multiScaleRegular'
  otherwise
    error('Uknown transformations %s', opts.transformation) ;
end
[~,transformations] = sort(rand(size(tfs,2), numel(images)), 1) ;

if ~isempty(opts.rgbVariance) && isempty(opts.averageImage)
  opts.averageImage = zeros(1,1,2) ;
end
if numel(opts.averageImage) == 2
  opts.averageImage = reshape(opts.averageImage, 1,1,2) ;
end

imo = zeros(opts.imageSize(1), opts.imageSize(2), 2, ...
  numel(images)/2*opts.numAugments, 'single') ;

nVid = max(vids);
si = 1 ;
countv = 1;
for v=1:nVid
  
  vid = find(vids==v);
  
  for i=1:numel(images(vid))
    
    % acquire image
    if isempty(im{i})
      imt1 = imread(images{2*vid(i)-1}) ;
      imt2 = imread(images{2*vid(i)}) ;
    else
      imt1 = im{2*vid(i)-1} ;
      imt2 = im{2*vid(i)} ;
    end
    imt = single(cat(3,imt1,imt2)) ; % faster than im2single (and multiplies by 255)
 
    % resize
    w = size(imt,2) ;
    h = size(imt,1) ;
    factor = [(opts.imageSize(1)+opts.border(1))/h ...
      (opts.imageSize(2)+opts.border(2))/w];
    
    if opts.keepAspect
      factor = max(factor) ;
    end
    if any(abs(factor - 1) > 0.0001)
      imt = imresize(imt, ...
        'scale', factor, ...
        'method', opts.interpolation) ;
    end
    
    % crop & flip
    if i==1
      flip = rand > 0.5 ;
      w = size(imt,2) ;
      h = size(imt,1) ;
      switch opts.transformation
        case 'stretch'
          sz = round(min(opts.imageSize(1:2)' .* (1-0.1+0.2*rand(2,1)), [w;h])) ;
          dx = randi(w - sz(2) + 1, 1) ;
          dy = randi(h - sz(1) + 1, 1) ;
%           flip = rand > 0.5 ;
        case 'multiScaleRegular'
          reg_szs = [256, 224, 192, 168] ;          
          sz(1) = reg_szs(randi(4)); sz(2) = reg_szs(randi(4));
 
          dy = [0 h-sz(1) 0 h-sz(1)  floor((h-sz(1)+1)/2)] + 1;
          dx = [0 w-sz(2) w-sz(2) 0 floor((w-sz(2)+1)/2)] + 1;
          corner = randi(5);
          dx = dx(corner); dy = dy(corner); 
        otherwise
          tf = tfs(:, transformations(mod(0, numel(transformations)) + 1)) ;
          sz = opts.imageSize(1:2) ;
          dx = floor((w - sz(2)) * tf(2)) + 1 ;
          dy = floor((h - sz(1)) * tf(1)) + 1 ;
%           flip = tf(3) ;
      end
      
    end
    if opts.lazyResize
      sx = round(linspace(dx, sz(2)+dx-1, opts.imageSize(2))) ;
      sy = round(linspace(dy, sz(1)+dy-1, opts.imageSize(1))) ;
    else
      factor = [opts.imageSize(1)/sz(1) ...
                  opts.imageSize(2)/sz(2)];
      if any(abs(factor - 1) > 0.0001)
        imt =   imresize(gather(imt(dy:sz(1)+dy-1,dx:sz(2)+dx-1,:)), [opts.imageSize(1:2)],...
          'Antialiasing', false, 'Method', opts.interpolation);
      end
      sx = 1:opts.imageSize(2); sy = 1:opts.imageSize(1);
    end
    if flip
      sx = fliplr(sx) ; 
      imo(:,:,1,si) = 255 - imt(sy,sx,1) ;
      imo(:,:,2,si) = imt(sy,sx,2) ;
    else 
      imo(:,:,:,si) = imt(sy,sx,:) ;
    end
    si = si + 1 ;
  end

  countv = countv + numel(images(vid));
end
if ~isempty(opts.averageImage) && numel(opts.averageImage)==2
  if ~isempty(opts.rgbVariance)
    imo = bsxfun(@minus, imo, opts.averageImage+reshape(opts.rgbVariance * randn(2,1), 1,1,3)) ;
  else
    imo = bsxfun(@minus, imo, opts.averageImage) ;
  end
end


================================================
FILE: dicnn/cnn_video_rgb_get_batch.m
================================================
function imo = cnn_video_rgb_get_batch(images, vids, varargin)
% CNN_VIDEO_RGB_GET_BATCH  Load, preprocess, and pack images for CNN evaluation

% video ids
% use same spatial jittering for frames from the same video
% NOTE: all the frames from a video should have the same size (wxh)

opts.imageSize = [227, 227] ;
opts.border = [29, 29] ;
opts.keepAspect = true ;
opts.numAugments = 1 ;
opts.transformation = 'none' ;
opts.averageImage = [] ;
opts.rgbVariance = zeros(0,3,'single') ;
opts.interpolation = 'bilinear' ;
opts.numThreads = 1 ;
opts.prefetch = false ;
opts.subMean = false ; % subtract the mean from each video
opts.lazyResize = true ;

opts = vl_argparse(opts, varargin);

% fetch is true if images is a list of filenames (instead of
% a cell array of images)
fetch = numel(images) >= 1 && ischar(images{1}) ;

% prefetch is used to load images in a separate thread
prefetch = fetch & opts.prefetch ;

if prefetch
  vl_imreadjpeg(images, 'numThreads', opts.numThreads, 'prefetch') ;
  imo = [] ;
  return ;
end
if fetch
  im = vl_imreadjpeg(images,'numThreads', opts.numThreads) ;
else
  im = images ;
end

tfs = [] ;
switch opts.transformation
  case 'none'
    tfs = [
      .5 ;
      .5 ;
      0 ] ;
  case 'f5'
    tfs = [...
      .5 0 0 1 1 .5 0 0 1 1 ;
      .5 0 1 0 1 .5 0 1 0 1 ;
      0 0 0 0 0  1 1 1 1 1] ;
  case 'f25'
    [tx,ty] = meshgrid(linspace(0,1,5)) ;
    tfs = [tx(:)' ; ty(:)' ; zeros(1,numel(tx))] ;
    tfs_ = tfs ;
    tfs_(3,:) = 1 ;
    tfs = [tfs,tfs_] ;
  case 'stretch'
  case 'multiScaleRegular'
  otherwise
    error('Uknown transformations %s', opts.transformation) ;
end
[~,transformations] = sort(rand(size(tfs,2), numel(images)), 1) ;

if ~isempty(opts.rgbVariance) && isempty(opts.averageImage)
  opts.averageImage = zeros(1,1,3) ;
end
if numel(opts.averageImage) == 3
  opts.averageImage = reshape(opts.averageImage, 1,1,3) ;
end

imo = zeros(opts.imageSize(1), opts.imageSize(2), 3, ...
  numel(images)*opts.numAugments, 'single') ;

nVid = max(vids);
si = 1 ;
countv = 1;
for v=1:nVid
  
  vid = find(vids==v);
  
  for i=1:numel(images(vid))
    
    % acquire image
    if isempty(im{i})
      imt = imread(images{vid(i)}) ;
      imt = single(imt) ; % faster than im2single (and multiplies by 255)
    else
      imt = im{vid(i)} ;
    end
    if size(imt,3) == 1
      imt = cat(3, imt, imt, imt) ;
    end
    
    % resize
    w = size(imt,2) ;
    h = size(imt,1) ;
    factor = [(opts.imageSize(1)+opts.border(1))/h ...
      (opts.imageSize(2)+opts.border(2))/w];
    
    if opts.keepAspect
      factor = max(factor) ;
    end
    if any(abs(factor - 1) > 0.0001)
      imt = imresize(imt, ...
        'scale', factor, ...
        'method', opts.interpolation) ;
    end
    
    % crop & flip
    if i==1
      w = size(imt,2) ;
      h = size(imt,1) ;
      switch opts.transformation
        case 'stretch'
          sz = round(min(opts.imageSize(1:2)' .* (1-0.1+0.2*rand(2,1)), [w;h])) ;
          dx = randi(w - sz(2) + 1, 1) ;
          dy = randi(h - sz(1) + 1, 1) ;
          flip = rand > 0.5 ;
        case 'multiScaleRegular'
          reg_szs = [256, 224, 192, 168] ;
          sz(1) = reg_szs(randi(4)); sz(2) = reg_szs(randi(4));
          
          dy = [0 h-sz(1) 0 h-sz(1)  floor((h-sz(1)+1)/2)] + 1;
          dx = [0 w-sz(2) w-sz(2) 0 floor((w-sz(2)+1)/2)] + 1;
          corner = randi(5);
          dx = dx(corner); dy = dy(corner);
          flip = rand > 0.5 ;
        otherwise
          tf = tfs(:, transformations(mod(0, numel(transformations)) + 1)) ;
          sz = opts.imageSize(1:2) ;
          dx = floor((w - sz(2)) * tf(2)) + 1 ;
          dy = floor((h - sz(1)) * tf(1)) + 1 ;
          flip = tf(3) ;
      end
      
    end
    
    if opts.lazyResize
      sx = round(linspace(dx, sz(2)+dx-1, opts.imageSize(2))) ;
      sy = round(linspace(dy, sz(1)+dy-1, opts.imageSize(1))) ;
    else
      factor = [opts.imageSize(1)/sz(1) ...
        opts.imageSize(2)/sz(2)];
      if any(abs(factor - 1) > 0.0001)
        imt =   imresize(gather(imt(dy:sz(1)+dy-1,dx:sz(2)+dx-1,:)), ...
          opts.imageSize(1:2), 'Antialiasing', false, ...
         'Method', opts.interpolation);
      end
      sx = 1:opts.imageSize(2); sy = 1:opts.imageSize(1);
    end
    
    
    if flip
      sx = fliplr(sx) ;   
    end
    
    imo(:,:,:,si) = imt(sy,sx,:) ;
    si = si + 1 ;
  end
  countv = countv + numel(images(vid));

end

if ~isempty(opts.averageImage) && numel(opts.averageImage)==3
  if ~isempty(opts.rgbVariance)
    imo = bsxfun(@minus, imo, opts.averageImage+reshape(opts.rgbVariance * randn(3,1), 1,1,3)) ;
  else
    imo = bsxfun(@minus, imo, opts.averageImage) ;
  end
end


================================================
FILE: dicnn/compute_approximate_dynamic_images.m
================================================
function di = compute_approximate_dynamic_images(images)
% Computes approximate dynamic images for a given array of images
% IMAGES must be a tensor of H x W x D x N dimensionality or
% cell of image names

% For the exact dynamic images, use the code
% http://users.cecs.anu.edu.au/~basura/dynamic_images/code.zip
% Explained here http://arxiv.org/abs/1512.01848

if isempty(images)
  di = [] ;
  return ;
end


if iscell(images)
  imagesA = cell(1,numel(images)) ; 
  for i=1:numel(images)
    if ~ischar(images{i})
      error('images must be an array of images or cell of image names') ;
    end
    imagesA{i} = imread(images{i}) ;
  end
  images = cat(4,imagesA{:}) ;
end

N = size(images,4) ;
di = vl_nnarpooltemporal(single(images),ones(1,N)) ;


================================================
FILE: dicnn/visualize_approximate_dynamic_images.m
================================================
function visualize_approximate_dynamic_images(images)
% VISUALIZE_DYNAMIC_IMAGES

di = compute_approximate_dynamic_images(images) ;

di = di - min(di(:)) ;
di = 255 * di ./ max(di(:)) ;
image(uint8(di)) ;


================================================
FILE: main_train.m
================================================
model = 'resnext50' ; % {'cafferef','resnext50','resnext101'}
input = 'rgb' ; % {'rgb','of'}
dataset = 'ucf101' ; % {'ucf101','hmdb51'}  hmdb51 requires more iterations to train (add more epochs to learning rate)
opts.train.batchSize = 128 ;
opts.train.numSubBatches = 32 ; % increase the number (16,32) if it does not fit into gpu mem 
opts.epochFactor = 5 ;
opts.split = 1 ;

opts.train.gpus = 1 ;

run matconvnet/matlab/vl_setupnn.m ;
vl_contrib install mcnExtraLayers ; vl_contrib setup mcnExtraLayers ;
vl_contrib install autonn ; vl_contrib setup autonn ;

% addpath(fullfile('matconvnet','contrib','mcnExtraLayers','matlab')) ;

opts.expDir = ['exp/' model 'rgb-arpool-split' num2str(opts.split)] ;
if strcmp(input,'rgb')  
  opts.DropOutRate = 0.5 ;
  trainfn = @cnn_dicnn_rgb ;
elseif strcmp(input,'of')  
  opts.DropOutRate = 0.8 ;
  trainfn = @cnn_dicnn_of ;
end

if strcmp(model,'cafferef')  

  opts.pool1Layer = 'conv1' ;
  % download from http://www.vlfeat.org/matconvnet/models/imagenet-caffe-ref.mat
  opts.modelPath = fullfile('models','imagenet-caffe-ref.mat') ;
  opts.networkFn = @cnn_init_cafferef ;
  
  if strcmp(input,'rgb')  
    opts.train.learningRate = 1e-3 * [ones(1,2) 0.1*ones(1,2)] ;
  else
    opts.train.learningRate = 3e-3 * [ones(1,10) 0.1*ones(1,2)] ;
  end

  opts.train.numEpochs = numel(opts.train.learningRate) ;
elseif strcmp(model,'resnext50') || strcmp(model,'resnext101')
  % download from http://www.robots.ox.ac.uk/~albanie/models/pytorch-imports/resnext_50_32x4d-pt-mcn.mat
  % download from http://www.robots.ox.ac.uk/~albanie/models/pytorch-imports/resnext_101_32x4d-pt-mcn.mat
  if strcmp(model,'resnext50')
    opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat') ;
  else
    opts.modelPath = fullfile('models','resnext_101_32x4d-pt-mcn.mat') ;
  end
  opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat') ;
  opts.networkFn = @cnn_init_resnext ;
  if strcmp(input,'rgb')  
    opts.train.learningRate = 1e-2 * [ones(1,2) 0.1*ones(1,8) ] ;
  else
    opts.train.learningRate = 1e-2 * [ones(1,2) 0.1*ones(1,2) ] ;
  end
end

addpath dicnn ;

[net, info] = trainfn(opts)


================================================
FILE: utils/extract_frames.sh
================================================
# !/bin/bash

# This script converts videos into frames
# for different fps change (-r 1)

for f in *.avi
  do g=`echo $f | sed 's/\.avi//'`;
  echo Processing $f; 
  mkdir -p frames/$g/ ;
  ffmpeg -i $f frames/$g/image-%04d.jpeg ; 
done